diff options
Diffstat (limited to 'tools/testing/selftests')
248 files changed, 19757 insertions, 2563 deletions
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 2ff68702fd41..1195bd85af38 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -249,10 +249,17 @@ else $(error Error: set INSTALL_PATH to use install) endif +FORMAT ?= .gz +TAR_PATH = $(abspath ${INSTALL_PATH}/kselftest-packages/kselftest.tar${FORMAT}) +gen_tar: install + @mkdir -p ${INSTALL_PATH}/kselftest-packages/ + @tar caf ${TAR_PATH} --exclude=kselftest-packages -C ${INSTALL_PATH} . + @echo "Created ${TAR_PATH}" + clean: @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\ done; -.PHONY: khdr all run_tests hotplug run_hotplug clean_hotplug run_pstore_crash install clean +.PHONY: khdr all run_tests hotplug run_hotplug clean_hotplug run_pstore_crash install clean gen_tar diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index c30079c86998..1bb204cee853 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -30,8 +30,6 @@ test_tcpnotify_user test_libbpf test_tcp_check_syncookie_user test_sysctl -test_hashmap -test_btf_dump test_current_pid_tgid_new_ns xdping test_cpp @@ -39,4 +37,5 @@ test_cpp /no_alu32 /bpf_gcc /tools - +/runqslower +/bench diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 7729892e0b04..22aaec74ea0a 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -2,6 +2,8 @@ include ../../../../scripts/Kbuild.include include ../../../scripts/Makefile.arch +CXX ?= $(CROSS_COMPILE)g++ + CURDIR := $(abspath .) TOOLSDIR := $(abspath ../../..) LIBDIR := $(TOOLSDIR)/lib @@ -20,9 +22,10 @@ CLANG ?= clang LLC ?= llc LLVM_OBJCOPY ?= llvm-objcopy BPF_GCC ?= $(shell command -v bpf-gcc;) -CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) -I$(CURDIR) \ - -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) -I$(TOOLSINCDIR) \ - -I$(APIDIR) \ +SAN_CFLAGS ?= +CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) $(SAN_CFLAGS) \ + -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) \ -Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_load_program=bpf_test_load_program LDLIBS += -lcap -lelf -lz -lrt -lpthread @@ -32,7 +35,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \ test_cgroup_storage \ - test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \ + test_netcnt test_tcpnotify_user test_sock_fields test_sysctl \ test_progs-no_alu32 \ test_current_pid_tgid_new_ns @@ -76,7 +79,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ - test_lirc_mode2_user xdping test_cpp runqslower + test_lirc_mode2_user xdping test_cpp runqslower bench TEST_CUSTOM_PROGS = urandom_read @@ -141,7 +144,8 @@ VMLINUX_BTF := $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) $(OUTPUT)/runqslower: $(BPFOBJ) $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \ OUTPUT=$(SCRATCH_DIR)/ VMLINUX_BTF=$(VMLINUX_BTF) \ - BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) + BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) && \ + cp $(SCRATCH_DIR)/runqslower $@ $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ) @@ -241,7 +245,7 @@ define GCC_BPF_BUILD_RULE $(BPF_GCC) $3 $4 -O2 -c $1 -o $2 endef -SKEL_BLACKLIST := btf__% test_pinning_invalid.c +SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. @@ -263,6 +267,7 @@ TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS) TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ $$(filter-out $(SKEL_BLACKLIST), \ $$(TRUNNER_BPF_SRCS))) +TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS) # Evaluate rules now with extra TRUNNER_XXX variables above already defined $$(eval $$(call DEFINE_TEST_RUNNER_RULES,$1,$2)) @@ -323,7 +328,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_BPF_SKELS) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) - cd $$(@D) && $$(CC) $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) + cd $$(@D) && $$(CC) -I. $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ %.c \ @@ -352,6 +357,7 @@ endef TRUNNER_TESTS_DIR := prog_tests TRUNNER_BPF_PROGS_DIR := progs TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ + network_helpers.c testing_helpers.c \ flow_dissector_load.h TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(wildcard progs/btf_dump_test_case_*.c) @@ -403,6 +409,24 @@ $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) $(call msg,CXX,,$@) $(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@ +# Benchmark runner +$(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h + $(call msg,CC,,$@) + $(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@ +$(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h +$(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h +$(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \ + $(OUTPUT)/perfbuf_bench.skel.h +$(OUTPUT)/bench.o: bench.h testing_helpers.h +$(OUTPUT)/bench: LDLIBS += -lm +$(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \ + $(OUTPUT)/bench_count.o \ + $(OUTPUT)/bench_rename.o \ + $(OUTPUT)/bench_trigger.o \ + $(OUTPUT)/bench_ringbufs.o + $(call msg,BINARY,,$@) + $(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS) + EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature \ diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst new file mode 100644 index 000000000000..e885d351595f --- /dev/null +++ b/tools/testing/selftests/bpf/README.rst @@ -0,0 +1,45 @@ +================== +BPF Selftest Notes +================== +General instructions on running selftests can be found in +`Documentation/bpf/bpf_devel_QA.rst`_. + +Additional information about selftest failures are +documented here. + +bpf_iter test failures with clang/llvm 10.0.0 +============================================= + +With clang/llvm 10.0.0, the following two bpf_iter tests failed: + * ``bpf_iter/ipv6_route`` + * ``bpf_iter/netlink`` + +The symptom for ``bpf_iter/ipv6_route`` looks like + +.. code-block:: c + + 2: (79) r8 = *(u64 *)(r1 +8) + ... + 14: (bf) r2 = r8 + 15: (0f) r2 += r1 + ; BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); + 16: (7b) *(u64 *)(r8 +64) = r2 + only read is supported + +The symptom for ``bpf_iter/netlink`` looks like + +.. code-block:: c + + ; struct netlink_sock *nlk = ctx->sk; + 2: (79) r7 = *(u64 *)(r1 +8) + ... + 15: (bf) r2 = r7 + 16: (0f) r2 += r1 + ; BPF_SEQ_PRINTF(seq, "%pK %-3d ", s, s->sk_protocol); + 17: (7b) *(u64 *)(r7 +0) = r2 + only read is supported + +This is due to a llvm BPF backend bug. The fix + https://reviews.llvm.org/D78466 +has been pushed to llvm 10.x release branch and will be +available in 10.0.1. The fix is available in llvm 11.0.0 trunk. diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c new file mode 100644 index 000000000000..944ad4721c83 --- /dev/null +++ b/tools/testing/selftests/bpf/bench.c @@ -0,0 +1,465 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define _GNU_SOURCE +#include <argp.h> +#include <linux/compiler.h> +#include <sys/time.h> +#include <sched.h> +#include <fcntl.h> +#include <pthread.h> +#include <sys/sysinfo.h> +#include <sys/resource.h> +#include <signal.h> +#include "bench.h" +#include "testing_helpers.h" + +struct env env = { + .warmup_sec = 1, + .duration_sec = 5, + .affinity = false, + .consumer_cnt = 1, + .producer_cnt = 1, +}; + +static int libbpf_print_fn(enum libbpf_print_level level, + const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !env.verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static int bump_memlock_rlimit(void) +{ + struct rlimit rlim_new = { + .rlim_cur = RLIM_INFINITY, + .rlim_max = RLIM_INFINITY, + }; + + return setrlimit(RLIMIT_MEMLOCK, &rlim_new); +} + +void setup_libbpf() +{ + int err; + + libbpf_set_print(libbpf_print_fn); + + err = bump_memlock_rlimit(); + if (err) + fprintf(stderr, "failed to increase RLIMIT_MEMLOCK: %d", err); +} + +void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns) +{ + double hits_per_sec, drops_per_sec; + double hits_per_prod; + + hits_per_sec = res->hits / 1000000.0 / (delta_ns / 1000000000.0); + hits_per_prod = hits_per_sec / env.producer_cnt; + drops_per_sec = res->drops / 1000000.0 / (delta_ns / 1000000000.0); + + printf("Iter %3d (%7.3lfus): ", + iter, (delta_ns - 1000000000) / 1000.0); + + printf("hits %8.3lfM/s (%7.3lfM/prod), drops %8.3lfM/s\n", + hits_per_sec, hits_per_prod, drops_per_sec); +} + +void hits_drops_report_final(struct bench_res res[], int res_cnt) +{ + int i; + double hits_mean = 0.0, drops_mean = 0.0; + double hits_stddev = 0.0, drops_stddev = 0.0; + + for (i = 0; i < res_cnt; i++) { + hits_mean += res[i].hits / 1000000.0 / (0.0 + res_cnt); + drops_mean += res[i].drops / 1000000.0 / (0.0 + res_cnt); + } + + if (res_cnt > 1) { + for (i = 0; i < res_cnt; i++) { + hits_stddev += (hits_mean - res[i].hits / 1000000.0) * + (hits_mean - res[i].hits / 1000000.0) / + (res_cnt - 1.0); + drops_stddev += (drops_mean - res[i].drops / 1000000.0) * + (drops_mean - res[i].drops / 1000000.0) / + (res_cnt - 1.0); + } + hits_stddev = sqrt(hits_stddev); + drops_stddev = sqrt(drops_stddev); + } + printf("Summary: hits %8.3lf \u00B1 %5.3lfM/s (%7.3lfM/prod), ", + hits_mean, hits_stddev, hits_mean / env.producer_cnt); + printf("drops %8.3lf \u00B1 %5.3lfM/s\n", + drops_mean, drops_stddev); +} + +const char *argp_program_version = "benchmark"; +const char *argp_program_bug_address = "<bpf@vger.kernel.org>"; +const char argp_program_doc[] = +"benchmark Generic benchmarking framework.\n" +"\n" +"This tool runs benchmarks.\n" +"\n" +"USAGE: benchmark <bench-name>\n" +"\n" +"EXAMPLES:\n" +" # run 'count-local' benchmark with 1 producer and 1 consumer\n" +" benchmark count-local\n" +" # run 'count-local' with 16 producer and 8 consumer thread, pinned to CPUs\n" +" benchmark -p16 -c8 -a count-local\n"; + +enum { + ARG_PROD_AFFINITY_SET = 1000, + ARG_CONS_AFFINITY_SET = 1001, +}; + +static const struct argp_option opts[] = { + { "list", 'l', NULL, 0, "List available benchmarks"}, + { "duration", 'd', "SEC", 0, "Duration of benchmark, seconds"}, + { "warmup", 'w', "SEC", 0, "Warm-up period, seconds"}, + { "producers", 'p', "NUM", 0, "Number of producer threads"}, + { "consumers", 'c', "NUM", 0, "Number of consumer threads"}, + { "verbose", 'v', NULL, 0, "Verbose debug output"}, + { "affinity", 'a', NULL, 0, "Set consumer/producer thread affinity"}, + { "prod-affinity", ARG_PROD_AFFINITY_SET, "CPUSET", 0, + "Set of CPUs for producer threads; implies --affinity"}, + { "cons-affinity", ARG_CONS_AFFINITY_SET, "CPUSET", 0, + "Set of CPUs for consumer threads; implies --affinity"}, + {}, +}; + +extern struct argp bench_ringbufs_argp; + +static const struct argp_child bench_parsers[] = { + { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + static int pos_args; + + switch (key) { + case 'v': + env.verbose = true; + break; + case 'l': + env.list = true; + break; + case 'd': + env.duration_sec = strtol(arg, NULL, 10); + if (env.duration_sec <= 0) { + fprintf(stderr, "Invalid duration: %s\n", arg); + argp_usage(state); + } + break; + case 'w': + env.warmup_sec = strtol(arg, NULL, 10); + if (env.warmup_sec <= 0) { + fprintf(stderr, "Invalid warm-up duration: %s\n", arg); + argp_usage(state); + } + break; + case 'p': + env.producer_cnt = strtol(arg, NULL, 10); + if (env.producer_cnt <= 0) { + fprintf(stderr, "Invalid producer count: %s\n", arg); + argp_usage(state); + } + break; + case 'c': + env.consumer_cnt = strtol(arg, NULL, 10); + if (env.consumer_cnt <= 0) { + fprintf(stderr, "Invalid consumer count: %s\n", arg); + argp_usage(state); + } + break; + case 'a': + env.affinity = true; + break; + case ARG_PROD_AFFINITY_SET: + env.affinity = true; + if (parse_num_list(arg, &env.prod_cpus.cpus, + &env.prod_cpus.cpus_len)) { + fprintf(stderr, "Invalid format of CPU set for producers."); + argp_usage(state); + } + break; + case ARG_CONS_AFFINITY_SET: + env.affinity = true; + if (parse_num_list(arg, &env.cons_cpus.cpus, + &env.cons_cpus.cpus_len)) { + fprintf(stderr, "Invalid format of CPU set for consumers."); + argp_usage(state); + } + break; + case ARGP_KEY_ARG: + if (pos_args++) { + fprintf(stderr, + "Unrecognized positional argument: %s\n", arg); + argp_usage(state); + } + env.bench_name = strdup(arg); + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static void parse_cmdline_args(int argc, char **argv) +{ + static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, + .children = bench_parsers, + }; + if (argp_parse(&argp, argc, argv, 0, NULL, NULL)) + exit(1); + if (!env.list && !env.bench_name) { + argp_help(&argp, stderr, ARGP_HELP_DOC, "bench"); + exit(1); + } +} + +static void collect_measurements(long delta_ns); + +static __u64 last_time_ns; +static void sigalarm_handler(int signo) +{ + long new_time_ns = get_time_ns(); + long delta_ns = new_time_ns - last_time_ns; + + collect_measurements(delta_ns); + + last_time_ns = new_time_ns; +} + +/* set up periodic 1-second timer */ +static void setup_timer() +{ + static struct sigaction sigalarm_action = { + .sa_handler = sigalarm_handler, + }; + struct itimerval timer_settings = {}; + int err; + + last_time_ns = get_time_ns(); + err = sigaction(SIGALRM, &sigalarm_action, NULL); + if (err < 0) { + fprintf(stderr, "failed to install SIGALRM handler: %d\n", -errno); + exit(1); + } + timer_settings.it_interval.tv_sec = 1; + timer_settings.it_value.tv_sec = 1; + err = setitimer(ITIMER_REAL, &timer_settings, NULL); + if (err < 0) { + fprintf(stderr, "failed to arm interval timer: %d\n", -errno); + exit(1); + } +} + +static void set_thread_affinity(pthread_t thread, int cpu) +{ + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + if (pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset)) { + fprintf(stderr, "setting affinity to CPU #%d failed: %d\n", + cpu, errno); + exit(1); + } +} + +static int next_cpu(struct cpu_set *cpu_set) +{ + if (cpu_set->cpus) { + int i; + + /* find next available CPU */ + for (i = cpu_set->next_cpu; i < cpu_set->cpus_len; i++) { + if (cpu_set->cpus[i]) { + cpu_set->next_cpu = i + 1; + return i; + } + } + fprintf(stderr, "Not enough CPUs specified, need CPU #%d or higher.\n", i); + exit(1); + } + + return cpu_set->next_cpu++; +} + +static struct bench_state { + int res_cnt; + struct bench_res *results; + pthread_t *consumers; + pthread_t *producers; +} state; + +const struct bench *bench = NULL; + +extern const struct bench bench_count_global; +extern const struct bench bench_count_local; +extern const struct bench bench_rename_base; +extern const struct bench bench_rename_kprobe; +extern const struct bench bench_rename_kretprobe; +extern const struct bench bench_rename_rawtp; +extern const struct bench bench_rename_fentry; +extern const struct bench bench_rename_fexit; +extern const struct bench bench_rename_fmodret; +extern const struct bench bench_trig_base; +extern const struct bench bench_trig_tp; +extern const struct bench bench_trig_rawtp; +extern const struct bench bench_trig_kprobe; +extern const struct bench bench_trig_fentry; +extern const struct bench bench_trig_fmodret; +extern const struct bench bench_rb_libbpf; +extern const struct bench bench_rb_custom; +extern const struct bench bench_pb_libbpf; +extern const struct bench bench_pb_custom; + +static const struct bench *benchs[] = { + &bench_count_global, + &bench_count_local, + &bench_rename_base, + &bench_rename_kprobe, + &bench_rename_kretprobe, + &bench_rename_rawtp, + &bench_rename_fentry, + &bench_rename_fexit, + &bench_rename_fmodret, + &bench_trig_base, + &bench_trig_tp, + &bench_trig_rawtp, + &bench_trig_kprobe, + &bench_trig_fentry, + &bench_trig_fmodret, + &bench_rb_libbpf, + &bench_rb_custom, + &bench_pb_libbpf, + &bench_pb_custom, +}; + +static void setup_benchmark() +{ + int i, err; + + if (!env.bench_name) { + fprintf(stderr, "benchmark name is not specified\n"); + exit(1); + } + + for (i = 0; i < ARRAY_SIZE(benchs); i++) { + if (strcmp(benchs[i]->name, env.bench_name) == 0) { + bench = benchs[i]; + break; + } + } + if (!bench) { + fprintf(stderr, "benchmark '%s' not found\n", env.bench_name); + exit(1); + } + + printf("Setting up benchmark '%s'...\n", bench->name); + + state.producers = calloc(env.producer_cnt, sizeof(*state.producers)); + state.consumers = calloc(env.consumer_cnt, sizeof(*state.consumers)); + state.results = calloc(env.duration_sec + env.warmup_sec + 2, + sizeof(*state.results)); + if (!state.producers || !state.consumers || !state.results) + exit(1); + + if (bench->validate) + bench->validate(); + if (bench->setup) + bench->setup(); + + for (i = 0; i < env.consumer_cnt; i++) { + err = pthread_create(&state.consumers[i], NULL, + bench->consumer_thread, (void *)(long)i); + if (err) { + fprintf(stderr, "failed to create consumer thread #%d: %d\n", + i, -errno); + exit(1); + } + if (env.affinity) + set_thread_affinity(state.consumers[i], + next_cpu(&env.cons_cpus)); + } + + /* unless explicit producer CPU list is specified, continue after + * last consumer CPU + */ + if (!env.prod_cpus.cpus) + env.prod_cpus.next_cpu = env.cons_cpus.next_cpu; + + for (i = 0; i < env.producer_cnt; i++) { + err = pthread_create(&state.producers[i], NULL, + bench->producer_thread, (void *)(long)i); + if (err) { + fprintf(stderr, "failed to create producer thread #%d: %d\n", + i, -errno); + exit(1); + } + if (env.affinity) + set_thread_affinity(state.producers[i], + next_cpu(&env.prod_cpus)); + } + + printf("Benchmark '%s' started.\n", bench->name); +} + +static pthread_mutex_t bench_done_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t bench_done = PTHREAD_COND_INITIALIZER; + +static void collect_measurements(long delta_ns) { + int iter = state.res_cnt++; + struct bench_res *res = &state.results[iter]; + + bench->measure(res); + + if (bench->report_progress) + bench->report_progress(iter, res, delta_ns); + + if (iter == env.duration_sec + env.warmup_sec) { + pthread_mutex_lock(&bench_done_mtx); + pthread_cond_signal(&bench_done); + pthread_mutex_unlock(&bench_done_mtx); + } +} + +int main(int argc, char **argv) +{ + parse_cmdline_args(argc, argv); + + if (env.list) { + int i; + + printf("Available benchmarks:\n"); + for (i = 0; i < ARRAY_SIZE(benchs); i++) { + printf("- %s\n", benchs[i]->name); + } + return 0; + } + + setup_benchmark(); + + setup_timer(); + + pthread_mutex_lock(&bench_done_mtx); + pthread_cond_wait(&bench_done, &bench_done_mtx); + pthread_mutex_unlock(&bench_done_mtx); + + if (bench->report_final) + /* skip first sample */ + bench->report_final(state.results + env.warmup_sec, + state.res_cnt - env.warmup_sec); + + return 0; +} + diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h new file mode 100644 index 000000000000..c1f48a473b02 --- /dev/null +++ b/tools/testing/selftests/bpf/bench.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#pragma once +#include <stdlib.h> +#include <stdbool.h> +#include <linux/err.h> +#include <errno.h> +#include <unistd.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include <math.h> +#include <time.h> +#include <sys/syscall.h> + +struct cpu_set { + bool *cpus; + int cpus_len; + int next_cpu; +}; + +struct env { + char *bench_name; + int duration_sec; + int warmup_sec; + bool verbose; + bool list; + bool affinity; + int consumer_cnt; + int producer_cnt; + struct cpu_set prod_cpus; + struct cpu_set cons_cpus; +}; + +struct bench_res { + long hits; + long drops; +}; + +struct bench { + const char *name; + void (*validate)(); + void (*setup)(); + void *(*producer_thread)(void *ctx); + void *(*consumer_thread)(void *ctx); + void (*measure)(struct bench_res* res); + void (*report_progress)(int iter, struct bench_res* res, long delta_ns); + void (*report_final)(struct bench_res res[], int res_cnt); +}; + +struct counter { + long value; +} __attribute__((aligned(128))); + +extern struct env env; +extern const struct bench *bench; + +void setup_libbpf(); +void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns); +void hits_drops_report_final(struct bench_res res[], int res_cnt); + +static inline __u64 get_time_ns() { + struct timespec t; + + clock_gettime(CLOCK_MONOTONIC, &t); + + return (u64)t.tv_sec * 1000000000 + t.tv_nsec; +} + +static inline void atomic_inc(long *value) +{ + (void)__atomic_add_fetch(value, 1, __ATOMIC_RELAXED); +} + +static inline void atomic_add(long *value, long n) +{ + (void)__atomic_add_fetch(value, n, __ATOMIC_RELAXED); +} + +static inline long atomic_swap(long *value, long n) +{ + return __atomic_exchange_n(value, n, __ATOMIC_RELAXED); +} diff --git a/tools/testing/selftests/bpf/benchs/bench_count.c b/tools/testing/selftests/bpf/benchs/bench_count.c new file mode 100644 index 000000000000..befba7a82643 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_count.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include "bench.h" + +/* COUNT-GLOBAL benchmark */ + +static struct count_global_ctx { + struct counter hits; +} count_global_ctx; + +static void *count_global_producer(void *input) +{ + struct count_global_ctx *ctx = &count_global_ctx; + + while (true) { + atomic_inc(&ctx->hits.value); + } + return NULL; +} + +static void *count_global_consumer(void *input) +{ + return NULL; +} + +static void count_global_measure(struct bench_res *res) +{ + struct count_global_ctx *ctx = &count_global_ctx; + + res->hits = atomic_swap(&ctx->hits.value, 0); +} + +/* COUNT-local benchmark */ + +static struct count_local_ctx { + struct counter *hits; +} count_local_ctx; + +static void count_local_setup() +{ + struct count_local_ctx *ctx = &count_local_ctx; + + ctx->hits = calloc(env.consumer_cnt, sizeof(*ctx->hits)); + if (!ctx->hits) + exit(1); +} + +static void *count_local_producer(void *input) +{ + struct count_local_ctx *ctx = &count_local_ctx; + int idx = (long)input; + + while (true) { + atomic_inc(&ctx->hits[idx].value); + } + return NULL; +} + +static void *count_local_consumer(void *input) +{ + return NULL; +} + +static void count_local_measure(struct bench_res *res) +{ + struct count_local_ctx *ctx = &count_local_ctx; + int i; + + for (i = 0; i < env.producer_cnt; i++) { + res->hits += atomic_swap(&ctx->hits[i].value, 0); + } +} + +const struct bench bench_count_global = { + .name = "count-global", + .producer_thread = count_global_producer, + .consumer_thread = count_global_consumer, + .measure = count_global_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_count_local = { + .name = "count-local", + .setup = count_local_setup, + .producer_thread = count_local_producer, + .consumer_thread = count_local_consumer, + .measure = count_local_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_rename.c b/tools/testing/selftests/bpf/benchs/bench_rename.c new file mode 100644 index 000000000000..e74cff40f4fe --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_rename.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <fcntl.h> +#include "bench.h" +#include "test_overhead.skel.h" + +/* BPF triggering benchmarks */ +static struct ctx { + struct test_overhead *skel; + struct counter hits; + int fd; +} ctx; + +static void validate() +{ + if (env.producer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-producer!\n"); + exit(1); + } + if (env.consumer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); + exit(1); + } +} + +static void *producer(void *input) +{ + char buf[] = "test_overhead"; + int err; + + while (true) { + err = write(ctx.fd, buf, sizeof(buf)); + if (err < 0) { + fprintf(stderr, "write failed\n"); + exit(1); + } + atomic_inc(&ctx.hits.value); + } +} + +static void measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.hits.value, 0); +} + +static void setup_ctx() +{ + setup_libbpf(); + + ctx.skel = test_overhead__open_and_load(); + if (!ctx.skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + ctx.fd = open("/proc/self/comm", O_WRONLY|O_TRUNC); + if (ctx.fd < 0) { + fprintf(stderr, "failed to open /proc/self/comm: %d\n", -errno); + exit(1); + } +} + +static void attach_bpf(struct bpf_program *prog) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program!\n"); + exit(1); + } +} + +static void setup_base() +{ + setup_ctx(); +} + +static void setup_kprobe() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog1); +} + +static void setup_kretprobe() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog2); +} + +static void setup_rawtp() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog3); +} + +static void setup_fentry() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog4); +} + +static void setup_fexit() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog5); +} + +static void setup_fmodret() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog6); +} + +static void *consumer(void *input) +{ + return NULL; +} + +const struct bench bench_rename_base = { + .name = "rename-base", + .validate = validate, + .setup = setup_base, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_kprobe = { + .name = "rename-kprobe", + .validate = validate, + .setup = setup_kprobe, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_kretprobe = { + .name = "rename-kretprobe", + .validate = validate, + .setup = setup_kretprobe, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_rawtp = { + .name = "rename-rawtp", + .validate = validate, + .setup = setup_rawtp, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_fentry = { + .name = "rename-fentry", + .validate = validate, + .setup = setup_fentry, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_fexit = { + .name = "rename-fexit", + .validate = validate, + .setup = setup_fexit, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_fmodret = { + .name = "rename-fmodret", + .validate = validate, + .setup = setup_fmodret, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c new file mode 100644 index 000000000000..da87c7f31891 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <asm/barrier.h> +#include <linux/perf_event.h> +#include <linux/ring_buffer.h> +#include <sys/epoll.h> +#include <sys/mman.h> +#include <argp.h> +#include <stdlib.h> +#include "bench.h" +#include "ringbuf_bench.skel.h" +#include "perfbuf_bench.skel.h" + +static struct { + bool back2back; + int batch_cnt; + bool sampled; + int sample_rate; + int ringbuf_sz; /* per-ringbuf, in bytes */ + bool ringbuf_use_output; /* use slower output API */ + int perfbuf_sz; /* per-CPU size, in pages */ +} args = { + .back2back = false, + .batch_cnt = 500, + .sampled = false, + .sample_rate = 500, + .ringbuf_sz = 512 * 1024, + .ringbuf_use_output = false, + .perfbuf_sz = 128, +}; + +enum { + ARG_RB_BACK2BACK = 2000, + ARG_RB_USE_OUTPUT = 2001, + ARG_RB_BATCH_CNT = 2002, + ARG_RB_SAMPLED = 2003, + ARG_RB_SAMPLE_RATE = 2004, +}; + +static const struct argp_option opts[] = { + { "rb-b2b", ARG_RB_BACK2BACK, NULL, 0, "Back-to-back mode"}, + { "rb-use-output", ARG_RB_USE_OUTPUT, NULL, 0, "Use bpf_ringbuf_output() instead of bpf_ringbuf_reserve()"}, + { "rb-batch-cnt", ARG_RB_BATCH_CNT, "CNT", 0, "Set BPF-side record batch count"}, + { "rb-sampled", ARG_RB_SAMPLED, NULL, 0, "Notification sampling"}, + { "rb-sample-rate", ARG_RB_SAMPLE_RATE, "RATE", 0, "Notification sample rate"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case ARG_RB_BACK2BACK: + args.back2back = true; + break; + case ARG_RB_USE_OUTPUT: + args.ringbuf_use_output = true; + break; + case ARG_RB_BATCH_CNT: + args.batch_cnt = strtol(arg, NULL, 10); + if (args.batch_cnt < 0) { + fprintf(stderr, "Invalid batch count."); + argp_usage(state); + } + break; + case ARG_RB_SAMPLED: + args.sampled = true; + break; + case ARG_RB_SAMPLE_RATE: + args.sample_rate = strtol(arg, NULL, 10); + if (args.sample_rate < 0) { + fprintf(stderr, "Invalid perfbuf sample rate."); + argp_usage(state); + } + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +/* exported into benchmark runner */ +const struct argp bench_ringbufs_argp = { + .options = opts, + .parser = parse_arg, +}; + +/* RINGBUF-LIBBPF benchmark */ + +static struct counter buf_hits; + +static inline void bufs_trigger_batch() +{ + (void)syscall(__NR_getpgid); +} + +static void bufs_validate() +{ + if (env.consumer_cnt != 1) { + fprintf(stderr, "rb-libbpf benchmark doesn't support multi-consumer!\n"); + exit(1); + } + + if (args.back2back && env.producer_cnt > 1) { + fprintf(stderr, "back-to-back mode makes sense only for single-producer case!\n"); + exit(1); + } +} + +static void *bufs_sample_producer(void *input) +{ + if (args.back2back) { + /* initial batch to get everything started */ + bufs_trigger_batch(); + return NULL; + } + + while (true) + bufs_trigger_batch(); + return NULL; +} + +static struct ringbuf_libbpf_ctx { + struct ringbuf_bench *skel; + struct ring_buffer *ringbuf; +} ringbuf_libbpf_ctx; + +static void ringbuf_libbpf_measure(struct bench_res *res) +{ + struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; + + res->hits = atomic_swap(&buf_hits.value, 0); + res->drops = atomic_swap(&ctx->skel->bss->dropped, 0); +} + +static struct ringbuf_bench *ringbuf_setup_skeleton() +{ + struct ringbuf_bench *skel; + + setup_libbpf(); + + skel = ringbuf_bench__open(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + skel->rodata->batch_cnt = args.batch_cnt; + skel->rodata->use_output = args.ringbuf_use_output ? 1 : 0; + + if (args.sampled) + /* record data + header take 16 bytes */ + skel->rodata->wakeup_data_size = args.sample_rate * 16; + + bpf_map__resize(skel->maps.ringbuf, args.ringbuf_sz); + + if (ringbuf_bench__load(skel)) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + + return skel; +} + +static int buf_process_sample(void *ctx, void *data, size_t len) +{ + atomic_inc(&buf_hits.value); + return 0; +} + +static void ringbuf_libbpf_setup() +{ + struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; + struct bpf_link *link; + + ctx->skel = ringbuf_setup_skeleton(); + ctx->ringbuf = ring_buffer__new(bpf_map__fd(ctx->skel->maps.ringbuf), + buf_process_sample, NULL, NULL); + if (!ctx->ringbuf) { + fprintf(stderr, "failed to create ringbuf\n"); + exit(1); + } + + link = bpf_program__attach(ctx->skel->progs.bench_ringbuf); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program!\n"); + exit(1); + } +} + +static void *ringbuf_libbpf_consumer(void *input) +{ + struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; + + while (ring_buffer__poll(ctx->ringbuf, -1) >= 0) { + if (args.back2back) + bufs_trigger_batch(); + } + fprintf(stderr, "ringbuf polling failed!\n"); + return NULL; +} + +/* RINGBUF-CUSTOM benchmark */ +struct ringbuf_custom { + __u64 *consumer_pos; + __u64 *producer_pos; + __u64 mask; + void *data; + int map_fd; +}; + +static struct ringbuf_custom_ctx { + struct ringbuf_bench *skel; + struct ringbuf_custom ringbuf; + int epoll_fd; + struct epoll_event event; +} ringbuf_custom_ctx; + +static void ringbuf_custom_measure(struct bench_res *res) +{ + struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx; + + res->hits = atomic_swap(&buf_hits.value, 0); + res->drops = atomic_swap(&ctx->skel->bss->dropped, 0); +} + +static void ringbuf_custom_setup() +{ + struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx; + const size_t page_size = getpagesize(); + struct bpf_link *link; + struct ringbuf_custom *r; + void *tmp; + int err; + + ctx->skel = ringbuf_setup_skeleton(); + + ctx->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (ctx->epoll_fd < 0) { + fprintf(stderr, "failed to create epoll fd: %d\n", -errno); + exit(1); + } + + r = &ctx->ringbuf; + r->map_fd = bpf_map__fd(ctx->skel->maps.ringbuf); + r->mask = args.ringbuf_sz - 1; + + /* Map writable consumer page */ + tmp = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, + r->map_fd, 0); + if (tmp == MAP_FAILED) { + fprintf(stderr, "failed to mmap consumer page: %d\n", -errno); + exit(1); + } + r->consumer_pos = tmp; + + /* Map read-only producer page and data pages. */ + tmp = mmap(NULL, page_size + 2 * args.ringbuf_sz, PROT_READ, MAP_SHARED, + r->map_fd, page_size); + if (tmp == MAP_FAILED) { + fprintf(stderr, "failed to mmap data pages: %d\n", -errno); + exit(1); + } + r->producer_pos = tmp; + r->data = tmp + page_size; + + ctx->event.events = EPOLLIN; + err = epoll_ctl(ctx->epoll_fd, EPOLL_CTL_ADD, r->map_fd, &ctx->event); + if (err < 0) { + fprintf(stderr, "failed to epoll add ringbuf: %d\n", -errno); + exit(1); + } + + link = bpf_program__attach(ctx->skel->progs.bench_ringbuf); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program\n"); + exit(1); + } +} + +#define RINGBUF_BUSY_BIT (1 << 31) +#define RINGBUF_DISCARD_BIT (1 << 30) +#define RINGBUF_META_LEN 8 + +static inline int roundup_len(__u32 len) +{ + /* clear out top 2 bits */ + len <<= 2; + len >>= 2; + /* add length prefix */ + len += RINGBUF_META_LEN; + /* round up to 8 byte alignment */ + return (len + 7) / 8 * 8; +} + +static void ringbuf_custom_process_ring(struct ringbuf_custom *r) +{ + unsigned long cons_pos, prod_pos; + int *len_ptr, len; + bool got_new_data; + + cons_pos = smp_load_acquire(r->consumer_pos); + while (true) { + got_new_data = false; + prod_pos = smp_load_acquire(r->producer_pos); + while (cons_pos < prod_pos) { + len_ptr = r->data + (cons_pos & r->mask); + len = smp_load_acquire(len_ptr); + + /* sample not committed yet, bail out for now */ + if (len & RINGBUF_BUSY_BIT) + return; + + got_new_data = true; + cons_pos += roundup_len(len); + + atomic_inc(&buf_hits.value); + } + if (got_new_data) + smp_store_release(r->consumer_pos, cons_pos); + else + break; + }; +} + +static void *ringbuf_custom_consumer(void *input) +{ + struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx; + int cnt; + + do { + if (args.back2back) + bufs_trigger_batch(); + cnt = epoll_wait(ctx->epoll_fd, &ctx->event, 1, -1); + if (cnt > 0) + ringbuf_custom_process_ring(&ctx->ringbuf); + } while (cnt >= 0); + fprintf(stderr, "ringbuf polling failed!\n"); + return 0; +} + +/* PERFBUF-LIBBPF benchmark */ +static struct perfbuf_libbpf_ctx { + struct perfbuf_bench *skel; + struct perf_buffer *perfbuf; +} perfbuf_libbpf_ctx; + +static void perfbuf_measure(struct bench_res *res) +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + + res->hits = atomic_swap(&buf_hits.value, 0); + res->drops = atomic_swap(&ctx->skel->bss->dropped, 0); +} + +static struct perfbuf_bench *perfbuf_setup_skeleton() +{ + struct perfbuf_bench *skel; + + setup_libbpf(); + + skel = perfbuf_bench__open(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + skel->rodata->batch_cnt = args.batch_cnt; + + if (perfbuf_bench__load(skel)) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + + return skel; +} + +static enum bpf_perf_event_ret +perfbuf_process_sample_raw(void *input_ctx, int cpu, + struct perf_event_header *e) +{ + switch (e->type) { + case PERF_RECORD_SAMPLE: + atomic_inc(&buf_hits.value); + break; + case PERF_RECORD_LOST: + break; + default: + return LIBBPF_PERF_EVENT_ERROR; + } + return LIBBPF_PERF_EVENT_CONT; +} + +static void perfbuf_libbpf_setup() +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + struct perf_event_attr attr; + struct perf_buffer_raw_opts pb_opts = { + .event_cb = perfbuf_process_sample_raw, + .ctx = (void *)(long)0, + .attr = &attr, + }; + struct bpf_link *link; + + ctx->skel = perfbuf_setup_skeleton(); + + memset(&attr, 0, sizeof(attr)); + attr.config = PERF_COUNT_SW_BPF_OUTPUT, + attr.type = PERF_TYPE_SOFTWARE; + attr.sample_type = PERF_SAMPLE_RAW; + /* notify only every Nth sample */ + if (args.sampled) { + attr.sample_period = args.sample_rate; + attr.wakeup_events = args.sample_rate; + } else { + attr.sample_period = 1; + attr.wakeup_events = 1; + } + + if (args.sample_rate > args.batch_cnt) { + fprintf(stderr, "sample rate %d is too high for given batch count %d\n", + args.sample_rate, args.batch_cnt); + exit(1); + } + + ctx->perfbuf = perf_buffer__new_raw(bpf_map__fd(ctx->skel->maps.perfbuf), + args.perfbuf_sz, &pb_opts); + if (!ctx->perfbuf) { + fprintf(stderr, "failed to create perfbuf\n"); + exit(1); + } + + link = bpf_program__attach(ctx->skel->progs.bench_perfbuf); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program\n"); + exit(1); + } +} + +static void *perfbuf_libbpf_consumer(void *input) +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + + while (perf_buffer__poll(ctx->perfbuf, -1) >= 0) { + if (args.back2back) + bufs_trigger_batch(); + } + fprintf(stderr, "perfbuf polling failed!\n"); + return NULL; +} + +/* PERFBUF-CUSTOM benchmark */ + +/* copies of internal libbpf definitions */ +struct perf_cpu_buf { + struct perf_buffer *pb; + void *base; /* mmap()'ed memory */ + void *buf; /* for reconstructing segmented data */ + size_t buf_size; + int fd; + int cpu; + int map_key; +}; + +struct perf_buffer { + perf_buffer_event_fn event_cb; + perf_buffer_sample_fn sample_cb; + perf_buffer_lost_fn lost_cb; + void *ctx; /* passed into callbacks */ + + size_t page_size; + size_t mmap_size; + struct perf_cpu_buf **cpu_bufs; + struct epoll_event *events; + int cpu_cnt; /* number of allocated CPU buffers */ + int epoll_fd; /* perf event FD */ + int map_fd; /* BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map FD */ +}; + +static void *perfbuf_custom_consumer(void *input) +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + struct perf_buffer *pb = ctx->perfbuf; + struct perf_cpu_buf *cpu_buf; + struct perf_event_mmap_page *header; + size_t mmap_mask = pb->mmap_size - 1; + struct perf_event_header *ehdr; + __u64 data_head, data_tail; + size_t ehdr_size; + void *base; + int i, cnt; + + while (true) { + if (args.back2back) + bufs_trigger_batch(); + cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, -1); + if (cnt <= 0) { + fprintf(stderr, "perf epoll failed: %d\n", -errno); + exit(1); + } + + for (i = 0; i < cnt; ++i) { + cpu_buf = pb->events[i].data.ptr; + header = cpu_buf->base; + base = ((void *)header) + pb->page_size; + + data_head = ring_buffer_read_head(header); + data_tail = header->data_tail; + while (data_head != data_tail) { + ehdr = base + (data_tail & mmap_mask); + ehdr_size = ehdr->size; + + if (ehdr->type == PERF_RECORD_SAMPLE) + atomic_inc(&buf_hits.value); + + data_tail += ehdr_size; + } + ring_buffer_write_tail(header, data_tail); + } + } + return NULL; +} + +const struct bench bench_rb_libbpf = { + .name = "rb-libbpf", + .validate = bufs_validate, + .setup = ringbuf_libbpf_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = ringbuf_libbpf_consumer, + .measure = ringbuf_libbpf_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rb_custom = { + .name = "rb-custom", + .validate = bufs_validate, + .setup = ringbuf_custom_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = ringbuf_custom_consumer, + .measure = ringbuf_custom_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_pb_libbpf = { + .name = "pb-libbpf", + .validate = bufs_validate, + .setup = perfbuf_libbpf_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = perfbuf_libbpf_consumer, + .measure = perfbuf_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_pb_custom = { + .name = "pb-custom", + .validate = bufs_validate, + .setup = perfbuf_libbpf_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = perfbuf_custom_consumer, + .measure = perfbuf_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c new file mode 100644 index 000000000000..49c22832f216 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include "bench.h" +#include "trigger_bench.skel.h" + +/* BPF triggering benchmarks */ +static struct trigger_ctx { + struct trigger_bench *skel; +} ctx; + +static struct counter base_hits; + +static void trigger_validate() +{ + if (env.consumer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); + exit(1); + } +} + +static void *trigger_base_producer(void *input) +{ + while (true) { + (void)syscall(__NR_getpgid); + atomic_inc(&base_hits.value); + } + return NULL; +} + +static void trigger_base_measure(struct bench_res *res) +{ + res->hits = atomic_swap(&base_hits.value, 0); +} + +static void *trigger_producer(void *input) +{ + while (true) + (void)syscall(__NR_getpgid); + return NULL; +} + +static void trigger_measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.skel->bss->hits, 0); +} + +static void setup_ctx() +{ + setup_libbpf(); + + ctx.skel = trigger_bench__open_and_load(); + if (!ctx.skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } +} + +static void attach_bpf(struct bpf_program *prog) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program!\n"); + exit(1); + } +} + +static void trigger_tp_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_tp); +} + +static void trigger_rawtp_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_raw_tp); +} + +static void trigger_kprobe_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_kprobe); +} + +static void trigger_fentry_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_fentry); +} + +static void trigger_fmodret_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_fmodret); +} + +static void *trigger_consumer(void *input) +{ + return NULL; +} + +const struct bench bench_trig_base = { + .name = "trig-base", + .validate = trigger_validate, + .producer_thread = trigger_base_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_base_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_tp = { + .name = "trig-tp", + .validate = trigger_validate, + .setup = trigger_tp_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_rawtp = { + .name = "trig-rawtp", + .validate = trigger_validate, + .setup = trigger_rawtp_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_kprobe = { + .name = "trig-kprobe", + .validate = trigger_validate, + .setup = trigger_kprobe_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_fentry = { + .name = "trig-fentry", + .validate = trigger_validate, + .setup = trigger_fentry_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_fmodret = { + .name = "trig-fmodret", + .validate = trigger_validate, + .setup = trigger_fmodret_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/run_bench_rename.sh b/tools/testing/selftests/bpf/benchs/run_bench_rename.sh new file mode 100755 index 000000000000..16f774b1cdbe --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_rename.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -eufo pipefail + +for i in base kprobe kretprobe rawtp fentry fexit fmodret +do + summary=$(sudo ./bench -w2 -d5 -a rename-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) + printf "%-10s: %s\n" $i "$summary" +done diff --git a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh new file mode 100755 index 000000000000..af4aa04caba6 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +set -eufo pipefail + +RUN_BENCH="sudo ./bench -w3 -d10 -a" + +function hits() +{ + echo "$*" | sed -E "s/.*hits\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/" +} + +function drops() +{ + echo "$*" | sed -E "s/.*drops\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/" +} + +function header() +{ + local len=${#1} + + printf "\n%s\n" "$1" + for i in $(seq 1 $len); do printf '='; done + printf '\n' +} + +function summarize() +{ + bench="$1" + summary=$(echo $2 | tail -n1) + printf "%-20s %s (drops %s)\n" "$bench" "$(hits $summary)" "$(drops $summary)" +} + +header "Single-producer, parallel producer" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH $b)" +done + +header "Single-producer, parallel producer, sampled notification" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH --rb-sampled $b)" +done + +header "Single-producer, back-to-back mode" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH --rb-b2b $b)" + summarize $b-sampled "$($RUN_BENCH --rb-sampled --rb-b2b $b)" +done + +header "Ringbuf back-to-back, effect of sample rate" +for b in 1 5 10 25 50 100 250 500 1000 2000 3000; do + summarize "rb-sampled-$b" "$($RUN_BENCH --rb-b2b --rb-batch-cnt $b --rb-sampled --rb-sample-rate $b rb-custom)" +done +header "Perfbuf back-to-back, effect of sample rate" +for b in 1 5 10 25 50 100 250 500 1000 2000 3000; do + summarize "pb-sampled-$b" "$($RUN_BENCH --rb-b2b --rb-batch-cnt $b --rb-sampled --rb-sample-rate $b pb-custom)" +done + +header "Ringbuf back-to-back, reserve+commit vs output" +summarize "reserve" "$($RUN_BENCH --rb-b2b rb-custom)" +summarize "output" "$($RUN_BENCH --rb-b2b --rb-use-output rb-custom)" + +header "Ringbuf sampled, reserve+commit vs output" +summarize "reserve-sampled" "$($RUN_BENCH --rb-sampled rb-custom)" +summarize "output-sampled" "$($RUN_BENCH --rb-sampled --rb-use-output rb-custom)" + +header "Single-producer, consumer/producer competing on the same CPU, low batch count" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH --rb-batch-cnt 1 --rb-sample-rate 1 --prod-affinity 0 --cons-affinity 0 $b)" +done + +header "Ringbuf, multi-producer contention" +for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do + summarize "rb-libbpf nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)" +done + diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh new file mode 100755 index 000000000000..78e83f243294 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -eufo pipefail + +for i in base tp rawtp kprobe fentry fmodret +do + summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) + printf "%-10s: %s\n" $i "$summary" +done diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 60e3ae5d4e48..2118e23ac07a 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -25,6 +25,7 @@ CONFIG_XDP_SOCKETS=y CONFIG_FTRACE_SYSCALLS=y CONFIG_IPV6_TUNNEL=y CONFIG_IPV6_GRE=y +CONFIG_IPV6_SEG6_BPF=y CONFIG_NET_FOU=m CONFIG_NET_FOU_IP_TUNNELS=y CONFIG_IPV6_FOU=m @@ -37,3 +38,4 @@ CONFIG_IPV6_SIT=m CONFIG_BPF_JIT=y CONFIG_BPF_LSM=y CONFIG_SECURITY=y +CONFIG_LIRC=y diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c new file mode 100644 index 000000000000..e36dd1a1780d --- /dev/null +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include <arpa/inet.h> + +#include <sys/epoll.h> + +#include <linux/err.h> +#include <linux/in.h> +#include <linux/in6.h> + +#include "bpf_util.h" +#include "network_helpers.h" + +#define clean_errno() (errno == 0 ? "None" : strerror(errno)) +#define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \ + __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) + +struct ipv4_packet pkt_v4 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IP), + .iph.ihl = 5, + .iph.protocol = IPPROTO_TCP, + .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), + .tcp.urg_ptr = 123, + .tcp.doff = 5, +}; + +struct ipv6_packet pkt_v6 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), + .iph.nexthdr = IPPROTO_TCP, + .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), + .tcp.urg_ptr = 123, + .tcp.doff = 5, +}; + +int start_server_with_port(int family, int type, __u16 port) +{ + struct sockaddr_storage addr = {}; + socklen_t len; + int fd; + + if (family == AF_INET) { + struct sockaddr_in *sin = (void *)&addr; + + sin->sin_family = AF_INET; + sin->sin_port = htons(port); + len = sizeof(*sin); + } else { + struct sockaddr_in6 *sin6 = (void *)&addr; + + sin6->sin6_family = AF_INET6; + sin6->sin6_port = htons(port); + len = sizeof(*sin6); + } + + fd = socket(family, type | SOCK_NONBLOCK, 0); + if (fd < 0) { + log_err("Failed to create server socket"); + return -1; + } + + if (bind(fd, (const struct sockaddr *)&addr, len) < 0) { + log_err("Failed to bind socket"); + close(fd); + return -1; + } + + if (type == SOCK_STREAM) { + if (listen(fd, 1) < 0) { + log_err("Failed to listed on socket"); + close(fd); + return -1; + } + } + + return fd; +} + +int start_server(int family, int type) +{ + return start_server_with_port(family, type, 0); +} + +static const struct timeval timeo_sec = { .tv_sec = 3 }; +static const size_t timeo_optlen = sizeof(timeo_sec); + +int connect_to_fd(int family, int type, int server_fd) +{ + int fd, save_errno; + + fd = socket(family, type, 0); + if (fd < 0) { + log_err("Failed to create client socket"); + return -1; + } + + if (connect_fd_to_fd(fd, server_fd) < 0 && errno != EINPROGRESS) { + save_errno = errno; + close(fd); + errno = save_errno; + return -1; + } + + return fd; +} + +int connect_fd_to_fd(int client_fd, int server_fd) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + int save_errno; + + if (setsockopt(client_fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, + timeo_optlen)) { + log_err("Failed to set SO_RCVTIMEO"); + return -1; + } + + if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get server addr"); + return -1; + } + + if (connect(client_fd, (const struct sockaddr *)&addr, len) < 0) { + if (errno != EINPROGRESS) { + save_errno = errno; + log_err("Failed to connect to server"); + errno = save_errno; + } + return -1; + } + + return 0; +} + +int connect_wait(int fd) +{ + struct epoll_event ev = {}, events[2]; + int timeout_ms = 1000; + int efd, nfd; + + efd = epoll_create1(EPOLL_CLOEXEC); + if (efd < 0) { + log_err("Failed to open epoll fd"); + return -1; + } + + ev.events = EPOLLRDHUP | EPOLLOUT; + ev.data.fd = fd; + + if (epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev) < 0) { + log_err("Failed to register fd=%d on epoll fd=%d", fd, efd); + close(efd); + return -1; + } + + nfd = epoll_wait(efd, events, ARRAY_SIZE(events), timeout_ms); + if (nfd < 0) + log_err("Failed to wait for I/O event on epoll fd=%d", efd); + + close(efd); + return nfd; +} diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h new file mode 100644 index 000000000000..6a8009605670 --- /dev/null +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NETWORK_HELPERS_H +#define __NETWORK_HELPERS_H +#include <sys/socket.h> +#include <sys/types.h> +#include <linux/types.h> +typedef __u16 __sum16; +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <netinet/tcp.h> +#include <bpf/bpf_endian.h> + +#define MAGIC_VAL 0x1234 +#define NUM_ITER 100000 +#define VIP_NUM 5 +#define MAGIC_BYTES 123 + +/* ipv4 test vector */ +struct ipv4_packet { + struct ethhdr eth; + struct iphdr iph; + struct tcphdr tcp; +} __packed; +extern struct ipv4_packet pkt_v4; + +/* ipv6 test vector */ +struct ipv6_packet { + struct ethhdr eth; + struct ipv6hdr iph; + struct tcphdr tcp; +} __packed; +extern struct ipv6_packet pkt_v6; + +int start_server(int family, int type); +int start_server_with_port(int family, int type, __u16 port); +int connect_to_fd(int family, int type, int server_fd); +int connect_fd_to_fd(int client_fd, int server_fd); +int connect_wait(int client_fd); + +#endif diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 0262f7b374f9..c548aded6585 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -1,24 +1,5 @@ -#include <asm/types.h> -#include <linux/types.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <stddef.h> -#include <stdbool.h> - -#include <linux/unistd.h> -#include <linux/filter.h> -#include <linux/bpf_perf_event.h> -#include <linux/bpf.h> - -#include <bpf/bpf.h> - -#include "../../../include/linux/filter.h" -#include "bpf_rlimit.h" -#include "bpf_util.h" +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> #define MAX_INSNS 512 #define MAX_MATCHES 16 @@ -359,15 +340,15 @@ static struct bpf_align_test tests[] = { * is still (4n), fixed offset is not changed. * Also, we create a new reg->id. */ - {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc))"}, + {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (18) * which is 20. Then the variable offset is (4n), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"}, - {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"}, + {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"}, + {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"}, }, }, { @@ -410,15 +391,15 @@ static struct bpf_align_test tests[] = { /* Adding 14 makes R6 be (4n+2) */ {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, /* Packet pointer has (4n+2) offset */ - {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, - {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, + {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, /* Newly read value in R6 was shifted left by 2, so has * known alignment of 4. */ @@ -426,15 +407,15 @@ static struct bpf_align_test tests[] = { /* Added (4n) to packet pointer's (4n+2) var_off, giving * another (4n+2). */ - {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, - {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, + {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, + {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, + {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, }, }, { @@ -469,16 +450,16 @@ static struct bpf_align_test tests[] = { .matches = { {4, "R5_w=pkt_end(id=0,off=0,imm=0)"}, /* (ptr - ptr) << 2 == unknown, (4n) */ - {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"}, + {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"}, /* (4n) + 14 == (4n+2). We blow our bounds, because * the add could overflow. */ - {7, "R5_w=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"}, + {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>=0 */ - {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {9, "R5=inv(id=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, - {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able * to overflow if the packet pointer started in the @@ -486,7 +467,7 @@ static struct bpf_align_test tests[] = { * So we did not get a 'range' on R6, and the access * attempt will fail. */ - {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, } }, { @@ -528,7 +509,7 @@ static struct bpf_align_test tests[] = { /* New unknown value in R7 is (4n) */ {11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* Subtracting it from R6 blows our unsigned bounds */ - {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,var_off=(0x2; 0xfffffffffffffffc))"}, + {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>= 0 */ {14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"}, /* At the time the word size load is performed from R5, @@ -537,7 +518,8 @@ static struct bpf_align_test tests[] = { * the total offset is 4-byte aligned and meets the * load's requirements. */ - {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"}, + }, }, { @@ -579,18 +561,18 @@ static struct bpf_align_test tests[] = { /* Adding 14 makes R6 be (4n+2) */ {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"}, /* Subtracting from packet pointer overflows ubounds */ - {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c))"}, + {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"}, /* New unknown value in R7 is (4n), >= 76 */ {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"}, /* Adding it to packet pointer gives nice bounds again */ - {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"}, + {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"}, + {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, }, }, }; @@ -669,51 +651,16 @@ static int do_test_single(struct bpf_align_test *test) return ret; } -static int do_test(unsigned int from, unsigned int to) +void test_align(void) { - int all_pass = 0; - int all_fail = 0; unsigned int i; - for (i = from; i < to; i++) { + for (i = 0; i < ARRAY_SIZE(tests); i++) { struct bpf_align_test *test = &tests[i]; - int fail; - printf("Test %3d: %s ... ", - i, test->descr); - fail = do_test_single(test); - if (fail) { - all_fail++; - printf("FAIL\n"); - } else { - all_pass++; - printf("PASS\n"); - } - } - printf("Results: %d pass %d fail\n", - all_pass, all_fail); - return all_fail ? EXIT_FAILURE : EXIT_SUCCESS; -} + if (!test__start_subtest(test->descr)) + continue; -int main(int argc, char **argv) -{ - unsigned int from = 0, to = ARRAY_SIZE(tests); - - if (argc == 3) { - unsigned int l = atoi(argv[argc - 2]); - unsigned int u = atoi(argv[argc - 1]); - - if (l < to && u < to) { - from = l; - to = u + 1; - } - } else if (argc == 2) { - unsigned int t = atoi(argv[argc - 1]); - - if (t < to) { - from = t; - to = t + 1; - } + CHECK_FAIL(do_test_single(test)); } - return do_test(from, to); } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c new file mode 100644 index 000000000000..87c29dde1cf9 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <test_progs.h> +#include "bpf_iter_ipv6_route.skel.h" +#include "bpf_iter_netlink.skel.h" +#include "bpf_iter_bpf_map.skel.h" +#include "bpf_iter_task.skel.h" +#include "bpf_iter_task_file.skel.h" +#include "bpf_iter_test_kern1.skel.h" +#include "bpf_iter_test_kern2.skel.h" +#include "bpf_iter_test_kern3.skel.h" +#include "bpf_iter_test_kern4.skel.h" + +static int duration; + +static void test_btf_id_or_null(void) +{ + struct bpf_iter_test_kern3 *skel; + + skel = bpf_iter_test_kern3__open_and_load(); + if (CHECK(skel, "bpf_iter_test_kern3__open_and_load", + "skeleton open_and_load unexpectedly succeeded\n")) { + bpf_iter_test_kern3__destroy(skel); + return; + } +} + +static void do_dummy_read(struct bpf_program *prog) +{ + struct bpf_link *link; + char buf[16] = {}; + int iter_fd, len; + + link = bpf_program__attach_iter(prog, NULL); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + return; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto free_link; + + /* not check contents, but ensure read() ends without error */ + while ((len = read(iter_fd, buf, sizeof(buf))) > 0) + ; + CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)); + + close(iter_fd); + +free_link: + bpf_link__destroy(link); +} + +static void test_ipv6_route(void) +{ + struct bpf_iter_ipv6_route *skel; + + skel = bpf_iter_ipv6_route__open_and_load(); + if (CHECK(!skel, "bpf_iter_ipv6_route__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_ipv6_route); + + bpf_iter_ipv6_route__destroy(skel); +} + +static void test_netlink(void) +{ + struct bpf_iter_netlink *skel; + + skel = bpf_iter_netlink__open_and_load(); + if (CHECK(!skel, "bpf_iter_netlink__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_netlink); + + bpf_iter_netlink__destroy(skel); +} + +static void test_bpf_map(void) +{ + struct bpf_iter_bpf_map *skel; + + skel = bpf_iter_bpf_map__open_and_load(); + if (CHECK(!skel, "bpf_iter_bpf_map__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_bpf_map); + + bpf_iter_bpf_map__destroy(skel); +} + +static void test_task(void) +{ + struct bpf_iter_task *skel; + + skel = bpf_iter_task__open_and_load(); + if (CHECK(!skel, "bpf_iter_task__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_task); + + bpf_iter_task__destroy(skel); +} + +static void test_task_file(void) +{ + struct bpf_iter_task_file *skel; + + skel = bpf_iter_task_file__open_and_load(); + if (CHECK(!skel, "bpf_iter_task_file__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_task_file); + + bpf_iter_task_file__destroy(skel); +} + +/* The expected string is less than 16 bytes */ +static int do_read_with_fd(int iter_fd, const char *expected, + bool read_one_char) +{ + int err = -1, len, read_buf_len, start; + char buf[16] = {}; + + read_buf_len = read_one_char ? 1 : 16; + start = 0; + while ((len = read(iter_fd, buf + start, read_buf_len)) > 0) { + start += len; + if (CHECK(start >= 16, "read", "read len %d\n", len)) + return -1; + read_buf_len = read_one_char ? 1 : 16 - start; + } + if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno))) + return -1; + + err = strcmp(buf, expected); + if (CHECK(err, "read", "incorrect read result: buf %s, expected %s\n", + buf, expected)) + return -1; + + return 0; +} + +static void test_anon_iter(bool read_one_char) +{ + struct bpf_iter_test_kern1 *skel; + struct bpf_link *link; + int iter_fd, err; + + skel = bpf_iter_test_kern1__open_and_load(); + if (CHECK(!skel, "bpf_iter_test_kern1__open_and_load", + "skeleton open_and_load failed\n")) + return; + + err = bpf_iter_test_kern1__attach(skel); + if (CHECK(err, "bpf_iter_test_kern1__attach", + "skeleton attach failed\n")) { + goto out; + } + + link = skel->links.dump_task; + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto out; + + do_read_with_fd(iter_fd, "abcd", read_one_char); + close(iter_fd); + +out: + bpf_iter_test_kern1__destroy(skel); +} + +static int do_read(const char *path, const char *expected) +{ + int err, iter_fd; + + iter_fd = open(path, O_RDONLY); + if (CHECK(iter_fd < 0, "open", "open %s failed: %s\n", + path, strerror(errno))) + return -1; + + err = do_read_with_fd(iter_fd, expected, false); + close(iter_fd); + return err; +} + +static void test_file_iter(void) +{ + const char *path = "/sys/fs/bpf/bpf_iter_test1"; + struct bpf_iter_test_kern1 *skel1; + struct bpf_iter_test_kern2 *skel2; + struct bpf_link *link; + int err; + + skel1 = bpf_iter_test_kern1__open_and_load(); + if (CHECK(!skel1, "bpf_iter_test_kern1__open_and_load", + "skeleton open_and_load failed\n")) + return; + + link = bpf_program__attach_iter(skel1->progs.dump_task, NULL); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + goto out; + + /* unlink this path if it exists. */ + unlink(path); + + err = bpf_link__pin(link, path); + if (CHECK(err, "pin_iter", "pin_iter to %s failed: %d\n", path, err)) + goto free_link; + + err = do_read(path, "abcd"); + if (err) + goto unlink_path; + + /* file based iterator seems working fine. Let us a link update + * of the underlying link and `cat` the iterator again, its content + * should change. + */ + skel2 = bpf_iter_test_kern2__open_and_load(); + if (CHECK(!skel2, "bpf_iter_test_kern2__open_and_load", + "skeleton open_and_load failed\n")) + goto unlink_path; + + err = bpf_link__update_program(link, skel2->progs.dump_task); + if (CHECK(err, "update_prog", "update_prog failed\n")) + goto destroy_skel2; + + do_read(path, "ABCD"); + +destroy_skel2: + bpf_iter_test_kern2__destroy(skel2); +unlink_path: + unlink(path); +free_link: + bpf_link__destroy(link); +out: + bpf_iter_test_kern1__destroy(skel1); +} + +static void test_overflow(bool test_e2big_overflow, bool ret1) +{ + __u32 map_info_len, total_read_len, expected_read_len; + int err, iter_fd, map1_fd, map2_fd, len; + struct bpf_map_info map_info = {}; + struct bpf_iter_test_kern4 *skel; + struct bpf_link *link; + __u32 page_size; + char *buf; + + skel = bpf_iter_test_kern4__open(); + if (CHECK(!skel, "bpf_iter_test_kern4__open", + "skeleton open failed\n")) + return; + + /* create two maps: bpf program will only do bpf_seq_write + * for these two maps. The goal is one map output almost + * fills seq_file buffer and then the other will trigger + * overflow and needs restart. + */ + map1_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); + if (CHECK(map1_fd < 0, "bpf_create_map", + "map_creation failed: %s\n", strerror(errno))) + goto out; + map2_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); + if (CHECK(map2_fd < 0, "bpf_create_map", + "map_creation failed: %s\n", strerror(errno))) + goto free_map1; + + /* bpf_seq_printf kernel buffer is one page, so one map + * bpf_seq_write will mostly fill it, and the other map + * will partially fill and then trigger overflow and need + * bpf_seq_read restart. + */ + page_size = sysconf(_SC_PAGE_SIZE); + + if (test_e2big_overflow) { + skel->rodata->print_len = (page_size + 8) / 8; + expected_read_len = 2 * (page_size + 8); + } else if (!ret1) { + skel->rodata->print_len = (page_size - 8) / 8; + expected_read_len = 2 * (page_size - 8); + } else { + skel->rodata->print_len = 1; + expected_read_len = 2 * 8; + } + skel->rodata->ret1 = ret1; + + if (CHECK(bpf_iter_test_kern4__load(skel), + "bpf_iter_test_kern4__load", "skeleton load failed\n")) + goto free_map2; + + /* setup filtering map_id in bpf program */ + map_info_len = sizeof(map_info); + err = bpf_obj_get_info_by_fd(map1_fd, &map_info, &map_info_len); + if (CHECK(err, "get_map_info", "get map info failed: %s\n", + strerror(errno))) + goto free_map2; + skel->bss->map1_id = map_info.id; + + err = bpf_obj_get_info_by_fd(map2_fd, &map_info, &map_info_len); + if (CHECK(err, "get_map_info", "get map info failed: %s\n", + strerror(errno))) + goto free_map2; + skel->bss->map2_id = map_info.id; + + link = bpf_program__attach_iter(skel->progs.dump_bpf_map, NULL); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + goto free_map2; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto free_link; + + buf = malloc(expected_read_len); + if (!buf) + goto close_iter; + + /* do read */ + total_read_len = 0; + if (test_e2big_overflow) { + while ((len = read(iter_fd, buf, expected_read_len)) > 0) + total_read_len += len; + + CHECK(len != -1 || errno != E2BIG, "read", + "expected ret -1, errno E2BIG, but get ret %d, error %s\n", + len, strerror(errno)); + goto free_buf; + } else if (!ret1) { + while ((len = read(iter_fd, buf, expected_read_len)) > 0) + total_read_len += len; + + if (CHECK(len < 0, "read", "read failed: %s\n", + strerror(errno))) + goto free_buf; + } else { + do { + len = read(iter_fd, buf, expected_read_len); + if (len > 0) + total_read_len += len; + } while (len > 0 || len == -EAGAIN); + + if (CHECK(len < 0, "read", "read failed: %s\n", + strerror(errno))) + goto free_buf; + } + + if (CHECK(total_read_len != expected_read_len, "read", + "total len %u, expected len %u\n", total_read_len, + expected_read_len)) + goto free_buf; + + if (CHECK(skel->bss->map1_accessed != 1, "map1_accessed", + "expected 1 actual %d\n", skel->bss->map1_accessed)) + goto free_buf; + + if (CHECK(skel->bss->map2_accessed != 2, "map2_accessed", + "expected 2 actual %d\n", skel->bss->map2_accessed)) + goto free_buf; + + CHECK(skel->bss->map2_seqnum1 != skel->bss->map2_seqnum2, + "map2_seqnum", "two different seqnum %lld %lld\n", + skel->bss->map2_seqnum1, skel->bss->map2_seqnum2); + +free_buf: + free(buf); +close_iter: + close(iter_fd); +free_link: + bpf_link__destroy(link); +free_map2: + close(map2_fd); +free_map1: + close(map1_fd); +out: + bpf_iter_test_kern4__destroy(skel); +} + +void test_bpf_iter(void) +{ + if (test__start_subtest("btf_id_or_null")) + test_btf_id_or_null(); + if (test__start_subtest("ipv6_route")) + test_ipv6_route(); + if (test__start_subtest("netlink")) + test_netlink(); + if (test__start_subtest("bpf_map")) + test_bpf_map(); + if (test__start_subtest("task")) + test_task(); + if (test__start_subtest("task_file")) + test_task_file(); + if (test__start_subtest("anon")) + test_anon_iter(false); + if (test__start_subtest("anon-read-one-char")) + test_anon_iter(true); + if (test__start_subtest("file")) + test_file_iter(); + if (test__start_subtest("overflow")) + test_overflow(false, false); + if (test__start_subtest("overflow-e2big")) + test_overflow(true, false); + if (test__start_subtest("prog-ret-1")) + test_overflow(false, true); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c index f10029821e16..7afa4160416f 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c @@ -1,26 +1,30 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#define nr_iters 2 + void test_bpf_obj_id(void) { const __u64 array_magic_value = 0xfaceb00c; const __u32 array_key = 0; - const int nr_iters = 2; const char *file = "./test_obj_id.o"; const char *expected_prog_name = "test_obj_id"; const char *expected_map_name = "test_map_id"; const __u64 nsec_per_sec = 1000000000; - struct bpf_object *objs[nr_iters]; + struct bpf_object *objs[nr_iters] = {}; + struct bpf_link *links[nr_iters] = {}; + struct bpf_program *prog; int prog_fds[nr_iters], map_fds[nr_iters]; /* +1 to test for the info_len returned by kernel */ struct bpf_prog_info prog_infos[nr_iters + 1]; struct bpf_map_info map_infos[nr_iters + 1]; + struct bpf_link_info link_infos[nr_iters + 1]; /* Each prog only uses one map. +1 to test nr_map_ids * returned by kernel. */ __u32 map_ids[nr_iters + 1]; - char jited_insns[128], xlated_insns[128], zeros[128]; + char jited_insns[128], xlated_insns[128], zeros[128], tp_name[128]; __u32 i, next_id, info_len, nr_id_found, duration = 0; struct timespec real_time_ts, boot_time_ts; int err = 0; @@ -36,14 +40,15 @@ void test_bpf_obj_id(void) CHECK(err >= 0 || errno != ENOENT, "get-fd-by-notexist-map-id", "err %d errno %d\n", err, errno); - for (i = 0; i < nr_iters; i++) - objs[i] = NULL; + err = bpf_link_get_fd_by_id(0); + CHECK(err >= 0 || errno != ENOENT, + "get-fd-by-notexist-link-id", "err %d errno %d\n", err, errno); /* Check bpf_obj_get_info_by_fd() */ bzero(zeros, sizeof(zeros)); for (i = 0; i < nr_iters; i++) { now = time(NULL); - err = bpf_prog_load(file, BPF_PROG_TYPE_SOCKET_FILTER, + err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &objs[i], &prog_fds[i]); /* test_obj_id.o is a dumb prog. It should never fail * to load. @@ -60,6 +65,17 @@ void test_bpf_obj_id(void) if (CHECK_FAIL(err)) goto done; + prog = bpf_object__find_program_by_title(objs[i], + "raw_tp/sys_enter"); + if (CHECK_FAIL(!prog)) + goto done; + links[i] = bpf_program__attach(prog); + err = libbpf_get_error(links[i]); + if (CHECK(err, "prog_attach", "prog #%d, err %d\n", i, err)) { + links[i] = NULL; + goto done; + } + /* Check getting map info */ info_len = sizeof(struct bpf_map_info) * 2; bzero(&map_infos[i], info_len); @@ -107,7 +123,7 @@ void test_bpf_obj_id(void) load_time = (real_time_ts.tv_sec - boot_time_ts.tv_sec) + (prog_infos[i].load_time / nsec_per_sec); if (CHECK(err || - prog_infos[i].type != BPF_PROG_TYPE_SOCKET_FILTER || + prog_infos[i].type != BPF_PROG_TYPE_RAW_TRACEPOINT || info_len != sizeof(struct bpf_prog_info) || (env.jit_enabled && !prog_infos[i].jited_prog_len) || (env.jit_enabled && @@ -120,7 +136,11 @@ void test_bpf_obj_id(void) *(int *)(long)prog_infos[i].map_ids != map_infos[i].id || strcmp((char *)prog_infos[i].name, expected_prog_name), "get-prog-info(fd)", - "err %d errno %d i %d type %d(%d) info_len %u(%zu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d load_time %lu(%lu) uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) name %s(%s)\n", + "err %d errno %d i %d type %d(%d) info_len %u(%zu) " + "jit_enabled %d jited_prog_len %u xlated_prog_len %u " + "jited_prog %d xlated_prog %d load_time %lu(%lu) " + "uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) " + "name %s(%s)\n", err, errno, i, prog_infos[i].type, BPF_PROG_TYPE_SOCKET_FILTER, info_len, sizeof(struct bpf_prog_info), @@ -135,6 +155,33 @@ void test_bpf_obj_id(void) *(int *)(long)prog_infos[i].map_ids, map_infos[i].id, prog_infos[i].name, expected_prog_name)) goto done; + + /* Check getting link info */ + info_len = sizeof(struct bpf_link_info) * 2; + bzero(&link_infos[i], info_len); + link_infos[i].raw_tracepoint.tp_name = (__u64)&tp_name; + link_infos[i].raw_tracepoint.tp_name_len = sizeof(tp_name); + err = bpf_obj_get_info_by_fd(bpf_link__fd(links[i]), + &link_infos[i], &info_len); + if (CHECK(err || + link_infos[i].type != BPF_LINK_TYPE_RAW_TRACEPOINT || + link_infos[i].prog_id != prog_infos[i].id || + link_infos[i].raw_tracepoint.tp_name != (__u64)&tp_name || + strcmp((char *)link_infos[i].raw_tracepoint.tp_name, + "sys_enter") || + info_len != sizeof(struct bpf_link_info), + "get-link-info(fd)", + "err %d errno %d info_len %u(%zu) type %d(%d) id %d " + "prog_id %d (%d) tp_name %s(%s)\n", + err, errno, + info_len, sizeof(struct bpf_link_info), + link_infos[i].type, BPF_LINK_TYPE_RAW_TRACEPOINT, + link_infos[i].id, + link_infos[i].prog_id, prog_infos[i].id, + (char *)link_infos[i].raw_tracepoint.tp_name, + "sys_enter")) + goto done; + } /* Check bpf_prog_get_next_id() */ @@ -247,7 +294,52 @@ void test_bpf_obj_id(void) "nr_id_found %u(%u)\n", nr_id_found, nr_iters); + /* Check bpf_link_get_next_id() */ + nr_id_found = 0; + next_id = 0; + while (!bpf_link_get_next_id(next_id, &next_id)) { + struct bpf_link_info link_info; + int link_fd, cmp_res; + + info_len = sizeof(link_info); + memset(&link_info, 0, info_len); + + link_fd = bpf_link_get_fd_by_id(next_id); + if (link_fd < 0 && errno == ENOENT) + /* The bpf_link is in the dead row */ + continue; + if (CHECK(link_fd < 0, "get-link-fd(next_id)", + "link_fd %d next_id %u errno %d\n", + link_fd, next_id, errno)) + break; + + for (i = 0; i < nr_iters; i++) + if (link_infos[i].id == next_id) + break; + + if (i == nr_iters) + continue; + + nr_id_found++; + + err = bpf_obj_get_info_by_fd(link_fd, &link_info, &info_len); + cmp_res = memcmp(&link_info, &link_infos[i], + offsetof(struct bpf_link_info, raw_tracepoint)); + CHECK(err || info_len != sizeof(link_info) || cmp_res, + "check get-link-info(next_id->fd)", + "err %d errno %d info_len %u(%zu) memcmp %d\n", + err, errno, info_len, sizeof(struct bpf_link_info), + cmp_res); + + close(link_fd); + } + CHECK(nr_id_found != nr_iters, + "check total link id found by get_next_id", + "nr_id_found %u(%u)\n", nr_id_found, nr_iters); + done: - for (i = 0; i < nr_iters; i++) + for (i = 0; i < nr_iters; i++) { + bpf_link__destroy(links[i]); bpf_object__close(objs[i]); + } } diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c new file mode 100644 index 000000000000..f7ee8fa377ad --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include <test_progs.h> + +#include "test_btf_map_in_map.skel.h" + +void test_btf_map_in_map(void) +{ + int duration = 0, err, key = 0, val; + struct test_btf_map_in_map* skel; + + skel = test_btf_map_in_map__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n")) + return; + + err = test_btf_map_in_map__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + goto cleanup; + + /* inner1 = input, inner2 = input + 1 */ + val = bpf_map__fd(skel->maps.inner_map1); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_arr), &key, &val, 0); + val = bpf_map__fd(skel->maps.inner_map2); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_hash), &key, &val, 0); + skel->bss->input = 1; + usleep(1); + + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map1), &key, &val); + CHECK(val != 1, "inner1", "got %d != exp %d\n", val, 1); + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map2), &key, &val); + CHECK(val != 2, "inner2", "got %d != exp %d\n", val, 2); + + /* inner1 = input + 1, inner2 = input */ + val = bpf_map__fd(skel->maps.inner_map2); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_arr), &key, &val, 0); + val = bpf_map__fd(skel->maps.inner_map1); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_hash), &key, &val, 0); + skel->bss->input = 3; + usleep(1); + + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map1), &key, &val); + CHECK(val != 4, "inner1", "got %d != exp %d\n", val, 4); + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map2), &key, &val); + CHECK(val != 3, "inner2", "got %d != exp %d\n", val, 3); + +cleanup: + test_btf_map_in_map__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c new file mode 100644 index 000000000000..059047af7df3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <test_progs.h> + +#include "network_helpers.h" +#include "cgroup_skb_sk_lookup_kern.skel.h" + +static void run_lookup_test(__u16 *g_serv_port, int out_sk) +{ + int serv_sk = -1, in_sk = -1, serv_in_sk = -1, err; + struct sockaddr_in6 addr = {}; + socklen_t addr_len = sizeof(addr); + __u32 duration = 0; + + serv_sk = start_server(AF_INET6, SOCK_STREAM); + if (CHECK(serv_sk < 0, "start_server", "failed to start server\n")) + return; + + err = getsockname(serv_sk, (struct sockaddr *)&addr, &addr_len); + if (CHECK(err, "getsockname", "errno %d\n", errno)) + goto cleanup; + + *g_serv_port = addr.sin6_port; + + /* Client outside of test cgroup should fail to connect by timeout. */ + err = connect_fd_to_fd(out_sk, serv_sk); + if (CHECK(!err || errno != EINPROGRESS, "connect_fd_to_fd", + "unexpected result err %d errno %d\n", err, errno)) + goto cleanup; + + err = connect_wait(out_sk); + if (CHECK(err, "connect_wait", "unexpected result %d\n", err)) + goto cleanup; + + /* Client inside test cgroup should connect just fine. */ + in_sk = connect_to_fd(AF_INET6, SOCK_STREAM, serv_sk); + if (CHECK(in_sk < 0, "connect_to_fd", "errno %d\n", errno)) + goto cleanup; + + serv_in_sk = accept(serv_sk, NULL, NULL); + if (CHECK(serv_in_sk < 0, "accept", "errno %d\n", errno)) + goto cleanup; + +cleanup: + close(serv_in_sk); + close(in_sk); + close(serv_sk); +} + +static void run_cgroup_bpf_test(const char *cg_path, int out_sk) +{ + struct cgroup_skb_sk_lookup_kern *skel; + struct bpf_link *link; + __u32 duration = 0; + int cgfd = -1; + + skel = cgroup_skb_sk_lookup_kern__open_and_load(); + if (CHECK(!skel, "skel_open_load", "open_load failed\n")) + return; + + cgfd = test__join_cgroup(cg_path); + if (CHECK(cgfd < 0, "cgroup_join", "cgroup setup failed\n")) + goto cleanup; + + link = bpf_program__attach_cgroup(skel->progs.ingress_lookup, cgfd); + if (CHECK(IS_ERR(link), "cgroup_attach", "err: %ld\n", PTR_ERR(link))) + goto cleanup; + + run_lookup_test(&skel->bss->g_serv_port, out_sk); + + bpf_link__destroy(link); + +cleanup: + close(cgfd); + cgroup_skb_sk_lookup_kern__destroy(skel); +} + +void test_cgroup_skb_sk_lookup(void) +{ + const char *cg_path = "/foo"; + int out_sk; + + /* Create a socket before joining testing cgroup so that its cgroup id + * differs from that of testing cgroup. Moving selftests process to + * testing cgroup won't change cgroup id of an already created socket. + */ + out_sk = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, 0); + if (CHECK_FAIL(out_sk < 0)) + return; + + run_cgroup_bpf_test(cg_path, out_sk); + + close(out_sk); +} diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c new file mode 100644 index 000000000000..f259085cca6a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -0,0 +1,456 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +// Copyright (c) 2020 Cloudflare + +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <string.h> + +#include <linux/pkt_cls.h> + +#include <test_progs.h> + +#include "progs/test_cls_redirect.h" +#include "test_cls_redirect.skel.h" + +#define ENCAP_IP INADDR_LOOPBACK +#define ENCAP_PORT (1234) + +struct addr_port { + in_port_t port; + union { + struct in_addr in_addr; + struct in6_addr in6_addr; + }; +}; + +struct tuple { + int family; + struct addr_port src; + struct addr_port dst; +}; + +static int start_server(const struct sockaddr *addr, socklen_t len, int type) +{ + int fd = socket(addr->sa_family, type, 0); + if (CHECK_FAIL(fd == -1)) + return -1; + if (CHECK_FAIL(bind(fd, addr, len) == -1)) + goto err; + if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) + goto err; + + return fd; + +err: + close(fd); + return -1; +} + +static int connect_to_server(const struct sockaddr *addr, socklen_t len, + int type) +{ + int fd = socket(addr->sa_family, type, 0); + if (CHECK_FAIL(fd == -1)) + return -1; + if (CHECK_FAIL(connect(fd, addr, len))) + goto err; + + return fd; + +err: + close(fd); + return -1; +} + +static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap) +{ + const struct sockaddr_in6 *in6; + const struct sockaddr_in *in; + + switch (sa->sa_family) { + case AF_INET: + in = (const struct sockaddr_in *)sa; + ap->in_addr = in->sin_addr; + ap->port = in->sin_port; + return true; + + case AF_INET6: + in6 = (const struct sockaddr_in6 *)sa; + ap->in6_addr = in6->sin6_addr; + ap->port = in6->sin6_port; + return true; + + default: + return false; + } +} + +static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type, + int *server, int *conn, struct tuple *tuple) +{ + struct sockaddr_storage ss; + socklen_t slen = sizeof(ss); + struct sockaddr *sa = (struct sockaddr *)&ss; + + *server = start_server(addr, len, type); + if (*server < 0) + return false; + + if (CHECK_FAIL(getsockname(*server, sa, &slen))) + goto close_server; + + *conn = connect_to_server(sa, slen, type); + if (*conn < 0) + goto close_server; + + /* We want to simulate packets arriving at conn, so we have to + * swap src and dst. + */ + slen = sizeof(ss); + if (CHECK_FAIL(getsockname(*conn, sa, &slen))) + goto close_conn; + + if (CHECK_FAIL(!fill_addr_port(sa, &tuple->dst))) + goto close_conn; + + slen = sizeof(ss); + if (CHECK_FAIL(getpeername(*conn, sa, &slen))) + goto close_conn; + + if (CHECK_FAIL(!fill_addr_port(sa, &tuple->src))) + goto close_conn; + + tuple->family = ss.ss_family; + return true; + +close_conn: + close(*conn); + *conn = -1; +close_server: + close(*server); + *server = -1; + return false; +} + +static socklen_t prepare_addr(struct sockaddr_storage *addr, int family) +{ + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + + switch (family) { + case AF_INET: + addr4 = (struct sockaddr_in *)addr; + memset(addr4, 0, sizeof(*addr4)); + addr4->sin_family = family; + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + return sizeof(*addr4); + case AF_INET6: + addr6 = (struct sockaddr_in6 *)addr; + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = family; + addr6->sin6_addr = in6addr_loopback; + return sizeof(*addr6); + default: + fprintf(stderr, "Invalid family %d", family); + return 0; + } +} + +static bool was_decapsulated(struct bpf_prog_test_run_attr *tattr) +{ + return tattr->data_size_out < tattr->data_size_in; +} + +enum type { + UDP, + TCP, + __NR_KIND, +}; + +enum hops { + NO_HOPS, + ONE_HOP, +}; + +enum flags { + NONE, + SYN, + ACK, +}; + +enum conn { + KNOWN_CONN, + UNKNOWN_CONN, +}; + +enum result { + ACCEPT, + FORWARD, +}; + +struct test_cfg { + enum type type; + enum result result; + enum conn conn; + enum hops hops; + enum flags flags; +}; + +static int test_str(void *buf, size_t len, const struct test_cfg *test, + int family) +{ + const char *family_str, *type, *conn, *hops, *result, *flags; + + family_str = "IPv4"; + if (family == AF_INET6) + family_str = "IPv6"; + + type = "TCP"; + if (test->type == UDP) + type = "UDP"; + + conn = "known"; + if (test->conn == UNKNOWN_CONN) + conn = "unknown"; + + hops = "no hops"; + if (test->hops == ONE_HOP) + hops = "one hop"; + + result = "accept"; + if (test->result == FORWARD) + result = "forward"; + + flags = "none"; + if (test->flags == SYN) + flags = "SYN"; + else if (test->flags == ACK) + flags = "ACK"; + + return snprintf(buf, len, "%s %s %s %s (%s, flags: %s)", family_str, + type, result, conn, hops, flags); +} + +static struct test_cfg tests[] = { + { TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, SYN }, + { TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, ACK }, + { TCP, FORWARD, UNKNOWN_CONN, ONE_HOP, ACK }, + { TCP, ACCEPT, KNOWN_CONN, ONE_HOP, ACK }, + { UDP, ACCEPT, UNKNOWN_CONN, NO_HOPS, NONE }, + { UDP, FORWARD, UNKNOWN_CONN, ONE_HOP, NONE }, + { UDP, ACCEPT, KNOWN_CONN, ONE_HOP, NONE }, +}; + +static void encap_init(encap_headers_t *encap, uint8_t hop_count, uint8_t proto) +{ + const uint8_t hlen = + (sizeof(struct guehdr) / sizeof(uint32_t)) + hop_count; + *encap = (encap_headers_t){ + .eth = { .h_proto = htons(ETH_P_IP) }, + .ip = { + .ihl = 5, + .version = 4, + .ttl = IPDEFTTL, + .protocol = IPPROTO_UDP, + .daddr = htonl(ENCAP_IP) + }, + .udp = { + .dest = htons(ENCAP_PORT), + }, + .gue = { + .hlen = hlen, + .proto_ctype = proto + }, + .unigue = { + .hop_count = hop_count + }, + }; +} + +static size_t build_input(const struct test_cfg *test, void *const buf, + const struct tuple *tuple) +{ + in_port_t sport = tuple->src.port; + encap_headers_t encap; + struct iphdr ip; + struct ipv6hdr ipv6; + struct tcphdr tcp; + struct udphdr udp; + struct in_addr next_hop; + uint8_t *p = buf; + int proto; + + proto = IPPROTO_IPIP; + if (tuple->family == AF_INET6) + proto = IPPROTO_IPV6; + + encap_init(&encap, test->hops == ONE_HOP ? 1 : 0, proto); + p = mempcpy(p, &encap, sizeof(encap)); + + if (test->hops == ONE_HOP) { + next_hop = (struct in_addr){ .s_addr = htonl(0x7f000002) }; + p = mempcpy(p, &next_hop, sizeof(next_hop)); + } + + proto = IPPROTO_TCP; + if (test->type == UDP) + proto = IPPROTO_UDP; + + switch (tuple->family) { + case AF_INET: + ip = (struct iphdr){ + .ihl = 5, + .version = 4, + .ttl = IPDEFTTL, + .protocol = proto, + .saddr = tuple->src.in_addr.s_addr, + .daddr = tuple->dst.in_addr.s_addr, + }; + p = mempcpy(p, &ip, sizeof(ip)); + break; + case AF_INET6: + ipv6 = (struct ipv6hdr){ + .version = 6, + .hop_limit = IPDEFTTL, + .nexthdr = proto, + .saddr = tuple->src.in6_addr, + .daddr = tuple->dst.in6_addr, + }; + p = mempcpy(p, &ipv6, sizeof(ipv6)); + break; + default: + return 0; + } + + if (test->conn == UNKNOWN_CONN) + sport--; + + switch (test->type) { + case TCP: + tcp = (struct tcphdr){ + .source = sport, + .dest = tuple->dst.port, + }; + if (test->flags == SYN) + tcp.syn = true; + if (test->flags == ACK) + tcp.ack = true; + p = mempcpy(p, &tcp, sizeof(tcp)); + break; + case UDP: + udp = (struct udphdr){ + .source = sport, + .dest = tuple->dst.port, + }; + p = mempcpy(p, &udp, sizeof(udp)); + break; + default: + return 0; + } + + return (void *)p - buf; +} + +static void close_fds(int *fds, int n) +{ + int i; + + for (i = 0; i < n; i++) + if (fds[i] > 0) + close(fds[i]); +} + +void test_cls_redirect(void) +{ + struct test_cls_redirect *skel = NULL; + struct bpf_prog_test_run_attr tattr = {}; + int families[] = { AF_INET, AF_INET6 }; + struct sockaddr_storage ss; + struct sockaddr *addr; + socklen_t slen; + int i, j, err; + + int servers[__NR_KIND][ARRAY_SIZE(families)] = {}; + int conns[__NR_KIND][ARRAY_SIZE(families)] = {}; + struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)]; + + skel = test_cls_redirect__open(); + if (CHECK_FAIL(!skel)) + return; + + skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP); + skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT); + + if (CHECK_FAIL(test_cls_redirect__load(skel))) + goto cleanup; + + addr = (struct sockaddr *)&ss; + for (i = 0; i < ARRAY_SIZE(families); i++) { + slen = prepare_addr(&ss, families[i]); + if (CHECK_FAIL(!slen)) + goto cleanup; + + if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_DGRAM, + &servers[UDP][i], &conns[UDP][i], + &tuples[UDP][i]))) + goto cleanup; + + if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_STREAM, + &servers[TCP][i], &conns[TCP][i], + &tuples[TCP][i]))) + goto cleanup; + } + + tattr.prog_fd = bpf_program__fd(skel->progs.cls_redirect); + for (i = 0; i < ARRAY_SIZE(tests); i++) { + struct test_cfg *test = &tests[i]; + + for (j = 0; j < ARRAY_SIZE(families); j++) { + struct tuple *tuple = &tuples[test->type][j]; + char input[256]; + char tmp[256]; + + test_str(tmp, sizeof(tmp), test, tuple->family); + if (!test__start_subtest(tmp)) + continue; + + tattr.data_out = tmp; + tattr.data_size_out = sizeof(tmp); + + tattr.data_in = input; + tattr.data_size_in = build_input(test, input, tuple); + if (CHECK_FAIL(!tattr.data_size_in)) + continue; + + err = bpf_prog_test_run_xattr(&tattr); + if (CHECK_FAIL(err)) + continue; + + if (tattr.retval != TC_ACT_REDIRECT) { + PRINT_FAIL("expected TC_ACT_REDIRECT, got %d\n", + tattr.retval); + continue; + } + + switch (test->result) { + case ACCEPT: + if (CHECK_FAIL(!was_decapsulated(&tattr))) + continue; + break; + case FORWARD: + if (CHECK_FAIL(was_decapsulated(&tattr))) + continue; + break; + default: + PRINT_FAIL("unknown result %d\n", test->result); + continue; + } + } + } + +cleanup: + test_cls_redirect__destroy(skel); + close_fds((int *)servers, sizeof(servers) / sizeof(servers[0][0])); + close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0])); +} diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c new file mode 100644 index 000000000000..17bbf76812ca --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include "cgroup_helpers.h" +#include "network_helpers.h" + +static int verify_ports(int family, int fd, + __u16 expected_local, __u16 expected_peer) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + __u16 port; + + if (getsockname(fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get server addr"); + return -1; + } + + if (family == AF_INET) + port = ((struct sockaddr_in *)&addr)->sin_port; + else + port = ((struct sockaddr_in6 *)&addr)->sin6_port; + + if (ntohs(port) != expected_local) { + log_err("Unexpected local port %d, expected %d", ntohs(port), + expected_local); + return -1; + } + + if (getpeername(fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get peer addr"); + return -1; + } + + if (family == AF_INET) + port = ((struct sockaddr_in *)&addr)->sin_port; + else + port = ((struct sockaddr_in6 *)&addr)->sin6_port; + + if (ntohs(port) != expected_peer) { + log_err("Unexpected peer port %d, expected %d", ntohs(port), + expected_peer); + return -1; + } + + return 0; +} + +static int run_test(int cgroup_fd, int server_fd, int family, int type) +{ + bool v4 = family == AF_INET; + __u16 expected_local_port = v4 ? 22222 : 22223; + __u16 expected_peer_port = 60000; + struct bpf_prog_load_attr attr = { + .file = v4 ? "./connect_force_port4.o" : + "./connect_force_port6.o", + }; + struct bpf_program *prog; + struct bpf_object *obj; + int xlate_fd, fd, err; + __u32 duration = 0; + + err = bpf_prog_load_xattr(&attr, &obj, &xlate_fd); + if (err) { + log_err("Failed to load BPF object"); + return -1; + } + + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/connect4" : + "cgroup/connect6"); + if (CHECK(!prog, "find_prog", "connect prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_CONNECT : + BPF_CGROUP_INET6_CONNECT, 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/getpeername4" : + "cgroup/getpeername6"); + if (CHECK(!prog, "find_prog", "getpeername prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_GETPEERNAME : + BPF_CGROUP_INET6_GETPEERNAME, 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/getsockname4" : + "cgroup/getsockname6"); + if (CHECK(!prog, "find_prog", "getsockname prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_GETSOCKNAME : + BPF_CGROUP_INET6_GETSOCKNAME, 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + fd = connect_to_fd(family, type, server_fd); + if (fd < 0) { + err = -1; + goto close_bpf_object; + } + + err = verify_ports(family, fd, expected_local_port, + expected_peer_port); + close(fd); + +close_bpf_object: + bpf_object__close(obj); + return err; +} + +void test_connect_force_port(void) +{ + int server_fd, cgroup_fd; + + cgroup_fd = test__join_cgroup("/connect_force_port"); + if (CHECK_FAIL(cgroup_fd < 0)) + return; + + server_fd = start_server_with_port(AF_INET, SOCK_STREAM, 60123); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM)); + close(server_fd); + + server_fd = start_server_with_port(AF_INET6, SOCK_STREAM, 60124); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM)); + close(server_fd); + + server_fd = start_server_with_port(AF_INET, SOCK_DGRAM, 60123); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM)); + close(server_fd); + + server_fd = start_server_with_port(AF_INET6, SOCK_DGRAM, 60124); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM)); + close(server_fd); + +close_cgroup_fd: + close(cgroup_fd); +} diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 31e177adbdf1..084ed26a7d78 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -392,7 +392,7 @@ static struct core_reloc_test_case test_cases[] = { .input = STRUCT_TO_CHAR_PTR(core_reloc_existence___minimal) { .a = 42, }, - .input_len = sizeof(struct core_reloc_existence), + .input_len = sizeof(struct core_reloc_existence___minimal), .output = STRUCT_TO_CHAR_PTR(core_reloc_existence_output) { .a_exists = 1, .b_exists = 0, diff --git a/tools/testing/selftests/bpf/prog_tests/enable_stats.c b/tools/testing/selftests/bpf/prog_tests/enable_stats.c new file mode 100644 index 000000000000..2cb2085917e7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/enable_stats.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include "test_enable_stats.skel.h" + +void test_enable_stats(void) +{ + struct test_enable_stats *skel; + int stats_fd, err, prog_fd; + struct bpf_prog_info info; + __u32 info_len = sizeof(info); + int duration = 0; + + skel = test_enable_stats__open_and_load(); + if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) + return; + + stats_fd = bpf_enable_stats(BPF_STATS_RUN_TIME); + if (CHECK(stats_fd < 0, "get_stats_fd", "failed %d\n", errno)) { + test_enable_stats__destroy(skel); + return; + } + + err = test_enable_stats__attach(skel); + if (CHECK(err, "attach_raw_tp", "err %d\n", err)) + goto cleanup; + + test_enable_stats__detach(skel); + + prog_fd = bpf_program__fd(skel->progs.test_enable_stats); + memset(&info, 0, info_len); + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (CHECK(err, "get_prog_info", + "failed to get bpf_prog_info for fd %d\n", prog_fd)) + goto cleanup; + if (CHECK(info.run_time_ns == 0, "check_stats_enabled", + "failed to enable run_time_ns stats\n")) + goto cleanup; + + CHECK(info.run_cnt != skel->bss->count, "check_run_cnt_valid", + "invalid run_cnt stats\n"); + +cleanup: + test_enable_stats__destroy(skel); + close(stats_fd); +} diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index c2642517e1d8..a895bfed55db 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <test_progs.h> +#include <network_helpers.h> static void test_fexit_bpf2bpf_common(const char *obj_file, const char *target_obj_file, diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 92563898867c..ea14e3ece812 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -1,10 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> #include <error.h> #include <linux/if.h> #include <linux/if_tun.h> #include <sys/uio.h> +#include "bpf_flow.skel.h" + #ifndef IP_MF #define IP_MF 0x2000 #endif @@ -100,6 +103,7 @@ struct test { #define VLAN_HLEN 4 +static __u32 duration; struct test tests[] = { { .name = "ipv4", @@ -443,17 +447,130 @@ static int ifup(const char *ifname) return 0; } +static int init_prog_array(struct bpf_object *obj, struct bpf_map *prog_array) +{ + int i, err, map_fd, prog_fd; + struct bpf_program *prog; + char prog_name[32]; + + map_fd = bpf_map__fd(prog_array); + if (map_fd < 0) + return -1; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "flow_dissector/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (!prog) + return -1; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) + return -1; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (err) + return -1; + } + return 0; +} + +static void run_tests_skb_less(int tap_fd, struct bpf_map *keys) +{ + int i, err, keys_fd; + + keys_fd = bpf_map__fd(keys); + if (CHECK(keys_fd < 0, "bpf_map__fd", "err %d\n", keys_fd)) + return; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + /* Keep in sync with 'flags' from eth_get_headlen. */ + __u32 eth_get_headlen_flags = + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG; + struct bpf_prog_test_run_attr tattr = {}; + struct bpf_flow_keys flow_keys = {}; + __u32 key = (__u32)(tests[i].keys.sport) << 16 | + tests[i].keys.dport; + + /* For skb-less case we can't pass input flags; run + * only the tests that have a matching set of flags. + */ + + if (tests[i].flags != eth_get_headlen_flags) + continue; + + err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt)); + CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno); + + err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys); + CHECK_ATTR(err, tests[i].name, "bpf_map_lookup_elem %d\n", err); + + CHECK_ATTR(err, tests[i].name, "skb-less err %d\n", err); + CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys); + + err = bpf_map_delete_elem(keys_fd, &key); + CHECK_ATTR(err, tests[i].name, "bpf_map_delete_elem %d\n", err); + } +} + +static void test_skb_less_prog_attach(struct bpf_flow *skel, int tap_fd) +{ + int err, prog_fd; + + prog_fd = bpf_program__fd(skel->progs._dissect); + if (CHECK(prog_fd < 0, "bpf_program__fd", "err %d\n", prog_fd)) + return; + + err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0); + if (CHECK(err, "bpf_prog_attach", "err %d errno %d\n", err, errno)) + return; + + run_tests_skb_less(tap_fd, skel->maps.last_dissection); + + err = bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR); + CHECK(err, "bpf_prog_detach", "err %d errno %d\n", err, errno); +} + +static void test_skb_less_link_create(struct bpf_flow *skel, int tap_fd) +{ + struct bpf_link *link; + int err, net_fd; + + net_fd = open("/proc/self/ns/net", O_RDONLY); + if (CHECK(net_fd < 0, "open(/proc/self/ns/net)", "err %d\n", errno)) + return; + + link = bpf_program__attach_netns(skel->progs._dissect, net_fd); + if (CHECK(IS_ERR(link), "attach_netns", "err %ld\n", PTR_ERR(link))) + goto out_close; + + run_tests_skb_less(tap_fd, skel->maps.last_dissection); + + err = bpf_link__destroy(link); + CHECK(err, "bpf_link__destroy", "err %d\n", err); +out_close: + close(net_fd); +} + void test_flow_dissector(void) { int i, err, prog_fd, keys_fd = -1, tap_fd; - struct bpf_object *obj; - __u32 duration = 0; + struct bpf_flow *skel; - err = bpf_flow_load(&obj, "./bpf_flow.o", "flow_dissector", - "jmp_table", "last_dissection", &prog_fd, &keys_fd); - if (CHECK_FAIL(err)) + skel = bpf_flow__open_and_load(); + if (CHECK(!skel, "skel", "failed to open/load skeleton\n")) return; + prog_fd = bpf_program__fd(skel->progs._dissect); + if (CHECK(prog_fd < 0, "bpf_program__fd", "err %d\n", prog_fd)) + goto out_destroy_skel; + keys_fd = bpf_map__fd(skel->maps.last_dissection); + if (CHECK(keys_fd < 0, "bpf_map__fd", "err %d\n", keys_fd)) + goto out_destroy_skel; + err = init_prog_array(skel->obj, skel->maps.jmp_table); + if (CHECK(err, "init_prog_array", "err %d\n", err)) + goto out_destroy_skel; + for (i = 0; i < ARRAY_SIZE(tests); i++) { struct bpf_flow_keys flow_keys; struct bpf_prog_test_run_attr tattr = { @@ -486,43 +603,17 @@ void test_flow_dissector(void) * via BPF map in this case. */ - err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0); - CHECK(err, "bpf_prog_attach", "err %d errno %d\n", err, errno); - tap_fd = create_tap("tap0"); CHECK(tap_fd < 0, "create_tap", "tap_fd %d errno %d\n", tap_fd, errno); err = ifup("tap0"); CHECK(err, "ifup", "err %d errno %d\n", err, errno); - for (i = 0; i < ARRAY_SIZE(tests); i++) { - /* Keep in sync with 'flags' from eth_get_headlen. */ - __u32 eth_get_headlen_flags = - BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG; - struct bpf_prog_test_run_attr tattr = {}; - struct bpf_flow_keys flow_keys = {}; - __u32 key = (__u32)(tests[i].keys.sport) << 16 | - tests[i].keys.dport; - - /* For skb-less case we can't pass input flags; run - * only the tests that have a matching set of flags. - */ - - if (tests[i].flags != eth_get_headlen_flags) - continue; - - err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt)); - CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno); - - err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys); - CHECK_ATTR(err, tests[i].name, "bpf_map_lookup_elem %d\n", err); - - CHECK_ATTR(err, tests[i].name, "skb-less err %d\n", err); - CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys); - - err = bpf_map_delete_elem(keys_fd, &key); - CHECK_ATTR(err, tests[i].name, "bpf_map_delete_elem %d\n", err); - } + /* Test direct prog attachment */ + test_skb_less_prog_attach(skel, tap_fd); + /* Test indirect prog attachment via link */ + test_skb_less_link_create(skel, tap_fd); - bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR); - bpf_object__close(obj); + close(tap_fd); +out_destroy_skel: + bpf_flow__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c index dc5ef155ec28..0e8a4d2f023d 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> void test_flow_dissector_load_bytes(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c index 1f51ba66b98b..15cb554a66d8 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c @@ -11,6 +11,7 @@ #include <fcntl.h> #include <sched.h> #include <stdbool.h> +#include <sys/stat.h> #include <unistd.h> #include <linux/bpf.h> @@ -18,21 +19,30 @@ #include "test_progs.h" -static bool is_attached(int netns) +static int init_net = -1; + +static __u32 query_attached_prog_id(int netns) { - __u32 cnt; + __u32 prog_ids[1] = {}; + __u32 prog_cnt = ARRAY_SIZE(prog_ids); int err; - err = bpf_prog_query(netns, BPF_FLOW_DISSECTOR, 0, NULL, NULL, &cnt); + err = bpf_prog_query(netns, BPF_FLOW_DISSECTOR, 0, NULL, + prog_ids, &prog_cnt); if (CHECK_FAIL(err)) { perror("bpf_prog_query"); - return true; /* fail-safe */ + return 0; } - return cnt > 0; + return prog_cnt == 1 ? prog_ids[0] : 0; +} + +static bool prog_is_attached(int netns) +{ + return query_attached_prog_id(netns) > 0; } -static int load_prog(void) +static int load_prog(enum bpf_prog_type type) { struct bpf_insn prog[] = { BPF_MOV64_IMM(BPF_REG_0, BPF_OK), @@ -40,61 +50,566 @@ static int load_prog(void) }; int fd; - fd = bpf_load_program(BPF_PROG_TYPE_FLOW_DISSECTOR, prog, - ARRAY_SIZE(prog), "GPL", 0, NULL, 0); + fd = bpf_load_program(type, prog, ARRAY_SIZE(prog), "GPL", 0, NULL, 0); if (CHECK_FAIL(fd < 0)) perror("bpf_load_program"); return fd; } -static void do_flow_dissector_reattach(void) +static __u32 query_prog_id(int prog) { - int prog_fd[2] = { -1, -1 }; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); int err; - prog_fd[0] = load_prog(); - if (prog_fd[0] < 0) - return; + err = bpf_obj_get_info_by_fd(prog, &info, &info_len); + if (CHECK_FAIL(err || info_len != sizeof(info))) { + perror("bpf_obj_get_info_by_fd"); + return 0; + } - prog_fd[1] = load_prog(); - if (prog_fd[1] < 0) - goto out_close; + return info.id; +} + +static int unshare_net(int old_net) +{ + int err, new_net; - err = bpf_prog_attach(prog_fd[0], 0, BPF_FLOW_DISSECTOR, 0); + err = unshare(CLONE_NEWNET); if (CHECK_FAIL(err)) { - perror("bpf_prog_attach-0"); - goto out_close; + perror("unshare(CLONE_NEWNET)"); + return -1; + } + new_net = open("/proc/self/ns/net", O_RDONLY); + if (CHECK_FAIL(new_net < 0)) { + perror("open(/proc/self/ns/net)"); + setns(old_net, CLONE_NEWNET); + return -1; } + return new_net; +} + +static void test_prog_attach_prog_attach(int netns, int prog1, int prog2) +{ + int err; + + err = bpf_prog_attach(prog1, 0, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); /* Expect success when attaching a different program */ - err = bpf_prog_attach(prog_fd[1], 0, BPF_FLOW_DISSECTOR, 0); + err = bpf_prog_attach(prog2, 0, BPF_FLOW_DISSECTOR, 0); if (CHECK_FAIL(err)) { - perror("bpf_prog_attach-1"); + perror("bpf_prog_attach(prog2) #1"); goto out_detach; } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); /* Expect failure when attaching the same program twice */ - err = bpf_prog_attach(prog_fd[1], 0, BPF_FLOW_DISSECTOR, 0); + err = bpf_prog_attach(prog2, 0, BPF_FLOW_DISSECTOR, 0); if (CHECK_FAIL(!err || errno != EINVAL)) - perror("bpf_prog_attach-2"); + perror("bpf_prog_attach(prog2) #2"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); out_detach: err = bpf_prog_detach(0, BPF_FLOW_DISSECTOR); if (CHECK_FAIL(err)) perror("bpf_prog_detach"); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_link_create(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int link1, link2; + + link1 = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure creating link when another link exists */ + errno = 0; + link2 = bpf_link_create(prog2, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link2 != -1 || errno != E2BIG)) + perror("bpf_prog_attach(prog2) expected E2BIG"); + if (link2 != -1) + close(link2); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link1); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_prog_attach_link_create(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int err, link; + + err = bpf_prog_attach(prog1, -1, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure creating link when prog attached */ + errno = 0; + link = bpf_link_create(prog2, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link != -1 || errno != EEXIST)) + perror("bpf_link_create(prog2) expected EEXIST"); + if (link != -1) + close(link); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + err = bpf_prog_detach(-1, BPF_FLOW_DISSECTOR); + if (CHECK_FAIL(err)) + perror("bpf_prog_detach"); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_prog_attach(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure attaching prog when link exists */ + errno = 0; + err = bpf_prog_attach(prog2, -1, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(!err || errno != EEXIST)) + perror("bpf_prog_attach(prog2) expected EEXIST"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_prog_detach(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure detaching prog when link exists */ + errno = 0; + err = bpf_prog_detach(-1, BPF_FLOW_DISSECTOR); + if (CHECK_FAIL(!err || errno != EINVAL)) + perror("bpf_prog_detach expected EINVAL"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_prog_attach_detach_query(int netns, int prog1, int prog2) +{ + int err; + + err = bpf_prog_attach(prog1, 0, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + err = bpf_prog_detach(0, BPF_FLOW_DISSECTOR); + if (CHECK_FAIL(err)) { + perror("bpf_prog_detach"); + return; + } + + /* Expect no prog attached after successful detach */ + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_close_query(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link); + /* Expect no prog attached after closing last link FD */ + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_no_old_prog(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect success replacing the prog when old prog not specified */ + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(err)) + perror("bpf_link_update"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_replace_old_prog(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link; + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect success F_REPLACE and old prog specified to succeed */ + update_opts.flags = BPF_F_REPLACE; + update_opts.old_prog_fd = prog1; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(err)) + perror("bpf_link_update"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_invalid_opts(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail w/ old prog FD but w/o F_REPLACE*/ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = prog1; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EINVAL)) { + perror("bpf_link_update expected EINVAL"); + goto out_close; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail on old prog FD mismatch */ + errno = 0; + update_opts.flags = BPF_F_REPLACE; + update_opts.old_prog_fd = prog2; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EPERM)) { + perror("bpf_link_update expected EPERM"); + goto out_close; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail for invalid old prog FD */ + errno = 0; + update_opts.flags = BPF_F_REPLACE; + update_opts.old_prog_fd = -1; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EBADF)) { + perror("bpf_link_update expected EBADF"); + goto out_close; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail with invalid flags */ + errno = 0; + update_opts.flags = BPF_F_ALLOW_MULTI; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EINVAL)) + perror("bpf_link_update expected EINVAL"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + +out_close: + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_invalid_prog(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link, prog3; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure when new prog FD is not valid */ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, -1, &update_opts); + if (CHECK_FAIL(!err || errno != EBADF)) { + perror("bpf_link_update expected EINVAL"); + goto out_close_link; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + prog3 = load_prog(BPF_PROG_TYPE_SOCKET_FILTER); + if (prog3 < 0) + goto out_close_link; + + /* Expect failure when new prog FD type doesn't match */ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog3, &update_opts); + if (CHECK_FAIL(!err || errno != EINVAL)) + perror("bpf_link_update expected EINVAL"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(prog3); +out_close_link: + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_netns_gone(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link, old_net; + + old_net = netns; + netns = unshare_net(old_net); + if (netns < 0) + return; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(netns); + err = setns(old_net, CLONE_NEWNET); + if (CHECK_FAIL(err)) { + perror("setns(CLONE_NEWNET)"); + close(link); + return; + } + + /* Expect failure when netns destroyed */ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != ENOLINK)) + perror("bpf_link_update"); + + close(link); +} + +static void test_link_get_info(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + struct bpf_link_info info = {}; + struct stat netns_stat = {}; + __u32 info_len, link_id; + int err, link, old_net; + + old_net = netns; + netns = unshare_net(old_net); + if (netns < 0) + return; + + err = fstat(netns, &netns_stat); + if (CHECK_FAIL(err)) { + perror("stat(netns)"); + goto out_resetns; + } + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + goto out_resetns; + } + + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(link, &info, &info_len); + if (CHECK_FAIL(err)) { + perror("bpf_obj_get_info"); + goto out_unlink; + } + CHECK_FAIL(info_len != sizeof(info)); + + /* Expect link info to be sane and match prog and netns details */ + CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS); + CHECK_FAIL(info.id == 0); + CHECK_FAIL(info.prog_id != query_prog_id(prog1)); + CHECK_FAIL(info.netns.netns_ino != netns_stat.st_ino); + CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR); + + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(err)) { + perror("bpf_link_update(prog2)"); + goto out_unlink; + } + + link_id = info.id; + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(link, &info, &info_len); + if (CHECK_FAIL(err)) { + perror("bpf_obj_get_info"); + goto out_unlink; + } + CHECK_FAIL(info_len != sizeof(info)); + + /* Expect no info change after update except in prog id */ + CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS); + CHECK_FAIL(info.id != link_id); + CHECK_FAIL(info.prog_id != query_prog_id(prog2)); + CHECK_FAIL(info.netns.netns_ino != netns_stat.st_ino); + CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR); + + /* Leave netns link is attached to and close last FD to it */ + err = setns(old_net, CLONE_NEWNET); + if (CHECK_FAIL(err)) { + perror("setns(NEWNET)"); + goto out_unlink; + } + close(netns); + old_net = -1; + netns = -1; + + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(link, &info, &info_len); + if (CHECK_FAIL(err)) { + perror("bpf_obj_get_info"); + goto out_unlink; + } + CHECK_FAIL(info_len != sizeof(info)); + + /* Expect netns_ino to change to 0 */ + CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS); + CHECK_FAIL(info.id != link_id); + CHECK_FAIL(info.prog_id != query_prog_id(prog2)); + CHECK_FAIL(info.netns.netns_ino != 0); + CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR); + +out_unlink: + close(link); +out_resetns: + if (old_net != -1) + setns(old_net, CLONE_NEWNET); + if (netns != -1) + close(netns); +} + +static void run_tests(int netns) +{ + struct test { + const char *test_name; + void (*test_func)(int netns, int prog1, int prog2); + } tests[] = { + { "prog attach, prog attach", + test_prog_attach_prog_attach }, + { "link create, link create", + test_link_create_link_create }, + { "prog attach, link create", + test_prog_attach_link_create }, + { "link create, prog attach", + test_link_create_prog_attach }, + { "link create, prog detach", + test_link_create_prog_detach }, + { "prog attach, detach, query", + test_prog_attach_detach_query }, + { "link create, close, query", + test_link_create_close_query }, + { "link update no old prog", + test_link_update_no_old_prog }, + { "link update with replace old prog", + test_link_update_replace_old_prog }, + { "link update invalid opts", + test_link_update_invalid_opts }, + { "link update invalid prog", + test_link_update_invalid_prog }, + { "link update netns gone", + test_link_update_netns_gone }, + { "link get info", + test_link_get_info }, + }; + int i, progs[2] = { -1, -1 }; + char test_name[80]; + + for (i = 0; i < ARRAY_SIZE(progs); i++) { + progs[i] = load_prog(BPF_PROG_TYPE_FLOW_DISSECTOR); + if (progs[i] < 0) + goto out_close; + } + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + snprintf(test_name, sizeof(test_name), + "flow dissector %s%s", + tests[i].test_name, + netns == init_net ? " (init_net)" : ""); + if (test__start_subtest(test_name)) + tests[i].test_func(netns, progs[0], progs[1]); + } out_close: - close(prog_fd[1]); - close(prog_fd[0]); + for (i = 0; i < ARRAY_SIZE(progs); i++) { + if (progs[i] != -1) + CHECK_FAIL(close(progs[i])); + } } void test_flow_dissector_reattach(void) { - int init_net, self_net, err; + int err, new_net, saved_net; - self_net = open("/proc/self/ns/net", O_RDONLY); - if (CHECK_FAIL(self_net < 0)) { + saved_net = open("/proc/self/ns/net", O_RDONLY); + if (CHECK_FAIL(saved_net < 0)) { perror("open(/proc/self/ns/net"); return; } @@ -111,30 +626,29 @@ void test_flow_dissector_reattach(void) goto out_close; } - if (is_attached(init_net)) { + if (prog_is_attached(init_net)) { test__skip(); printf("Can't test with flow dissector attached to init_net\n"); goto out_setns; } /* First run tests in root network namespace */ - do_flow_dissector_reattach(); + run_tests(init_net); /* Then repeat tests in a non-root namespace */ - err = unshare(CLONE_NEWNET); - if (CHECK_FAIL(err)) { - perror("unshare(CLONE_NEWNET)"); + new_net = unshare_net(init_net); + if (new_net < 0) goto out_setns; - } - do_flow_dissector_reattach(); + run_tests(new_net); + close(new_net); out_setns: /* Move back to netns we started in. */ - err = setns(self_net, CLONE_NEWNET); + err = setns(saved_net, CLONE_NEWNET); if (CHECK_FAIL(err)) perror("setns(/proc/self/ns/net)"); out_close: close(init_net); - close(self_net); + close(saved_net); } diff --git a/tools/testing/selftests/bpf/prog_tests/global_data.c b/tools/testing/selftests/bpf/prog_tests/global_data.c index c680926fce73..e3cb62b0a110 100644 --- a/tools/testing/selftests/bpf/prog_tests/global_data.c +++ b/tools/testing/selftests/bpf/prog_tests/global_data.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> static void test_global_data_number(struct bpf_object *obj, __u32 duration) { diff --git a/tools/testing/selftests/bpf/test_hashmap.c b/tools/testing/selftests/bpf/prog_tests/hashmap.c index c490e012c23f..428d488830c6 100644 --- a/tools/testing/selftests/bpf/test_hashmap.c +++ b/tools/testing/selftests/bpf/prog_tests/hashmap.c @@ -5,26 +5,17 @@ * * Copyright (c) 2019 Facebook */ -#include <stdio.h> -#include <errno.h> -#include <linux/err.h> +#include "test_progs.h" #include "bpf/hashmap.h" -#define CHECK(condition, format...) ({ \ - int __ret = !!(condition); \ - if (__ret) { \ - fprintf(stderr, "%s:%d:FAIL ", __func__, __LINE__); \ - fprintf(stderr, format); \ - } \ - __ret; \ -}) +static int duration = 0; -size_t hash_fn(const void *k, void *ctx) +static size_t hash_fn(const void *k, void *ctx) { return (long)k; } -bool equal_fn(const void *a, const void *b, void *ctx) +static bool equal_fn(const void *a, const void *b, void *ctx) { return (long)a == (long)b; } @@ -49,53 +40,55 @@ static inline size_t exp_cap(size_t sz) #define ELEM_CNT 62 -int test_hashmap_generic(void) +static void test_hashmap_generic(void) { struct hashmap_entry *entry, *tmp; int err, bkt, found_cnt, i; long long found_msk; struct hashmap *map; - fprintf(stderr, "%s: ", __func__); - map = hashmap__new(hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) - return 1; + if (CHECK(IS_ERR(map), "hashmap__new", + "failed to create map: %ld\n", PTR_ERR(map))) + return; for (i = 0; i < ELEM_CNT; i++) { const void *oldk, *k = (const void *)(long)i; void *oldv, *v = (void *)(long)(1024 + i); err = hashmap__update(map, k, v, &oldk, &oldv); - if (CHECK(err != -ENOENT, "unexpected result: %d\n", err)) - return 1; + if (CHECK(err != -ENOENT, "hashmap__update", + "unexpected result: %d\n", err)) + goto cleanup; if (i % 2) { err = hashmap__add(map, k, v); } else { err = hashmap__set(map, k, v, &oldk, &oldv); - if (CHECK(oldk != NULL || oldv != NULL, + if (CHECK(oldk != NULL || oldv != NULL, "check_kv", "unexpected k/v: %p=%p\n", oldk, oldv)) - return 1; + goto cleanup; } - if (CHECK(err, "failed to add k/v %ld = %ld: %d\n", + if (CHECK(err, "elem_add", "failed to add k/v %ld = %ld: %d\n", (long)k, (long)v, err)) - return 1; + goto cleanup; - if (CHECK(!hashmap__find(map, k, &oldv), + if (CHECK(!hashmap__find(map, k, &oldv), "elem_find", "failed to find key %ld\n", (long)k)) - return 1; - if (CHECK(oldv != v, "found value is wrong: %ld\n", (long)oldv)) - return 1; + goto cleanup; + if (CHECK(oldv != v, "elem_val", + "found value is wrong: %ld\n", (long)oldv)) + goto cleanup; } - if (CHECK(hashmap__size(map) != ELEM_CNT, + if (CHECK(hashmap__size(map) != ELEM_CNT, "hashmap__size", "invalid map size: %zu\n", hashmap__size(map))) - return 1; + goto cleanup; if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), + "hashmap_cap", "unexpected map capacity: %zu\n", hashmap__capacity(map))) - return 1; + goto cleanup; found_msk = 0; hashmap__for_each_entry(map, entry, bkt) { @@ -103,42 +96,47 @@ int test_hashmap_generic(void) long v = (long)entry->value; found_msk |= 1ULL << k; - if (CHECK(v - k != 1024, "invalid k/v pair: %ld = %ld\n", k, v)) - return 1; + if (CHECK(v - k != 1024, "check_kv", + "invalid k/v pair: %ld = %ld\n", k, v)) + goto cleanup; } - if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, + if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, "elem_cnt", "not all keys iterated: %llx\n", found_msk)) - return 1; + goto cleanup; for (i = 0; i < ELEM_CNT; i++) { const void *oldk, *k = (const void *)(long)i; void *oldv, *v = (void *)(long)(256 + i); err = hashmap__add(map, k, v); - if (CHECK(err != -EEXIST, "unexpected add result: %d\n", err)) - return 1; + if (CHECK(err != -EEXIST, "hashmap__add", + "unexpected add result: %d\n", err)) + goto cleanup; if (i % 2) err = hashmap__update(map, k, v, &oldk, &oldv); else err = hashmap__set(map, k, v, &oldk, &oldv); - if (CHECK(err, "failed to update k/v %ld = %ld: %d\n", - (long)k, (long)v, err)) - return 1; - if (CHECK(!hashmap__find(map, k, &oldv), + if (CHECK(err, "elem_upd", + "failed to update k/v %ld = %ld: %d\n", + (long)k, (long)v, err)) + goto cleanup; + if (CHECK(!hashmap__find(map, k, &oldv), "elem_find", "failed to find key %ld\n", (long)k)) - return 1; - if (CHECK(oldv != v, "found value is wrong: %ld\n", (long)oldv)) - return 1; + goto cleanup; + if (CHECK(oldv != v, "elem_val", + "found value is wrong: %ld\n", (long)oldv)) + goto cleanup; } - if (CHECK(hashmap__size(map) != ELEM_CNT, + if (CHECK(hashmap__size(map) != ELEM_CNT, "hashmap__size", "invalid updated map size: %zu\n", hashmap__size(map))) - return 1; + goto cleanup; if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), + "hashmap__capacity", "unexpected map capacity: %zu\n", hashmap__capacity(map))) - return 1; + goto cleanup; found_msk = 0; hashmap__for_each_entry_safe(map, entry, tmp, bkt) { @@ -146,20 +144,21 @@ int test_hashmap_generic(void) long v = (long)entry->value; found_msk |= 1ULL << k; - if (CHECK(v - k != 256, + if (CHECK(v - k != 256, "elem_check", "invalid updated k/v pair: %ld = %ld\n", k, v)) - return 1; + goto cleanup; } - if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, + if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, "elem_cnt", "not all keys iterated after update: %llx\n", found_msk)) - return 1; + goto cleanup; found_cnt = 0; hashmap__for_each_key_entry(map, entry, (void *)0) { found_cnt++; } - if (CHECK(!found_cnt, "didn't find any entries for key 0\n")) - return 1; + if (CHECK(!found_cnt, "found_cnt", + "didn't find any entries for key 0\n")) + goto cleanup; found_msk = 0; found_cnt = 0; @@ -173,30 +172,31 @@ int test_hashmap_generic(void) found_cnt++; found_msk |= 1ULL << (long)k; - if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), + if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), "elem_del", "failed to delete k/v %ld = %ld\n", (long)k, (long)v)) - return 1; - if (CHECK(oldk != k || oldv != v, + goto cleanup; + if (CHECK(oldk != k || oldv != v, "check_old", "invalid deleted k/v: expected %ld = %ld, got %ld = %ld\n", (long)k, (long)v, (long)oldk, (long)oldv)) - return 1; - if (CHECK(hashmap__delete(map, k, &oldk, &oldv), + goto cleanup; + if (CHECK(hashmap__delete(map, k, &oldk, &oldv), "elem_del", "unexpectedly deleted k/v %ld = %ld\n", (long)oldk, (long)oldv)) - return 1; + goto cleanup; } - if (CHECK(!found_cnt || !found_msk, + if (CHECK(!found_cnt || !found_msk, "found_entries", "didn't delete any key entries\n")) - return 1; - if (CHECK(hashmap__size(map) != ELEM_CNT - found_cnt, + goto cleanup; + if (CHECK(hashmap__size(map) != ELEM_CNT - found_cnt, "elem_cnt", "invalid updated map size (already deleted: %d): %zu\n", found_cnt, hashmap__size(map))) - return 1; + goto cleanup; if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), + "hashmap__capacity", "unexpected map capacity: %zu\n", hashmap__capacity(map))) - return 1; + goto cleanup; hashmap__for_each_entry_safe(map, entry, tmp, bkt) { const void *oldk, *k; @@ -208,53 +208,56 @@ int test_hashmap_generic(void) found_cnt++; found_msk |= 1ULL << (long)k; - if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), + if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), "elem_del", "failed to delete k/v %ld = %ld\n", (long)k, (long)v)) - return 1; - if (CHECK(oldk != k || oldv != v, + goto cleanup; + if (CHECK(oldk != k || oldv != v, "elem_check", "invalid old k/v: expect %ld = %ld, got %ld = %ld\n", (long)k, (long)v, (long)oldk, (long)oldv)) - return 1; - if (CHECK(hashmap__delete(map, k, &oldk, &oldv), + goto cleanup; + if (CHECK(hashmap__delete(map, k, &oldk, &oldv), "elem_del", "unexpectedly deleted k/v %ld = %ld\n", (long)k, (long)v)) - return 1; + goto cleanup; } if (CHECK(found_cnt != ELEM_CNT || found_msk != (1ULL << ELEM_CNT) - 1, + "found_cnt", "not all keys were deleted: found_cnt:%d, found_msk:%llx\n", found_cnt, found_msk)) - return 1; - if (CHECK(hashmap__size(map) != 0, + goto cleanup; + if (CHECK(hashmap__size(map) != 0, "hashmap__size", "invalid updated map size (already deleted: %d): %zu\n", found_cnt, hashmap__size(map))) - return 1; + goto cleanup; found_cnt = 0; hashmap__for_each_entry(map, entry, bkt) { - CHECK(false, "unexpected map entries left: %ld = %ld\n", - (long)entry->key, (long)entry->value); - return 1; + CHECK(false, "elem_exists", + "unexpected map entries left: %ld = %ld\n", + (long)entry->key, (long)entry->value); + goto cleanup; } - hashmap__free(map); + hashmap__clear(map); hashmap__for_each_entry(map, entry, bkt) { - CHECK(false, "unexpected map entries left: %ld = %ld\n", - (long)entry->key, (long)entry->value); - return 1; + CHECK(false, "elem_exists", + "unexpected map entries left: %ld = %ld\n", + (long)entry->key, (long)entry->value); + goto cleanup; } - fprintf(stderr, "OK\n"); - return 0; +cleanup: + hashmap__free(map); } -size_t collision_hash_fn(const void *k, void *ctx) +static size_t collision_hash_fn(const void *k, void *ctx) { return 0; } -int test_hashmap_multimap(void) +static void test_hashmap_multimap(void) { void *k1 = (void *)0, *k2 = (void *)1; struct hashmap_entry *entry; @@ -262,121 +265,116 @@ int test_hashmap_multimap(void) long found_msk; int err, bkt; - fprintf(stderr, "%s: ", __func__); - /* force collisions */ map = hashmap__new(collision_hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) - return 1; - + if (CHECK(IS_ERR(map), "hashmap__new", + "failed to create map: %ld\n", PTR_ERR(map))) + return; /* set up multimap: * [0] -> 1, 2, 4; * [1] -> 8, 16, 32; */ err = hashmap__append(map, k1, (void *)1); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; err = hashmap__append(map, k1, (void *)2); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; err = hashmap__append(map, k1, (void *)4); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; err = hashmap__append(map, k2, (void *)8); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; err = hashmap__append(map, k2, (void *)16); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; err = hashmap__append(map, k2, (void *)32); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; - if (CHECK(hashmap__size(map) != 6, + if (CHECK(hashmap__size(map) != 6, "hashmap_size", "invalid map size: %zu\n", hashmap__size(map))) - return 1; + goto cleanup; /* verify global iteration still works and sees all values */ found_msk = 0; hashmap__for_each_entry(map, entry, bkt) { found_msk |= (long)entry->value; } - if (CHECK(found_msk != (1 << 6) - 1, + if (CHECK(found_msk != (1 << 6) - 1, "found_msk", "not all keys iterated: %lx\n", found_msk)) - return 1; + goto cleanup; /* iterate values for key 1 */ found_msk = 0; hashmap__for_each_key_entry(map, entry, k1) { found_msk |= (long)entry->value; } - if (CHECK(found_msk != (1 | 2 | 4), + if (CHECK(found_msk != (1 | 2 | 4), "found_msk", "invalid k1 values: %lx\n", found_msk)) - return 1; + goto cleanup; /* iterate values for key 2 */ found_msk = 0; hashmap__for_each_key_entry(map, entry, k2) { found_msk |= (long)entry->value; } - if (CHECK(found_msk != (8 | 16 | 32), + if (CHECK(found_msk != (8 | 16 | 32), "found_msk", "invalid k2 values: %lx\n", found_msk)) - return 1; + goto cleanup; - fprintf(stderr, "OK\n"); - return 0; +cleanup: + hashmap__free(map); } -int test_hashmap_empty() +static void test_hashmap_empty() { struct hashmap_entry *entry; int bkt; struct hashmap *map; void *k = (void *)0; - fprintf(stderr, "%s: ", __func__); - /* force collisions */ map = hashmap__new(hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) - return 1; + if (CHECK(IS_ERR(map), "hashmap__new", + "failed to create map: %ld\n", PTR_ERR(map))) + goto cleanup; - if (CHECK(hashmap__size(map) != 0, + if (CHECK(hashmap__size(map) != 0, "hashmap__size", "invalid map size: %zu\n", hashmap__size(map))) - return 1; - if (CHECK(hashmap__capacity(map) != 0, + goto cleanup; + if (CHECK(hashmap__capacity(map) != 0, "hashmap__capacity", "invalid map capacity: %zu\n", hashmap__capacity(map))) - return 1; - if (CHECK(hashmap__find(map, k, NULL), "unexpected find\n")) - return 1; - if (CHECK(hashmap__delete(map, k, NULL, NULL), "unexpected delete\n")) - return 1; + goto cleanup; + if (CHECK(hashmap__find(map, k, NULL), "elem_find", + "unexpected find\n")) + goto cleanup; + if (CHECK(hashmap__delete(map, k, NULL, NULL), "elem_del", + "unexpected delete\n")) + goto cleanup; hashmap__for_each_entry(map, entry, bkt) { - CHECK(false, "unexpected iterated entry\n"); - return 1; + CHECK(false, "elem_found", "unexpected iterated entry\n"); + goto cleanup; } hashmap__for_each_key_entry(map, entry, k) { - CHECK(false, "unexpected key entry\n"); - return 1; + CHECK(false, "key_found", "unexpected key entry\n"); + goto cleanup; } - fprintf(stderr, "OK\n"); - return 0; +cleanup: + hashmap__free(map); } -int main(int argc, char **argv) +void test_hashmap() { - bool failed = false; - - if (test_hashmap_generic()) - failed = true; - if (test_hashmap_multimap()) - failed = true; - if (test_hashmap_empty()) - failed = true; - - return failed; + if (test__start_subtest("generic")) + test_hashmap_generic(); + if (test__start_subtest("multimap")) + test_hashmap_multimap(); + if (test__start_subtest("empty")) + test_hashmap_empty(); } diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c index 7507c8f689bc..42c3a3103c26 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> struct meta { int ifindex; diff --git a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c index eaf64595be88..c2d373e294bb 100644 --- a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c +++ b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> static void test_l4lb(const char *file) { diff --git a/tools/testing/selftests/bpf/prog_tests/map_lock.c b/tools/testing/selftests/bpf/prog_tests/map_lock.c index 8f91f1881d11..ce17b1ed8709 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/map_lock.c @@ -1,5 +1,19 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> + +static void *spin_lock_thread(void *arg) +{ + __u32 duration, retval; + int err, prog_fd = *(u32 *) arg; + + err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, &retval, &duration); + CHECK(err || retval, "", + "err %d errno %d retval %d duration %d\n", + err, errno, retval, duration); + pthread_exit(arg); +} static void *parallel_map_access(void *arg) { diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c index 6b9dce431d41..43d0b5578f46 100644 --- a/tools/testing/selftests/bpf/prog_tests/mmap.c +++ b/tools/testing/selftests/bpf/prog_tests/mmap.c @@ -19,7 +19,7 @@ void test_mmap(void) const size_t map_sz = roundup_page(sizeof(struct map_data)); const int zero = 0, one = 1, two = 2, far = 1500; const long page_size = sysconf(_SC_PAGE_SIZE); - int err, duration = 0, i, data_map_fd, data_map_id, tmp_fd; + int err, duration = 0, i, data_map_fd, data_map_id, tmp_fd, rdmap_fd; struct bpf_map *data_map, *bss_map; void *bss_mmaped = NULL, *map_mmaped = NULL, *tmp1, *tmp2; struct test_mmap__bss *bss_data; @@ -37,6 +37,17 @@ void test_mmap(void) data_map = skel->maps.data_map; data_map_fd = bpf_map__fd(data_map); + rdmap_fd = bpf_map__fd(skel->maps.rdonly_map); + tmp1 = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, rdmap_fd, 0); + if (CHECK(tmp1 != MAP_FAILED, "rdonly_write_mmap", "unexpected success\n")) { + munmap(tmp1, 4096); + goto cleanup; + } + /* now double-check if it's mmap()'able at all */ + tmp1 = mmap(NULL, 4096, PROT_READ, MAP_SHARED, rdmap_fd, 0); + if (CHECK(tmp1 == MAP_FAILED, "rdonly_read_mmap", "failed: %d\n", errno)) + goto cleanup; + /* get map's ID */ memset(&map_info, 0, map_info_sz); err = bpf_obj_get_info_by_fd(data_map_fd, &map_info, &map_info_sz); diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index 542240e16564..e74dc501b27f 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -80,9 +80,6 @@ void test_ns_current_pid_tgid(void) "User pid/tgid %llu BPF pid/tgid %llu\n", id, bss.pid_tgid)) goto cleanup; cleanup: - if (!link) { - bpf_link__destroy(link); - link = NULL; - } + bpf_link__destroy(link); bpf_object__close(obj); } diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c index 1450ea2dd4cc..a122ce3b360e 100644 --- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c +++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c @@ -6,6 +6,11 @@ #include <test_progs.h> #include "bpf/libbpf_internal.h" +/* AddressSanitizer sometimes crashes due to data dereference below, due to + * this being mmap()'ed memory. Disable instrumentation with + * no_sanitize_address attribute + */ +__attribute__((no_sanitize_address)) static void on_sample(void *ctx, int cpu, void *data, __u32 size) { int cpu_data = *(int *)data, duration = 0; diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_access.c index a2537dfa899c..44b514fabccd 100644 --- a/tools/testing/selftests/bpf/prog_tests/pkt_access.c +++ b/tools/testing/selftests/bpf/prog_tests/pkt_access.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> void test_pkt_access(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c index 5f7aea605019..939015cd6dba 100644 --- a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c +++ b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> void test_pkt_md_access(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c index 5dd89b941f53..dde2b7ae7bc9 100644 --- a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c +++ b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> void test_prog_run_xattr(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c index faccc66f4e39..f47e7b1cb32c 100644 --- a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c +++ b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> enum { QUEUE, diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c new file mode 100644 index 000000000000..2bba908dfa63 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <linux/compiler.h> +#include <asm/barrier.h> +#include <test_progs.h> +#include <sys/mman.h> +#include <sys/epoll.h> +#include <time.h> +#include <sched.h> +#include <signal.h> +#include <pthread.h> +#include <sys/sysinfo.h> +#include <linux/perf_event.h> +#include <linux/ring_buffer.h> +#include "test_ringbuf.skel.h" + +#define EDONE 7777 + +static int duration = 0; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +static volatile int sample_cnt; + +static int process_sample(void *ctx, void *data, size_t len) +{ + struct sample *s = data; + + sample_cnt++; + + switch (s->seq) { + case 0: + CHECK(s->value != 333, "sample1_value", "exp %ld, got %ld\n", + 333L, s->value); + return 0; + case 1: + CHECK(s->value != 777, "sample2_value", "exp %ld, got %ld\n", + 777L, s->value); + return -EDONE; + default: + /* we don't care about the rest */ + return 0; + } +} + +static struct test_ringbuf *skel; +static struct ring_buffer *ringbuf; + +static void trigger_samples() +{ + skel->bss->dropped = 0; + skel->bss->total = 0; + skel->bss->discarded = 0; + + /* trigger exactly two samples */ + skel->bss->value = 333; + syscall(__NR_getpgid); + skel->bss->value = 777; + syscall(__NR_getpgid); +} + +static void *poll_thread(void *input) +{ + long timeout = (long)input; + + return (void *)(long)ring_buffer__poll(ringbuf, timeout); +} + +void test_ringbuf(void) +{ + const size_t rec_sz = BPF_RINGBUF_HDR_SZ + sizeof(struct sample); + pthread_t thread; + long bg_ret = -1; + int err; + + skel = test_ringbuf__open_and_load(); + if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + return; + + /* only trigger BPF program for current process */ + skel->bss->pid = getpid(); + + ringbuf = ring_buffer__new(bpf_map__fd(skel->maps.ringbuf), + process_sample, NULL, NULL); + if (CHECK(!ringbuf, "ringbuf_create", "failed to create ringbuf\n")) + goto cleanup; + + err = test_ringbuf__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attachment failed: %d\n", err)) + goto cleanup; + + trigger_samples(); + + /* 2 submitted + 1 discarded records */ + CHECK(skel->bss->avail_data != 3 * rec_sz, + "err_avail_size", "exp %ld, got %ld\n", + 3L * rec_sz, skel->bss->avail_data); + CHECK(skel->bss->ring_size != 4096, + "err_ring_size", "exp %ld, got %ld\n", + 4096L, skel->bss->ring_size); + CHECK(skel->bss->cons_pos != 0, + "err_cons_pos", "exp %ld, got %ld\n", + 0L, skel->bss->cons_pos); + CHECK(skel->bss->prod_pos != 3 * rec_sz, + "err_prod_pos", "exp %ld, got %ld\n", + 3L * rec_sz, skel->bss->prod_pos); + + /* poll for samples */ + err = ring_buffer__poll(ringbuf, -1); + + /* -EDONE is used as an indicator that we are done */ + if (CHECK(err != -EDONE, "err_done", "done err: %d\n", err)) + goto cleanup; + + /* we expect extra polling to return nothing */ + err = ring_buffer__poll(ringbuf, 0); + if (CHECK(err != 0, "extra_samples", "poll result: %d\n", err)) + goto cleanup; + + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n", + 1L, skel->bss->discarded); + + /* now validate consumer position is updated and returned */ + trigger_samples(); + CHECK(skel->bss->cons_pos != 3 * rec_sz, + "err_cons_pos", "exp %ld, got %ld\n", + 3L * rec_sz, skel->bss->cons_pos); + err = ring_buffer__poll(ringbuf, -1); + CHECK(err <= 0, "poll_err", "err %d\n", err); + + /* start poll in background w/ long timeout */ + err = pthread_create(&thread, NULL, poll_thread, (void *)(long)10000); + if (CHECK(err, "bg_poll", "pthread_create failed: %d\n", err)) + goto cleanup; + + /* turn off notifications now */ + skel->bss->flags = BPF_RB_NO_WAKEUP; + + /* give background thread a bit of a time */ + usleep(50000); + trigger_samples(); + /* sleeping arbitrarily is bad, but no better way to know that + * epoll_wait() **DID NOT** unblock in background thread + */ + usleep(50000); + /* background poll should still be blocked */ + err = pthread_tryjoin_np(thread, (void **)&bg_ret); + if (CHECK(err != EBUSY, "try_join", "err %d\n", err)) + goto cleanup; + + /* BPF side did everything right */ + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n", + 1L, skel->bss->discarded); + + /* clear flags to return to "adaptive" notification mode */ + skel->bss->flags = 0; + + /* produce new samples, no notification should be triggered, because + * consumer is now behind + */ + trigger_samples(); + + /* background poll should still be blocked */ + err = pthread_tryjoin_np(thread, (void **)&bg_ret); + if (CHECK(err != EBUSY, "try_join", "err %d\n", err)) + goto cleanup; + + /* now force notifications */ + skel->bss->flags = BPF_RB_FORCE_WAKEUP; + sample_cnt = 0; + trigger_samples(); + + /* now we should get a pending notification */ + usleep(50000); + err = pthread_tryjoin_np(thread, (void **)&bg_ret); + if (CHECK(err, "join_bg", "err %d\n", err)) + goto cleanup; + + if (CHECK(bg_ret != 1, "bg_ret", "epoll_wait result: %ld", bg_ret)) + goto cleanup; + + /* 3 rounds, 2 samples each */ + CHECK(sample_cnt != 6, "wrong_sample_cnt", + "expected to see %d samples, got %d\n", 6, sample_cnt); + + /* BPF side did everything right */ + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n", + 1L, skel->bss->discarded); + + test_ringbuf__detach(skel); +cleanup: + ring_buffer__free(ringbuf); + test_ringbuf__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c new file mode 100644 index 000000000000..78e450609803 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <test_progs.h> +#include <sys/epoll.h> +#include "test_ringbuf_multi.skel.h" + +static int duration = 0; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +static int process_sample(void *ctx, void *data, size_t len) +{ + int ring = (unsigned long)ctx; + struct sample *s = data; + + switch (s->seq) { + case 0: + CHECK(ring != 1, "sample1_ring", "exp %d, got %d\n", 1, ring); + CHECK(s->value != 333, "sample1_value", "exp %ld, got %ld\n", + 333L, s->value); + break; + case 1: + CHECK(ring != 2, "sample2_ring", "exp %d, got %d\n", 2, ring); + CHECK(s->value != 777, "sample2_value", "exp %ld, got %ld\n", + 777L, s->value); + break; + default: + CHECK(true, "extra_sample", "unexpected sample seq %d, val %ld\n", + s->seq, s->value); + return -1; + } + + return 0; +} + +void test_ringbuf_multi(void) +{ + struct test_ringbuf_multi *skel; + struct ring_buffer *ringbuf; + int err; + + skel = test_ringbuf_multi__open_and_load(); + if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + return; + + /* only trigger BPF program for current process */ + skel->bss->pid = getpid(); + + ringbuf = ring_buffer__new(bpf_map__fd(skel->maps.ringbuf1), + process_sample, (void *)(long)1, NULL); + if (CHECK(!ringbuf, "ringbuf_create", "failed to create ringbuf\n")) + goto cleanup; + + err = ring_buffer__add(ringbuf, bpf_map__fd(skel->maps.ringbuf2), + process_sample, (void *)(long)2); + if (CHECK(err, "ringbuf_add", "failed to add another ring\n")) + goto cleanup; + + err = test_ringbuf_multi__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attachment failed: %d\n", err)) + goto cleanup; + + /* trigger few samples, some will be skipped */ + skel->bss->target_ring = 0; + skel->bss->value = 333; + syscall(__NR_getpgid); + + /* skipped, no ringbuf in slot 1 */ + skel->bss->target_ring = 1; + skel->bss->value = 555; + syscall(__NR_getpgid); + + skel->bss->target_ring = 2; + skel->bss->value = 777; + syscall(__NR_getpgid); + + /* poll for samples, should get 2 ringbufs back */ + err = ring_buffer__poll(ringbuf, -1); + if (CHECK(err != 4, "poll_res", "expected 4 records, got %d\n", err)) + goto cleanup; + + /* expect extra polling to return nothing */ + err = ring_buffer__poll(ringbuf, 0); + if (CHECK(err < 0, "extra_samples", "poll result: %d\n", err)) + goto cleanup; + + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->skipped != 1, "err_skipped", "exp %ld, got %ld\n", + 1L, skel->bss->skipped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + +cleanup: + ring_buffer__free(ringbuf); + test_ringbuf_multi__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/signal_pending.c b/tools/testing/selftests/bpf/prog_tests/signal_pending.c index 996e808f43a2..dfcbddcbe4d3 100644 --- a/tools/testing/selftests/bpf/prog_tests/signal_pending.c +++ b/tools/testing/selftests/bpf/prog_tests/signal_pending.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> static void sigalrm_handler(int s) {} static struct sigaction sigalrm_action = { diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index d572e1a2c297..47fa04adc147 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -20,6 +20,7 @@ #define CONNECT_PORT 4321 #define TEST_DADDR (0xC0A80203) #define NS_SELF "/proc/self/ns/net" +#define SERVER_MAP_PATH "/sys/fs/bpf/tc/globals/server_map" static const struct timeval timeo_sec = { .tv_sec = 3 }; static const size_t timeo_optlen = sizeof(timeo_sec); @@ -265,6 +266,7 @@ void test_sk_assign(void) TEST("ipv6 udp addr redir", AF_INET6, SOCK_DGRAM, true), }; int server = -1; + int server_map; int self_net; self_net = open(NS_SELF, O_RDONLY); @@ -278,9 +280,17 @@ void test_sk_assign(void) goto cleanup; } + server_map = bpf_obj_get(SERVER_MAP_PATH); + if (CHECK_FAIL(server_map < 0)) { + perror("Unable to open " SERVER_MAP_PATH); + goto cleanup; + } + for (int i = 0; i < ARRAY_SIZE(tests) && !READ_ONCE(stop); i++) { struct test_sk_cfg *test = &tests[i]; const struct sockaddr *addr; + const int zero = 0; + int err; if (!test__start_subtest(test->name)) continue; @@ -288,7 +298,13 @@ void test_sk_assign(void) addr = (const struct sockaddr *)test->addr; server = start_server(addr, test->len, test->type); if (server == -1) - goto cleanup; + goto close; + + err = bpf_map_update_elem(server_map, &zero, &server, BPF_ANY); + if (CHECK_FAIL(err)) { + perror("Unable to update server_map"); + goto close; + } /* connect to unbound ports */ prepare_addr(test->addr, test->family, CONNECT_PORT, @@ -302,7 +318,10 @@ void test_sk_assign(void) close: close(server); + close(server_map); cleanup: + if (CHECK_FAIL(unlink(SERVER_MAP_PATH))) + perror("Unable to unlink " SERVER_MAP_PATH); if (CHECK_FAIL(setns(self_net, CLONE_NEWNET))) perror("Failed to setns("NS_SELF")"); close(self_net); diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c index 4538bd08203f..7021b92af313 100644 --- a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c +++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> void test_skb_ctx(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/skb_helpers.c b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c new file mode 100644 index 000000000000..f302ad84a298 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include <network_helpers.h> + +void test_skb_helpers(void) +{ + struct __sk_buff skb = { + .wire_len = 100, + .gso_segs = 8, + .gso_size = 10, + }; + struct bpf_prog_test_run_attr tattr = { + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .ctx_in = &skb, + .ctx_size_in = sizeof(skb), + .ctx_out = &skb, + .ctx_size_out = sizeof(skb), + }; + struct bpf_object *obj; + int err; + + err = bpf_prog_load("./test_skb_helpers.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &tattr.prog_fd); + if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno)) + return; + err = bpf_prog_test_run_xattr(&tattr); + CHECK_ATTR(err, "len", "err %d errno %d\n", err, errno); + bpf_object__close(obj); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index aa43e0bd210c..96e7b7f84c65 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Cloudflare +#include <error.h> #include "test_progs.h" +#include "test_skmsg_load_helpers.skel.h" #define TCP_REPAIR 19 /* TCP sock is under repair right now */ @@ -70,10 +72,43 @@ out: close(s); } +static void test_skmsg_helpers(enum bpf_map_type map_type) +{ + struct test_skmsg_load_helpers *skel; + int err, map, verdict; + + skel = test_skmsg_load_helpers__open_and_load(); + if (CHECK_FAIL(!skel)) { + perror("test_skmsg_load_helpers__open_and_load"); + return; + } + + verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + map = bpf_map__fd(skel->maps.sock_map); + + err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach"); + goto out; + } + + err = bpf_prog_detach2(verdict, map, BPF_SK_MSG_VERDICT); + if (CHECK_FAIL(err)) { + perror("bpf_prog_detach2"); + goto out; + } +out: + test_skmsg_load_helpers__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKMAP); if (test__start_subtest("sockhash create_update_free")) test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKHASH); + if (test__start_subtest("sockmap sk_msg load helpers")) + test_skmsg_helpers(BPF_MAP_TYPE_SOCKMAP); + if (test__start_subtest("sockhash sk_msg load helpers")) + test_skmsg_helpers(BPF_MAP_TYPE_SOCKHASH); } diff --git a/tools/testing/selftests/bpf/prog_tests/spinlock.c b/tools/testing/selftests/bpf/prog_tests/spinlock.c index 1ae00cd3174e..7577a77a4c4c 100644 --- a/tools/testing/selftests/bpf/prog_tests/spinlock.c +++ b/tools/testing/selftests/bpf/prog_tests/spinlock.c @@ -1,5 +1,19 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> + +static void *spin_lock_thread(void *arg) +{ + __u32 duration, retval; + int err, prog_fd = *(u32 *) arg; + + err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, &retval, &duration); + CHECK(err || retval, "", + "err %d errno %d retval %d duration %d\n", + err, errno, retval, duration); + pthread_exit(arg); +} void test_spinlock(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index e56b52ab41da..9013a0c01eed 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> #include "cgroup_helpers.h" +#include "network_helpers.h" struct tcp_rtt_storage { __u32 invoked; @@ -87,34 +88,6 @@ static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked, return err; } -static int connect_to_server(int server_fd) -{ - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int fd; - - fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) { - log_err("Failed to create client socket"); - return -1; - } - - if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { - log_err("Failed to get server addr"); - goto out; - } - - if (connect(fd, (const struct sockaddr *)&addr, len) < 0) { - log_err("Fail to connect to server"); - goto out; - } - - return fd; - -out: - close(fd); - return -1; -} static int run_test(int cgroup_fd, int server_fd) { @@ -145,7 +118,7 @@ static int run_test(int cgroup_fd, int server_fd) goto close_bpf_object; } - client_fd = connect_to_server(server_fd); + client_fd = connect_to_fd(AF_INET, SOCK_STREAM, server_fd); if (client_fd < 0) { err = -1; goto close_bpf_object; @@ -180,103 +153,22 @@ close_bpf_object: return err; } -static int start_server(void) -{ - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_LOOPBACK), - }; - int fd; - - fd = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, 0); - if (fd < 0) { - log_err("Failed to create server socket"); - return -1; - } - - if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) { - log_err("Failed to bind socket"); - close(fd); - return -1; - } - - return fd; -} - -static pthread_mutex_t server_started_mtx = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t server_started = PTHREAD_COND_INITIALIZER; -static volatile bool server_done = false; - -static void *server_thread(void *arg) -{ - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int fd = *(int *)arg; - int client_fd; - int err; - - err = listen(fd, 1); - - pthread_mutex_lock(&server_started_mtx); - pthread_cond_signal(&server_started); - pthread_mutex_unlock(&server_started_mtx); - - if (CHECK_FAIL(err < 0)) { - perror("Failed to listed on socket"); - return ERR_PTR(err); - } - - while (true) { - client_fd = accept(fd, (struct sockaddr *)&addr, &len); - if (client_fd == -1 && errno == EAGAIN) { - usleep(50); - continue; - } - break; - } - if (CHECK_FAIL(client_fd < 0)) { - perror("Failed to accept client"); - return ERR_PTR(err); - } - - while (!server_done) - usleep(50); - - close(client_fd); - - return NULL; -} - void test_tcp_rtt(void) { int server_fd, cgroup_fd; - pthread_t tid; - void *server_res; cgroup_fd = test__join_cgroup("/tcp_rtt"); if (CHECK_FAIL(cgroup_fd < 0)) return; - server_fd = start_server(); + server_fd = start_server(AF_INET, SOCK_STREAM); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; - if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread, - (void *)&server_fd))) - goto close_server_fd; - - pthread_mutex_lock(&server_started_mtx); - pthread_cond_wait(&server_started, &server_started_mtx); - pthread_mutex_unlock(&server_started_mtx); - CHECK_FAIL(run_test(cgroup_fd, server_fd)); - server_done = true; - CHECK_FAIL(pthread_join(tid, &server_res)); - CHECK_FAIL(IS_ERR(server_res)); - -close_server_fd: close(server_fd); + close_cgroup_fd: close(cgroup_fd); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_overhead.c b/tools/testing/selftests/bpf/prog_tests/test_overhead.c index 465b371a561d..2702df2b2343 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_overhead.c +++ b/tools/testing/selftests/bpf/prog_tests/test_overhead.c @@ -61,9 +61,10 @@ void test_test_overhead(void) const char *raw_tp_name = "raw_tp/task_rename"; const char *fentry_name = "fentry/__set_task_comm"; const char *fexit_name = "fexit/__set_task_comm"; + const char *fmodret_name = "fmod_ret/__set_task_comm"; const char *kprobe_func = "__set_task_comm"; struct bpf_program *kprobe_prog, *kretprobe_prog, *raw_tp_prog; - struct bpf_program *fentry_prog, *fexit_prog; + struct bpf_program *fentry_prog, *fexit_prog, *fmodret_prog; struct bpf_object *obj; struct bpf_link *link; int err, duration = 0; @@ -96,6 +97,10 @@ void test_test_overhead(void) if (CHECK(!fexit_prog, "find_probe", "prog '%s' not found\n", fexit_name)) goto cleanup; + fmodret_prog = bpf_object__find_program_by_title(obj, fmodret_name); + if (CHECK(!fmodret_prog, "find_probe", + "prog '%s' not found\n", fmodret_name)) + goto cleanup; err = bpf_object__load(obj); if (CHECK(err, "obj_load", "err %d\n", err)) @@ -142,6 +147,13 @@ void test_test_overhead(void) goto cleanup; test_run("fexit"); bpf_link__destroy(link); + + /* attach fmod_ret */ + link = bpf_program__attach_trace(fmodret_prog); + if (CHECK(IS_ERR(link), "attach fmod_ret", "err %ld\n", PTR_ERR(link))) + goto cleanup; + test_run("fmod_ret"); + bpf_link__destroy(link); cleanup: prctl(PR_SET_NAME, comm, 0L, 0L, 0L); bpf_object__close(obj); diff --git a/tools/testing/selftests/bpf/prog_tests/xdp.c b/tools/testing/selftests/bpf/prog_tests/xdp.c index dcb5ecac778e..48921ff74850 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> void test_xdp(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c index 3744196d7cba..d5c98f2cb12f 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c @@ -1,13 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> -void test_xdp_adjust_tail(void) +void test_xdp_adjust_tail_shrink(void) { - const char *file = "./test_adjust_tail.o"; + const char *file = "./test_xdp_adjust_tail_shrink.o"; + __u32 duration, retval, size, expect_sz; struct bpf_object *obj; - char buf[128]; - __u32 duration, retval, size; int err, prog_fd; + char buf[128]; err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); if (CHECK_FAIL(err)) @@ -20,10 +21,121 @@ void test_xdp_adjust_tail(void) "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); + expect_sz = sizeof(pkt_v6) - 20; /* Test shrink with 20 bytes */ err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6), buf, &size, &retval, &duration); - CHECK(err || retval != XDP_TX || size != 54, - "ipv6", "err %d errno %d retval %d size %d\n", + CHECK(err || retval != XDP_TX || size != expect_sz, + "ipv6", "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, retval, size, expect_sz); + bpf_object__close(obj); +} + +void test_xdp_adjust_tail_grow(void) +{ + const char *file = "./test_xdp_adjust_tail_grow.o"; + struct bpf_object *obj; + char buf[4096]; /* avoid segfault: large buf to hold grow results */ + __u32 duration, retval, size, expect_sz; + int err, prog_fd; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + buf, &size, &retval, &duration); + CHECK(err || retval != XDP_DROP, + "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); + + expect_sz = sizeof(pkt_v6) + 40; /* Test grow with 40 bytes */ + err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6) /* 74 */, + buf, &size, &retval, &duration); + CHECK(err || retval != XDP_TX || size != expect_sz, + "ipv6", "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, retval, size, expect_sz); + + bpf_object__close(obj); +} + +void test_xdp_adjust_tail_grow2(void) +{ + const char *file = "./test_xdp_adjust_tail_grow.o"; + char buf[4096]; /* avoid segfault: large buf to hold grow results */ + int tailroom = 320; /* SKB_DATA_ALIGN(sizeof(struct skb_shared_info))*/; + struct bpf_object *obj; + int err, cnt, i; + int max_grow; + + struct bpf_prog_test_run_attr tattr = { + .repeat = 1, + .data_in = &buf, + .data_out = &buf, + .data_size_in = 0, /* Per test */ + .data_size_out = 0, /* Per test */ + }; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &tattr.prog_fd); + if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno)) + return; + + /* Test case-64 */ + memset(buf, 1, sizeof(buf)); + tattr.data_size_in = 64; /* Determine test case via pkt size */ + tattr.data_size_out = 128; /* Limit copy_size */ + /* Kernel side alloc packet memory area that is zero init */ + err = bpf_prog_test_run_xattr(&tattr); + + CHECK_ATTR(errno != ENOSPC /* Due limit copy_size in bpf_test_finish */ + || tattr.retval != XDP_TX + || tattr.data_size_out != 192, /* Expected grow size */ + "case-64", + "err %d errno %d retval %d size %d\n", + err, errno, tattr.retval, tattr.data_size_out); + + /* Extra checks for data contents */ + CHECK_ATTR(tattr.data_size_out != 192 + || buf[0] != 1 || buf[63] != 1 /* 0-63 memset to 1 */ + || buf[64] != 0 || buf[127] != 0 /* 64-127 memset to 0 */ + || buf[128] != 1 || buf[191] != 1, /*128-191 memset to 1 */ + "case-64-data", + "err %d errno %d retval %d size %d\n", + err, errno, tattr.retval, tattr.data_size_out); + + /* Test case-128 */ + memset(buf, 2, sizeof(buf)); + tattr.data_size_in = 128; /* Determine test case via pkt size */ + tattr.data_size_out = sizeof(buf); /* Copy everything */ + err = bpf_prog_test_run_xattr(&tattr); + + max_grow = 4096 - XDP_PACKET_HEADROOM - tailroom; /* 3520 */ + CHECK_ATTR(err + || tattr.retval != XDP_TX + || tattr.data_size_out != max_grow,/* Expect max grow size */ + "case-128", + "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, tattr.retval, tattr.data_size_out, max_grow); + + /* Extra checks for data content: Count grow size, will contain zeros */ + for (i = 0, cnt = 0; i < sizeof(buf); i++) { + if (buf[i] == 0) + cnt++; + } + CHECK_ATTR((cnt != (max_grow - tattr.data_size_in)) /* Grow increase */ + || tattr.data_size_out != max_grow, /* Total grow size */ + "case-128-data", + "err %d errno %d retval %d size %d grow-size %d\n", + err, errno, tattr.retval, tattr.data_size_out, cnt); + bpf_object__close(obj); } + +void test_xdp_adjust_tail(void) +{ + if (test__start_subtest("xdp_adjust_tail_shrink")) + test_xdp_adjust_tail_shrink(); + if (test__start_subtest("xdp_adjust_tail_grow")) + test_xdp_adjust_tail_grow(); + if (test__start_subtest("xdp_adjust_tail_grow2")) + test_xdp_adjust_tail_grow2(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c index a0f688c37023..2c6c570b21f8 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> #include <net/if.h> #include "test_xdp.skel.h" #include "test_xdp_bpf2bpf.skel.h" diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c new file mode 100644 index 000000000000..d19dbd668f6a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <uapi/linux/bpf.h> +#include <linux/if_link.h> +#include <test_progs.h> + +#include "test_xdp_devmap_helpers.skel.h" +#include "test_xdp_with_devmap_helpers.skel.h" + +#define IFINDEX_LO 1 + +struct bpf_devmap_val { + u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + u32 id; /* prog id on map read */ + } bpf_prog; +}; + +void test_xdp_with_devmap_helpers(void) +{ + struct test_xdp_with_devmap_helpers *skel; + struct bpf_prog_info info = {}; + struct bpf_devmap_val val = { + .ifindex = IFINDEX_LO, + }; + __u32 len = sizeof(info); + __u32 duration = 0, idx = 0; + int err, dm_fd, map_fd; + + + skel = test_xdp_with_devmap_helpers__open_and_load(); + if (CHECK_FAIL(!skel)) { + perror("test_xdp_with_devmap_helpers__open_and_load"); + return; + } + + /* can not attach program with DEVMAPs that allow programs + * as xdp generic + */ + dm_fd = bpf_program__fd(skel->progs.xdp_redir_prog); + err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE); + CHECK(err == 0, "Generic attach of program with 8-byte devmap", + "should have failed\n"); + + dm_fd = bpf_program__fd(skel->progs.xdp_dummy_dm); + map_fd = bpf_map__fd(skel->maps.dm_ports); + err = bpf_obj_get_info_by_fd(dm_fd, &info, &len); + if (CHECK_FAIL(err)) + goto out_close; + + val.bpf_prog.fd = dm_fd; + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + CHECK(err, "Add program to devmap entry", + "err %d errno %d\n", err, errno); + + err = bpf_map_lookup_elem(map_fd, &idx, &val); + CHECK(err, "Read devmap entry", "err %d errno %d\n", err, errno); + CHECK(info.id != val.bpf_prog.id, "Expected program id in devmap entry", + "expected %u read %u\n", info.id, val.bpf_prog.id); + + /* can not attach BPF_XDP_DEVMAP program to a device */ + err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE); + CHECK(err == 0, "Attach of BPF_XDP_DEVMAP program", + "should have failed\n"); + + val.ifindex = 1; + val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog); + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + CHECK(err == 0, "Add non-BPF_XDP_DEVMAP program to devmap entry", + "should have failed\n"); + +out_close: + test_xdp_with_devmap_helpers__destroy(skel); +} + +void test_neg_xdp_devmap_helpers(void) +{ + struct test_xdp_devmap_helpers *skel; + __u32 duration = 0; + + skel = test_xdp_devmap_helpers__open_and_load(); + if (CHECK(skel, + "Load of XDP program accessing egress ifindex without attach type", + "should have failed\n")) { + test_xdp_devmap_helpers__destroy(skel); + } +} + + +void test_xdp_devmap_attach(void) +{ + if (test__start_subtest("DEVMAP with programs in entries")) + test_xdp_with_devmap_helpers(); + + if (test__start_subtest("Verifier check of DEVMAP programs")) + test_neg_xdp_devmap_helpers(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c index c9404e6b226e..f284f72158ef 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> void test_xdp_noinline(void) { diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index 9941f0ba471e..de6de9221518 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -20,20 +20,20 @@ #include <bpf/bpf_endian.h> int _version SEC("version") = 1; -#define PROG(F) SEC(#F) int bpf_func_##F +#define PROG(F) PROG_(F, _##F) +#define PROG_(NUM, NAME) SEC("flow_dissector/"#NUM) int bpf_func##NAME /* These are the identifiers of the BPF programs that will be used in tail * calls. Name is limited to 16 characters, with the terminating character and * bpf_func_ above, we have only 6 to work with, anything after will be cropped. */ -enum { - IP, - IPV6, - IPV6OP, /* Destination/Hop-by-Hop Options IPv6 Extension header */ - IPV6FR, /* Fragmentation IPv6 Extension Header */ - MPLS, - VLAN, -}; +#define IP 0 +#define IPV6 1 +#define IPV6OP 2 /* Destination/Hop-by-Hop Options IPv6 Ext. Header */ +#define IPV6FR 3 /* Fragmentation IPv6 Extension Header */ +#define MPLS 4 +#define VLAN 5 +#define MAX_PROG 6 #define IP_MF 0x2000 #define IP_OFFSET 0x1FFF @@ -59,7 +59,7 @@ struct frag_hdr { struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); - __uint(max_entries, 8); + __uint(max_entries, MAX_PROG); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(__u32)); } jmp_table SEC(".maps"); diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c new file mode 100644 index 000000000000..b57bd6fef208 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__bpf_map +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__bpf_map { + struct bpf_iter_meta *meta; + struct bpf_map *map; +} __attribute__((preserve_access_index)); + +SEC("iter/bpf_map") +int dump_bpf_map(struct bpf_iter__bpf_map *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + __u64 seq_num = ctx->meta->seq_num; + struct bpf_map *map = ctx->map; + + if (map == (void *)0) { + BPF_SEQ_PRINTF(seq, " %%%%%% END %%%%%%\n"); + return 0; + } + + if (seq_num == 0) + BPF_SEQ_PRINTF(seq, " id refcnt usercnt locked_vm\n"); + + BPF_SEQ_PRINTF(seq, "%8u %8ld %8ld %10lu\n", map->id, map->refcnt.counter, + map->usercnt.counter, + map->memory.user->locked_vm.counter); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c new file mode 100644 index 000000000000..c8e9ca74c87b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__ipv6_route bpf_iter__ipv6_route___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__ipv6_route +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__ipv6_route { + struct bpf_iter_meta *meta; + struct fib6_info *rt; +} __attribute__((preserve_access_index)); + +char _license[] SEC("license") = "GPL"; + +extern bool CONFIG_IPV6_SUBTREES __kconfig __weak; + +#define RTF_GATEWAY 0x0002 +#define IFNAMSIZ 16 +#define fib_nh_gw_family nh_common.nhc_gw_family +#define fib_nh_gw6 nh_common.nhc_gw.ipv6 +#define fib_nh_dev nh_common.nhc_dev + +SEC("iter/ipv6_route") +int dump_ipv6_route(struct bpf_iter__ipv6_route *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct fib6_info *rt = ctx->rt; + const struct net_device *dev; + struct fib6_nh *fib6_nh; + unsigned int flags; + struct nexthop *nh; + + if (rt == (void *)0) + return 0; + + fib6_nh = &rt->fib6_nh[0]; + flags = rt->fib6_flags; + + /* FIXME: nexthop_is_multipath is not handled here. */ + nh = rt->nh; + if (rt->nh) + fib6_nh = &nh->nh_info->fib6_nh; + + BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); + + if (CONFIG_IPV6_SUBTREES) + BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_src.addr, + rt->fib6_src.plen); + else + BPF_SEQ_PRINTF(seq, "00000000000000000000000000000000 00 "); + + if (fib6_nh->fib_nh_gw_family) { + flags |= RTF_GATEWAY; + BPF_SEQ_PRINTF(seq, "%pi6 ", &fib6_nh->fib_nh_gw6); + } else { + BPF_SEQ_PRINTF(seq, "00000000000000000000000000000000 "); + } + + dev = fib6_nh->fib_nh_dev; + if (dev) + BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x %8s\n", rt->fib6_metric, + rt->fib6_ref.refs.counter, 0, flags, dev->name); + else + BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x\n", rt->fib6_metric, + rt->fib6_ref.refs.counter, 0, flags); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c new file mode 100644 index 000000000000..e7b8753eac0b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__netlink bpf_iter__netlink___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__netlink +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +#define sk_rmem_alloc sk_backlog.rmem_alloc +#define sk_refcnt __sk_common.skc_refcnt + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__netlink { + struct bpf_iter_meta *meta; + struct netlink_sock *sk; +} __attribute__((preserve_access_index)); + +static inline struct inode *SOCK_INODE(struct socket *socket) +{ + return &container_of(socket, struct socket_alloc, socket)->vfs_inode; +} + +SEC("iter/netlink") +int dump_netlink(struct bpf_iter__netlink *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct netlink_sock *nlk = ctx->sk; + unsigned long group, ino; + struct inode *inode; + struct socket *sk; + struct sock *s; + + if (nlk == (void *)0) + return 0; + + if (ctx->meta->seq_num == 0) + BPF_SEQ_PRINTF(seq, "sk Eth Pid Groups " + "Rmem Wmem Dump Locks Drops " + "Inode\n"); + + s = &nlk->sk; + BPF_SEQ_PRINTF(seq, "%pK %-3d ", s, s->sk_protocol); + + if (!nlk->groups) { + group = 0; + } else { + /* FIXME: temporary use bpf_probe_read here, needs + * verifier support to do direct access. + */ + bpf_probe_read(&group, sizeof(group), &nlk->groups[0]); + } + BPF_SEQ_PRINTF(seq, "%-10u %08x %-8d %-8d %-5d %-8d ", + nlk->portid, (u32)group, + s->sk_rmem_alloc.counter, + s->sk_wmem_alloc.refs.counter - 1, + nlk->cb_running, s->sk_refcnt.refs.counter); + + sk = s->sk_socket; + if (!sk) { + ino = 0; + } else { + /* FIXME: container_of inside SOCK_INODE has a forced + * type conversion, and direct access cannot be used + * with current verifier. + */ + inode = SOCK_INODE(sk); + bpf_probe_read(&ino, sizeof(ino), &inode->i_ino); + } + BPF_SEQ_PRINTF(seq, "%-8u %-8lu\n", s->sk_drops.counter, ino); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c new file mode 100644 index 000000000000..ee754021f98e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + +SEC("iter/task") +int dump_task(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + + if (task == (void *)0) { + BPF_SEQ_PRINTF(seq, " === END ===\n"); + return 0; + } + + if (ctx->meta->seq_num == 0) + BPF_SEQ_PRINTF(seq, " tgid gid\n"); + + BPF_SEQ_PRINTF(seq, "%8d %8d\n", task->tgid, task->pid); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c new file mode 100644 index 000000000000..0f0ec3db20ba --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task_file bpf_iter__task_file___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task_file +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task_file { + struct bpf_iter_meta *meta; + struct task_struct *task; + __u32 fd; + struct file *file; +} __attribute__((preserve_access_index)); + +SEC("iter/task_file") +int dump_task_file(struct bpf_iter__task_file *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + __u32 fd = ctx->fd; + struct file *file = ctx->file; + + if (task == (void *)0 || file == (void *)0) + return 0; + + if (ctx->meta->seq_num == 0) + BPF_SEQ_PRINTF(seq, " tgid gid fd file\n"); + + BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd, + (long)file->f_op); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c new file mode 100644 index 000000000000..c71a7c283108 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define START_CHAR 'a' +#include "bpf_iter_test_kern_common.h" diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c new file mode 100644 index 000000000000..8bdc8dc07444 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define START_CHAR 'A' +#include "bpf_iter_test_kern_common.h" diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c new file mode 100644 index 000000000000..13c2c90c835f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + +SEC("iter/task") +int dump_task(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + int tgid; + + tgid = task->tgid; + bpf_seq_write(seq, &tgid, sizeof(tgid)); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c new file mode 100644 index 000000000000..0aa71b333cf3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__bpf_map +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__bpf_map { + struct bpf_iter_meta *meta; + struct bpf_map *map; +} __attribute__((preserve_access_index)); + +__u32 map1_id = 0, map2_id = 0; +__u32 map1_accessed = 0, map2_accessed = 0; +__u64 map1_seqnum = 0, map2_seqnum1 = 0, map2_seqnum2 = 0; + +static volatile const __u32 print_len; +static volatile const __u32 ret1; + +SEC("iter/bpf_map") +int dump_bpf_map(struct bpf_iter__bpf_map *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct bpf_map *map = ctx->map; + __u64 seq_num; + int i, ret = 0; + + if (map == (void *)0) + return 0; + + /* only dump map1_id and map2_id */ + if (map->id != map1_id && map->id != map2_id) + return 0; + + seq_num = ctx->meta->seq_num; + if (map->id == map1_id) { + map1_seqnum = seq_num; + map1_accessed++; + } + + if (map->id == map2_id) { + if (map2_accessed == 0) { + map2_seqnum1 = seq_num; + if (ret1) + ret = 1; + } else { + map2_seqnum2 = seq_num; + } + map2_accessed++; + } + + /* fill seq_file buffer */ + for (i = 0; i < print_len; i++) + bpf_seq_write(seq, &seq_num, sizeof(seq_num)); + + return ret; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h new file mode 100644 index 000000000000..dee1339e6905 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; +int count = 0; + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + +SEC("iter/task") +int dump_task(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + char c; + + if (count < 4) { + c = START_CHAR + count; + bpf_seq_write(seq, &c, sizeof(c)); + count++; + } + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c new file mode 100644 index 000000000000..3f757e30d7a0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <linux/bpf.h> +#include <bpf/bpf_endian.h> +#include <bpf/bpf_helpers.h> + +#include <linux/if_ether.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/tcp.h> + +#include <sys/types.h> +#include <sys/socket.h> + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; + +__u16 g_serv_port = 0; + +static inline void set_ip(__u32 *dst, const struct in6_addr *src) +{ + dst[0] = src->in6_u.u6_addr32[0]; + dst[1] = src->in6_u.u6_addr32[1]; + dst[2] = src->in6_u.u6_addr32[2]; + dst[3] = src->in6_u.u6_addr32[3]; +} + +static inline void set_tuple(struct bpf_sock_tuple *tuple, + const struct ipv6hdr *ip6h, + const struct tcphdr *tcph) +{ + set_ip(tuple->ipv6.saddr, &ip6h->daddr); + set_ip(tuple->ipv6.daddr, &ip6h->saddr); + tuple->ipv6.sport = tcph->dest; + tuple->ipv6.dport = tcph->source; +} + +static inline int is_allowed_peer_cg(struct __sk_buff *skb, + const struct ipv6hdr *ip6h, + const struct tcphdr *tcph) +{ + __u64 cgid, acgid, peer_cgid, peer_acgid; + struct bpf_sock_tuple tuple; + size_t tuple_len = sizeof(tuple.ipv6); + struct bpf_sock *peer_sk; + + set_tuple(&tuple, ip6h, tcph); + + peer_sk = bpf_sk_lookup_tcp(skb, &tuple, tuple_len, + BPF_F_CURRENT_NETNS, 0); + if (!peer_sk) + return 0; + + cgid = bpf_skb_cgroup_id(skb); + peer_cgid = bpf_sk_cgroup_id(peer_sk); + + acgid = bpf_skb_ancestor_cgroup_id(skb, 2); + peer_acgid = bpf_sk_ancestor_cgroup_id(peer_sk, 2); + + bpf_sk_release(peer_sk); + + return cgid && cgid == peer_cgid && acgid && acgid == peer_acgid; +} + +SEC("cgroup_skb/ingress") +int ingress_lookup(struct __sk_buff *skb) +{ + __u32 serv_port_key = 0; + struct ipv6hdr ip6h; + struct tcphdr tcph; + + if (skb->protocol != bpf_htons(ETH_P_IPV6)) + return 1; + + /* For SYN packets coming to listening socket skb->remote_port will be + * zero, so IPv6/TCP headers are loaded to identify remote peer + * instead. + */ + if (bpf_skb_load_bytes(skb, 0, &ip6h, sizeof(ip6h))) + return 1; + + if (ip6h.nexthdr != IPPROTO_TCP) + return 1; + + if (bpf_skb_load_bytes(skb, sizeof(ip6h), &tcph, sizeof(tcph))) + return 1; + + if (!g_serv_port) + return 0; + + if (tcph.dest != g_serv_port) + return 1; + + return is_allowed_peer_cg(skb, &ip6h, &tcph); +} diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index ad3c498a8150..1ab2c5eba86c 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -8,6 +8,9 @@ #include <linux/in.h> #include <linux/in6.h> #include <sys/socket.h> +#include <netinet/tcp.h> +#include <linux/if.h> +#include <errno.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> @@ -16,6 +19,14 @@ #define DST_REWRITE_IP4 0x7f000001U #define DST_REWRITE_PORT4 4444 +#ifndef TCP_CA_NAME_MAX +#define TCP_CA_NAME_MAX 16 +#endif + +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif + int _version SEC("version") = 1; __attribute__ ((noinline)) @@ -33,6 +44,66 @@ int do_bind(struct bpf_sock_addr *ctx) return 1; } +static __inline int verify_cc(struct bpf_sock_addr *ctx, + char expected[TCP_CA_NAME_MAX]) +{ + char buf[TCP_CA_NAME_MAX]; + int i; + + if (bpf_getsockopt(ctx, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf))) + return 1; + + for (i = 0; i < TCP_CA_NAME_MAX; i++) { + if (buf[i] != expected[i]) + return 1; + if (buf[i] == 0) + break; + } + + return 0; +} + +static __inline int set_cc(struct bpf_sock_addr *ctx) +{ + char reno[TCP_CA_NAME_MAX] = "reno"; + char cubic[TCP_CA_NAME_MAX] = "cubic"; + + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &reno, sizeof(reno))) + return 1; + if (verify_cc(ctx, reno)) + return 1; + + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &cubic, sizeof(cubic))) + return 1; + if (verify_cc(ctx, cubic)) + return 1; + + return 0; +} + +static __inline int bind_to_device(struct bpf_sock_addr *ctx) +{ + char veth1[IFNAMSIZ] = "test_sock_addr1"; + char veth2[IFNAMSIZ] = "test_sock_addr2"; + char missing[IFNAMSIZ] = "nonexistent_dev"; + char del_bind[IFNAMSIZ] = ""; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth1, sizeof(veth1))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth2, sizeof(veth2))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &missing, sizeof(missing)) != -ENODEV) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &del_bind, sizeof(del_bind))) + return 1; + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { @@ -46,6 +117,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) tuple.ipv4.daddr = bpf_htonl(DST_REWRITE_IP4); tuple.ipv4.dport = bpf_htons(DST_REWRITE_PORT4); + /* Bind to device and unbind it. */ + if (bind_to_device(ctx)) + return 0; + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) return 0; else if (ctx->type == SOCK_STREAM) @@ -66,6 +141,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) bpf_sk_release(sk); + /* Rewrite congestion control. */ + if (ctx->type == SOCK_STREAM && set_cc(ctx)) + return 0; + /* Rewrite destination. */ ctx->user_ip4 = bpf_htonl(DST_REWRITE_IP4); ctx->user_port = bpf_htons(DST_REWRITE_PORT4); diff --git a/tools/testing/selftests/bpf/progs/connect_force_port4.c b/tools/testing/selftests/bpf/progs/connect_force_port4.c new file mode 100644 index 000000000000..7396308677a3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_force_port4.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <string.h> +#include <stdbool.h> + +#include <linux/bpf.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <sys/socket.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; + +struct svc_addr { + __be32 addr; + __be16 port; +}; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct svc_addr); +} service_mapping SEC(".maps"); + +SEC("cgroup/connect4") +int connect4(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in sa = {}; + struct svc_addr *orig; + + /* Force local address to 127.0.0.1:22222. */ + sa.sin_family = AF_INET; + sa.sin_port = bpf_htons(22222); + sa.sin_addr.s_addr = bpf_htonl(0x7f000001); + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + /* Rewire service 1.2.3.4:60000 to backend 127.0.0.1:60123. */ + if (ctx->user_port == bpf_htons(60000)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!orig) + return 0; + + orig->addr = ctx->user_ip4; + orig->port = ctx->user_port; + + ctx->user_ip4 = bpf_htonl(0x7f000001); + ctx->user_port = bpf_htons(60123); + } + return 1; +} + +SEC("cgroup/getsockname4") +int getsockname4(struct bpf_sock_addr *ctx) +{ + /* Expose local server as 1.2.3.4:60000 to client. */ + if (ctx->user_port == bpf_htons(60123)) { + ctx->user_ip4 = bpf_htonl(0x01020304); + ctx->user_port = bpf_htons(60000); + } + return 1; +} + +SEC("cgroup/getpeername4") +int getpeername4(struct bpf_sock_addr *ctx) +{ + struct svc_addr *orig; + + /* Expose service 1.2.3.4:60000 as peer instead of backend. */ + if (ctx->user_port == bpf_htons(60123)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0); + if (orig) { + ctx->user_ip4 = orig->addr; + ctx->user_port = orig->port; + } + } + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/connect_force_port6.c b/tools/testing/selftests/bpf/progs/connect_force_port6.c new file mode 100644 index 000000000000..c1a2b555e9ad --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_force_port6.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <string.h> + +#include <linux/bpf.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <sys/socket.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; + +struct svc_addr { + __be32 addr[4]; + __be16 port; +}; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct svc_addr); +} service_mapping SEC(".maps"); + +SEC("cgroup/connect6") +int connect6(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in6 sa = {}; + struct svc_addr *orig; + + /* Force local address to [::1]:22223. */ + sa.sin6_family = AF_INET6; + sa.sin6_port = bpf_htons(22223); + sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + /* Rewire service [fc00::1]:60000 to backend [::1]:60124. */ + if (ctx->user_port == bpf_htons(60000)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!orig) + return 0; + + orig->addr[0] = ctx->user_ip6[0]; + orig->addr[1] = ctx->user_ip6[1]; + orig->addr[2] = ctx->user_ip6[2]; + orig->addr[3] = ctx->user_ip6[3]; + orig->port = ctx->user_port; + + ctx->user_ip6[0] = 0; + ctx->user_ip6[1] = 0; + ctx->user_ip6[2] = 0; + ctx->user_ip6[3] = bpf_htonl(1); + ctx->user_port = bpf_htons(60124); + } + return 1; +} + +SEC("cgroup/getsockname6") +int getsockname6(struct bpf_sock_addr *ctx) +{ + /* Expose local server as [fc00::1]:60000 to client. */ + if (ctx->user_port == bpf_htons(60124)) { + ctx->user_ip6[0] = bpf_htonl(0xfc000000); + ctx->user_ip6[1] = 0; + ctx->user_ip6[2] = 0; + ctx->user_ip6[3] = bpf_htonl(1); + ctx->user_port = bpf_htons(60000); + } + return 1; +} + +SEC("cgroup/getpeername6") +int getpeername6(struct bpf_sock_addr *ctx) +{ + struct svc_addr *orig; + + /* Expose service [fc00::1]:60000 as peer instead of backend. */ + if (ctx->user_port == bpf_htons(60124)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0); + if (orig) { + ctx->user_ip6[0] = orig->addr[0]; + ctx->user_ip6[1] = orig->addr[1]; + ctx->user_ip6[2] = orig->addr[2]; + ctx->user_ip6[3] = orig->addr[3]; + ctx->user_port = orig->port; + } + } + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 6d598cfbdb3e..34d84717c946 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -379,7 +379,7 @@ struct core_reloc_arrays___equiv_zero_sz_arr { struct core_reloc_arrays_substruct c[3]; struct core_reloc_arrays_substruct d[1][2]; /* equivalent to flexible array */ - struct core_reloc_arrays_substruct f[0][2]; + struct core_reloc_arrays_substruct f[][2]; }; struct core_reloc_arrays___fixed_arr { diff --git a/tools/testing/selftests/bpf/progs/perfbuf_bench.c b/tools/testing/selftests/bpf/progs/perfbuf_bench.c new file mode 100644 index 000000000000..e5ab4836a641 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/perfbuf_bench.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <linux/bpf.h> +#include <stdint.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(value_size, sizeof(int)); + __uint(key_size, sizeof(int)); +} perfbuf SEC(".maps"); + +const volatile int batch_cnt = 0; + +long sample_val = 42; +long dropped __attribute__((aligned(128))) = 0; + +SEC("fentry/__x64_sys_getpgid") +int bench_perfbuf(void *ctx) +{ + __u64 *sample; + int i; + + for (i = 0; i < batch_cnt; i++) { + if (bpf_perf_event_output(ctx, &perfbuf, BPF_F_CURRENT_CPU, + &sample_val, sizeof(sample_val))) + __sync_add_and_fetch(&dropped, 1); + } + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/ringbuf_bench.c b/tools/testing/selftests/bpf/progs/ringbuf_bench.c new file mode 100644 index 000000000000..123607d314d6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/ringbuf_bench.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <linux/bpf.h> +#include <stdint.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} ringbuf SEC(".maps"); + +const volatile int batch_cnt = 0; +const volatile long use_output = 0; + +long sample_val = 42; +long dropped __attribute__((aligned(128))) = 0; + +const volatile long wakeup_data_size = 0; + +static __always_inline long get_flags() +{ + long sz; + + if (!wakeup_data_size) + return 0; + + sz = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA); + return sz >= wakeup_data_size ? BPF_RB_FORCE_WAKEUP : BPF_RB_NO_WAKEUP; +} + +SEC("fentry/__x64_sys_getpgid") +int bench_ringbuf(void *ctx) +{ + long *sample, flags; + int i; + + if (!use_output) { + for (i = 0; i < batch_cnt; i++) { + sample = bpf_ringbuf_reserve(&ringbuf, + sizeof(sample_val), 0); + if (!sample) { + __sync_add_and_fetch(&dropped, 1); + } else { + *sample = sample_val; + flags = get_flags(); + bpf_ringbuf_submit(sample, flags); + } + } + } else { + for (i = 0; i < batch_cnt; i++) { + flags = get_flags(); + if (bpf_ringbuf_output(&ringbuf, &sample_val, + sizeof(sample_val), flags)) + __sync_add_and_fetch(&dropped, 1); + } + } + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c new file mode 100644 index 000000000000..e5093796be97 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2020 Facebook */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct inner_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} inner_map1 SEC(".maps"), + inner_map2 SEC(".maps"); + +struct outer_arr { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 3); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + /* it's possible to use anonymous struct as inner map definition here */ + __array(values, struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + /* changing max_entries to 2 will fail during load + * due to incompatibility with inner_map definition */ + __uint(max_entries, 1); + __type(key, int); + __type(value, int); + }); +} outer_arr SEC(".maps") = { + /* (void *) cast is necessary because we didn't use `struct inner_map` + * in __inner(values, ...) + * Actually, a conscious effort is required to screw up initialization + * of inner map slots, which is a great thing! + */ + .values = { (void *)&inner_map1, 0, (void *)&inner_map2 }, +}; + +struct outer_hash { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 5); + __uint(key_size, sizeof(int)); + /* Here everything works flawlessly due to reuse of struct inner_map + * and compiler will complain at the attempt to use non-inner_map + * references below. This is great experience. + */ + __array(values, struct inner_map); +} outer_hash SEC(".maps") = { + .values = { + [0] = &inner_map2, + [4] = &inner_map1, + }, +}; + +int input = 0; + +SEC("raw_tp/sys_enter") +int handle__sys_enter(void *ctx) +{ + struct inner_map *inner_map; + int key = 0, val; + + inner_map = bpf_map_lookup_elem(&outer_arr, &key); + if (!inner_map) + return 1; + val = input; + bpf_map_update_elem(inner_map, &key, &val, 0); + + inner_map = bpf_map_lookup_elem(&outer_hash, &key); + if (!inner_map) + return 1; + val = input + 1; + bpf_map_update_elem(inner_map, &key, &val, 0); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c new file mode 100644 index 000000000000..f0b72e86bee5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -0,0 +1,1061 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +// Copyright (c) 2019, 2020 Cloudflare + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include <linux/bpf.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/if_ether.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/pkt_cls.h> +#include <linux/tcp.h> +#include <linux/udp.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#include "test_cls_redirect.h" + +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) + +#define IP_OFFSET_MASK (0x1FFF) +#define IP_MF (0x2000) + +char _license[] SEC("license") = "Dual BSD/GPL"; + +/** + * Destination port and IP used for UDP encapsulation. + */ +static volatile const __be16 ENCAPSULATION_PORT; +static volatile const __be32 ENCAPSULATION_IP; + +typedef struct { + uint64_t processed_packets_total; + uint64_t l3_protocol_packets_total_ipv4; + uint64_t l3_protocol_packets_total_ipv6; + uint64_t l4_protocol_packets_total_tcp; + uint64_t l4_protocol_packets_total_udp; + uint64_t accepted_packets_total_syn; + uint64_t accepted_packets_total_syn_cookies; + uint64_t accepted_packets_total_last_hop; + uint64_t accepted_packets_total_icmp_echo_request; + uint64_t accepted_packets_total_established; + uint64_t forwarded_packets_total_gue; + uint64_t forwarded_packets_total_gre; + + uint64_t errors_total_unknown_l3_proto; + uint64_t errors_total_unknown_l4_proto; + uint64_t errors_total_malformed_ip; + uint64_t errors_total_fragmented_ip; + uint64_t errors_total_malformed_icmp; + uint64_t errors_total_unwanted_icmp; + uint64_t errors_total_malformed_icmp_pkt_too_big; + uint64_t errors_total_malformed_tcp; + uint64_t errors_total_malformed_udp; + uint64_t errors_total_icmp_echo_replies; + uint64_t errors_total_malformed_encapsulation; + uint64_t errors_total_encap_adjust_failed; + uint64_t errors_total_encap_buffer_too_small; + uint64_t errors_total_redirect_loop; +} metrics_t; + +typedef enum { + INVALID = 0, + UNKNOWN, + ECHO_REQUEST, + SYN, + SYN_COOKIE, + ESTABLISHED, +} verdict_t; + +typedef struct { + uint16_t src, dst; +} flow_ports_t; + +_Static_assert( + sizeof(flow_ports_t) != + offsetofend(struct bpf_sock_tuple, ipv4.dport) - + offsetof(struct bpf_sock_tuple, ipv4.sport) - 1, + "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); +_Static_assert( + sizeof(flow_ports_t) != + offsetofend(struct bpf_sock_tuple, ipv6.dport) - + offsetof(struct bpf_sock_tuple, ipv6.sport) - 1, + "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); + +typedef int ret_t; + +/* This is a bit of a hack. We need a return value which allows us to + * indicate that the regular flow of the program should continue, + * while allowing functions to use XDP_PASS and XDP_DROP, etc. + */ +static const ret_t CONTINUE_PROCESSING = -1; + +/* Convenience macro to call functions which return ret_t. + */ +#define MAYBE_RETURN(x) \ + do { \ + ret_t __ret = x; \ + if (__ret != CONTINUE_PROCESSING) \ + return __ret; \ + } while (0) + +/* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes), + * or not aligned if the arch supports efficient unaligned access. + * + * Since the verifier ensures that eBPF packet accesses follow these rules, + * we can tell LLVM to emit code as if we always had a larger alignment. + * It will yell at us if we end up on a platform where this is not valid. + */ +typedef uint8_t *net_ptr __attribute__((align_value(8))); + +typedef struct buf { + struct __sk_buff *skb; + net_ptr head; + /* NB: tail musn't have alignment other than 1, otherwise + * LLVM will go and eliminate code, e.g. when checking packet lengths. + */ + uint8_t *const tail; +} buf_t; + +static size_t buf_off(const buf_t *buf) +{ + /* Clang seems to optimize constructs like + * a - b + c + * if c is known: + * r? = c + * r? -= b + * r? += a + * + * This is a problem if a and b are packet pointers, + * since the verifier allows subtracting two pointers to + * get a scalar, but not a scalar and a pointer. + * + * Use inline asm to break this optimization. + */ + size_t off = (size_t)buf->head; + asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data)); + return off; +} + +static bool buf_copy(buf_t *buf, void *dst, size_t len) +{ + if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) { + return false; + } + + buf->head += len; + return true; +} + +static bool buf_skip(buf_t *buf, const size_t len) +{ + /* Check whether off + len is valid in the non-linear part. */ + if (buf_off(buf) + len > buf->skb->len) { + return false; + } + + buf->head += len; + return true; +} + +/* Returns a pointer to the start of buf, or NULL if len is + * larger than the remaining data. Consumes len bytes on a successful + * call. + * + * If scratch is not NULL, the function will attempt to load non-linear + * data via bpf_skb_load_bytes. On success, scratch is returned. + */ +static void *buf_assign(buf_t *buf, const size_t len, void *scratch) +{ + if (buf->head + len > buf->tail) { + if (scratch == NULL) { + return NULL; + } + + return buf_copy(buf, scratch, len) ? scratch : NULL; + } + + void *ptr = buf->head; + buf->head += len; + return ptr; +} + +static bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) +{ + if (ipv4->ihl <= 5) { + return true; + } + + return buf_skip(buf, (ipv4->ihl - 5) * 4); +} + +static bool ipv4_is_fragment(const struct iphdr *ip) +{ + uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); + return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; +} + +static struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) +{ + struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch); + if (ipv4 == NULL) { + return NULL; + } + + if (ipv4->ihl < 5) { + return NULL; + } + + if (!pkt_skip_ipv4_options(pkt, ipv4)) { + return NULL; + } + + return ipv4; +} + +/* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ +static bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) +{ + if (!buf_copy(pkt, ports, sizeof(*ports))) { + return false; + } + + /* Ports in the L4 headers are reversed, since we are parsing an ICMP + * payload which is going towards the eyeball. + */ + uint16_t dst = ports->src; + ports->src = ports->dst; + ports->dst = dst; + return true; +} + +static uint16_t pkt_checksum_fold(uint32_t csum) +{ + /* The highest reasonable value for an IPv4 header + * checksum requires two folds, so we just do that always. + */ + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + return (uint16_t)~csum; +} + +static void pkt_ipv4_checksum(struct iphdr *iph) +{ + iph->check = 0; + + /* An IP header without options is 20 bytes. Two of those + * are the checksum, which we always set to zero. Hence, + * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7, + * which fits in 32 bit. + */ + _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes"); + uint32_t acc = 0; + uint16_t *ipw = (uint16_t *)iph; + +#pragma clang loop unroll(full) + for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) { + acc += ipw[i]; + } + + iph->check = pkt_checksum_fold(acc); +} + +static bool pkt_skip_ipv6_extension_headers(buf_t *pkt, + const struct ipv6hdr *ipv6, + uint8_t *upper_proto, + bool *is_fragment) +{ + /* We understand five extension headers. + * https://tools.ietf.org/html/rfc8200#section-4.1 states that all + * headers should occur once, except Destination Options, which may + * occur twice. Hence we give up after 6 headers. + */ + struct { + uint8_t next; + uint8_t len; + } exthdr = { + .next = ipv6->nexthdr, + }; + *is_fragment = false; + +#pragma clang loop unroll(full) + for (int i = 0; i < 6; i++) { + switch (exthdr.next) { + case IPPROTO_FRAGMENT: + *is_fragment = true; + /* NB: We don't check that hdrlen == 0 as per spec. */ + /* fallthrough; */ + + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: + case IPPROTO_MH: + if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) { + return false; + } + + /* hdrlen is in 8-octet units, and excludes the first 8 octets. */ + if (!buf_skip(pkt, + (exthdr.len + 1) * 8 - sizeof(exthdr))) { + return false; + } + + /* Decode next header */ + break; + + default: + /* The next header is not one of the known extension + * headers, treat it as the upper layer header. + * + * This handles IPPROTO_NONE. + * + * Encapsulating Security Payload (50) and Authentication + * Header (51) also end up here (and will trigger an + * unknown proto error later). They have a custom header + * format and seem too esoteric to care about. + */ + *upper_proto = exthdr.next; + return true; + } + } + + /* We never found an upper layer header. */ + return false; +} + +/* This function has to be inlined, because the verifier otherwise rejects it + * due to returning a pointer to the stack. This is technically correct, since + * scratch is allocated on the stack. However, this usage should be safe since + * it's the callers stack after all. + */ +static inline __attribute__((__always_inline__)) struct ipv6hdr * +pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, + bool *is_fragment) +{ + struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch); + if (ipv6 == NULL) { + return NULL; + } + + if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) { + return NULL; + } + + return ipv6; +} + +/* Global metrics, per CPU + */ +struct bpf_map_def metrics_map SEC("maps") = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(unsigned int), + .value_size = sizeof(metrics_t), + .max_entries = 1, +}; + +static metrics_t *get_global_metrics(void) +{ + uint64_t key = 0; + return bpf_map_lookup_elem(&metrics_map, &key); +} + +static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) +{ + const int payload_off = + sizeof(*encap) + + sizeof(struct in_addr) * encap->unigue.hop_count; + int32_t encap_overhead = payload_off - sizeof(struct ethhdr); + + // Changing the ethertype if the encapsulated packet is ipv6 + if (encap->gue.proto_ctype == IPPROTO_IPV6) { + encap->eth.h_proto = bpf_htons(ETH_P_IPV6); + } + + if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, + BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET) || + bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC)) + return TC_ACT_SHOT; + + return bpf_redirect(skb->ifindex, BPF_F_INGRESS); +} + +static ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, + struct in_addr *next_hop, metrics_t *metrics) +{ + metrics->forwarded_packets_total_gre++; + + const int payload_off = + sizeof(*encap) + + sizeof(struct in_addr) * encap->unigue.hop_count; + int32_t encap_overhead = + payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); + int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; + uint16_t proto = ETH_P_IP; + + /* Loop protection: the inner packet's TTL is decremented as a safeguard + * against any forwarding loop. As the only interesting field is the TTL + * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes + * as they handle the split packets if needed (no need for the data to be + * in the linear section). + */ + if (encap->gue.proto_ctype == IPPROTO_IPV6) { + proto = ETH_P_IPV6; + uint8_t ttl; + int rc; + + rc = bpf_skb_load_bytes( + skb, payload_off + offsetof(struct ipv6hdr, hop_limit), + &ttl, 1); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (ttl == 0) { + metrics->errors_total_redirect_loop++; + return TC_ACT_SHOT; + } + + ttl--; + rc = bpf_skb_store_bytes( + skb, payload_off + offsetof(struct ipv6hdr, hop_limit), + &ttl, 1, 0); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + } else { + uint8_t ttl; + int rc; + + rc = bpf_skb_load_bytes( + skb, payload_off + offsetof(struct iphdr, ttl), &ttl, + 1); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (ttl == 0) { + metrics->errors_total_redirect_loop++; + return TC_ACT_SHOT; + } + + /* IPv4 also has a checksum to patch. While the TTL is only one byte, + * this function only works for 2 and 4 bytes arguments (the result is + * the same). + */ + rc = bpf_l3_csum_replace( + skb, payload_off + offsetof(struct iphdr, check), ttl, + ttl - 1, 2); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + ttl--; + rc = bpf_skb_store_bytes( + skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1, + 0); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + } + + if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, + BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET) || + bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) { + metrics->errors_total_encap_adjust_failed++; + return TC_ACT_SHOT; + } + + if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) { + metrics->errors_total_encap_buffer_too_small++; + return TC_ACT_SHOT; + } + + buf_t pkt = { + .skb = skb, + .head = (uint8_t *)(long)skb->data, + .tail = (uint8_t *)(long)skb->data_end, + }; + + encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL); + if (encap_gre == NULL) { + metrics->errors_total_encap_buffer_too_small++; + return TC_ACT_SHOT; + } + + encap_gre->ip.protocol = IPPROTO_GRE; + encap_gre->ip.daddr = next_hop->s_addr; + encap_gre->ip.saddr = ENCAPSULATION_IP; + encap_gre->ip.tot_len = + bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta); + encap_gre->gre.flags = 0; + encap_gre->gre.protocol = bpf_htons(proto); + pkt_ipv4_checksum((void *)&encap_gre->ip); + + return bpf_redirect(skb->ifindex, 0); +} + +static ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, + struct in_addr *next_hop, metrics_t *metrics) +{ + /* swap L2 addresses */ + /* This assumes that packets are received from a router. + * So just swapping the MAC addresses here will make the packet go back to + * the router, which will send it to the appropriate machine. + */ + unsigned char temp[ETH_ALEN]; + memcpy(temp, encap->eth.h_dest, sizeof(temp)); + memcpy(encap->eth.h_dest, encap->eth.h_source, + sizeof(encap->eth.h_dest)); + memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source)); + + if (encap->unigue.next_hop == encap->unigue.hop_count - 1 && + encap->unigue.last_hop_gre) { + return forward_with_gre(skb, encap, next_hop, metrics); + } + + metrics->forwarded_packets_total_gue++; + uint32_t old_saddr = encap->ip.saddr; + encap->ip.saddr = encap->ip.daddr; + encap->ip.daddr = next_hop->s_addr; + if (encap->unigue.next_hop < encap->unigue.hop_count) { + encap->unigue.next_hop++; + } + + /* Remove ip->saddr, add next_hop->s_addr */ + const uint64_t off = offsetof(typeof(*encap), ip.check); + int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4); + if (ret < 0) { + return TC_ACT_SHOT; + } + + return bpf_redirect(skb->ifindex, 0); +} + +static ret_t skip_next_hops(buf_t *pkt, int n) +{ + switch (n) { + case 1: + if (!buf_skip(pkt, sizeof(struct in_addr))) + return TC_ACT_SHOT; + case 0: + return CONTINUE_PROCESSING; + + default: + return TC_ACT_SHOT; + } +} + +/* Get the next hop from the GLB header. + * + * Sets next_hop->s_addr to 0 if there are no more hops left. + * pkt is positioned just after the variable length GLB header + * iff the call is successful. + */ +static ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, + struct in_addr *next_hop) +{ + if (encap->unigue.next_hop > encap->unigue.hop_count) { + return TC_ACT_SHOT; + } + + /* Skip "used" next hops. */ + MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop)); + + if (encap->unigue.next_hop == encap->unigue.hop_count) { + /* No more next hops, we are at the end of the GLB header. */ + next_hop->s_addr = 0; + return CONTINUE_PROCESSING; + } + + if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) { + return TC_ACT_SHOT; + } + + /* Skip the remainig next hops (may be zero). */ + return skip_next_hops(pkt, encap->unigue.hop_count - + encap->unigue.next_hop - 1); +} + +/* Fill a bpf_sock_tuple to be used with the socket lookup functions. + * This is a kludge that let's us work around verifier limitations: + * + * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) + * + * clang will substitue a costant for sizeof, which allows the verifier + * to track it's value. Based on this, it can figure out the constant + * return value, and calling code works while still being "generic" to + * IPv4 and IPv6. + */ +static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, + uint64_t iphlen, uint16_t sport, uint16_t dport) +{ + switch (iphlen) { + case sizeof(struct iphdr): { + struct iphdr *ipv4 = (struct iphdr *)iph; + tuple->ipv4.daddr = ipv4->daddr; + tuple->ipv4.saddr = ipv4->saddr; + tuple->ipv4.sport = sport; + tuple->ipv4.dport = dport; + return sizeof(tuple->ipv4); + } + + case sizeof(struct ipv6hdr): { + struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph; + memcpy(&tuple->ipv6.daddr, &ipv6->daddr, + sizeof(tuple->ipv6.daddr)); + memcpy(&tuple->ipv6.saddr, &ipv6->saddr, + sizeof(tuple->ipv6.saddr)); + tuple->ipv6.sport = sport; + tuple->ipv6.dport = dport; + return sizeof(tuple->ipv6); + } + + default: + return 0; + } +} + +static verdict_t classify_tcp(struct __sk_buff *skb, + struct bpf_sock_tuple *tuple, uint64_t tuplen, + void *iph, struct tcphdr *tcp) +{ + struct bpf_sock *sk = + bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); + if (sk == NULL) { + return UNKNOWN; + } + + if (sk->state != BPF_TCP_LISTEN) { + bpf_sk_release(sk); + return ESTABLISHED; + } + + if (iph != NULL && tcp != NULL) { + /* Kludge: we've run out of arguments, but need the length of the ip header. */ + uint64_t iphlen = sizeof(struct iphdr); + if (tuplen == sizeof(tuple->ipv6)) { + iphlen = sizeof(struct ipv6hdr); + } + + if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp, + sizeof(*tcp)) == 0) { + bpf_sk_release(sk); + return SYN_COOKIE; + } + } + + bpf_sk_release(sk); + return UNKNOWN; +} + +static verdict_t classify_udp(struct __sk_buff *skb, + struct bpf_sock_tuple *tuple, uint64_t tuplen) +{ + struct bpf_sock *sk = + bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); + if (sk == NULL) { + return UNKNOWN; + } + + if (sk->state == BPF_TCP_ESTABLISHED) { + bpf_sk_release(sk); + return ESTABLISHED; + } + + bpf_sk_release(sk); + return UNKNOWN; +} + +static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, + struct bpf_sock_tuple *tuple, uint64_t tuplen, + metrics_t *metrics) +{ + switch (proto) { + case IPPROTO_TCP: + return classify_tcp(skb, tuple, tuplen, NULL, NULL); + + case IPPROTO_UDP: + return classify_udp(skb, tuple, tuplen); + + default: + metrics->errors_total_malformed_icmp++; + return INVALID; + } +} + +static verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) +{ + struct icmphdr icmp; + if (!buf_copy(pkt, &icmp, sizeof(icmp))) { + metrics->errors_total_malformed_icmp++; + return INVALID; + } + + /* We should never receive encapsulated echo replies. */ + if (icmp.type == ICMP_ECHOREPLY) { + metrics->errors_total_icmp_echo_replies++; + return INVALID; + } + + if (icmp.type == ICMP_ECHO) { + return ECHO_REQUEST; + } + + if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) { + metrics->errors_total_unwanted_icmp++; + return INVALID; + } + + struct iphdr _ip4; + const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); + if (ipv4 == NULL) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + /* The source address in the outer IP header is from the entity that + * originated the ICMP message. Use the original IP header to restore + * the correct flow tuple. + */ + struct bpf_sock_tuple tuple; + tuple.ipv4.saddr = ipv4->daddr; + tuple.ipv4.daddr = ipv4->saddr; + + if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + return classify_icmp(pkt->skb, ipv4->protocol, &tuple, + sizeof(tuple.ipv4), metrics); +} + +static verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) +{ + struct icmp6hdr icmp6; + if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) { + metrics->errors_total_malformed_icmp++; + return INVALID; + } + + /* We should never receive encapsulated echo replies. */ + if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) { + metrics->errors_total_icmp_echo_replies++; + return INVALID; + } + + if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) { + return ECHO_REQUEST; + } + + if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) { + metrics->errors_total_unwanted_icmp++; + return INVALID; + } + + bool is_fragment; + uint8_t l4_proto; + struct ipv6hdr _ipv6; + const struct ipv6hdr *ipv6 = + pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); + if (ipv6 == NULL) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + if (is_fragment) { + metrics->errors_total_fragmented_ip++; + return INVALID; + } + + /* Swap source and dest addresses. */ + struct bpf_sock_tuple tuple; + memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr)); + memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr)); + + if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6), + metrics); +} + +static verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, + metrics_t *metrics) +{ + metrics->l4_protocol_packets_total_tcp++; + + struct tcphdr _tcp; + struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp); + if (tcp == NULL) { + metrics->errors_total_malformed_tcp++; + return INVALID; + } + + if (tcp->syn) { + return SYN; + } + + struct bpf_sock_tuple tuple; + uint64_t tuplen = + fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest); + return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp); +} + +static verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, + metrics_t *metrics) +{ + metrics->l4_protocol_packets_total_udp++; + + struct udphdr _udp; + struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp); + if (udph == NULL) { + metrics->errors_total_malformed_udp++; + return INVALID; + } + + struct bpf_sock_tuple tuple; + uint64_t tuplen = + fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest); + return classify_udp(pkt->skb, &tuple, tuplen); +} + +static verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) +{ + metrics->l3_protocol_packets_total_ipv4++; + + struct iphdr _ip4; + struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); + if (ipv4 == NULL) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (ipv4->version != 4) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (ipv4_is_fragment(ipv4)) { + metrics->errors_total_fragmented_ip++; + return INVALID; + } + + switch (ipv4->protocol) { + case IPPROTO_ICMP: + return process_icmpv4(pkt, metrics); + + case IPPROTO_TCP: + return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics); + + case IPPROTO_UDP: + return process_udp(pkt, ipv4, sizeof(*ipv4), metrics); + + default: + metrics->errors_total_unknown_l4_proto++; + return INVALID; + } +} + +static verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) +{ + metrics->l3_protocol_packets_total_ipv6++; + + uint8_t l4_proto; + bool is_fragment; + struct ipv6hdr _ipv6; + struct ipv6hdr *ipv6 = + pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); + if (ipv6 == NULL) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (ipv6->version != 6) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (is_fragment) { + metrics->errors_total_fragmented_ip++; + return INVALID; + } + + switch (l4_proto) { + case IPPROTO_ICMPV6: + return process_icmpv6(pkt, metrics); + + case IPPROTO_TCP: + return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics); + + case IPPROTO_UDP: + return process_udp(pkt, ipv6, sizeof(*ipv6), metrics); + + default: + metrics->errors_total_unknown_l4_proto++; + return INVALID; + } +} + +SEC("classifier/cls_redirect") +int cls_redirect(struct __sk_buff *skb) +{ + metrics_t *metrics = get_global_metrics(); + if (metrics == NULL) { + return TC_ACT_SHOT; + } + + metrics->processed_packets_total++; + + /* Pass bogus packets as long as we're not sure they're + * destined for us. + */ + if (skb->protocol != bpf_htons(ETH_P_IP)) { + return TC_ACT_OK; + } + + encap_headers_t *encap; + + /* Make sure that all encapsulation headers are available in + * the linear portion of the skb. This makes it easy to manipulate them. + */ + if (bpf_skb_pull_data(skb, sizeof(*encap))) { + return TC_ACT_OK; + } + + buf_t pkt = { + .skb = skb, + .head = (uint8_t *)(long)skb->data, + .tail = (uint8_t *)(long)skb->data_end, + }; + + encap = buf_assign(&pkt, sizeof(*encap), NULL); + if (encap == NULL) { + return TC_ACT_OK; + } + + if (encap->ip.ihl != 5) { + /* We never have any options. */ + return TC_ACT_OK; + } + + if (encap->ip.daddr != ENCAPSULATION_IP || + encap->ip.protocol != IPPROTO_UDP) { + return TC_ACT_OK; + } + + /* TODO Check UDP length? */ + if (encap->udp.dest != ENCAPSULATION_PORT) { + return TC_ACT_OK; + } + + /* We now know that the packet is destined to us, we can + * drop bogus ones. + */ + if (ipv4_is_fragment((void *)&encap->ip)) { + metrics->errors_total_fragmented_ip++; + return TC_ACT_SHOT; + } + + if (encap->gue.variant != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->gue.control != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->gue.flags != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->gue.hlen != + sizeof(encap->unigue) / 4 + encap->unigue.hop_count) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->unigue.version != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->unigue.reserved != 0) { + return TC_ACT_SHOT; + } + + struct in_addr next_hop; + MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop)); + + if (next_hop.s_addr == 0) { + metrics->accepted_packets_total_last_hop++; + return accept_locally(skb, encap); + } + + verdict_t verdict; + switch (encap->gue.proto_ctype) { + case IPPROTO_IPIP: + verdict = process_ipv4(&pkt, metrics); + break; + + case IPPROTO_IPV6: + verdict = process_ipv6(&pkt, metrics); + break; + + default: + metrics->errors_total_unknown_l3_proto++; + return TC_ACT_SHOT; + } + + switch (verdict) { + case INVALID: + /* metrics have already been bumped */ + return TC_ACT_SHOT; + + case UNKNOWN: + return forward_to_next_hop(skb, encap, &next_hop, metrics); + + case ECHO_REQUEST: + metrics->accepted_packets_total_icmp_echo_request++; + break; + + case SYN: + if (encap->unigue.forward_syn) { + return forward_to_next_hop(skb, encap, &next_hop, + metrics); + } + + metrics->accepted_packets_total_syn++; + break; + + case SYN_COOKIE: + metrics->accepted_packets_total_syn_cookies++; + break; + + case ESTABLISHED: + metrics->accepted_packets_total_established++; + break; + } + + return accept_locally(skb, encap); +} diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.h b/tools/testing/selftests/bpf/progs/test_cls_redirect.h new file mode 100644 index 000000000000..76eab0aacba0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright 2019, 2020 Cloudflare */ + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include <linux/if_ether.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/udp.h> + +struct gre_base_hdr { + uint16_t flags; + uint16_t protocol; +} __attribute__((packed)); + +struct guehdr { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + uint8_t hlen : 5, control : 1, variant : 2; +#else + uint8_t variant : 2, control : 1, hlen : 5; +#endif + uint8_t proto_ctype; + uint16_t flags; +}; + +struct unigue { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + uint8_t _r : 2, last_hop_gre : 1, forward_syn : 1, version : 4; +#else + uint8_t version : 4, forward_syn : 1, last_hop_gre : 1, _r : 2; +#endif + uint8_t reserved; + uint8_t next_hop; + uint8_t hop_count; + // Next hops go here +} __attribute__((packed)); + +typedef struct { + struct ethhdr eth; + struct iphdr ip; + struct gre_base_hdr gre; +} __attribute__((packed)) encap_gre_t; + +typedef struct { + struct ethhdr eth; + struct iphdr ip; + struct udphdr udp; + struct guehdr gue; + struct unigue unigue; +} __attribute__((packed)) encap_headers_t; diff --git a/tools/testing/selftests/bpf/progs/test_enable_stats.c b/tools/testing/selftests/bpf/progs/test_enable_stats.c new file mode 100644 index 000000000000..01a002ade529 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_enable_stats.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <linux/bpf.h> +#include <stdint.h> +#include <linux/types.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +__u64 count = 0; + +SEC("raw_tracepoint/sys_enter") +int test_enable_stats(void *ctx) +{ + count += 1; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_mmap.c b/tools/testing/selftests/bpf/progs/test_mmap.c index 6239596cd14e..4eb42cff5fe9 100644 --- a/tools/testing/selftests/bpf/progs/test_mmap.c +++ b/tools/testing/selftests/bpf/progs/test_mmap.c @@ -9,6 +9,14 @@ char _license[] SEC("license") = "GPL"; struct { __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 4096); + __uint(map_flags, BPF_F_MMAPABLE | BPF_F_RDONLY_PROG); + __type(key, __u32); + __type(value, char); +} rdonly_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 512 * 4); /* at least 4 pages of data */ __uint(map_flags, BPF_F_MMAPABLE); __type(key, __u32); diff --git a/tools/testing/selftests/bpf/progs/test_obj_id.c b/tools/testing/selftests/bpf/progs/test_obj_id.c index 98b9de2fafd0..ded71b3ff6b4 100644 --- a/tools/testing/selftests/bpf/progs/test_obj_id.c +++ b/tools/testing/selftests/bpf/progs/test_obj_id.c @@ -3,16 +3,8 @@ */ #include <stddef.h> #include <linux/bpf.h> -#include <linux/pkt_cls.h> #include <bpf/bpf_helpers.h> -/* It is a dumb bpf program such that it must have no - * issue to be loaded since testing the verifier is - * not the focus here. - */ - -int _version SEC("version") = 1; - struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 1); @@ -20,13 +12,13 @@ struct { __type(value, __u64); } test_map_id SEC(".maps"); -SEC("test_obj_id_dummy") -int test_obj_id(struct __sk_buff *skb) +SEC("raw_tp/sys_enter") +int test_obj_id(void *ctx) { __u32 key = 0; __u64 *value; value = bpf_map_lookup_elem(&test_map_id, &key); - return TC_ACT_OK; + return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_overhead.c b/tools/testing/selftests/bpf/progs/test_overhead.c index abb7344b531f..42403d088abc 100644 --- a/tools/testing/selftests/bpf/progs/test_overhead.c +++ b/tools/testing/selftests/bpf/progs/test_overhead.c @@ -39,4 +39,10 @@ int BPF_PROG(prog5, struct task_struct *tsk, const char *buf, bool exec) return 0; } +SEC("fmod_ret/__set_task_comm") +int BPF_PROG(prog6, struct task_struct *tsk, const char *buf, bool exec) +{ + return !tsk; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf.c b/tools/testing/selftests/bpf/progs/test_ringbuf.c new file mode 100644 index 000000000000..8ba9959b036b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 12); +} ringbuf SEC(".maps"); + +/* inputs */ +int pid = 0; +long value = 0; +long flags = 0; + +/* outputs */ +long total = 0; +long discarded = 0; +long dropped = 0; + +long avail_data = 0; +long ring_size = 0; +long cons_pos = 0; +long prod_pos = 0; + +/* inner state */ +long seq = 0; + +SEC("tp/syscalls/sys_enter_getpgid") +int test_ringbuf(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + int zero = 0; + + if (cur_pid != pid) + return 0; + + sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0); + if (!sample) { + __sync_fetch_and_add(&dropped, 1); + return 1; + } + + sample->pid = pid; + bpf_get_current_comm(sample->comm, sizeof(sample->comm)); + sample->value = value; + + sample->seq = seq++; + __sync_fetch_and_add(&total, 1); + + if (sample->seq & 1) { + /* copy from reserved sample to a new one... */ + bpf_ringbuf_output(&ringbuf, sample, sizeof(*sample), flags); + /* ...and then discard reserved sample */ + bpf_ringbuf_discard(sample, flags); + __sync_fetch_and_add(&discarded, 1); + } else { + bpf_ringbuf_submit(sample, flags); + } + + avail_data = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA); + ring_size = bpf_ringbuf_query(&ringbuf, BPF_RB_RING_SIZE); + cons_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_CONS_POS); + prod_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_PROD_POS); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c new file mode 100644 index 000000000000..edf3b6953533 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +struct ringbuf_map { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 12); +} ringbuf1 SEC(".maps"), + ringbuf2 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 4); + __type(key, int); + __array(values, struct ringbuf_map); +} ringbuf_arr SEC(".maps") = { + .values = { + [0] = &ringbuf1, + [2] = &ringbuf2, + }, +}; + +/* inputs */ +int pid = 0; +int target_ring = 0; +long value = 0; + +/* outputs */ +long total = 0; +long dropped = 0; +long skipped = 0; + +SEC("tp/syscalls/sys_enter_getpgid") +int test_ringbuf(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + void *rb; + int zero = 0; + + if (cur_pid != pid) + return 0; + + rb = bpf_map_lookup_elem(&ringbuf_arr, &target_ring); + if (!rb) { + skipped += 1; + return 1; + } + + sample = bpf_ringbuf_reserve(rb, sizeof(*sample), 0); + if (!sample) { + dropped += 1; + return 1; + } + + sample->pid = pid; + bpf_get_current_comm(sample->comm, sizeof(sample->comm)); + sample->value = value; + + sample->seq = total; + total += 1; + + bpf_ringbuf_submit(sample, 0); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c index 8f530843b4da..1ecd987005d2 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_assign.c +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c @@ -16,6 +16,26 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> +/* Pin map under /sys/fs/bpf/tc/globals/<map name> */ +#define PIN_GLOBAL_NS 2 + +/* Must match struct bpf_elf_map layout from iproute2 */ +struct { + __u32 type; + __u32 size_key; + __u32 size_value; + __u32 max_elem; + __u32 flags; + __u32 id; + __u32 pinning; +} server_map SEC("maps") = { + .type = BPF_MAP_TYPE_SOCKMAP, + .size_key = sizeof(int), + .size_value = sizeof(__u64), + .max_elem = 1, + .pinning = PIN_GLOBAL_NS, +}; + int _version SEC("version") = 1; char _license[] SEC("license") = "GPL"; @@ -72,7 +92,9 @@ handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) { struct bpf_sock_tuple ln = {0}; struct bpf_sock *sk; + const int zero = 0; size_t tuple_len; + __be16 dport; int ret; tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); @@ -83,32 +105,11 @@ handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) if (sk) goto assign; - if (ipv4) { - if (tuple->ipv4.dport != bpf_htons(4321)) - return TC_ACT_OK; - - ln.ipv4.daddr = bpf_htonl(0x7f000001); - ln.ipv4.dport = bpf_htons(1234); - - sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv4), - BPF_F_CURRENT_NETNS, 0); - } else { - if (tuple->ipv6.dport != bpf_htons(4321)) - return TC_ACT_OK; - - /* Upper parts of daddr are already zero. */ - ln.ipv6.daddr[3] = bpf_htonl(0x1); - ln.ipv6.dport = bpf_htons(1234); - - sk = bpf_sk_lookup_udp(skb, &ln, sizeof(ln.ipv6), - BPF_F_CURRENT_NETNS, 0); - } + dport = ipv4 ? tuple->ipv4.dport : tuple->ipv6.dport; + if (dport != bpf_htons(4321)) + return TC_ACT_OK; - /* workaround: We can't do a single socket lookup here, because then - * the compiler will likely spill tuple_len to the stack. This makes it - * lose all bounds information in the verifier, which then rejects the - * call as unsafe. - */ + sk = bpf_map_lookup_elem(&server_map, &zero); if (!sk) return TC_ACT_SHOT; @@ -123,7 +124,9 @@ handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) { struct bpf_sock_tuple ln = {0}; struct bpf_sock *sk; + const int zero = 0; size_t tuple_len; + __be16 dport; int ret; tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); @@ -137,32 +140,11 @@ handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) bpf_sk_release(sk); } - if (ipv4) { - if (tuple->ipv4.dport != bpf_htons(4321)) - return TC_ACT_OK; + dport = ipv4 ? tuple->ipv4.dport : tuple->ipv6.dport; + if (dport != bpf_htons(4321)) + return TC_ACT_OK; - ln.ipv4.daddr = bpf_htonl(0x7f000001); - ln.ipv4.dport = bpf_htons(1234); - - sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4), - BPF_F_CURRENT_NETNS, 0); - } else { - if (tuple->ipv6.dport != bpf_htons(4321)) - return TC_ACT_OK; - - /* Upper parts of daddr are already zero. */ - ln.ipv6.daddr[3] = bpf_htonl(0x1); - ln.ipv6.dport = bpf_htons(1234); - - sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6), - BPF_F_CURRENT_NETNS, 0); - } - - /* workaround: We can't do a single socket lookup here, because then - * the compiler will likely spill tuple_len to the stack. This makes it - * lose all bounds information in the verifier, which then rejects the - * call as unsafe. - */ + sk = bpf_map_lookup_elem(&server_map, &zero); if (!sk) return TC_ACT_SHOT; diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c index d2b38fa6a5b0..e83d0b48d80c 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c +++ b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c @@ -73,6 +73,7 @@ int bpf_sk_lookup_test0(struct __sk_buff *skb) tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); + bpf_printk("sk=%d\n", sk ? 1 : 0); if (sk) bpf_sk_release(sk); return sk ? TC_ACT_OK : TC_ACT_UNSPEC; diff --git a/tools/testing/selftests/bpf/progs/test_skb_helpers.c b/tools/testing/selftests/bpf/progs/test_skb_helpers.c new file mode 100644 index 000000000000..bb3fbf1a29e3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_skb_helpers.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define TEST_COMM_LEN 16 + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u32); +} cgroup_map SEC(".maps"); + +char _license[] SEC("license") = "GPL"; + +SEC("classifier/test_skb_helpers") +int test_skb_helpers(struct __sk_buff *skb) +{ + struct task_struct *task; + char comm[TEST_COMM_LEN]; + __u32 tpid; + + task = (struct task_struct *)bpf_get_current_task(); + bpf_probe_read_kernel(&tpid , sizeof(tpid), &task->tgid); + bpf_probe_read_kernel_str(&comm, sizeof(comm), &task->comm); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c new file mode 100644 index 000000000000..45e8fc75a739 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Isovalent, Inc. +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_hash SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, __u32); + __type(value, __u64); +} socket_storage SEC(".maps"); + +SEC("sk_msg") +int prog_msg_verdict(struct sk_msg_md *msg) +{ + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + int verdict = SK_PASS; + __u32 pid, tpid; + __u64 *sk_stg; + + pid = bpf_get_current_pid_tgid() >> 32; + sk_stg = bpf_sk_storage_get(&socket_storage, msg->sk, 0, BPF_SK_STORAGE_GET_F_CREATE); + if (!sk_stg) + return SK_DROP; + *sk_stg = pid; + bpf_probe_read_kernel(&tpid , sizeof(tpid), &task->tgid); + if (pid != tpid) + verdict = SK_DROP; + bpf_sk_storage_delete(&socket_storage, (void *)msg->sk); + return verdict; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h index 9b4d3a68a91a..057036ca1111 100644 --- a/tools/testing/selftests/bpf/test_sockmap_kern.h +++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h @@ -79,11 +79,18 @@ struct { struct { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); + __uint(max_entries, 2); __type(key, int); __type(value, int); } sock_skb_opts SEC(".maps"); +struct { + __uint(type, TEST_MAP_TYPE); + __uint(max_entries, 20); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} tls_sock_map SEC(".maps"); + SEC("sk_skb1") int bpf_prog1(struct __sk_buff *skb) { @@ -110,8 +117,6 @@ int bpf_prog2(struct __sk_buff *skb) flags = *f; } - bpf_printk("sk_skb2: redirect(%iB) flags=%i\n", - len, flags); #ifdef SOCKMAP return bpf_sk_redirect_map(skb, &sock_map, ret, flags); #else @@ -120,6 +125,43 @@ int bpf_prog2(struct __sk_buff *skb) } +SEC("sk_skb3") +int bpf_prog3(struct __sk_buff *skb) +{ + const int one = 1; + int err, *f, ret = SK_PASS; + void *data_end; + char *c; + + err = bpf_skb_pull_data(skb, 19); + if (err) + goto tls_out; + + c = (char *)(long)skb->data; + data_end = (void *)(long)skb->data_end; + + if (c + 18 < data_end) + memcpy(&c[13], "PASS", 4); + f = bpf_map_lookup_elem(&sock_skb_opts, &one); + if (f && *f) { + __u64 flags = 0; + + ret = 0; + flags = *f; +#ifdef SOCKMAP + return bpf_sk_redirect_map(skb, &tls_sock_map, ret, flags); +#else + return bpf_sk_redirect_hash(skb, &tls_sock_map, &ret, flags); +#endif + } + + f = bpf_map_lookup_elem(&sock_skb_opts, &one); + if (f && *f) + ret = SK_DROP; +tls_out: + return ret; +} + SEC("sockops") int bpf_sockmap(struct bpf_sock_ops *skops) { @@ -143,8 +185,6 @@ int bpf_sockmap(struct bpf_sock_ops *skops) err = bpf_sock_hash_update(skops, &sock_map, &ret, BPF_NOEXIST); #endif - bpf_printk("passive(%i -> %i) map ctx update err: %d\n", - lport, bpf_ntohl(rport), err); } break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: @@ -160,8 +200,6 @@ int bpf_sockmap(struct bpf_sock_ops *skops) err = bpf_sock_hash_update(skops, &sock_map, &ret, BPF_NOEXIST); #endif - bpf_printk("active(%i -> %i) map ctx update err: %d\n", - lport, bpf_ntohl(rport), err); } break; default: @@ -199,72 +237,6 @@ int bpf_prog4(struct sk_msg_md *msg) } SEC("sk_msg2") -int bpf_prog5(struct sk_msg_md *msg) -{ - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - int *start, *end, *start_push, *end_push, *start_pop, *pop; - int *bytes, len1, len2 = 0, len3, len4; - int err1 = -1, err2 = -1; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - err1 = bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - err2 = bpf_msg_cork_bytes(msg, *bytes); - len1 = (__u64)msg->data_end - (__u64)msg->data; - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) { - int err; - - bpf_printk("sk_msg2: pull(%i:%i)\n", - start ? *start : 0, end ? *end : 0); - err = bpf_msg_pull_data(msg, *start, *end, 0); - if (err) - bpf_printk("sk_msg2: pull_data err %i\n", - err); - len2 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length update %i->%i\n", - len1, len2); - } - - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) { - int err; - - bpf_printk("sk_msg2: push(%i:%i)\n", - start_push ? *start_push : 0, - end_push ? *end_push : 0); - err = bpf_msg_push_data(msg, *start_push, *end_push, 0); - if (err) - bpf_printk("sk_msg2: push_data err %i\n", err); - len3 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length push_update %i->%i\n", - len2 ? len2 : len1, len3); - } - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) { - int err; - - bpf_printk("sk_msg2: pop(%i@%i)\n", - start_pop, pop); - err = bpf_msg_pop_data(msg, *start_pop, *pop, 0); - if (err) - bpf_printk("sk_msg2: pop_data err %i\n", err); - len4 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length pop_data %i->%i\n", - len1 ? len1 : 0, len4); - } - - bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n", - len1, err1, err2); - return SK_PASS; -} - -SEC("sk_msg3") int bpf_prog6(struct sk_msg_md *msg) { int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0; @@ -305,86 +277,7 @@ int bpf_prog6(struct sk_msg_md *msg) #endif } -SEC("sk_msg4") -int bpf_prog7(struct sk_msg_md *msg) -{ - int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f; - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - int len1, len2 = 0, len3, len4; - int err1 = 0, err2 = 0, key = 0; - __u64 flags = 0; - - int err; - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - err1 = bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - err2 = bpf_msg_cork_bytes(msg, *bytes); - len1 = (__u64)msg->data_end - (__u64)msg->data; - - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) { - bpf_printk("sk_msg2: pull(%i:%i)\n", - start ? *start : 0, end ? *end : 0); - err = bpf_msg_pull_data(msg, *start, *end, 0); - if (err) - bpf_printk("sk_msg2: pull_data err %i\n", - err); - len2 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length update %i->%i\n", - len1, len2); - } - - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) { - bpf_printk("sk_msg4: push(%i:%i)\n", - start_push ? *start_push : 0, - end_push ? *end_push : 0); - err = bpf_msg_push_data(msg, *start_push, *end_push, 0); - if (err) - bpf_printk("sk_msg4: push_data err %i\n", - err); - len3 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg4: length push_update %i->%i\n", - len2 ? len2 : len1, len3); - } - - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) { - int err; - - bpf_printk("sk_msg4: pop(%i@%i)\n", - start_pop, pop); - err = bpf_msg_pop_data(msg, *start_pop, *pop, 0); - if (err) - bpf_printk("sk_msg4: pop_data err %i\n", err); - len4 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg4: length pop_data %i->%i\n", - len1 ? len1 : 0, len4); - } - - - f = bpf_map_lookup_elem(&sock_redir_flags, &zero); - if (f && *f) { - key = 2; - flags = *f; - } - bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n", - len1, flags, err1 ? err1 : err2); -#ifdef SOCKMAP - err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); -#else - err = bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags); -#endif - bpf_printk("sk_msg3: err %i\n", err); - return err; -} - -SEC("sk_msg5") +SEC("sk_msg3") int bpf_prog8(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; @@ -401,7 +294,7 @@ int bpf_prog8(struct sk_msg_md *msg) } return SK_PASS; } -SEC("sk_msg6") +SEC("sk_msg4") int bpf_prog9(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; @@ -419,7 +312,7 @@ int bpf_prog9(struct sk_msg_md *msg) return SK_PASS; } -SEC("sk_msg7") +SEC("sk_msg5") int bpf_prog10(struct sk_msg_md *msg) { int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop; @@ -443,7 +336,6 @@ int bpf_prog10(struct sk_msg_md *msg) pop = bpf_map_lookup_elem(&sock_bytes, &five); if (start_pop && pop) bpf_msg_pop_data(msg, *start_pop, *pop, 0); - bpf_printk("return sk drop\n"); return SK_DROP; } diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c index 2d0b0b82a78a..50525235380e 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c @@ -45,7 +45,7 @@ int sysctl_tcp_mem(struct bpf_sysctl *ctx) unsigned long tcp_mem[3] = {0, 0, 0}; char value[MAX_VALUE_STR_LEN]; unsigned char i, off = 0; - int ret; + volatile int ret; if (ctx->write) return 0; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c new file mode 100644 index 000000000000..3d66599eee2e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +SEC("xdp_adjust_tail_grow") +int _xdp_adjust_tail_grow(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + unsigned int data_len; + int offset = 0; + + /* Data length determine test case */ + data_len = data_end - data; + + if (data_len == 54) { /* sizeof(pkt_v4) */ + offset = 4096; /* test too large offset */ + } else if (data_len == 74) { /* sizeof(pkt_v6) */ + offset = 40; + } else if (data_len == 64) { + offset = 128; + } else if (data_len == 128) { + offset = 4096 - 256 - 320 - data_len; /* Max tail grow 3520 */ + } else { + return XDP_ABORTED; /* No matching test */ + } + + if (bpf_xdp_adjust_tail(xdp, offset)) + return XDP_DROP; + return XDP_TX; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_adjust_tail.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c index b7fc85769bdc..22065a9cfb25 100644 --- a/tools/testing/selftests/bpf/progs/test_adjust_tail.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Copyright (c) 2018 Facebook +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -11,15 +11,15 @@ int _version SEC("version") = 1; -SEC("xdp_adjust_tail") -int _xdp_adjust_tail(struct xdp_md *xdp) +SEC("xdp_adjust_tail_shrink") +int _xdp_adjust_tail_shrink(struct xdp_md *xdp) { void *data_end = (void *)(long)xdp->data_end; void *data = (void *)(long)xdp->data; int offset = 0; - if (data_end - data == 54) - offset = 256; + if (data_end - data == 54) /* sizeof(pkt_v4) */ + offset = 256; /* shrink too much */ else offset = 20; if (bpf_xdp_adjust_tail(xdp, 0 - offset)) diff --git a/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c new file mode 100644 index 000000000000..e5c0f131c8a7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* fails to load without expected_attach_type = BPF_XDP_DEVMAP + * because of access to egress_ifindex + */ +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +SEC("xdp_dm_log") +int xdpdm_devlog(struct xdp_md *ctx) +{ + char fmt[] = "devmap redirect: dev %u -> dev %u len %u\n"; + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + unsigned int len = data_end - data; + + bpf_trace_printk(fmt, sizeof(fmt), + ctx->ingress_ifindex, ctx->egress_ifindex, len); + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c new file mode 100644 index 000000000000..deef0e050863 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_devmap_val)); + __uint(max_entries, 4); +} dm_ports SEC(".maps"); + +SEC("xdp_redir") +int xdp_redir_prog(struct xdp_md *ctx) +{ + return bpf_redirect_map(&dm_ports, 1, 0); +} + +/* invalid program on DEVMAP entry; + * SEC name means expected attach type not set + */ +SEC("xdp_dummy") +int xdp_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +/* valid program on DEVMAP entry via SEC name; + * has access to egress and ingress ifindex + */ +SEC("xdp_devmap") +int xdp_dummy_dm(struct xdp_md *ctx) +{ + char fmt[] = "devmap redirect: dev %u -> dev %u len %u\n"; + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + unsigned int len = data_end - data; + + bpf_trace_printk(fmt, sizeof(fmt), + ctx->ingress_ifindex, ctx->egress_ifindex, len); + + return XDP_PASS; +} +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c new file mode 100644 index 000000000000..8b36b6640e7e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <linux/bpf.h> +#include <asm/unistd.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +long hits = 0; + +SEC("tp/syscalls/sys_enter_getpgid") +int bench_trigger_tp(void *ctx) +{ + __sync_add_and_fetch(&hits, 1); + return 0; +} + +SEC("raw_tp/sys_enter") +int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id) +{ + if (id == __NR_getpgid) + __sync_add_and_fetch(&hits, 1); + return 0; +} + +SEC("kprobe/__x64_sys_getpgid") +int bench_trigger_kprobe(void *ctx) +{ + __sync_add_and_fetch(&hits, 1); + return 0; +} + +SEC("fentry/__x64_sys_getpgid") +int bench_trigger_fentry(void *ctx) +{ + __sync_add_and_fetch(&hits, 1); + return 0; +} + +SEC("fmod_ret/__x64_sys_getpgid") +int bench_trigger_fmodret(void *ctx) +{ + __sync_add_and_fetch(&hits, 1); + return -22; +} diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index c6766b2cff85..6a12a0e01e07 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1394,23 +1394,25 @@ static void test_map_rdonly(void) key = 1; value = 1234; - /* Insert key=1 element. */ + /* Try to insert key=1 element. */ assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == -1 && errno == EPERM); - /* Check that key=2 is not found. */ + /* Check that key=1 is not found. */ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT); assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == ENOENT); + + close(fd); } -static void test_map_wronly(void) +static void test_map_wronly_hash(void) { int fd, key = 0, value = 0; fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), MAP_SIZE, map_flags | BPF_F_WRONLY); if (fd < 0) { - printf("Failed to create map for read only test '%s'!\n", + printf("Failed to create map for write only test '%s'!\n", strerror(errno)); exit(1); } @@ -1420,9 +1422,49 @@ static void test_map_wronly(void) /* Insert key=1 element. */ assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); - /* Check that key=2 is not found. */ + /* Check that reading elements and keys from the map is not allowed. */ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == EPERM); assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM); + + close(fd); +} + +static void test_map_wronly_stack_or_queue(enum bpf_map_type map_type) +{ + int fd, value = 0; + + assert(map_type == BPF_MAP_TYPE_QUEUE || + map_type == BPF_MAP_TYPE_STACK); + fd = bpf_create_map(map_type, 0, sizeof(value), MAP_SIZE, + map_flags | BPF_F_WRONLY); + /* Stack/Queue maps do not support BPF_F_NO_PREALLOC */ + if (map_flags & BPF_F_NO_PREALLOC) { + assert(fd < 0 && errno == EINVAL); + return; + } + if (fd < 0) { + printf("Failed to create map '%s'!\n", strerror(errno)); + exit(1); + } + + value = 1234; + assert(bpf_map_update_elem(fd, NULL, &value, BPF_ANY) == 0); + + /* Peek element should fail */ + assert(bpf_map_lookup_elem(fd, NULL, &value) == -1 && errno == EPERM); + + /* Pop element should fail */ + assert(bpf_map_lookup_and_delete_elem(fd, NULL, &value) == -1 && + errno == EPERM); + + close(fd); +} + +static void test_map_wronly(void) +{ + test_map_wronly_hash(); + test_map_wronly_stack_or_queue(BPF_MAP_TYPE_STACK); + test_map_wronly_stack_or_queue(BPF_MAP_TYPE_QUEUE); } static void prepare_reuseport_grp(int type, int map_fd, size_t map_elem_size, diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index b521e0a512b6..54fa5fa688ce 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -222,23 +222,6 @@ int test__join_cgroup(const char *path) return fd; } -struct ipv4_packet pkt_v4 = { - .eth.h_proto = __bpf_constant_htons(ETH_P_IP), - .iph.ihl = 5, - .iph.protocol = IPPROTO_TCP, - .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), - .tcp.urg_ptr = 123, - .tcp.doff = 5, -}; - -struct ipv6_packet pkt_v6 = { - .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), - .iph.nexthdr = IPPROTO_TCP, - .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), - .tcp.urg_ptr = 123, - .tcp.doff = 5, -}; - int bpf_find_map(const char *test, struct bpf_object *obj, const char *name) { struct bpf_map *map; @@ -351,25 +334,13 @@ int extract_build_id(char *build_id, size_t size) len = size; memcpy(build_id, line, len); build_id[len] = '\0'; + free(line); return 0; err: fclose(fp); return -1; } -void *spin_lock_thread(void *arg) -{ - __u32 duration, retval; - int err, prog_fd = *(u32 *) arg; - - err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4), - NULL, NULL, &retval, &duration); - CHECK(err || retval, "", - "err %d errno %d retval %d duration %d\n", - err, errno, retval, duration); - pthread_exit(arg); -} - /* extern declarations for test funcs */ #define DEFINE_TEST(name) extern void test_##name(void); #include <prog_tests/tests.h> @@ -420,6 +391,18 @@ static int libbpf_print_fn(enum libbpf_print_level level, return 0; } +static void free_str_set(const struct str_set *set) +{ + int i; + + if (!set) + return; + + for (i = 0; i < set->cnt; i++) + free((void *)set->strs[i]); + free(set->strs); +} + static int parse_str_list(const char *s, struct str_set *set) { char *input, *state = NULL, *next, **tmp, **strs = NULL; @@ -455,67 +438,6 @@ err: return -ENOMEM; } -int parse_num_list(const char *s, struct test_selector *sel) -{ - int i, set_len = 0, new_len, num, start = 0, end = -1; - bool *set = NULL, *tmp, parsing_end = false; - char *next; - - while (s[0]) { - errno = 0; - num = strtol(s, &next, 10); - if (errno) - return -errno; - - if (parsing_end) - end = num; - else - start = num; - - if (!parsing_end && *next == '-') { - s = next + 1; - parsing_end = true; - continue; - } else if (*next == ',') { - parsing_end = false; - s = next + 1; - end = num; - } else if (*next == '\0') { - parsing_end = false; - s = next; - end = num; - } else { - return -EINVAL; - } - - if (start > end) - return -EINVAL; - - if (end + 1 > set_len) { - new_len = end + 1; - tmp = realloc(set, new_len); - if (!tmp) { - free(set); - return -ENOMEM; - } - for (i = set_len; i < start; i++) - tmp[i] = false; - set = tmp; - set_len = new_len; - } - for (i = start; i <= end; i++) - set[i] = true; - } - - if (!set) - return -EINVAL; - - sel->num_set = set; - sel->num_set_len = set_len; - - return 0; -} - extern int extra_prog_load_log_flags; static error_t parse_arg(int key, char *arg, struct argp_state *state) @@ -529,13 +451,15 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) if (subtest_str) { *subtest_str = '\0'; if (parse_num_list(subtest_str + 1, - &env->subtest_selector)) { + &env->subtest_selector.num_set, + &env->subtest_selector.num_set_len)) { fprintf(stderr, "Failed to parse subtest numbers.\n"); return -EINVAL; } } - if (parse_num_list(arg, &env->test_selector)) { + if (parse_num_list(arg, &env->test_selector.num_set, + &env->test_selector.num_set_len)) { fprintf(stderr, "Failed to parse test numbers.\n"); return -EINVAL; } @@ -756,11 +680,11 @@ int main(int argc, char **argv) fprintf(stdout, "Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt); - free(env.test_selector.blacklist.strs); - free(env.test_selector.whitelist.strs); + free_str_set(&env.test_selector.blacklist); + free_str_set(&env.test_selector.whitelist); free(env.test_selector.num_set); - free(env.subtest_selector.blacklist.strs); - free(env.subtest_selector.whitelist.strs); + free_str_set(&env.subtest_selector.blacklist); + free_str_set(&env.subtest_selector.whitelist); free(env.subtest_selector.num_set); return env.fail_cnt ? EXIT_FAILURE : EXIT_SUCCESS; diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index f4aff6b8284b..f4503c926aca 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -37,6 +37,7 @@ typedef __u16 __sum16; #include "bpf_util.h" #include <bpf/bpf_endian.h> #include "trace_helpers.h" +#include "testing_helpers.h" #include "flow_dissector_load.h" enum verbosity { @@ -87,23 +88,12 @@ extern void test__skip(void); extern void test__fail(void); extern int test__join_cgroup(const char *path); -#define MAGIC_BYTES 123 - -/* ipv4 test vector */ -struct ipv4_packet { - struct ethhdr eth; - struct iphdr iph; - struct tcphdr tcp; -} __packed; -extern struct ipv4_packet pkt_v4; - -/* ipv6 test vector */ -struct ipv6_packet { - struct ethhdr eth; - struct ipv6hdr iph; - struct tcphdr tcp; -} __packed; -extern struct ipv6_packet pkt_v6; +#define PRINT_FAIL(format...) \ + ({ \ + test__fail(); \ + fprintf(stdout, "%s:FAIL:%d ", __func__, __LINE__); \ + fprintf(stdout, ##format); \ + }) #define _CHECK(condition, tag, duration, format...) ({ \ int __ret = !!(condition); \ @@ -136,10 +126,6 @@ extern struct ipv6_packet pkt_v6; #define CHECK_ATTR(condition, tag, format...) \ _CHECK(condition, tag, tattr.duration, format) -#define MAGIC_VAL 0x1234 -#define NUM_ITER 100000 -#define VIP_NUM 5 - static inline __u64 ptr_to_u64(const void *ptr) { return (__u64) (unsigned long) ptr; @@ -149,7 +135,6 @@ int bpf_find_map(const char *test, struct bpf_object *obj, const char *name); int compare_map_keys(int map1_fd, int map2_fd); int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len); int extract_build_id(char *build_id, size_t size); -void *spin_lock_thread(void *arg); #ifdef __x86_64__ #define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep" diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 61fd95b89af8..0358814c67dc 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -677,7 +677,7 @@ static int bind4_prog_load(const struct sock_addr_test *test) uint8_t u4_addr8[4]; uint16_t u4_addr16[2]; uint32_t u4_addr32; - } ip4; + } ip4, port; struct sockaddr_in addr4_rw; if (inet_pton(AF_INET, SERV4_IP, (void *)&ip4) != 1) { @@ -685,6 +685,8 @@ static int bind4_prog_load(const struct sock_addr_test *test) return -1; } + port.u4_addr32 = htons(SERV4_PORT); + if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, (struct sockaddr *)&addr4_rw, sizeof(addr4_rw)) == -1) return -1; @@ -696,49 +698,65 @@ static int bind4_prog_load(const struct sock_addr_test *test) /* if (sk.family == AF_INET && */ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 24), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 32), /* (sk.type == SOCK_DGRAM || sk.type == SOCK_STREAM) && */ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, type)), BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 1), BPF_JMP_A(1), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_STREAM, 20), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_STREAM, 28), /* 1st_byte_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[0], 18), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[0], 26), /* 2nd_byte_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4) + 1), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[1], 16), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[1], 24), /* 3rd_byte_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4) + 2), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[2], 14), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[2], 22), /* 4th_byte_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4) + 3), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[3], 12), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[3], 20), /* 1st_half_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[0], 10), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[0], 18), /* 2nd_half_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4) + 2), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[1], 8), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[1], 16), - /* whole_user_ip4 == expected) { */ + /* whole_user_ip4 == expected && */ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4)), BPF_LD_IMM64(BPF_REG_8, ip4.u4_addr32), /* See [2]. */ + BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 12), + + /* 1st_byte_of_user_port == expected && */ + BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_port)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr8[0], 10), + + /* 1st_half_of_user_port == expected && */ + BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_port)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr16[0], 8), + + /* user_port == expected) { */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_port)), + BPF_LD_IMM64(BPF_REG_8, port.u4_addr32), /* See [2]. */ BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 4), /* user_ip4 = addr4_rw.sin_addr */ diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 779e11da979c..37695fc8096a 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -54,7 +54,7 @@ static void running_handler(int a); #define S1_PORT 10000 #define S2_PORT 10001 -#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o" +#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o" #define BPF_SOCKHASH_FILENAME "test_sockhash_kern.o" #define CG_PATH "/sockmap" @@ -63,14 +63,12 @@ int s1, s2, c1, c2, p1, p2; int test_cnt; int passed; int failed; -int map_fd[8]; -struct bpf_map *maps[8]; +int map_fd[9]; +struct bpf_map *maps[9]; int prog_fd[11]; int txmsg_pass; -int txmsg_noisy; int txmsg_redir; -int txmsg_redir_noisy; int txmsg_drop; int txmsg_apply; int txmsg_cork; @@ -81,7 +79,10 @@ int txmsg_end_push; int txmsg_start_pop; int txmsg_pop; int txmsg_ingress; -int txmsg_skb; +int txmsg_redir_skb; +int txmsg_ktls_skb; +int txmsg_ktls_skb_drop; +int txmsg_ktls_skb_redir; int ktls; int peek_flag; @@ -89,15 +90,13 @@ static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, {"cgroup", required_argument, NULL, 'c' }, {"rate", required_argument, NULL, 'r' }, - {"verbose", no_argument, NULL, 'v' }, + {"verbose", optional_argument, NULL, 'v' }, {"iov_count", required_argument, NULL, 'i' }, {"length", required_argument, NULL, 'l' }, {"test", required_argument, NULL, 't' }, {"data_test", no_argument, NULL, 'd' }, {"txmsg", no_argument, &txmsg_pass, 1 }, - {"txmsg_noisy", no_argument, &txmsg_noisy, 1 }, {"txmsg_redir", no_argument, &txmsg_redir, 1 }, - {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1}, {"txmsg_drop", no_argument, &txmsg_drop, 1 }, {"txmsg_apply", required_argument, NULL, 'a'}, {"txmsg_cork", required_argument, NULL, 'k'}, @@ -108,12 +107,108 @@ static const struct option long_options[] = { {"txmsg_start_pop", required_argument, NULL, 'w'}, {"txmsg_pop", required_argument, NULL, 'x'}, {"txmsg_ingress", no_argument, &txmsg_ingress, 1 }, - {"txmsg_skb", no_argument, &txmsg_skb, 1 }, + {"txmsg_redir_skb", no_argument, &txmsg_redir_skb, 1 }, {"ktls", no_argument, &ktls, 1 }, {"peek", no_argument, &peek_flag, 1 }, + {"whitelist", required_argument, NULL, 'n' }, + {"blacklist", required_argument, NULL, 'b' }, {0, 0, NULL, 0 } }; +struct test_env { + const char *type; + const char *subtest; + const char *prepend; + + int test_num; + int subtest_num; + + int succ_cnt; + int fail_cnt; + int fail_last; +}; + +struct test_env env; + +struct sockmap_options { + int verbose; + bool base; + bool sendpage; + bool data_test; + bool drop_expected; + int iov_count; + int iov_length; + int rate; + char *map; + char *whitelist; + char *blacklist; + char *prepend; +}; + +struct _test { + char *title; + void (*tester)(int cg_fd, struct sockmap_options *opt); +}; + +static void test_start(void) +{ + env.subtest_num++; +} + +static void test_fail(void) +{ + env.fail_cnt++; +} + +static void test_pass(void) +{ + env.succ_cnt++; +} + +static void test_reset(void) +{ + txmsg_start = txmsg_end = 0; + txmsg_start_pop = txmsg_pop = 0; + txmsg_start_push = txmsg_end_push = 0; + txmsg_pass = txmsg_drop = txmsg_redir = 0; + txmsg_apply = txmsg_cork = 0; + txmsg_ingress = txmsg_redir_skb = 0; + txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0; +} + +static int test_start_subtest(const struct _test *t, struct sockmap_options *o) +{ + env.type = o->map; + env.subtest = t->title; + env.prepend = o->prepend; + env.test_num++; + env.subtest_num = 0; + env.fail_last = env.fail_cnt; + test_reset(); + return 0; +} + +static void test_end_subtest(void) +{ + int error = env.fail_cnt - env.fail_last; + int type = strcmp(env.type, BPF_SOCKMAP_FILENAME); + + if (!error) + test_pass(); + + fprintf(stdout, "#%2d/%2d %8s:%s:%s:%s\n", + env.test_num, env.subtest_num, + !type ? "sockmap" : "sockhash", + env.prepend ? : "", + env.subtest, error ? "FAIL" : "OK"); +} + +static void test_print_results(void) +{ + fprintf(stdout, "Pass: %d Fail: %d\n", + env.succ_cnt, env.fail_cnt); +} + static void usage(char *argv[]) { int i; @@ -296,7 +391,7 @@ static int sockmap_init_sockets(int verbose) return errno; } - if (verbose) { + if (verbose > 1) { printf("connected sockets: c1 <-> p1, c2 <-> p2\n"); printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n", c1, s1, c2, s2); @@ -311,17 +406,6 @@ struct msg_stats { struct timespec end; }; -struct sockmap_options { - int verbose; - bool base; - bool sendpage; - bool data_test; - bool drop_expected; - int iov_count; - int iov_length; - int rate; -}; - static int msg_loop_sendpage(int fd, int iov_length, int cnt, struct msg_stats *s, struct sockmap_options *opt) @@ -345,14 +429,18 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt, clock_gettime(CLOCK_MONOTONIC, &s->start); for (i = 0; i < cnt; i++) { - int sent = sendfile(fd, fp, NULL, iov_length); + int sent; + + errno = 0; + sent = sendfile(fd, fp, NULL, iov_length); if (!drop && sent < 0) { - perror("send loop error"); + perror("sendpage loop error"); fclose(file); return sent; } else if (drop && sent >= 0) { - printf("sendpage loop error expected: %i\n", sent); + printf("sendpage loop error expected: %i errno %i\n", + sent, errno); fclose(file); return -EIO; } @@ -418,14 +506,41 @@ unwind_iov: static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz) { - int i, j, bytes_cnt = 0; + int i, j = 0, bytes_cnt = 0; unsigned char k = 0; for (i = 0; i < msg->msg_iovlen; i++) { unsigned char *d = msg->msg_iov[i].iov_base; - for (j = 0; - j < msg->msg_iov[i].iov_len && size; j++) { + /* Special case test for skb ingress + ktls */ + if (i == 0 && txmsg_ktls_skb) { + if (msg->msg_iov[i].iov_len < 4) + return -EIO; + if (txmsg_ktls_skb_redir) { + if (memcmp(&d[13], "PASS", 4) != 0) { + fprintf(stderr, + "detected redirect ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[13], d[14], d[15], d[16]); + return -EIO; + } + d[13] = 0; + d[14] = 1; + d[15] = 2; + d[16] = 3; + j = 13; + } else if (txmsg_ktls_skb) { + if (memcmp(d, "PASS", 4) != 0) { + fprintf(stderr, + "detected ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[0], d[1], d[2], d[3]); + return -EIO; + } + d[0] = 0; + d[1] = 1; + d[2] = 2; + d[3] = 3; + } + } + + for (; j < msg->msg_iov[i].iov_len && size; j++) { if (d[j] != k++) { fprintf(stderr, "detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n", @@ -464,13 +579,18 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, if (tx) { clock_gettime(CLOCK_MONOTONIC, &s->start); for (i = 0; i < cnt; i++) { - int sent = sendmsg(fd, &msg, flags); + int sent; + + errno = 0; + sent = sendmsg(fd, &msg, flags); if (!drop && sent < 0) { - perror("send loop error"); + perror("sendmsg loop error"); goto out_errno; } else if (drop && sent >= 0) { - printf("send loop error expected: %i\n", sent); + fprintf(stderr, + "sendmsg loop error expected: %i errno %i\n", + sent, errno); errno = -EIO; goto out_errno; } @@ -497,9 +617,10 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, * paths. */ total_bytes = (float)iov_count * (float)iov_length * (float)cnt; - txmsg_pop_total = txmsg_pop; if (txmsg_apply) - txmsg_pop_total *= (total_bytes / txmsg_apply); + txmsg_pop_total = txmsg_pop * (total_bytes / txmsg_apply); + else + txmsg_pop_total = txmsg_pop * cnt; total_bytes -= txmsg_pop_total; err = clock_gettime(CLOCK_MONOTONIC, &s->start); if (err < 0) @@ -633,14 +754,18 @@ static int sendmsg_test(struct sockmap_options *opt) rxpid = fork(); if (rxpid == 0) { - if (opt->drop_expected) - exit(0); + iov_buf -= (txmsg_pop - txmsg_start_pop + 1); + if (opt->drop_expected || txmsg_ktls_skb_drop) + _exit(0); + + if (!iov_buf) /* zero bytes sent case */ + _exit(0); if (opt->sendpage) iov_count = 1; err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false, opt); - if (opt->verbose) + if (opt->verbose > 1) fprintf(stderr, "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n", iov_count, iov_buf, cnt, err); @@ -648,7 +773,7 @@ static int sendmsg_test(struct sockmap_options *opt) sent_Bps = sentBps(s); recvd_Bps = recvdBps(s); } - if (opt->verbose) + if (opt->verbose > 1) fprintf(stdout, "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s %s\n", s.bytes_sent, sent_Bps, sent_Bps/giga, @@ -678,7 +803,7 @@ static int sendmsg_test(struct sockmap_options *opt) sent_Bps = sentBps(s); recvd_Bps = recvdBps(s); } - if (opt->verbose) + if (opt->verbose > 1) fprintf(stdout, "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n", s.bytes_sent, sent_Bps, sent_Bps/giga, @@ -694,14 +819,14 @@ static int sendmsg_test(struct sockmap_options *opt) if (WIFEXITED(rx_status)) { err = WEXITSTATUS(rx_status); if (err) { - fprintf(stderr, "rx thread exited with err %d. ", err); + fprintf(stderr, "rx thread exited with err %d.\n", err); goto out; } } if (WIFEXITED(tx_status)) { err = WEXITSTATUS(tx_status); if (err) - fprintf(stderr, "tx thread exited with err %d. ", err); + fprintf(stderr, "tx thread exited with err %d.\n", err); } out: return err; @@ -783,6 +908,7 @@ static int forever_ping_pong(int rate, struct sockmap_options *opt) } enum { + SELFTESTS, PING_PONG, SENDMSG, BASE, @@ -816,8 +942,28 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) return err; } + /* Attach programs to TLS sockmap */ + if (txmsg_ktls_skb) { + err = bpf_prog_attach(prog_fd[0], map_fd[8], + BPF_SK_SKB_STREAM_PARSER, 0); + if (err) { + fprintf(stderr, + "ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n", + prog_fd[0], map_fd[8], err, strerror(errno)); + return err; + } + + err = bpf_prog_attach(prog_fd[2], map_fd[8], + BPF_SK_SKB_STREAM_VERDICT, 0); + if (err) { + fprintf(stderr, "ERROR: bpf_prog_attach (TLS sockmap): %d (%s)\n", + err, strerror(errno)); + return err; + } + } + /* Attach to cgroups */ - err = bpf_prog_attach(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS, 0); + err = bpf_prog_attach(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS, 0); if (err) { fprintf(stderr, "ERROR: bpf_prog_attach (groups): %d (%s)\n", err, strerror(errno)); @@ -833,19 +979,14 @@ run: /* Attach txmsg program to sockmap */ if (txmsg_pass) - tx_prog_fd = prog_fd[3]; - else if (txmsg_noisy) tx_prog_fd = prog_fd[4]; else if (txmsg_redir) tx_prog_fd = prog_fd[5]; - else if (txmsg_redir_noisy) - tx_prog_fd = prog_fd[6]; - else if (txmsg_drop) - tx_prog_fd = prog_fd[9]; - /* apply and cork must be last */ else if (txmsg_apply) - tx_prog_fd = prog_fd[7]; + tx_prog_fd = prog_fd[6]; else if (txmsg_cork) + tx_prog_fd = prog_fd[7]; + else if (txmsg_drop) tx_prog_fd = prog_fd[8]; else tx_prog_fd = 0; @@ -870,7 +1011,7 @@ run: goto out; } - if (txmsg_redir || txmsg_redir_noisy) + if (txmsg_redir) redir_fd = c2; else redir_fd = c1; @@ -1018,7 +1159,35 @@ run: } } - if (txmsg_skb) { + if (txmsg_ktls_skb) { + int ingress = BPF_F_INGRESS; + + i = 0; + err = bpf_map_update_elem(map_fd[8], &i, &p2, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n", + err, strerror(errno)); + } + + if (txmsg_ktls_skb_redir) { + i = 1; + err = bpf_map_update_elem(map_fd[7], + &i, &ingress, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n", + err, strerror(errno)); + } + } + + if (txmsg_ktls_skb_drop) { + i = 1; + err = bpf_map_update_elem(map_fd[7], &i, &i, BPF_ANY); + } + } + + if (txmsg_redir_skb) { int skb_fd = (test == SENDMSG || test == SENDPAGE) ? p2 : p1; int ingress = BPF_F_INGRESS; @@ -1033,8 +1202,7 @@ run: } i = 3; - err = bpf_map_update_elem(map_fd[0], - &i, &skb_fd, BPF_ANY); + err = bpf_map_update_elem(map_fd[0], &i, &skb_fd, BPF_ANY); if (err) { fprintf(stderr, "ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n", @@ -1068,9 +1236,12 @@ run: fprintf(stderr, "unknown test\n"); out: /* Detatch and zero all the maps */ - bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS); + bpf_prog_detach2(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS); bpf_prog_detach2(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER); bpf_prog_detach2(prog_fd[1], map_fd[0], BPF_SK_SKB_STREAM_VERDICT); + bpf_prog_detach2(prog_fd[0], map_fd[8], BPF_SK_SKB_STREAM_PARSER); + bpf_prog_detach2(prog_fd[2], map_fd[8], BPF_SK_SKB_STREAM_VERDICT); + if (tx_prog_fd >= 0) bpf_prog_detach2(tx_prog_fd, map_fd[1], BPF_SK_MSG_VERDICT); @@ -1112,12 +1283,8 @@ static void test_options(char *options) if (txmsg_pass) strncat(options, "pass,", OPTSTRING); - if (txmsg_noisy) - strncat(options, "pass_noisy,", OPTSTRING); if (txmsg_redir) strncat(options, "redir,", OPTSTRING); - if (txmsg_redir_noisy) - strncat(options, "redir_noisy,", OPTSTRING); if (txmsg_drop) strncat(options, "drop,", OPTSTRING); if (txmsg_apply) { @@ -1143,8 +1310,10 @@ static void test_options(char *options) } if (txmsg_ingress) strncat(options, "ingress,", OPTSTRING); - if (txmsg_skb) - strncat(options, "skb,", OPTSTRING); + if (txmsg_redir_skb) + strncat(options, "redir_skb,", OPTSTRING); + if (txmsg_ktls_skb) + strncat(options, "ktls_skb,", OPTSTRING); if (ktls) strncat(options, "ktls,", OPTSTRING); if (peek_flag) @@ -1168,416 +1337,317 @@ static int __test_exec(int cgrp, int test, struct sockmap_options *opt) test_options(options); - fprintf(stdout, - "[TEST %i]: (%i, %i, %i, %s, %s): ", - test_cnt, opt->rate, opt->iov_count, opt->iov_length, - test_to_str(test), options); - fflush(stdout); + if (opt->verbose) { + fprintf(stdout, + " [TEST %i]: (%i, %i, %i, %s, %s): ", + test_cnt, opt->rate, opt->iov_count, opt->iov_length, + test_to_str(test), options); + fflush(stdout); + } err = run_options(opt, cgrp, test); - fprintf(stdout, "%s\n", !err ? "PASS" : "FAILED"); + if (opt->verbose) + fprintf(stdout, " %s\n", !err ? "PASS" : "FAILED"); test_cnt++; !err ? passed++ : failed++; free(options); return err; } -static int test_exec(int cgrp, struct sockmap_options *opt) -{ - int err = __test_exec(cgrp, SENDMSG, opt); - - if (err) - goto out; - - err = __test_exec(cgrp, SENDPAGE, opt); -out: - return err; -} - -static int test_loop(int cgrp) -{ - struct sockmap_options opt; - - int err, i, l, r; - - opt.verbose = 0; - opt.base = false; - opt.sendpage = false; - opt.data_test = false; - opt.drop_expected = false; - opt.iov_count = 0; - opt.iov_length = 0; - opt.rate = 0; - - r = 1; - for (i = 1; i < 100; i += 33) { - for (l = 1; l < 100; l += 33) { - opt.rate = r; - opt.iov_count = i; - opt.iov_length = l; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } - } - sched_yield(); -out: - return err; -} - -static int test_txmsg(int cgrp) +static void test_exec(int cgrp, struct sockmap_options *opt) { + int type = strcmp(opt->map, BPF_SOCKMAP_FILENAME); int err; - txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0; - txmsg_apply = txmsg_cork = 0; - txmsg_ingress = txmsg_skb = 0; - - txmsg_pass = 1; - err = test_loop(cgrp); - txmsg_pass = 0; - if (err) - goto out; - - txmsg_redir = 1; - err = test_loop(cgrp); - txmsg_redir = 0; - if (err) - goto out; - - txmsg_drop = 1; - err = test_loop(cgrp); - txmsg_drop = 0; - if (err) - goto out; - - txmsg_redir = 1; - txmsg_ingress = 1; - err = test_loop(cgrp); - txmsg_redir = 0; - txmsg_ingress = 0; - if (err) - goto out; -out: - txmsg_pass = 0; - txmsg_redir = 0; - txmsg_drop = 0; - return err; + if (type == 0) { + test_start(); + err = __test_exec(cgrp, SENDMSG, opt); + if (err) + test_fail(); + } else { + test_start(); + err = __test_exec(cgrp, SENDPAGE, opt); + if (err) + test_fail(); + } } -static int test_send(struct sockmap_options *opt, int cgrp) +static void test_send_one(struct sockmap_options *opt, int cgrp) { - int err; - opt->iov_length = 1; opt->iov_count = 1; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); opt->iov_length = 1; opt->iov_count = 1024; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); opt->iov_length = 1024; opt->iov_count = 1; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); - opt->iov_length = 1; +} + +static void test_send_many(struct sockmap_options *opt, int cgrp) +{ + opt->iov_length = 3; opt->iov_count = 1; opt->rate = 512; - err = test_exec(cgrp, opt); - if (err) - goto out; - - opt->iov_length = 256; - opt->iov_count = 1024; - opt->rate = 2; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); opt->rate = 100; opt->iov_count = 1; opt->iov_length = 5; - err = test_exec(cgrp, opt); - if (err) - goto out; -out: - sched_yield(); - return err; + test_exec(cgrp, opt); } -static int test_mixed(int cgrp) +static void test_send_large(struct sockmap_options *opt, int cgrp) { - struct sockmap_options opt = {0}; - int err; + opt->iov_length = 256; + opt->iov_count = 1024; + opt->rate = 2; + test_exec(cgrp, opt); +} - txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0; - txmsg_apply = txmsg_cork = 0; - txmsg_start = txmsg_end = 0; - txmsg_start_push = txmsg_end_push = 0; - txmsg_start_pop = txmsg_pop = 0; +static void test_send(struct sockmap_options *opt, int cgrp) +{ + test_send_one(opt, cgrp); + test_send_many(opt, cgrp); + test_send_large(opt, cgrp); + sched_yield(); +} +static void test_txmsg_pass(int cgrp, struct sockmap_options *opt) +{ /* Test small and large iov_count values with pass/redir/apply/cork */ txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + test_send(opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 0; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_redir(int cgrp, struct sockmap_options *opt) +{ + txmsg_redir = 1; + test_send(opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_drop(int cgrp, struct sockmap_options *opt) +{ + txmsg_drop = 1; + test_send(opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1024; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_ingress_redir(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = txmsg_drop = 0; + txmsg_ingress = txmsg_redir = 1; + test_send(opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 0; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_skb(int cgrp, struct sockmap_options *opt) +{ + bool data = opt->data_test; + int k = ktls; - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1024; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + opt->data_test = true; + ktls = 1; + txmsg_pass = txmsg_drop = 0; + txmsg_ingress = txmsg_redir = 0; + txmsg_ktls_skb = 1; txmsg_pass = 1; - txmsg_redir = 0; - txmsg_cork = 4096; - txmsg_apply = 4096; - err = test_send(&opt, cgrp); - if (err) - goto out; - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 1; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + /* Using data verification so ensure iov layout is + * expected from test receiver side. e.g. has enough + * bytes to write test code. + */ + opt->iov_length = 100; + opt->iov_count = 1; + opt->rate = 1; + test_exec(cgrp, opt); - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 0; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_ktls_skb_drop = 1; + test_exec(cgrp, opt); - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 1024; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_ktls_skb_drop = 0; + txmsg_ktls_skb_redir = 1; + test_exec(cgrp, opt); - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 0; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + opt->data_test = data; + ktls = k; +} + + +/* Test cork with hung data. This tests poor usage patterns where + * cork can leave data on the ring if user program is buggy and + * doesn't flush them somehow. They do take some time however + * because they wait for a timeout. Test pass, redir and cork with + * apply logic. Use cork size of 4097 with send_large to avoid + * aligning cork size with send size. + */ +static void test_txmsg_cork_hangs(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_cork = 4097; + txmsg_apply = 4097; + test_send_large(opt, cgrp); txmsg_pass = 0; txmsg_redir = 1; - txmsg_apply = 1024; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_apply = 0; + txmsg_cork = 4097; + test_send_large(opt, cgrp); txmsg_pass = 0; txmsg_redir = 1; - txmsg_cork = 4096; - txmsg_apply = 4096; - err = test_send(&opt, cgrp); - if (err) - goto out; -out: - return err; + txmsg_apply = 4097; + txmsg_cork = 4097; + test_send_large(opt, cgrp); } -static int test_start_end(int cgrp) +static void test_txmsg_pull(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {0}; - int err, i; + /* Test basic start/end */ + txmsg_start = 1; + txmsg_end = 2; + test_send(opt, cgrp); + + /* Test >4k pull */ + txmsg_start = 4096; + txmsg_end = 9182; + test_send_large(opt, cgrp); - /* Test basic start/end with lots of iov_count and iov_lengths */ + /* Test pull + redirect */ + txmsg_redir = 0; txmsg_start = 1; txmsg_end = 2; - txmsg_start_push = 1; - txmsg_end_push = 2; - txmsg_start_pop = 1; - txmsg_pop = 1; - err = test_txmsg(cgrp); - if (err) - goto out; + test_send(opt, cgrp); - /* Cut a byte of pushed data but leave reamining in place */ + /* Test pull + cork */ + txmsg_redir = 0; + txmsg_cork = 512; txmsg_start = 1; txmsg_end = 2; - txmsg_start_push = 1; - txmsg_end_push = 3; + test_send_many(opt, cgrp); + + /* Test pull + cork + redirect */ + txmsg_redir = 1; + txmsg_cork = 512; + txmsg_start = 1; + txmsg_end = 2; + test_send_many(opt, cgrp); +} + +static void test_txmsg_pop(int cgrp, struct sockmap_options *opt) +{ + /* Test basic pop */ txmsg_start_pop = 1; - txmsg_pop = 1; - err = test_txmsg(cgrp); - if (err) - goto out; + txmsg_pop = 2; + test_send_many(opt, cgrp); - /* Test start/end with cork */ - opt.rate = 16; - opt.iov_count = 1; - opt.iov_length = 100; - txmsg_cork = 1600; - - txmsg_start_pop = 0; - txmsg_pop = 0; - - for (i = 99; i <= 1600; i += 500) { - txmsg_start = 0; - txmsg_end = i; - txmsg_start_push = 0; - txmsg_end_push = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } + /* Test pop with >4k */ + txmsg_start_pop = 4096; + txmsg_pop = 4096; + test_send_large(opt, cgrp); - /* Test pop data in middle of cork */ - for (i = 99; i <= 1600; i += 500) { - txmsg_start_pop = 10; - txmsg_pop = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } - txmsg_start_pop = 0; - txmsg_pop = 0; - - /* Test start/end with cork but pull data in middle */ - for (i = 199; i <= 1600; i += 500) { - txmsg_start = 100; - txmsg_end = i; - txmsg_start_push = 100; - txmsg_end_push = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } + /* Test pop + redirect */ + txmsg_redir = 1; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(opt, cgrp); - /* Test start/end with cork pulling last sg entry */ - txmsg_start = 1500; - txmsg_end = 1600; - txmsg_start_push = 1500; - txmsg_end_push = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; + /* Test pop + cork */ + txmsg_redir = 0; + txmsg_cork = 512; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(opt, cgrp); - /* Test pop with cork pulling last sg entry */ - txmsg_start_pop = 1500; - txmsg_pop = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; - txmsg_start_pop = 0; - txmsg_pop = 0; - - /* Test start/end pull of single byte in last page */ - txmsg_start = 1111; - txmsg_end = 1112; - txmsg_start_push = 1111; - txmsg_end_push = 1112; - err = test_exec(cgrp, &opt); - if (err) - goto out; + /* Test pop + redirect + cork */ + txmsg_redir = 1; + txmsg_cork = 4; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(opt, cgrp); +} - /* Test pop of single byte in last page */ - txmsg_start_pop = 1111; - txmsg_pop = 1112; - err = test_exec(cgrp, &opt); - if (err) - goto out; +static void test_txmsg_push(int cgrp, struct sockmap_options *opt) +{ + /* Test basic push */ + txmsg_start_push = 1; + txmsg_end_push = 1; + test_send(opt, cgrp); - /* Test start/end with end < start */ - txmsg_start = 1111; - txmsg_end = 0; - txmsg_start_push = 1111; - txmsg_end_push = 0; - err = test_exec(cgrp, &opt); - if (err) - goto out; + /* Test push 4kB >4k */ + txmsg_start_push = 4096; + txmsg_end_push = 4096; + test_send_large(opt, cgrp); - /* Test start/end with end > data */ - txmsg_start = 0; - txmsg_end = 1601; - txmsg_start_push = 0; - txmsg_end_push = 1601; - err = test_exec(cgrp, &opt); - if (err) - goto out; + /* Test push + redirect */ + txmsg_redir = 1; + txmsg_start_push = 1; + txmsg_end_push = 2; + test_send_many(opt, cgrp); - /* Test start/end with start > data */ - txmsg_start = 1601; - txmsg_end = 1600; - txmsg_start_push = 1601; - txmsg_end_push = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; + /* Test push + cork */ + txmsg_redir = 0; + txmsg_cork = 512; + txmsg_start_push = 1; + txmsg_end_push = 2; + test_send_many(opt, cgrp); +} - /* Test pop with start > data */ - txmsg_start_pop = 1601; - txmsg_pop = 1; - err = test_exec(cgrp, &opt); - if (err) - goto out; +static void test_txmsg_push_pop(int cgrp, struct sockmap_options *opt) +{ + txmsg_start_push = 1; + txmsg_end_push = 10; + txmsg_start_pop = 5; + txmsg_pop = 4; + test_send_large(opt, cgrp); +} - /* Test pop with pop range > data */ - txmsg_start_pop = 1599; - txmsg_pop = 10; - err = test_exec(cgrp, &opt); -out: - txmsg_start = 0; - txmsg_end = 0; - sched_yield(); - return err; +static void test_txmsg_apply(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1; + txmsg_cork = 0; + test_send_one(opt, cgrp); + + txmsg_pass = 0; + txmsg_redir = 1; + txmsg_apply = 1; + txmsg_cork = 0; + test_send_one(opt, cgrp); + + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1024; + txmsg_cork = 0; + test_send_large(opt, cgrp); + + txmsg_pass = 0; + txmsg_redir = 1; + txmsg_apply = 1024; + txmsg_cork = 0; + test_send_large(opt, cgrp); +} + +static void test_txmsg_cork(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 0; + txmsg_cork = 1; + test_send(opt, cgrp); + + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1; + txmsg_cork = 1; + test_send(opt, cgrp); } char *map_names[] = { @@ -1589,11 +1659,13 @@ char *map_names[] = { "sock_bytes", "sock_redir_flags", "sock_skb_opts", + "tls_sock_map", }; int prog_attach_type[] = { BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, + BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_SOCK_OPS, BPF_SK_MSG_VERDICT, BPF_SK_MSG_VERDICT, @@ -1607,6 +1679,7 @@ int prog_attach_type[] = { int prog_type[] = { BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_SK_SKB, + BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_SK_MSG, @@ -1662,73 +1735,117 @@ static int populate_progs(char *bpf_file) return 0; } -static int __test_suite(int cg_fd, char *bpf_file) +struct _test test[] = { + {"txmsg test passthrough", test_txmsg_pass}, + {"txmsg test redirect", test_txmsg_redir}, + {"txmsg test drop", test_txmsg_drop}, + {"txmsg test ingress redirect", test_txmsg_ingress_redir}, + {"txmsg test skb", test_txmsg_skb}, + {"txmsg test apply", test_txmsg_apply}, + {"txmsg test cork", test_txmsg_cork}, + {"txmsg test hanging corks", test_txmsg_cork_hangs}, + {"txmsg test push_data", test_txmsg_push}, + {"txmsg test pull-data", test_txmsg_pull}, + {"txmsg test pop-data", test_txmsg_pop}, + {"txmsg test push/pop data", test_txmsg_push_pop}, +}; + +static int check_whitelist(struct _test *t, struct sockmap_options *opt) { - int err, cleanup = cg_fd; + char *entry, *ptr; + + if (!opt->whitelist) + return 0; + ptr = strdup(opt->whitelist); + if (!ptr) + return -ENOMEM; + entry = strtok(ptr, ","); + while (entry) { + if ((opt->prepend && strstr(opt->prepend, entry) != 0) || + strstr(opt->map, entry) != 0 || + strstr(t->title, entry) != 0) + return 0; + entry = strtok(NULL, ","); + } + return -EINVAL; +} - err = populate_progs(bpf_file); +static int check_blacklist(struct _test *t, struct sockmap_options *opt) +{ + char *entry, *ptr; + + if (!opt->blacklist) + return -EINVAL; + ptr = strdup(opt->blacklist); + if (!ptr) + return -ENOMEM; + entry = strtok(ptr, ","); + while (entry) { + if ((opt->prepend && strstr(opt->prepend, entry) != 0) || + strstr(opt->map, entry) != 0 || + strstr(t->title, entry) != 0) + return 0; + entry = strtok(NULL, ","); + } + return -EINVAL; +} + +static int __test_selftests(int cg_fd, struct sockmap_options *opt) +{ + int i, err; + + err = populate_progs(opt->map); if (err < 0) { fprintf(stderr, "ERROR: (%i) load bpf failed\n", err); return err; } - if (cg_fd < 0) { - if (setup_cgroup_environment()) { - fprintf(stderr, "ERROR: cgroup env failed\n"); - return -EINVAL; - } + /* Tests basic commands and APIs */ + for (i = 0; i < sizeof(test)/sizeof(struct _test); i++) { + struct _test t = test[i]; - cg_fd = create_and_get_cgroup(CG_PATH); - if (cg_fd < 0) { - fprintf(stderr, - "ERROR: (%i) open cg path failed: %s\n", - cg_fd, optarg); - return cg_fd; - } + if (check_whitelist(&t, opt) != 0) + continue; + if (check_blacklist(&t, opt) == 0) + continue; - if (join_cgroup(CG_PATH)) { - fprintf(stderr, "ERROR: failed to join cgroup\n"); - return -EINVAL; - } + test_start_subtest(&t, opt); + t.tester(cg_fd, opt); + test_end_subtest(); } - /* Tests basic commands and APIs with range of iov values */ - txmsg_start = txmsg_end = txmsg_start_push = txmsg_end_push = 0; - err = test_txmsg(cg_fd); - if (err) - goto out; + return err; +} - /* Tests interesting combinations of APIs used together */ - err = test_mixed(cg_fd); - if (err) - goto out; +static void test_selftests_sockmap(int cg_fd, struct sockmap_options *opt) +{ + opt->map = BPF_SOCKMAP_FILENAME; + __test_selftests(cg_fd, opt); +} - /* Tests pull_data API using start/end API */ - err = test_start_end(cg_fd); - if (err) - goto out; +static void test_selftests_sockhash(int cg_fd, struct sockmap_options *opt) +{ + opt->map = BPF_SOCKHASH_FILENAME; + __test_selftests(cg_fd, opt); +} -out: - printf("Summary: %i PASSED %i FAILED\n", passed, failed); - if (cleanup < 0) { - cleanup_cgroup_environment(); - close(cg_fd); - } - return err; +static void test_selftests_ktls(int cg_fd, struct sockmap_options *opt) +{ + opt->map = BPF_SOCKHASH_FILENAME; + opt->prepend = "ktls"; + ktls = 1; + __test_selftests(cg_fd, opt); + ktls = 0; } -static int test_suite(int cg_fd) +static int test_selftest(int cg_fd, struct sockmap_options *opt) { - int err; - err = __test_suite(cg_fd, BPF_SOCKMAP_FILENAME); - if (err) - goto out; - err = __test_suite(cg_fd, BPF_SOCKHASH_FILENAME); -out: - if (cg_fd > -1) - close(cg_fd); - return err; + test_selftests_sockmap(cg_fd, opt); + test_selftests_sockhash(cg_fd, opt); + test_selftests_ktls(cg_fd, opt); + test_print_results(); + return 0; } int main(int argc, char **argv) @@ -1737,12 +1854,10 @@ int main(int argc, char **argv) struct sockmap_options options = {0}; int opt, longindex, err, cg_fd = 0; char *bpf_file = BPF_SOCKMAP_FILENAME; - int test = PING_PONG; - - if (argc < 2) - return test_suite(-1); + int test = SELFTESTS; + bool cg_created = 0; - while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:", + while ((opt = getopt_long(argc, argv, ":dhv:c:r:i:l:t:p:q:n:b:", long_options, &longindex)) != -1) { switch (opt) { case 's': @@ -1783,6 +1898,8 @@ int main(int argc, char **argv) break; case 'v': options.verbose = 1; + if (optarg) + options.verbose = atoi(optarg); break; case 'i': iov_count = atoi(optarg); @@ -1809,6 +1926,15 @@ int main(int argc, char **argv) return -1; } break; + case 'n': + options.whitelist = strdup(optarg); + if (!options.whitelist) + return -ENOMEM; + break; + case 'b': + options.blacklist = strdup(optarg); + if (!options.blacklist) + return -ENOMEM; case 0: break; case 'h': @@ -1818,13 +1944,30 @@ int main(int argc, char **argv) } } - if (argc <= 3 && cg_fd) - return test_suite(cg_fd); - if (!cg_fd) { - fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n", - argv[0]); - return -1; + if (setup_cgroup_environment()) { + fprintf(stderr, "ERROR: cgroup env failed\n"); + return -EINVAL; + } + + cg_fd = create_and_get_cgroup(CG_PATH); + if (cg_fd < 0) { + fprintf(stderr, + "ERROR: (%i) open cg path failed: %s\n", + cg_fd, strerror(errno)); + return cg_fd; + } + + if (join_cgroup(CG_PATH)) { + fprintf(stderr, "ERROR: failed to join cgroup\n"); + return -EINVAL; + } + cg_created = 1; + } + + if (test == SELFTESTS) { + err = test_selftest(cg_fd, &options); + goto out; } err = populate_progs(bpf_file); @@ -1843,6 +1986,13 @@ int main(int argc, char **argv) options.rate = rate; err = run_options(&options, cg_fd, test); +out: + if (options.whitelist) + free(options.whitelist); + if (options.blacklist) + free(options.blacklist); + if (cg_created) + cleanup_cgroup_environment(); close(cg_fd); return err; } diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 87eaa49609a0..78a6bae56ea6 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -50,7 +50,7 @@ #define MAX_INSNS BPF_MAXINSNS #define MAX_TEST_INSNS 1000000 #define MAX_FIXUPS 8 -#define MAX_NR_MAPS 19 +#define MAX_NR_MAPS 20 #define MAX_TEST_RUNS 8 #define POINTER_VALUE 0xcafe4all #define TEST_DATA_LEN 64 @@ -86,6 +86,7 @@ struct bpf_test { int fixup_map_array_small[MAX_FIXUPS]; int fixup_sk_storage_map[MAX_FIXUPS]; int fixup_map_event_output[MAX_FIXUPS]; + int fixup_map_reuseport_array[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; uint32_t insn_processed; @@ -637,6 +638,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, int *fixup_map_array_small = test->fixup_map_array_small; int *fixup_sk_storage_map = test->fixup_sk_storage_map; int *fixup_map_event_output = test->fixup_map_event_output; + int *fixup_map_reuseport_array = test->fixup_map_reuseport_array; if (test->fill_helper) { test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn)); @@ -806,12 +808,28 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, fixup_map_event_output++; } while (*fixup_map_event_output); } + if (*fixup_map_reuseport_array) { + map_fds[19] = __create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + sizeof(u32), sizeof(u64), 1, 0); + do { + prog[*fixup_map_reuseport_array].imm = map_fds[19]; + fixup_map_reuseport_array++; + } while (*fixup_map_reuseport_array); + } } +struct libcap { + struct __user_cap_header_struct hdr; + struct __user_cap_data_struct data[2]; +}; + static int set_admin(bool admin) { cap_t caps; - const cap_value_t cap_val = CAP_SYS_ADMIN; + /* need CAP_BPF, CAP_NET_ADMIN, CAP_PERFMON to load progs */ + const cap_value_t cap_net_admin = CAP_NET_ADMIN; + const cap_value_t cap_sys_admin = CAP_SYS_ADMIN; + struct libcap *cap; int ret = -1; caps = cap_get_proc(); @@ -819,11 +837,26 @@ static int set_admin(bool admin) perror("cap_get_proc"); return -1; } - if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val, + cap = (struct libcap *)caps; + if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_sys_admin, CAP_CLEAR)) { + perror("cap_set_flag clear admin"); + goto out; + } + if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_net_admin, admin ? CAP_SET : CAP_CLEAR)) { - perror("cap_set_flag"); + perror("cap_set_flag set_or_clear net"); goto out; } + /* libcap is likely old and simply ignores CAP_BPF and CAP_PERFMON, + * so update effective bits manually + */ + if (admin) { + cap->data[1].effective |= 1 << (38 /* CAP_PERFMON */ - 32); + cap->data[1].effective |= 1 << (39 /* CAP_BPF */ - 32); + } else { + cap->data[1].effective &= ~(1 << (38 - 32)); + cap->data[1].effective &= ~(1 << (39 - 32)); + } if (cap_set_proc(caps)) { perror("cap_set_proc"); goto out; @@ -943,7 +976,12 @@ static void do_test_single(struct bpf_test *test, bool unpriv, attr.insns = prog; attr.insns_cnt = prog_len; attr.license = "GPL"; - attr.log_level = verbose || expected_ret == VERBOSE_ACCEPT ? 1 : 4; + if (verbose) + attr.log_level = 1; + else if (expected_ret == VERBOSE_ACCEPT) + attr.log_level = 2; + else + attr.log_level = 4; attr.prog_flags = pflags; fd_prog = bpf_load_program_xattr(&attr, bpf_vlog, sizeof(bpf_vlog)); @@ -1052,9 +1090,11 @@ fail_log: static bool is_admin(void) { + cap_flag_value_t net_priv = CAP_CLEAR; + bool perfmon_priv = false; + bool bpf_priv = false; + struct libcap *cap; cap_t caps; - cap_flag_value_t sysadmin = CAP_CLEAR; - const cap_value_t cap_val = CAP_SYS_ADMIN; #ifdef CAP_IS_SUPPORTED if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) { @@ -1067,11 +1107,14 @@ static bool is_admin(void) perror("cap_get_proc"); return false; } - if (cap_get_flag(caps, cap_val, CAP_EFFECTIVE, &sysadmin)) - perror("cap_get_flag"); + cap = (struct libcap *)caps; + bpf_priv = cap->data[1].effective & (1 << (39/* CAP_BPF */ - 32)); + perfmon_priv = cap->data[1].effective & (1 << (38/* CAP_PERFMON */ - 32)); + if (cap_get_flag(caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &net_priv)) + perror("cap_get_flag NET"); if (cap_free(caps)) perror("cap_free"); - return (sysadmin == CAP_SET); + return bpf_priv && perfmon_priv && net_priv == CAP_SET; } static void get_unpriv_disabled() diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c new file mode 100644 index 000000000000..0af6337a8962 --- /dev/null +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (C) 2020 Facebook, Inc. */ +#include <stdlib.h> +#include <errno.h> +#include "testing_helpers.h" + +int parse_num_list(const char *s, bool **num_set, int *num_set_len) +{ + int i, set_len = 0, new_len, num, start = 0, end = -1; + bool *set = NULL, *tmp, parsing_end = false; + char *next; + + while (s[0]) { + errno = 0; + num = strtol(s, &next, 10); + if (errno) + return -errno; + + if (parsing_end) + end = num; + else + start = num; + + if (!parsing_end && *next == '-') { + s = next + 1; + parsing_end = true; + continue; + } else if (*next == ',') { + parsing_end = false; + s = next + 1; + end = num; + } else if (*next == '\0') { + parsing_end = false; + s = next; + end = num; + } else { + return -EINVAL; + } + + if (start > end) + return -EINVAL; + + if (end + 1 > set_len) { + new_len = end + 1; + tmp = realloc(set, new_len); + if (!tmp) { + free(set); + return -ENOMEM; + } + for (i = set_len; i < start; i++) + tmp[i] = false; + set = tmp; + set_len = new_len; + } + for (i = start; i <= end; i++) + set[i] = true; + } + + if (!set) + return -EINVAL; + + *num_set = set; + *num_set_len = set_len; + + return 0; +} diff --git a/tools/testing/selftests/bpf/testing_helpers.h b/tools/testing/selftests/bpf/testing_helpers.h new file mode 100644 index 000000000000..923b51762759 --- /dev/null +++ b/tools/testing/selftests/bpf/testing_helpers.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (C) 2020 Facebook, Inc. */ +#include <stdbool.h> + +int parse_num_list(const char *s, bool **set, int *set_len); diff --git a/tools/testing/selftests/bpf/verifier/and.c b/tools/testing/selftests/bpf/verifier/and.c index e0fad1548737..d781bc86e100 100644 --- a/tools/testing/selftests/bpf/verifier/and.c +++ b/tools/testing/selftests/bpf/verifier/and.c @@ -15,7 +15,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -44,7 +44,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index f3c33e128709..1c4b1939f5a8 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -117,7 +117,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -137,7 +137,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R0 unbounded memory access, make sure to bounds check any such access", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index a253a064e6e0..4d6645f2874c 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -20,7 +20,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, }, { @@ -146,7 +146,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result = REJECT }, { @@ -238,7 +238,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), /* r1 = [0x00, 0xff] */ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1), @@ -253,10 +253,6 @@ * [0xffff'ffff'0000'0080, 0xffff'ffff'ffff'ffff] */ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1), - /* r1 = 0 or - * [0x00ff'ffff'ff00'0000, 0x00ff'ffff'ffff'ffff] - */ - BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8), /* error on OOB pointer computation */ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), /* exit */ @@ -265,8 +261,10 @@ }, .fixup_map_hash_8b = { 3 }, /* not actually fully unbounded, but the bound is very high */ - .errstr = "value 72057594021150720 makes map_value pointer be out of bounds", - .result = REJECT + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root", + .result_unpriv = REJECT, + .errstr = "value -4294967168 makes map_value pointer be out of bounds", + .result = REJECT, }, { "bounds check after truncation of boundary-crossing range (2)", @@ -276,7 +274,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), /* r1 = [0x00, 0xff] */ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1), @@ -293,10 +291,6 @@ * [0xffff'ffff'0000'0080, 0xffff'ffff'ffff'ffff] */ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1), - /* r1 = 0 or - * [0x00ff'ffff'ff00'0000, 0x00ff'ffff'ffff'ffff] - */ - BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8), /* error on OOB pointer computation */ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), /* exit */ @@ -305,8 +299,10 @@ }, .fixup_map_hash_8b = { 3 }, /* not actually fully unbounded, but the bound is very high */ - .errstr = "value 72057594021150720 makes map_value pointer be out of bounds", - .result = REJECT + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root", + .result_unpriv = REJECT, + .errstr = "value -4294967168 makes map_value pointer be out of bounds", + .result = REJECT, }, { "bounds check after wrapping 32-bit addition", @@ -358,7 +354,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT }, { @@ -539,3 +535,25 @@ }, .result = ACCEPT }, +{ + "assigning 32bit bounds to 64bit for wA = 0, wB = wA", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_MOV32_IMM(BPF_REG_9, 0), + BPF_MOV32_REG(BPF_REG_2, BPF_REG_9), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_7), + BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_8, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_6, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, +}, diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 2d752c4f8d9d..94258c6b5235 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -19,7 +19,7 @@ BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, @@ -105,7 +105,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .fixup_map_hash_8b = { 16 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", }, { "calls: overlapping caller/callee", @@ -315,7 +315,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = POINTER_VALUE, @@ -346,7 +346,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = TEST_DATA_LEN + TEST_DATA_LEN - ETH_HLEN - ETH_HLEN, @@ -397,7 +397,7 @@ BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .fixup_map_hash_48b = { 3 }, .result_unpriv = REJECT, .result = ACCEPT, @@ -1064,7 +1064,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .errstr = "R0 !read_ok", .result = REJECT, @@ -1977,7 +1977,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, }, @@ -2003,7 +2003,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, @@ -2028,7 +2028,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/const_or.c b/tools/testing/selftests/bpf/verifier/const_or.c index 84446dfc7c1d..6c214c58e8d4 100644 --- a/tools/testing/selftests/bpf/verifier/const_or.c +++ b/tools/testing/selftests/bpf/verifier/const_or.c @@ -6,7 +6,7 @@ BPF_MOV64_IMM(BPF_REG_2, 34), BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 13), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -20,7 +20,7 @@ BPF_MOV64_IMM(BPF_REG_2, 34), BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 24), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "invalid stack type R1 off=-48 access_size=58", @@ -36,7 +36,7 @@ BPF_MOV64_IMM(BPF_REG_4, 13), BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -51,7 +51,7 @@ BPF_MOV64_IMM(BPF_REG_4, 24), BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "invalid stack type R1 off=-48 access_size=58", diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 50a8a63be4ac..5cf361d8eb1c 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -85,7 +85,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -103,7 +103,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -121,7 +121,7 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -137,7 +137,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, @@ -152,7 +152,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c index b9fb28e8e224..988f46a1a4c7 100644 --- a/tools/testing/selftests/bpf/verifier/direct_value_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c @@ -68,7 +68,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", }, { "direct map access, write test 7", @@ -220,7 +220,7 @@ }, .fixup_map_array_small = { 1 }, .result = REJECT, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", }, { "direct map access, write test 19", diff --git a/tools/testing/selftests/bpf/verifier/event_output.c b/tools/testing/selftests/bpf/verifier/event_output.c index 130553e19eca..99f8f582c02b 100644 --- a/tools/testing/selftests/bpf/verifier/event_output.c +++ b/tools/testing/selftests/bpf/verifier/event_output.c @@ -92,3 +92,27 @@ .result = ACCEPT, .retval = 1, }, +{ + "perfevent for cgroup dev", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_DEVICE, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "perfevent for cgroup sysctl", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "perfevent for cgroup sockopt", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, diff --git a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c index 67ab12410050..87c4e7900083 100644 --- a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c +++ b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c @@ -19,7 +19,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -36,7 +36,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "invalid indirect read from stack off -64+0 size 64", @@ -55,7 +55,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -84,7 +84,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -112,7 +112,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -132,7 +132,7 @@ BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 3), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -152,7 +152,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -171,7 +171,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -190,7 +190,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 3), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -208,7 +208,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, 64, 3), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -233,7 +233,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -259,7 +259,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -286,7 +286,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -313,12 +313,12 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 4 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -468,7 +468,7 @@ BPF_MOV64_IMM(BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "R1 type=inv expected=fp", @@ -481,7 +481,7 @@ BPF_MOV64_IMM(BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "R1 type=inv expected=fp", @@ -495,7 +495,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -513,7 +513,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, @@ -534,7 +534,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, @@ -554,7 +554,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, @@ -580,7 +580,7 @@ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 63), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_EXIT_INSN(), }, @@ -607,7 +607,7 @@ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 32), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 32), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_EXIT_INSN(), }, diff --git a/tools/testing/selftests/bpf/verifier/helper_value_access.c b/tools/testing/selftests/bpf/verifier/helper_value_access.c index 7572e403ddb9..1c7882ddfa63 100644 --- a/tools/testing/selftests/bpf/verifier/helper_value_access.c +++ b/tools/testing/selftests/bpf/verifier/helper_value_access.c @@ -10,7 +10,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -29,7 +29,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -67,7 +67,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) + 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -87,7 +87,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -109,7 +109,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -129,7 +129,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -170,7 +170,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo) + 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -191,7 +191,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_2, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -212,7 +212,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -235,7 +235,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -256,7 +256,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -280,7 +280,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -300,7 +300,7 @@ sizeof(struct test_val) - offsetof(struct test_val, foo) + 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -322,7 +322,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -344,7 +344,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -368,7 +368,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -390,7 +390,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -415,7 +415,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -433,7 +433,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -458,7 +458,7 @@ sizeof(struct test_val) - offsetof(struct test_val, foo) + 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -926,7 +926,7 @@ }, .fixup_map_hash_16b = { 3, 10 }, .result = REJECT, - .errstr = "R2 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R2 unbounded memory access, make sure to bounds check any such access", .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, { diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 02151f8c940f..6dc8003ffc70 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -31,14 +31,14 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, .fixup_map_array_48b = { 1 }, .result = VERBOSE_ACCEPT, .errstr = - "26: (85) call bpf_probe_read#4\ + "26: (85) call bpf_probe_read_kernel#113\ last_idx 26 first_idx 20\ regs=4 stack=0 before 25\ regs=4 stack=0 before 24\ @@ -91,7 +91,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -99,7 +99,7 @@ .result = VERBOSE_ACCEPT, .flags = BPF_F_TEST_STATE_FREQ, .errstr = - "26: (85) call bpf_probe_read#4\ + "26: (85) call bpf_probe_read_kernel#113\ last_idx 26 first_idx 22\ regs=4 stack=0 before 25\ regs=4 stack=0 before 24\ diff --git a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c index da7a4b37cb98..fc4e301260f6 100644 --- a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c +++ b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c @@ -1,34 +1,4 @@ { - "prevent map lookup in sockmap", - .insns = { - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), - }, - .fixup_map_sockmap = { 3 }, - .result = REJECT, - .errstr = "cannot pass map_type 15 into func bpf_map_lookup_elem", - .prog_type = BPF_PROG_TYPE_SOCK_OPS, -}, -{ - "prevent map lookup in sockhash", - .insns = { - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), - }, - .fixup_map_sockhash = { 3 }, - .result = REJECT, - .errstr = "cannot pass map_type 18 into func bpf_map_lookup_elem", - .prog_type = BPF_PROG_TYPE_SOCK_OPS, -}, -{ "prevent map lookup in stack trace", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c index 604b46151736..056e0273bf12 100644 --- a/tools/testing/selftests/bpf/verifier/ref_tracking.c +++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c @@ -821,3 +821,36 @@ .result = REJECT, .errstr = "invalid mem access", }, +{ + "reference tracking: branch tracking valid pointer null comparison", + .insns = { + BPF_SK_LOOKUP(sk_lookup_tcp), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, +}, +{ + "reference tracking: branch tracking valid pointer value comparison", + .insns = { + BPF_SK_LOOKUP(sk_lookup_tcp), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 4), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 1234, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "Unreleased reference", + .result = REJECT, +}, diff --git a/tools/testing/selftests/bpf/verifier/sock.c b/tools/testing/selftests/bpf/verifier/sock.c index 9ed192e14f5f..b1aac2641498 100644 --- a/tools/testing/selftests/bpf/verifier/sock.c +++ b/tools/testing/selftests/bpf/verifier/sock.c @@ -222,7 +222,7 @@ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_sock, state)), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_sock, rx_queue_mapping)), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -516,3 +516,118 @@ .prog_type = BPF_PROG_TYPE_XDP, .result = ACCEPT, }, +{ + "bpf_map_lookup_elem(sockmap, &key)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = REJECT, + .errstr = "Unreleased reference id=2 alloc_insn=5", +}, +{ + "bpf_map_lookup_elem(sockhash, &key)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_sockhash = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = REJECT, + .errstr = "Unreleased reference id=2 alloc_insn=5", +}, +{ + "bpf_map_lookup_elem(sockmap, &key); sk->type [fullsock field]; bpf_sk_release(sk)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = ACCEPT, +}, +{ + "bpf_map_lookup_elem(sockhash, &key); sk->type [fullsock field]; bpf_sk_release(sk)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .fixup_map_sockhash = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = ACCEPT, +}, +{ + "bpf_sk_select_reuseport(ctx, reuseport_array, &key, flags)", + .insns = { + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport), + BPF_EXIT_INSN(), + }, + .fixup_map_reuseport_array = { 4 }, + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, + .result = ACCEPT, +}, +{ + "bpf_sk_select_reuseport(ctx, sockmap, &key, flags)", + .insns = { + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 4 }, + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, + .result = ACCEPT, +}, +{ + "bpf_sk_select_reuseport(ctx, sockhash, &key, flags)", + .insns = { + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 4 }, + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/bpf/verifier/value_or_null.c b/tools/testing/selftests/bpf/verifier/value_or_null.c index 860d4a71cd83..3ecb70a3d939 100644 --- a/tools/testing/selftests/bpf/verifier/value_or_null.c +++ b/tools/testing/selftests/bpf/verifier/value_or_null.c @@ -150,3 +150,22 @@ .result_unpriv = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, +{ + "map lookup and null branch prediction", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2), + BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_10, 10), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 4 }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index a53d99cebd9f..97ee658e1242 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -50,7 +50,7 @@ .fixup_map_array_48b = { 8 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R0 min value is outside of the array range", + .errstr_unpriv = "R0 min value is outside of the allowed memory range", .retval = 1, }, { @@ -325,7 +325,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result_unpriv = REJECT, .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", }, @@ -601,7 +601,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R1 max value is outside of the array range", + .errstr = "R1 max value is outside of the allowed memory range", .errstr_unpriv = "R1 pointer arithmetic of map value goes out of range", .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -726,7 +726,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", }, { "map access: value_ptr -= known scalar, 2", diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh index 26044e397157..b32ba5fec59d 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh @@ -107,7 +107,7 @@ ingress_flow_action_drop_test() RET=0 - devlink_trap_drop_test ingress_flow_action_drop acl_drops $swp2 101 + devlink_trap_drop_test ingress_flow_action_drop $swp2 101 log_test "ingress_flow_action_drop" @@ -132,7 +132,7 @@ egress_flow_action_drop_test() RET=0 - devlink_trap_drop_test egress_flow_action_drop acl_drops $swp2 102 + devlink_trap_drop_test egress_flow_action_drop $swp2 102 log_test "egress_flow_action_drop" diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh new file mode 100755 index 000000000000..a37273473c1b --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh @@ -0,0 +1,688 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test devlink-trap control trap functionality over mlxsw. Each registered +# control packet trap is tested to make sure it is triggered under the right +# conditions. +# +# +---------------------------------+ +# | H1 (vrf) | +# | + $h1 | +# | | 192.0.2.1/24 | +# | | 2001:db8:1::1/64 | +# | | | +# | | default via 192.0.2.2 | +# | | default via 2001:db8:1::2 | +# +----|----------------------------+ +# | +# +----|----------------------------------------------------------------------+ +# | SW | | +# | + $rp1 | +# | 192.0.2.2/24 | +# | 2001:db8:1::2/64 | +# | | +# | 2001:db8:2::2/64 | +# | 198.51.100.2/24 | +# | + $rp2 | +# | | | +# +----|----------------------------------------------------------------------+ +# | +# +----|----------------------------+ +# | | default via 198.51.100.2 | +# | | default via 2001:db8:2::2 | +# | | | +# | | 2001:db8:2::1/64 | +# | | 198.51.100.1/24 | +# | + $h2 | +# | H2 (vrf) | +# +---------------------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + stp_test + lacp_test + lldp_test + igmp_query_test + igmp_v1_report_test + igmp_v2_report_test + igmp_v3_report_test + igmp_v2_leave_test + mld_query_test + mld_v1_report_test + mld_v2_report_test + mld_v1_done_test + ipv4_dhcp_test + ipv6_dhcp_test + arp_request_test + arp_response_test + ipv6_neigh_solicit_test + ipv6_neigh_advert_test + ipv4_bfd_test + ipv6_bfd_test + ipv4_ospf_test + ipv6_ospf_test + ipv4_bgp_test + ipv6_bgp_test + ipv4_vrrp_test + ipv6_vrrp_test + ipv4_pim_test + ipv6_pim_test + uc_loopback_test + local_route_test + external_route_test + ipv6_uc_dip_link_local_scope_test + ipv4_router_alert_test + ipv6_router_alert_test + ipv6_dip_all_nodes_test + ipv6_dip_all_routers_test + ipv6_router_solicit_test + ipv6_router_advert_test + ipv6_redirect_test + ptp_event_test + ptp_general_test + flow_action_sample_test + flow_action_trap_test +" +NUM_NETIFS=4 +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64 + + ip -4 route add default vrf v$h1 nexthop via 192.0.2.2 + ip -6 route add default vrf v$h1 nexthop via 2001:db8:1::2 +} + +h1_destroy() +{ + ip -6 route del default vrf v$h1 nexthop via 2001:db8:1::2 + ip -4 route del default vrf v$h1 nexthop via 192.0.2.2 + + simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64 +} + +h2_create() +{ + simple_if_init $h2 198.51.100.1/24 2001:db8:2::1/64 + + ip -4 route add default vrf v$h2 nexthop via 198.51.100.2 + ip -6 route add default vrf v$h2 nexthop via 2001:db8:2::2 +} + +h2_destroy() +{ + ip -6 route del default vrf v$h2 nexthop via 2001:db8:2::2 + ip -4 route del default vrf v$h2 nexthop via 198.51.100.2 + + simple_if_fini $h2 198.51.100.1/24 2001:db8:2::1/64 +} + +router_create() +{ + ip link set dev $rp1 up + ip link set dev $rp2 up + + __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64 + __addr_add_del $rp2 add 198.51.100.2/24 2001:db8:2::2/64 +} + +router_destroy() +{ + __addr_add_del $rp2 del 198.51.100.2/24 2001:db8:2::2/64 + __addr_add_del $rp1 del 192.0.2.2/24 2001:db8:1::2/64 + + ip link set dev $rp2 down + ip link set dev $rp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp1=${NETIFS[p2]} + + rp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + vrf_prepare + forwarding_enable + + h1_create + h2_create + router_create +} + +cleanup() +{ + pre_cleanup + + router_destroy + h2_destroy + h1_destroy + + forwarding_restore + vrf_cleanup +} + +stp_test() +{ + devlink_trap_stats_test "STP" "stp" $MZ $h1 -c 1 -t bpdu -q +} + +lacp_payload_get() +{ + local source_mac=$1; shift + local p + + p=$(: + )"01:80:C2:00:00:02:"$( : ETH daddr + )"$source_mac:"$( : ETH saddr + )"88:09:"$( : ETH type + ) + echo $p +} + +lacp_test() +{ + local h1mac=$(mac_get $h1) + + devlink_trap_stats_test "LACP" "lacp" $MZ $h1 -c 1 \ + $(lacp_payload_get $h1mac) -p 100 -q +} + +lldp_payload_get() +{ + local source_mac=$1; shift + local p + + p=$(: + )"01:80:C2:00:00:0E:"$( : ETH daddr + )"$source_mac:"$( : ETH saddr + )"88:CC:"$( : ETH type + ) + echo $p +} + +lldp_test() +{ + local h1mac=$(mac_get $h1) + + devlink_trap_stats_test "LLDP" "lldp" $MZ $h1 -c 1 \ + $(lldp_payload_get $h1mac) -p 100 -q +} + +igmp_query_test() +{ + # IGMP (IP Protocol 2) Membership Query (Type 0x11) + devlink_trap_stats_test "IGMP Membership Query" "igmp_query" \ + $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 224.0.0.1 -t ip proto=2,p=11 -p 100 -q +} + +igmp_v1_report_test() +{ + # IGMP (IP Protocol 2) Version 1 Membership Report (Type 0x12) + devlink_trap_stats_test "IGMP Version 1 Membership Report" \ + "igmp_v1_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=12 -p 100 -q +} + +igmp_v2_report_test() +{ + # IGMP (IP Protocol 2) Version 2 Membership Report (Type 0x16) + devlink_trap_stats_test "IGMP Version 2 Membership Report" \ + "igmp_v2_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=16 -p 100 -q +} + +igmp_v3_report_test() +{ + # IGMP (IP Protocol 2) Version 3 Membership Report (Type 0x22) + devlink_trap_stats_test "IGMP Version 3 Membership Report" \ + "igmp_v3_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=22 -p 100 -q +} + +igmp_v2_leave_test() +{ + # IGMP (IP Protocol 2) Version 2 Leave Group (Type 0x17) + devlink_trap_stats_test "IGMP Version 2 Leave Group" \ + "igmp_v2_leave" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:02 \ + -A 192.0.2.1 -B 224.0.0.2 -t ip proto=2,p=17 -p 100 -q +} + +mld_payload_get() +{ + local type=$1; shift + local p + + type=$(printf "%x" $type) + p=$(: + )"3A:"$( : Next Header - ICMPv6 + )"00:"$( : Hdr Ext Len + )"00:00:00:00:00:00:"$( : Options and Padding + )"$type:"$( : ICMPv6.type + )"00:"$( : ICMPv6.code + )"00:"$( : ICMPv6.checksum + ) + echo $p +} + +mld_query_test() +{ + # MLD Multicast Listener Query (Type 130) + devlink_trap_stats_test "MLD Multicast Listener Query" "mld_query" \ + $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 130) -p 100 -q +} + +mld_v1_report_test() +{ + # MLD Version 1 Multicast Listener Report (Type 131) + devlink_trap_stats_test "MLD Version 1 Multicast Listener Report" \ + "mld_v1_report" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 131) -p 100 -q +} + +mld_v2_report_test() +{ + # MLD Version 2 Multicast Listener Report (Type 143) + devlink_trap_stats_test "MLD Version 2 Multicast Listener Report" \ + "mld_v2_report" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 143) -p 100 -q +} + +mld_v1_done_test() +{ + # MLD Version 1 Multicast Listener Done (Type 132) + devlink_trap_stats_test "MLD Version 1 Multicast Listener Done" \ + "mld_v1_done" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 132) -p 100 -q +} + +ipv4_dhcp_test() +{ + devlink_trap_stats_test "IPv4 DHCP Port 67" "ipv4_dhcp" \ + $MZ $h1 -c 1 -a own -b bcast -A 0.0.0.0 -B 255.255.255.255 \ + -t udp sp=68,dp=67 -p 100 -q + + devlink_trap_stats_test "IPv4 DHCP Port 68" "ipv4_dhcp" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) -A 192.0.2.1 \ + -B 255.255.255.255 -t udp sp=67,dp=68 -p 100 -q +} + +ipv6_dhcp_test() +{ + devlink_trap_stats_test "IPv6 DHCP Port 547" "ipv6_dhcp" \ + $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1:2 -t udp sp=546,dp=547 \ + -p 100 -q + + devlink_trap_stats_test "IPv6 DHCP Port 546" "ipv6_dhcp" \ + $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1:2 -t udp sp=547,dp=546 \ + -p 100 -q +} + +arp_request_test() +{ + devlink_trap_stats_test "ARP Request" "arp_request" \ + $MZ $h1 -c 1 -a own -b bcast -t arp request -p 100 -q +} + +arp_response_test() +{ + devlink_trap_stats_test "ARP Response" "arp_response" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) -t arp reply -p 100 -q +} + +icmpv6_header_get() +{ + local type=$1; shift + local p + + type=$(printf "%x" $type) + p=$(: + )"$type:"$( : ICMPv6.type + )"00:"$( : ICMPv6.code + )"00:"$( : ICMPv6.checksum + ) + echo $p +} + +ipv6_neigh_solicit_test() +{ + devlink_trap_stats_test "IPv6 Neighbour Solicitation" \ + "ipv6_neigh_solicit" $MZ $h1 -6 -c 1 \ + -A fe80::1 -B ff02::1:ff00:02 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 135) -p 100 -q +} + +ipv6_neigh_advert_test() +{ + devlink_trap_stats_test "IPv6 Neighbour Advertisement" \ + "ipv6_neigh_advert" $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B 2001:db8:1::2 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 136) -p 100 -q +} + +ipv4_bfd_test() +{ + devlink_trap_stats_test "IPv4 BFD Control - Port 3784" "ipv4_bfd" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t udp sp=49153,dp=3784 -p 100 -q + + devlink_trap_stats_test "IPv4 BFD Echo - Port 3785" "ipv4_bfd" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t udp sp=49153,dp=3785 -p 100 -q +} + +ipv6_bfd_test() +{ + devlink_trap_stats_test "IPv6 BFD Control - Port 3784" "ipv6_bfd" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 \ + -t udp sp=49153,dp=3784 -p 100 -q + + devlink_trap_stats_test "IPv6 BFD Echo - Port 3785" "ipv6_bfd" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 \ + -t udp sp=49153,dp=3785 -p 100 -q +} + +ipv4_ospf_test() +{ + devlink_trap_stats_test "IPv4 OSPF - Multicast" "ipv4_ospf" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:05 \ + -A 192.0.2.1 -B 224.0.0.5 -t ip proto=89 -p 100 -q + + devlink_trap_stats_test "IPv4 OSPF - Unicast" "ipv4_ospf" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t ip proto=89 -p 100 -q +} + +ipv6_ospf_test() +{ + devlink_trap_stats_test "IPv6 OSPF - Multicast" "ipv6_ospf" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:05 \ + -A fe80::1 -B ff02::5 -t ip next=89 -p 100 -q + + devlink_trap_stats_test "IPv6 OSPF - Unicast" "ipv6_ospf" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 -t ip next=89 -p 100 -q +} + +ipv4_bgp_test() +{ + devlink_trap_stats_test "IPv4 BGP" "ipv4_bgp" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t tcp sp=54321,dp=179,flags=rst \ + -p 100 -q +} + +ipv6_bgp_test() +{ + devlink_trap_stats_test "IPv6 BGP" "ipv6_bgp" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 \ + -t tcp sp=54321,dp=179,flags=rst -p 100 -q +} + +ipv4_vrrp_test() +{ + devlink_trap_stats_test "IPv4 VRRP" "ipv4_vrrp" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:12 \ + -A 192.0.2.1 -B 224.0.0.18 -t ip proto=112 -p 100 -q +} + +ipv6_vrrp_test() +{ + devlink_trap_stats_test "IPv6 VRRP" "ipv6_vrrp" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:12 \ + -A fe80::1 -B ff02::12 -t ip next=112 -p 100 -q +} + +ipv4_pim_test() +{ + devlink_trap_stats_test "IPv4 PIM - Multicast" "ipv4_pim" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:0d \ + -A 192.0.2.1 -B 224.0.0.13 -t ip proto=103 -p 100 -q + + devlink_trap_stats_test "IPv4 PIM - Unicast" "ipv4_pim" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t ip proto=103 -p 100 -q +} + +ipv6_pim_test() +{ + devlink_trap_stats_test "IPv6 PIM - Multicast" "ipv6_pim" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:0d \ + -A fe80::1 -B ff02::d -t ip next=103 -p 100 -q + + devlink_trap_stats_test "IPv6 PIM - Unicast" "ipv6_pim" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B 2001:db8:1::2 -t ip next=103 -p 100 -q +} + +uc_loopback_test() +{ + # Add neighbours to the fake destination IPs, so that the packets are + # routed in the device and not trapped due to an unresolved neighbour + # exception. + ip -4 neigh add 192.0.2.3 lladdr 00:11:22:33:44:55 nud permanent \ + dev $rp1 + ip -6 neigh add 2001:db8:1::3 lladdr 00:11:22:33:44:55 nud permanent \ + dev $rp1 + + devlink_trap_stats_test "IPv4 Unicast Loopback" "uc_loopback" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.3 -t udp sp=54321,dp=12345 -p 100 -q + + devlink_trap_stats_test "IPv6 Unicast Loopback" "uc_loopback" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::3 -t udp sp=54321,dp=12345 \ + -p 100 -q + + ip -6 neigh del 2001:db8:1::3 dev $rp1 + ip -4 neigh del 192.0.2.3 dev $rp1 +} + +local_route_test() +{ + # Use a fake source IP to prevent the trap from being triggered twice + # when the router sends back a port unreachable message. + devlink_trap_stats_test "IPv4 Local Route" "local_route" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.3 -B 192.0.2.2 -t udp sp=54321,dp=12345 -p 100 -q + + devlink_trap_stats_test "IPv6 Local Route" "local_route" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::3 -B 2001:db8:1::2 -t udp sp=54321,sp=12345 \ + -p 100 -q +} + +external_route_test() +{ + # Add a dummy device through which the incoming packets should be + # routed. + ip link add name dummy10 up type dummy + ip address add 203.0.113.1/24 dev dummy10 + ip -6 address add 2001:db8:10::1/64 dev dummy10 + + devlink_trap_stats_test "IPv4 External Route" "external_route" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 203.0.113.2 -t udp sp=54321,dp=12345 -p 100 -q + + devlink_trap_stats_test "IPv6 External Route" "external_route" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:10::2 -t udp sp=54321,sp=12345 \ + -p 100 -q + + ip -6 address del 2001:db8:10::1/64 dev dummy10 + ip address del 203.0.113.1/24 dev dummy10 + ip link del dev dummy10 +} + +ipv6_uc_dip_link_local_scope_test() +{ + # Add a dummy link-local prefix route to allow the packet to be routed. + ip -6 route add fe80:1::/64 dev $rp2 + + devlink_trap_stats_test \ + "IPv6 Unicast Destination IP With Link-Local Scope" \ + "ipv6_uc_dip_link_local_scope" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B fe80:1::2 -t udp sp=54321,sp=12345 \ + -p 100 -q + + ip -6 route del fe80:1::/64 dev $rp2 +} + +ipv4_router_alert_get() +{ + local p + + # https://en.wikipedia.org/wiki/IPv4#Options + p=$(: + )"94:"$( : Option Number + )"04:"$( : Option Length + )"00:00:"$( : Option Data + ) + echo $p +} + +ipv4_router_alert_test() +{ + devlink_trap_stats_test "IPv4 Router Alert" "ipv4_router_alert" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 198.51.100.3 \ + -t ip option=$(ipv4_router_alert_get) -p 100 -q +} + +ipv6_router_alert_get() +{ + local p + + # https://en.wikipedia.org/wiki/IPv6_packet#Hop-by-hop_options_and_destination_options + # https://tools.ietf.org/html/rfc2711#section-2.1 + p=$(: + )"11:"$( : Next Header - UDP + )"00:"$( : Hdr Ext Len + )"05:02:00:00:00:00:"$( : Option Data + ) + echo $p +} + +ipv6_router_alert_test() +{ + devlink_trap_stats_test "IPv6 Router Alert" "ipv6_router_alert" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::3 \ + -t ip next=0,payload=$(ipv6_router_alert_get) -p 100 -q +} + +ipv6_dip_all_nodes_test() +{ + devlink_trap_stats_test "IPv6 Destination IP \"All Nodes Address\"" \ + "ipv6_dip_all_nodes" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:01 \ + -A 2001:db8:1::1 -B ff02::1 -t udp sp=12345,dp=54321 -p 100 -q +} + +ipv6_dip_all_routers_test() +{ + devlink_trap_stats_test "IPv6 Destination IP \"All Routers Address\"" \ + "ipv6_dip_all_routers" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:02 \ + -A 2001:db8:1::1 -B ff02::2 -t udp sp=12345,dp=54321 -p 100 -q +} + +ipv6_router_solicit_test() +{ + devlink_trap_stats_test "IPv6 Router Solicitation" \ + "ipv6_router_solicit" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:02 \ + -A fe80::1 -B ff02::2 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 133) -p 100 -q +} + +ipv6_router_advert_test() +{ + devlink_trap_stats_test "IPv6 Router Advertisement" \ + "ipv6_router_advert" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:01 \ + -A fe80::1 -B ff02::1 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 134) -p 100 -q +} + +ipv6_redirect_test() +{ + devlink_trap_stats_test "IPv6 Redirect Message" \ + "ipv6_redirect" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B 2001:db8:1::2 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 137) -p 100 -q +} + +ptp_event_test() +{ + # PTP is only supported on Spectrum-1, for now. + [[ "$DEVLINK_VIDDID" != "15b3:cb84" ]] && return + + # PTP Sync (0) + devlink_trap_stats_test "PTP Time-Critical Event Message" "ptp_event" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:01:81 \ + -A 192.0.2.1 -B 224.0.1.129 \ + -t udp sp=12345,dp=319,payload=10 -p 100 -q +} + +ptp_general_test() +{ + # PTP is only supported on Spectrum-1, for now. + [[ "$DEVLINK_VIDDID" != "15b3:cb84" ]] && return + + # PTP Announce (b) + devlink_trap_stats_test "PTP General Message" "ptp_general" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:01:81 \ + -A 192.0.2.1 -B 224.0.1.129 \ + -t udp sp=12345,dp=320,payload=1b -p 100 -q +} + +flow_action_sample_test() +{ + # Install a filter that samples every incoming packet. + tc qdisc add dev $rp1 clsact + tc filter add dev $rp1 ingress proto all pref 1 handle 101 matchall \ + skip_sw action sample rate 1 group 1 + + devlink_trap_stats_test "Flow Sampling" "flow_action_sample" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 198.51.100.1 -t udp sp=12345,dp=54321 -p 100 -q + + tc filter del dev $rp1 ingress proto all pref 1 handle 101 matchall + tc qdisc del dev $rp1 clsact +} + +flow_action_trap_test() +{ + # Install a filter that traps a specific flow. + tc qdisc add dev $rp1 clsact + tc filter add dev $rp1 ingress proto ip pref 1 handle 101 flower \ + skip_sw ip_proto udp src_port 12345 dst_port 54321 action trap + + devlink_trap_stats_test "Flow Trapping (Logging)" "flow_action_trap" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 198.51.100.1 -t udp sp=12345,dp=54321 -p 100 -q + + tc filter del dev $rp1 ingress proto ip pref 1 handle 101 flower + tc qdisc del dev $rp1 clsact +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh index e7aecb065409..a4c2812e9807 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh @@ -96,7 +96,6 @@ source_mac_is_multicast_test() { local trap_name="source_mac_is_multicast" local smac=01:02:03:04:05:06 - local group_name="l2_drops" local mz_pid tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \ @@ -107,7 +106,7 @@ source_mac_is_multicast_test() RET=0 - devlink_trap_drop_test $trap_name $group_name $swp2 101 + devlink_trap_drop_test $trap_name $swp2 101 log_test "Source MAC is multicast" @@ -118,7 +117,6 @@ __vlan_tag_mismatch_test() { local trap_name="vlan_tag_mismatch" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local opt=$1; shift local mz_pid @@ -132,7 +130,7 @@ __vlan_tag_mismatch_test() $MZ $h1 "$opt" -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 101 + devlink_trap_drop_test $trap_name $swp2 101 # Add PVID and make sure packets are no longer dropped. bridge vlan add vid 1 dev $swp1 pvid untagged master @@ -140,7 +138,7 @@ __vlan_tag_mismatch_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -179,7 +177,6 @@ ingress_vlan_filter_test() { local trap_name="ingress_vlan_filter" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local mz_pid local vid=10 @@ -193,7 +190,7 @@ ingress_vlan_filter_test() $MZ $h1 -Q $vid -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 101 + devlink_trap_drop_test $trap_name $swp2 101 # Add the VLAN on the bridge port and make sure packets are no longer # dropped. @@ -202,7 +199,7 @@ ingress_vlan_filter_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -222,7 +219,6 @@ __ingress_stp_filter_test() { local trap_name="ingress_spanning_tree_filter" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local state=$1; shift local mz_pid local vid=20 @@ -237,7 +233,7 @@ __ingress_stp_filter_test() $MZ $h1 -Q $vid -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 101 + devlink_trap_drop_test $trap_name $swp2 101 # Change STP state to forwarding and make sure packets are no longer # dropped. @@ -246,7 +242,7 @@ __ingress_stp_filter_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -292,7 +288,6 @@ port_list_is_empty_uc_test() { local trap_name="port_list_is_empty" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local mz_pid # Disable unicast flooding on both ports, so that packets cannot egress @@ -308,7 +303,7 @@ port_list_is_empty_uc_test() $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 101 + devlink_trap_drop_test $trap_name $swp2 101 # Allow packets to be flooded to one port. ip link set dev $swp2 type bridge_slave flood on @@ -316,7 +311,7 @@ port_list_is_empty_uc_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -335,7 +330,6 @@ port_list_is_empty_mc_test() { local trap_name="port_list_is_empty" local dmac=01:00:5e:00:00:01 - local group_name="l2_drops" local dip=239.0.0.1 local mz_pid @@ -354,7 +348,7 @@ port_list_is_empty_mc_test() $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 101 + devlink_trap_drop_test $trap_name $swp2 101 # Allow packets to be flooded to one port. ip link set dev $swp2 type bridge_slave mcast_flood on @@ -362,7 +356,7 @@ port_list_is_empty_mc_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -387,7 +381,6 @@ port_loopback_filter_uc_test() { local trap_name="port_loopback_filter" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local mz_pid # Make sure packets can only egress the input port. @@ -401,7 +394,7 @@ port_loopback_filter_uc_test() $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 101 + devlink_trap_drop_test $trap_name $swp2 101 # Allow packets to be flooded. ip link set dev $swp2 type bridge_slave flood on @@ -409,7 +402,7 @@ port_loopback_filter_uc_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh index 616f47d86a61..f5abb1ebd392 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh @@ -161,7 +161,6 @@ ping_check() non_ip_test() { local trap_name="non_ip" - local group_name="l3_drops" local mz_pid RET=0 @@ -176,7 +175,7 @@ non_ip_test() 00:00 de:ad:be:ef" & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Non IP" @@ -190,7 +189,6 @@ __uc_dip_over_mc_dmac_test() local dip=$1; shift local flags=${1:-""}; shift local trap_name="uc_dip_over_mc_dmac" - local group_name="l3_drops" local dmac=01:02:03:04:05:06 local mz_pid @@ -206,7 +204,7 @@ __uc_dip_over_mc_dmac_test() -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Unicast destination IP over multicast destination MAC: $desc" @@ -227,7 +225,6 @@ __sip_is_loopback_test() local dip=$1; shift local flags=${1:-""}; shift local trap_name="sip_is_loopback_address" - local group_name="l3_drops" local mz_pid RET=0 @@ -242,7 +239,7 @@ __sip_is_loopback_test() -b $rp1mac -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Source IP is loopback address: $desc" @@ -262,7 +259,6 @@ __dip_is_loopback_test() local dip=$1; shift local flags=${1:-""}; shift local trap_name="dip_is_loopback_address" - local group_name="l3_drops" local mz_pid RET=0 @@ -277,7 +273,7 @@ __dip_is_loopback_test() -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Destination IP is loopback address: $desc" @@ -298,7 +294,6 @@ __sip_is_mc_test() local dip=$1; shift local flags=${1:-""}; shift local trap_name="sip_is_mc" - local group_name="l3_drops" local mz_pid RET=0 @@ -313,7 +308,7 @@ __sip_is_mc_test() -b $rp1mac -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Source IP is multicast: $desc" @@ -329,7 +324,6 @@ sip_is_mc_test() ipv4_sip_is_limited_bc_test() { local trap_name="ipv4_sip_is_limited_bc" - local group_name="l3_drops" local sip=255.255.255.255 local mz_pid @@ -345,7 +339,7 @@ ipv4_sip_is_limited_bc_test() -B $h2_ipv4 -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IPv4 source IP is limited broadcast" @@ -382,7 +376,6 @@ __ipv4_header_corrupted_test() local ihl=$1; shift local checksum=$1; shift local trap_name="ip_header_corrupted" - local group_name="l3_drops" local payload local mz_pid @@ -399,7 +392,7 @@ __ipv4_header_corrupted_test() $MZ $h1 -c 0 -d 1msec -a $h1mac -b $rp1mac -q p=$payload & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IP header corrupted: $desc: IPv4" @@ -429,7 +422,6 @@ __ipv6_header_corrupted_test() local desc=$1; shift local ipver=$1; shift local trap_name="ip_header_corrupted" - local group_name="l3_drops" local payload local mz_pid @@ -446,7 +438,7 @@ __ipv6_header_corrupted_test() $MZ $h1 -c 0 -d 1msec -a $h1mac -b $rp1mac -q p=$payload & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IP header corrupted: $desc: IPv6" @@ -469,7 +461,6 @@ ip_header_corrupted_test() ipv6_mc_dip_reserved_scope_test() { local trap_name="ipv6_mc_dip_reserved_scope" - local group_name="l3_drops" local dip=FF00:: local mz_pid @@ -485,7 +476,7 @@ ipv6_mc_dip_reserved_scope_test() "33:33:00:00:00:00" -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IPv6 multicast destination IP reserved scope" @@ -495,7 +486,6 @@ ipv6_mc_dip_reserved_scope_test() ipv6_mc_dip_interface_local_scope_test() { local trap_name="ipv6_mc_dip_interface_local_scope" - local group_name="l3_drops" local dip=FF01:: local mz_pid @@ -511,7 +501,7 @@ ipv6_mc_dip_interface_local_scope_test() "33:33:00:00:00:00" -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IPv6 multicast destination IP interface-local scope" @@ -526,7 +516,6 @@ __blackhole_route_test() local dip=$1; shift local ip_proto=${1:-"icmp"}; shift local trap_name="blackhole_route" - local group_name="l3_drops" local mz_pid RET=0 @@ -542,7 +531,7 @@ __blackhole_route_test() -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 101 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Blackhole route: IPv$flags" devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101 @@ -558,7 +547,6 @@ blackhole_route_test() irif_disabled_test() { local trap_name="irif_disabled" - local group_name="l3_drops" local t0_packets t0_bytes local t1_packets t1_bytes local mz_pid @@ -613,7 +601,6 @@ irif_disabled_test() erif_disabled_test() { local trap_name="erif_disabled" - local group_name="l3_drops" local t0_packets t0_bytes local t1_packets t1_bytes local mz_pid diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh index 2bc6df42d597..1fedfc9da434 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh @@ -169,7 +169,6 @@ trap_action_check() mtu_value_is_too_small_test() { local trap_name="mtu_value_is_too_small" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -191,7 +190,7 @@ mtu_value_is_too_small_test() -B 198.51.100.1 -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets_hitting "dev $h1 ingress" 101 check_err $? "Packets were not received to h1" @@ -208,7 +207,6 @@ __ttl_value_is_too_small_test() { local ttl_val=$1; shift local trap_name="ttl_value_is_too_small" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -227,7 +225,7 @@ __ttl_value_is_too_small_test() -b $rp1mac -B 198.51.100.1 -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets_hitting "dev $h1 ingress" 101 check_err $? "Packets were not received to h1" @@ -271,7 +269,6 @@ __mc_reverse_path_forwarding_test() local proto=$1; shift local flags=${1:-""}; shift local trap_name="mc_reverse_path_forwarding" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -292,7 +289,7 @@ __mc_reverse_path_forwarding_test() mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $rp2 egress" 101 0 check_err $? "Packets were not dropped" @@ -322,7 +319,6 @@ __reject_route_test() local unreachable=$1; shift local flags=${1:-""}; shift local trap_name="reject_route" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -341,7 +337,7 @@ __reject_route_test() -B $dst_ip -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets_hitting "dev $h1 ingress" 101 check_err $? "ICMP packet was not received to h1" @@ -370,7 +366,6 @@ __host_miss_test() local desc=$1; shift local dip=$1; shift local trap_name="unresolved_neigh" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -405,7 +400,6 @@ __invalid_nexthop_test() local subnet=$1; shift local via_add=$1; shift local trap_name="unresolved_neigh" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -494,7 +488,6 @@ vrf_without_routes_destroy() ipv4_lpm_miss_test() { local trap_name="ipv4_lpm_miss" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -511,7 +504,7 @@ ipv4_lpm_miss_test() -B 203.0.113.1 -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name log_test "LPM miss: IPv4" @@ -522,7 +515,6 @@ ipv4_lpm_miss_test() ipv6_lpm_miss_test() { local trap_name="ipv6_lpm_miss" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -539,7 +531,7 @@ ipv6_lpm_miss_test() -B 2001:db8::1 -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name log_test "LPM miss: IPv6" diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh index 039629bb92a3..8817851da7a9 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh @@ -140,7 +140,6 @@ ecn_payload_get() ecn_decap_test() { local trap_name="decap_error" - local group_name="tunnel_drops" local desc=$1; shift local ecn_desc=$1; shift local outer_tos=$1; shift @@ -161,7 +160,7 @@ ecn_decap_test() mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $swp1 egress" 101 0 check_err $? "Packets were not dropped" @@ -200,7 +199,6 @@ ipip_payload_get() no_matching_tunnel_test() { local trap_name="decap_error" - local group_name="tunnel_drops" local desc=$1; shift local sip=$1; shift local mz_pid @@ -218,7 +216,7 @@ no_matching_tunnel_test() -A $sip -B 192.0.2.65 -t ip len=48,proto=47,p=$payload -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $swp1 egress" 101 0 check_err $? "Packets were not dropped" diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh index e11a416323cf..10e0f3dbc930 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh @@ -159,7 +159,6 @@ ecn_payload_get() ecn_decap_test() { local trap_name="decap_error" - local group_name="tunnel_drops" local desc=$1; shift local ecn_desc=$1; shift local outer_tos=$1; shift @@ -177,7 +176,7 @@ ecn_decap_test() -t udp sp=12345,dp=$VXPORT,tos=$outer_tos,p=$payload -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $swp1 egress" 101 0 check_err $? "Packets were not dropped" @@ -228,7 +227,6 @@ short_payload_get() corrupted_packet_test() { local trap_name="decap_error" - local group_name="tunnel_drops" local desc=$1; shift local payload_get=$1; shift local mz_pid @@ -246,7 +244,7 @@ corrupted_packet_test() -B 192.0.2.17 -t udp sp=12345,dp=$VXPORT,p=$payload -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $swp1 egress" 101 0 check_err $? "Packets were not dropped" @@ -297,7 +295,6 @@ mc_smac_payload_get() overlay_smac_is_mc_test() { local trap_name="overlay_smac_is_mc" - local group_name="tunnel_drops" local mz_pid RET=0 @@ -314,7 +311,7 @@ overlay_smac_is_mc_test() -B 192.0.2.17 -t udp sp=12345,dp=$VXPORT,p=$payload -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp1 101 + devlink_trap_drop_test $trap_name $swp1 101 log_test "Overlay source MAC is multicast" diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh index 24dd8ed48580..b025daea062d 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh @@ -300,7 +300,7 @@ test_uc_aware() local i for ((i = 0; i < attempts; ++i)); do - if $ARPING -c 1 -I $h1 -b 192.0.2.66 -q -w 0.1; then + if $ARPING -c 1 -I $h1 -b 192.0.2.66 -q -w 1; then ((passes++)) fi diff --git a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh index 58f3a05f08af..7d9e73a43a49 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh @@ -15,7 +15,7 @@ source mlxsw_lib.sh SB_POOL_ING=0 SB_POOL_EGR_CPU=10 -SB_ITC_CPU_IP=3 +SB_ITC_CPU_IP=2 SB_ITC_CPU_ARP=2 SB_ITC=0 diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh index 68c80d0ec1ec..9241250c5921 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_restrictions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh @@ -7,6 +7,10 @@ ALL_TESTS=" shared_block_drop_test egress_redirect_test multi_mirror_test + matchall_sample_egress_test + matchall_mirror_behind_flower_ingress_test + matchall_sample_behind_flower_ingress_test + matchall_mirror_behind_flower_egress_test " NUM_NETIFS=2 @@ -155,6 +159,134 @@ multi_mirror_test() log_test "multi mirror" } +matchall_sample_egress_test() +{ + RET=0 + + # It is forbidden in mlxsw driver to have matchall with sample action + # bound on egress + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 ingress protocol all pref 1 handle 101 \ + matchall skip_sw action sample rate 100 group 1 + check_err $? "Failed to add rule with sample action on ingress" + + tc filter del dev $swp1 ingress protocol all pref 1 handle 101 matchall + + tc filter add dev $swp1 egress protocol all pref 1 handle 101 \ + matchall skip_sw action sample rate 100 group 1 + check_fail $? "Incorrect success to add rule with sample action on egress" + + tc qdisc del dev $swp1 clsact + + log_test "matchall sample egress" +} + +matchall_behind_flower_ingress_test() +{ + local action=$1 + local action_args=$2 + + RET=0 + + # On ingress, all matchall-mirror and matchall-sample + # rules have to be in front of the flower rules + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 ingress protocol ip pref 10 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + + tc filter add dev $swp1 ingress protocol all pref 9 handle 102 \ + matchall skip_sw action $action_args + check_err $? "Failed to add matchall rule in front of a flower rule" + + tc filter del dev $swp1 ingress protocol all pref 9 handle 102 matchall + + tc filter add dev $swp1 ingress protocol all pref 11 handle 102 \ + matchall skip_sw action $action_args + check_fail $? "Incorrect success to add matchall rule behind a flower rule" + + tc filter del dev $swp1 ingress protocol ip pref 10 handle 101 flower + + tc filter add dev $swp1 ingress protocol all pref 9 handle 102 \ + matchall skip_sw action $action_args + + tc filter add dev $swp1 ingress protocol ip pref 10 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_err $? "Failed to add flower rule behind a matchall rule" + + tc filter del dev $swp1 ingress protocol ip pref 10 handle 101 flower + + tc filter add dev $swp1 ingress protocol ip pref 8 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_fail $? "Incorrect success to add flower rule in front of a matchall rule" + + tc qdisc del dev $swp1 clsact + + log_test "matchall $action flower ingress" +} + +matchall_mirror_behind_flower_ingress_test() +{ + matchall_behind_flower_ingress_test "mirror" "mirred egress mirror dev $swp2" +} + +matchall_sample_behind_flower_ingress_test() +{ + matchall_behind_flower_ingress_test "sample" "sample rate 100 group 1" +} + +matchall_behind_flower_egress_test() +{ + local action=$1 + local action_args=$2 + + RET=0 + + # On egress, all matchall-mirror rules have to be behind the flower rules + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 egress protocol ip pref 10 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + + tc filter add dev $swp1 egress protocol all pref 11 handle 102 \ + matchall skip_sw action $action_args + check_err $? "Failed to add matchall rule in front of a flower rule" + + tc filter del dev $swp1 egress protocol all pref 11 handle 102 matchall + + tc filter add dev $swp1 egress protocol all pref 9 handle 102 \ + matchall skip_sw action $action_args + check_fail $? "Incorrect success to add matchall rule behind a flower rule" + + tc filter del dev $swp1 egress protocol ip pref 10 handle 101 flower + + tc filter add dev $swp1 egress protocol all pref 11 handle 102 \ + matchall skip_sw action $action_args + + tc filter add dev $swp1 egress protocol ip pref 10 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_err $? "Failed to add flower rule behind a matchall rule" + + tc filter del dev $swp1 egress protocol ip pref 10 handle 101 flower + + tc filter add dev $swp1 egress protocol ip pref 12 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_fail $? "Incorrect success to add flower rule in front of a matchall rule" + + tc qdisc del dev $swp1 clsact + + log_test "matchall $action flower egress" +} + +matchall_mirror_behind_flower_egress_test() +{ + matchall_behind_flower_egress_test "mirror" "mirred egress mirror dev $swp2" +} + setup_prepare() { swp1=${NETIFS[p1]} diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index 9f9741444549..de4b32fc4223 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -146,11 +146,39 @@ regions_test() check_region_snapshot_count dummy post-first-request 3 + devlink region dump $DL_HANDLE/dummy snapshot 25 >> /dev/null + check_err $? "Failed to dump snapshot with id 25" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 0 len 1 >> /dev/null + check_err $? "Failed to read snapshot with id 25 (1 byte)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 128 len 128 >> /dev/null + check_err $? "Failed to read snapshot with id 25 (128 bytes)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 128 len $((1<<32)) >> /dev/null + check_err $? "Failed to read snapshot with id 25 (oversized)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr $((1<<32)) len 128 >> /dev/null 2>&1 + check_fail $? "Bad read of snapshot with id 25 did not fail" + devlink region del $DL_HANDLE/dummy snapshot 25 check_err $? "Failed to delete snapshot with id 25" check_region_snapshot_count dummy post-second-delete 2 + sid=$(devlink -j region new $DL_HANDLE/dummy | jq '.[][][][]') + check_err $? "Failed to create a new snapshot with id allocated by the kernel" + + check_region_snapshot_count dummy post-first-request 3 + + devlink region dump $DL_HANDLE/dummy snapshot $sid >> /dev/null + check_err $? "Failed to dump a snapshot with id allocated by the kernel" + + devlink region del $DL_HANDLE/dummy snapshot $sid + check_err $? "Failed to delete snapshot with id allocated by the kernel" + + check_region_snapshot_count dummy post-first-request 2 + log_test "regions test" } diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh index dbd1e014ba17..da49ad2761b5 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh @@ -264,6 +264,8 @@ trap_policer_test() local packets_t0 local packets_t1 + RET=0 + if [ $(devlink_trap_policers_num_get) -eq 0 ]; then check_err 1 "Failed to dump policers" fi @@ -328,6 +330,8 @@ trap_group_check_policer() trap_policer_bind_test() { + RET=0 + devlink trap group set $DEVLINK_DEV group l2_drops policer 1 check_err $? "Failed to bind a valid policer" if [ $(devlink_trap_group_policer_get "l2_drops") -ne 1 ]; then diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore index c078ece12ff0..94b02a18f230 100644 --- a/tools/testing/selftests/exec/.gitignore +++ b/tools/testing/selftests/exec/.gitignore @@ -9,3 +9,4 @@ execveat.ephemeral execveat.denatured /recursion-depth xxxxxxxx* +pipe diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index 33339e31e365..4453b8f8def3 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -3,8 +3,9 @@ CFLAGS = -Wall CFLAGS += -Wno-nonnull CFLAGS += -D_GNU_SOURCE +TEST_PROGS := binfmt_script TEST_GEN_PROGS := execveat -TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir +TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir pipe # Makefile is a run-time dependency, since it's accessed by the execveat test TEST_FILES := Makefile diff --git a/tools/testing/selftests/exec/binfmt_script b/tools/testing/selftests/exec/binfmt_script new file mode 100755 index 000000000000..05f94a741c7a --- /dev/null +++ b/tools/testing/selftests/exec/binfmt_script @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# +# Test that truncation of bprm->buf doesn't cause unexpected execs paths, along +# with various other pathological cases. +import os, subprocess + +# Relevant commits +# +# b5372fe5dc84 ("exec: load_script: Do not exec truncated interpreter path") +# 6eb3c3d0a52d ("exec: increase BINPRM_BUF_SIZE to 256") + +# BINPRM_BUF_SIZE +SIZE=256 + +NAME_MAX=int(subprocess.check_output(["getconf", "NAME_MAX", "."])) + +test_num=0 + +code='''#!/usr/bin/perl +print "Executed interpreter! Args:\n"; +print "0 : '$0'\n"; +$counter = 1; +foreach my $a (@ARGV) { + print "$counter : '$a'\n"; + $counter++; +} +''' + +## +# test - produce a binfmt_script hashbang line for testing +# +# @size: bytes for bprm->buf line, including hashbang but not newline +# @good: whether this script is expected to execute correctly +# @hashbang: the special 2 bytes for running binfmt_script +# @leading: any leading whitespace before the executable path +# @root: start of executable pathname +# @target: end of executable pathname +# @arg: bytes following the executable pathname +# @fill: character to fill between @root and @target to reach @size bytes +# @newline: character to use as newline, not counted towards @size +# ... +def test(name, size, good=True, leading="", root="./", target="/perl", + fill="A", arg="", newline="\n", hashbang="#!"): + global test_num, tests, NAME_MAX + test_num += 1 + if test_num > tests: + raise ValueError("more binfmt_script tests than expected! (want %d, expected %d)" + % (test_num, tests)) + + middle = "" + remaining = size - len(hashbang) - len(leading) - len(root) - len(target) - len(arg) + # The middle of the pathname must not exceed NAME_MAX + while remaining >= NAME_MAX: + middle += fill * (NAME_MAX - 1) + middle += '/' + remaining -= NAME_MAX + middle += fill * remaining + + dirpath = root + middle + binary = dirpath + target + if len(target): + os.makedirs(dirpath, mode=0o755, exist_ok=True) + open(binary, "w").write(code) + os.chmod(binary, 0o755) + + buf=hashbang + leading + root + middle + target + arg + newline + if len(newline) > 0: + buf += 'echo this is not really perl\n' + + script = "binfmt_script-%s" % (name) + open(script, "w").write(buf) + os.chmod(script, 0o755) + + proc = subprocess.Popen(["./%s" % (script)], shell=True, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + stdout = proc.communicate()[0] + + if proc.returncode == 0 and b'Executed interpreter' in stdout: + if good: + print("ok %d - binfmt_script %s (successful good exec)" + % (test_num, name)) + else: + print("not ok %d - binfmt_script %s succeeded when it should have failed" + % (test_num, name)) + else: + if good: + print("not ok %d - binfmt_script %s failed when it should have succeeded (rc:%d)" + % (test_num, name, proc.returncode)) + else: + print("ok %d - binfmt_script %s (correctly failed bad exec)" + % (test_num, name)) + + # Clean up crazy binaries + os.unlink(script) + if len(target): + elements = binary.split('/') + os.unlink(binary) + elements.pop() + while len(elements) > 1: + os.rmdir("/".join(elements)) + elements.pop() + +tests=27 +print("TAP version 1.3") +print("1..%d" % (tests)) + +### FAIL (8 tests) + +# Entire path is well past the BINFMT_BUF_SIZE. +test(name="too-big", size=SIZE+80, good=False) +# Path is right at max size, making it impossible to tell if it was truncated. +test(name="exact", size=SIZE, good=False) +# Same as above, but with leading whitespace. +test(name="exact-space", size=SIZE, good=False, leading=" ") +# Huge buffer of only whitespace. +test(name="whitespace-too-big", size=SIZE+71, good=False, root="", + fill=" ", target="") +# A good path, but it gets truncated due to leading whitespace. +test(name="truncated", size=SIZE+17, good=False, leading=" " * 19) +# Entirely empty except for #! +test(name="empty", size=2, good=False, root="", + fill="", target="", newline="") +# Within size, but entirely spaces +test(name="spaces", size=SIZE-1, good=False, root="", fill=" ", + target="", newline="") +# Newline before binary. +test(name="newline-prefix", size=SIZE-1, good=False, leading="\n", + root="", fill=" ", target="") + +### ok (19 tests) + +# The original test case that was broken by commit: +# 8099b047ecc4 ("exec: load_script: don't blindly truncate shebang string") +test(name="test.pl", size=439, leading=" ", + root="./nix/store/bwav8kz8b3y471wjsybgzw84mrh4js9-perl-5.28.1/bin", + arg=" -I/nix/store/x6yyav38jgr924nkna62q3pkp0dgmzlx-perl5.28.1-File-Slurp-9999.25/lib/perl5/site_perl -I/nix/store/ha8v67sl8dac92r9z07vzr4gv1y9nwqz-perl5.28.1-Net-DBus-1.1.0/lib/perl5/site_perl -I/nix/store/dcrkvnjmwh69ljsvpbdjjdnqgwx90a9d-perl5.28.1-XML-Parser-2.44/lib/perl5/site_perl -I/nix/store/rmji88k2zz7h4zg97385bygcydrf2q8h-perl5.28.1-XML-Twig-3.52/lib/perl5/site_perl") +# One byte under size, leaving newline visible. +test(name="one-under", size=SIZE-1) +# Two bytes under size, leaving newline visible. +test(name="two-under", size=SIZE-2) +# Exact size, but trailing whitespace visible instead of newline +test(name="exact-trunc-whitespace", size=SIZE, arg=" ") +# Exact size, but trailing space and first arg char visible instead of newline. +test(name="exact-trunc-arg", size=SIZE, arg=" f") +# One bute under, with confirmed non-truncated arg since newline now visible. +test(name="one-under-full-arg", size=SIZE-1, arg=" f") +# Short read buffer by one byte. +test(name="one-under-no-nl", size=SIZE-1, newline="") +# Short read buffer by half buffer size. +test(name="half-under-no-nl", size=int(SIZE/2), newline="") +# One byte under with whitespace arg. leaving wenline visible. +test(name="one-under-trunc-arg", size=SIZE-1, arg=" ") +# One byte under with whitespace leading. leaving wenline visible. +test(name="one-under-leading", size=SIZE-1, leading=" ") +# One byte under with whitespace leading and as arg. leaving newline visible. +test(name="one-under-leading-trunc-arg", size=SIZE-1, leading=" ", arg=" ") +# Same as above, but with 2 bytes under +test(name="two-under-no-nl", size=SIZE-2, newline="") +test(name="two-under-trunc-arg", size=SIZE-2, arg=" ") +test(name="two-under-leading", size=SIZE-2, leading=" ") +test(name="two-under-leading-trunc-arg", size=SIZE-2, leading=" ", arg=" ") +# Same as above, but with buffer half filled +test(name="two-under-no-nl", size=int(SIZE/2), newline="") +test(name="two-under-trunc-arg", size=int(SIZE/2), arg=" ") +test(name="two-under-leading", size=int(SIZE/2), leading=" ") +test(name="two-under-lead-trunc-arg", size=int(SIZE/2), leading=" ", arg=" ") + +if test_num != tests: + raise ValueError("fewer binfmt_script tests than expected! (ran %d, expected %d" + % (test_num, tests)) diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c index cbb6efbdb786..67bf7254a48f 100644 --- a/tools/testing/selftests/exec/execveat.c +++ b/tools/testing/selftests/exec/execveat.c @@ -5,7 +5,9 @@ * Selftests for execveat(2). */ +#ifndef _GNU_SOURCE #define _GNU_SOURCE /* to get O_PATH, AT_EMPTY_PATH */ +#endif #include <sys/sendfile.h> #include <sys/stat.h> #include <sys/syscall.h> @@ -311,6 +313,10 @@ static int run_tests(void) fail += check_execveat_fail(AT_FDCWD, fullname_symlink, AT_SYMLINK_NOFOLLOW, ELOOP); + /* Non-regular file failure */ + fail += check_execveat_fail(dot_dfd, "pipe", 0, EACCES); + unlink("pipe"); + /* Shell script wrapping executable file: */ /* dfd + path */ fail += check_execveat(subdir_dfd, "../script", 0); @@ -384,6 +390,8 @@ static void prerequisites(void) fd = open("subdir.ephemeral/script", O_RDWR|O_CREAT|O_TRUNC, 0755); write(fd, script, strlen(script)); close(fd); + + mkfifo("pipe", 0755); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc b/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc index 021c03fd885d..23465823532b 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc @@ -14,6 +14,8 @@ if [ ! -f set_event ]; then exit_unsupported fi +[ -f error_log ] || exit_unsupported + ftrace_errlog_check 'event filter parse error' '((sig >= 10 && sig < 15) || dsig ^== 17) && comm != bash' 'events/signal/signal_generate/filter' exit 0 diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions index 61a3c7e2634d..697c77ef2e2b 100644 --- a/tools/testing/selftests/ftrace/test.d/functions +++ b/tools/testing/selftests/ftrace/test.d/functions @@ -119,12 +119,14 @@ yield() { ping $LOCALHOST -c 1 || sleep .001 || usleep 1 || sleep 1 } +# Since probe event command may include backslash, explicitly use printf "%s" +# to NOT interpret it. ftrace_errlog_check() { # err-prefix command-with-error-pos-by-^ command-file - pos=$(echo -n "${2%^*}" | wc -c) # error position - command=$(echo "$2" | tr -d ^) + pos=$(printf "%s" "${2%^*}" | wc -c) # error position + command=$(printf "%s" "$2" | tr -d ^) echo "Test command: $command" echo > error_log - (! echo "$command" >> "$3" ) 2> /dev/null + (! printf "%s" "$command" >> "$3" ) 2> /dev/null grep "$1: error:" -A 3 error_log N=$(tail -n 1 error_log | wc -c) # " Command: " and "^\n" => 13 diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc index ef1e9bafb098..eb0f4ab4e070 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc @@ -91,7 +91,9 @@ esac if grep -q "Create/append/" README && grep -q "imm-value" README; then echo 'p:kprobes/testevent _do_fork' > kprobe_events check_error '^r:kprobes/testevent do_exit' # DIFF_PROBE_TYPE -echo 'p:kprobes/testevent _do_fork abcd=\1' > kprobe_events + +# Explicitly use printf "%s" to not interpret \1 +printf "%s" 'p:kprobes/testevent _do_fork abcd=\1' > kprobe_events check_error 'p:kprobes/testevent _do_fork ^bcd=\1' # DIFF_ARG_TYPE check_error 'p:kprobes/testevent _do_fork ^abcd=\1:u8' # DIFF_ARG_TYPE check_error 'p:kprobes/testevent _do_fork ^abcd=\"foo"' # DIFF_ARG_TYPE diff --git a/tools/testing/selftests/gen_kselftest_tar.sh b/tools/testing/selftests/gen_kselftest_tar.sh index 8b2b6088540d..4a974bc03385 100755 --- a/tools/testing/selftests/gen_kselftest_tar.sh +++ b/tools/testing/selftests/gen_kselftest_tar.sh @@ -49,6 +49,11 @@ main() # directory ./kselftest_install.sh "$install_dir" (cd "$install_work"; tar $copts "$dest"/kselftest${ext} $install_name) + + # Don't put the message at the actual end as people may be parsing the + # "archive created" line in their scripts. + echo -e "\nConsider using 'make gen_tar' instead of this script\n" + echo "Kselftest archive kselftest${ext} created!" # clean up top-level install work directory diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 2bb8c81fc0b4..c9f03ef93338 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -168,9 +168,17 @@ #define __TEST_IMPL(test_name, _signal) \ static void test_name(struct __test_metadata *_metadata); \ + static inline void wrapper_##test_name( \ + struct __test_metadata *_metadata, \ + struct __fixture_variant_metadata *variant) \ + { \ + test_name(_metadata); \ + } \ static struct __test_metadata _##test_name##_object = \ - { .name = "global." #test_name, \ - .fn = &test_name, .termsig = _signal, \ + { .name = #test_name, \ + .fn = &wrapper_##test_name, \ + .fixture = &_fixture_global, \ + .termsig = _signal, \ .timeout = TEST_TIMEOUT_DEFAULT, }; \ static void __attribute__((constructor)) _register_##test_name(void) \ { \ @@ -212,10 +220,13 @@ * populated and cleaned up using FIXTURE_SETUP() and FIXTURE_TEARDOWN(). */ #define FIXTURE(fixture_name) \ + FIXTURE_VARIANT(fixture_name); \ + static struct __fixture_metadata _##fixture_name##_fixture_object = \ + { .name = #fixture_name, }; \ static void __attribute__((constructor)) \ _register_##fixture_name##_data(void) \ { \ - __fixture_count++; \ + __register_fixture(&_##fixture_name##_fixture_object); \ } \ FIXTURE_DATA(fixture_name) @@ -241,7 +252,10 @@ #define FIXTURE_SETUP(fixture_name) \ void fixture_name##_setup( \ struct __test_metadata __attribute__((unused)) *_metadata, \ - FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) + FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ + const FIXTURE_VARIANT(fixture_name) \ + __attribute__((unused)) *variant) + /** * FIXTURE_TEARDOWN(fixture_name) * *_metadata* is included so that EXPECT_* and ASSERT_* work correctly. @@ -264,6 +278,59 @@ FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) /** + * FIXTURE_VARIANT(fixture_name) - Optionally called once per fixture + * to declare fixture variant + * + * @fixture_name: fixture name + * + * .. code-block:: c + * + * FIXTURE_VARIANT(datatype name) { + * type property1; + * ... + * }; + * + * Defines type of constant parameters provided to FIXTURE_SETUP() and TEST_F() + * as *variant*. Variants allow the same tests to be run with different + * arguments. + */ +#define FIXTURE_VARIANT(fixture_name) struct _fixture_variant_##fixture_name + +/** + * FIXTURE_VARIANT_ADD(fixture_name, variant_name) - Called once per fixture + * variant to setup and register the data + * + * @fixture_name: fixture name + * @variant_name: name of the parameter set + * + * .. code-block:: c + * + * FIXTURE_ADD(datatype name) { + * .property1 = val1; + * ... + * }; + * + * Defines a variant of the test fixture, provided to FIXTURE_SETUP() and + * TEST_F() as *variant*. Tests of each fixture will be run once for each + * variant. + */ +#define FIXTURE_VARIANT_ADD(fixture_name, variant_name) \ + extern FIXTURE_VARIANT(fixture_name) \ + _##fixture_name##_##variant_name##_variant; \ + static struct __fixture_variant_metadata \ + _##fixture_name##_##variant_name##_object = \ + { .name = #variant_name, \ + .data = &_##fixture_name##_##variant_name##_variant}; \ + static void __attribute__((constructor)) \ + _register_##fixture_name##_##variant_name(void) \ + { \ + __register_fixture_variant(&_##fixture_name##_fixture_object, \ + &_##fixture_name##_##variant_name##_object); \ + } \ + FIXTURE_VARIANT(fixture_name) \ + _##fixture_name##_##variant_name##_variant = + +/** * TEST_F(fixture_name, test_name) - Emits test registration and helpers for * fixture-based test cases * @@ -293,24 +360,27 @@ #define __TEST_F_IMPL(fixture_name, test_name, signal, tmout) \ static void fixture_name##_##test_name( \ struct __test_metadata *_metadata, \ - FIXTURE_DATA(fixture_name) *self); \ + FIXTURE_DATA(fixture_name) *self, \ + const FIXTURE_VARIANT(fixture_name) *variant); \ static inline void wrapper_##fixture_name##_##test_name( \ - struct __test_metadata *_metadata) \ + struct __test_metadata *_metadata, \ + struct __fixture_variant_metadata *variant) \ { \ /* fixture data is alloced, setup, and torn down per call. */ \ FIXTURE_DATA(fixture_name) self; \ memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \ - fixture_name##_setup(_metadata, &self); \ + fixture_name##_setup(_metadata, &self, variant->data); \ /* Let setup failure terminate early. */ \ if (!_metadata->passed) \ return; \ - fixture_name##_##test_name(_metadata, &self); \ + fixture_name##_##test_name(_metadata, &self, variant->data); \ fixture_name##_teardown(_metadata, &self); \ } \ static struct __test_metadata \ _##fixture_name##_##test_name##_object = { \ - .name = #fixture_name "." #test_name, \ + .name = #test_name, \ .fn = &wrapper_##fixture_name##_##test_name, \ + .fixture = &_##fixture_name##_fixture_object, \ .termsig = signal, \ .timeout = tmout, \ }; \ @@ -321,7 +391,9 @@ } \ static void fixture_name##_##test_name( \ struct __test_metadata __attribute__((unused)) *_metadata, \ - FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) + FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ + const FIXTURE_VARIANT(fixture_name) \ + __attribute__((unused)) *variant) /** * TEST_HARNESS_MAIN - Simple wrapper to run the test harness @@ -631,11 +703,74 @@ } \ } while (0); OPTIONAL_HANDLER(_assert) +/* List helpers */ +#define __LIST_APPEND(head, item) \ +{ \ + /* Circular linked list where only prev is circular. */ \ + if (head == NULL) { \ + head = item; \ + item->next = NULL; \ + item->prev = item; \ + return; \ + } \ + if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { \ + item->next = NULL; \ + item->prev = head->prev; \ + item->prev->next = item; \ + head->prev = item; \ + } else { \ + item->next = head; \ + item->next->prev = item; \ + item->prev = item; \ + head = item; \ + } \ +} + +struct __test_metadata; +struct __fixture_variant_metadata; + +/* Contains all the information about a fixture. */ +struct __fixture_metadata { + const char *name; + struct __test_metadata *tests; + struct __fixture_variant_metadata *variant; + struct __fixture_metadata *prev, *next; +} _fixture_global __attribute__((unused)) = { + .name = "global", + .prev = &_fixture_global, +}; + +static struct __fixture_metadata *__fixture_list = &_fixture_global; +static int __constructor_order; + +#define _CONSTRUCTOR_ORDER_FORWARD 1 +#define _CONSTRUCTOR_ORDER_BACKWARD -1 + +static inline void __register_fixture(struct __fixture_metadata *f) +{ + __LIST_APPEND(__fixture_list, f); +} + +struct __fixture_variant_metadata { + const char *name; + const void *data; + struct __fixture_variant_metadata *prev, *next; +}; + +static inline void +__register_fixture_variant(struct __fixture_metadata *f, + struct __fixture_variant_metadata *variant) +{ + __LIST_APPEND(f->variant, variant); +} + /* Contains all the information for test execution and status checking. */ struct __test_metadata { const char *name; - void (*fn)(struct __test_metadata *); + void (*fn)(struct __test_metadata *, + struct __fixture_variant_metadata *); pid_t pid; /* pid of test when being run */ + struct __fixture_metadata *fixture; int termsig; int passed; int trigger; /* extra handler after the evaluation */ @@ -646,15 +781,6 @@ struct __test_metadata { struct __test_metadata *prev, *next; }; -/* Storage for the (global) tests to be run. */ -static struct __test_metadata *__test_list; -static unsigned int __test_count; -static unsigned int __fixture_count; -static int __constructor_order; - -#define _CONSTRUCTOR_ORDER_FORWARD 1 -#define _CONSTRUCTOR_ORDER_BACKWARD -1 - /* * Since constructors are called in reverse order, reverse the test * list so tests are run in source declaration order. @@ -666,25 +792,7 @@ static int __constructor_order; */ static inline void __register_test(struct __test_metadata *t) { - __test_count++; - /* Circular linked list where only prev is circular. */ - if (__test_list == NULL) { - __test_list = t; - t->next = NULL; - t->prev = t; - return; - } - if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { - t->next = NULL; - t->prev = __test_list->prev; - t->prev->next = t; - __test_list->prev = t; - } else { - t->next = __test_list; - t->next->prev = t; - t->prev = t; - __test_list = t; - } + __LIST_APPEND(t->fixture->tests, t); } static inline int __bail(int for_realz, bool no_print, __u8 step) @@ -790,43 +898,67 @@ void __wait_for_test(struct __test_metadata *t) } } -void __run_test(struct __test_metadata *t) +void __run_test(struct __fixture_metadata *f, + struct __fixture_variant_metadata *variant, + struct __test_metadata *t) { + /* reset test struct */ t->passed = 1; t->trigger = 0; - printf("[ RUN ] %s\n", t->name); + t->step = 0; + t->no_print = 0; + + printf("[ RUN ] %s%s%s.%s\n", + f->name, variant->name[0] ? "." : "", variant->name, t->name); t->pid = fork(); if (t->pid < 0) { printf("ERROR SPAWNING TEST CHILD\n"); t->passed = 0; } else if (t->pid == 0) { - t->fn(t); + t->fn(t, variant); /* return the step that failed or 0 */ _exit(t->passed ? 0 : t->step); } else { __wait_for_test(t); } - printf("[ %4s ] %s\n", (t->passed ? "OK" : "FAIL"), t->name); + printf("[ %4s ] %s%s%s.%s\n", (t->passed ? "OK" : "FAIL"), + f->name, variant->name[0] ? "." : "", variant->name, t->name); } static int test_harness_run(int __attribute__((unused)) argc, char __attribute__((unused)) **argv) { + struct __fixture_variant_metadata no_variant = { .name = "", }; + struct __fixture_variant_metadata *v; + struct __fixture_metadata *f; struct __test_metadata *t; int ret = 0; + unsigned int case_count = 0, test_count = 0; unsigned int count = 0; unsigned int pass_count = 0; + for (f = __fixture_list; f; f = f->next) { + for (v = f->variant ?: &no_variant; v; v = v->next) { + case_count++; + for (t = f->tests; t; t = t->next) + test_count++; + } + } + /* TODO(wad) add optional arguments similar to gtest. */ printf("[==========] Running %u tests from %u test cases.\n", - __test_count, __fixture_count + 1); - for (t = __test_list; t; t = t->next) { - count++; - __run_test(t); - if (t->passed) - pass_count++; - else - ret = 1; + test_count, case_count); + for (f = __fixture_list; f; f = f->next) { + for (v = f->variant ?: &no_variant; v; v = v->next) { + for (t = f->tests; t; t = t->next) { + count++; + __run_test(f, v, t); + if (t->passed) + pass_count++; + else + ret = 1; + } + } } printf("[==========] %u / %u tests passed.\n", pass_count, count); printf("[ %s ]\n", (ret ? "FAILED" : "PASSED")); diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index a9b2b48947ff..f159718f90c0 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -7,10 +7,10 @@ /x86_64/hyperv_cpuid /x86_64/mmio_warning_test /x86_64/platform_info_test -/x86_64/set_memory_region_test /x86_64/set_sregs_test /x86_64/smm_test /x86_64/state_test +/x86_64/vmx_preemption_timer_test /x86_64/svm_vmcall_test /x86_64/sync_regs_test /x86_64/vmx_close_while_nested_test @@ -22,4 +22,5 @@ /demand_paging_test /dirty_log_test /kvm_create_max_vcpus +/set_memory_region_test /steal_time diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 42f4f49f2a48..b4ff112e5c7e 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -43,10 +43,10 @@ TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test -TEST_GEN_PROGS_x86_64 += x86_64/set_memory_region_test TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test TEST_GEN_PROGS_x86_64 += x86_64/smm_test TEST_GEN_PROGS_x86_64 += x86_64/state_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test @@ -59,12 +59,14 @@ TEST_GEN_PROGS_x86_64 += clear_dirty_log_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus +TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time TEST_GEN_PROGS_aarch64 += clear_dirty_log_test TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus +TEST_GEN_PROGS_aarch64 += set_memory_region_test TEST_GEN_PROGS_aarch64 += steal_time TEST_GEN_PROGS_s390x = s390x/memop @@ -73,6 +75,7 @@ TEST_GEN_PROGS_s390x += s390x/sync_regs_test TEST_GEN_PROGS_s390x += demand_paging_test TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += kvm_create_max_vcpus +TEST_GEN_PROGS_s390x += set_memory_region_test TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) LIBKVM += $(LIBKVM_$(UNAME_M)) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 92e184a422ee..919e161dd289 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -10,6 +10,7 @@ #include "test_util.h" #include "asm/kvm.h" +#include "linux/list.h" #include "linux/kvm.h" #include <sys/ioctl.h> @@ -113,6 +114,7 @@ int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); +void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid); vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, uint32_t data_memslot, uint32_t pgd_memslot); @@ -256,6 +258,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm); unsigned int vm_get_page_size(struct kvm_vm *vm); unsigned int vm_get_page_shift(struct kvm_vm *vm); unsigned int vm_get_max_gfn(struct kvm_vm *vm); +int vm_get_fd(struct kvm_vm *vm); unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages); @@ -311,13 +314,30 @@ void ucall_uninit(struct kvm_vm *vm); void ucall(uint64_t cmd, int nargs, ...); uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc); +#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \ + ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) #define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) #define GUEST_DONE() ucall(UCALL_DONE, 0) -#define GUEST_ASSERT(_condition) do { \ - if (!(_condition)) \ - ucall(UCALL_ABORT, 2, \ - "Failed guest assert: " \ - #_condition, __LINE__); \ +#define __GUEST_ASSERT(_condition, _nargs, _args...) do { \ + if (!(_condition)) \ + ucall(UCALL_ABORT, 2 + _nargs, \ + "Failed guest assert: " \ + #_condition, __LINE__, _args); \ } while (0) +#define GUEST_ASSERT(_condition) \ + __GUEST_ASSERT((_condition), 0, 0) + +#define GUEST_ASSERT_1(_condition, arg1) \ + __GUEST_ASSERT((_condition), 1, (arg1)) + +#define GUEST_ASSERT_2(_condition, arg1, arg2) \ + __GUEST_ASSERT((_condition), 2, (arg1), (arg2)) + +#define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \ + __GUEST_ASSERT((_condition), 3, (arg1), (arg2), (arg3)) + +#define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \ + __GUEST_ASSERT((_condition), 4, (arg1), (arg2), (arg3), (arg4)) + #endif /* SELFTEST_KVM_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 7428513a4c68..82b7fe16a824 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -79,13 +79,16 @@ static inline uint64_t get_desc64_base(const struct desc64 *desc) static inline uint64_t rdtsc(void) { uint32_t eax, edx; - + uint64_t tsc_val; /* * The lfence is to wait (on Intel CPUs) until all previous - * instructions have been executed. + * instructions have been executed. If software requires RDTSC to be + * executed prior to execution of any subsequent instruction, it can + * execute LFENCE immediately after RDTSC */ - __asm__ __volatile__("lfence; rdtsc" : "=a"(eax), "=d"(edx)); - return ((uint64_t)edx) << 32 | eax; + __asm__ __volatile__("lfence; rdtsc; lfence" : "=a"(eax), "=d"(edx)); + tsc_val = ((uint64_t)edx) << 32 | eax; + return tsc_val; } static inline uint64_t rdtscp(uint32_t *aux) diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h index cd037917fece..674151d24fcf 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h @@ -35,4 +35,14 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa); void nested_svm_check_supported(void); +static inline bool cpu_has_svm(void) +{ + u32 eax = 0x80000001, ecx; + + asm("cpuid" : + "=a" (eax), "=c" (ecx) : "0" (eax) : "ebx", "edx"); + + return ecx & CPUID_SVM; +} + #endif /* SELFTEST_KVM_SVM_UTILS_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index 3d27069b9ed9..ccff3e6e2704 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -575,6 +575,33 @@ struct vmx_pages { void *eptp; }; +union vmx_basic { + u64 val; + struct { + u32 revision; + u32 size:13, + reserved1:3, + width:1, + dual:1, + type:4, + insouts:1, + ctrl:1, + vm_entry_exception_ctrl:1, + reserved2:7; + }; +}; + +union vmx_ctrl_msr { + u64 val; + struct { + u32 set, clr; + }; +}; + +union vmx_basic basic; +union vmx_ctrl_msr ctrl_pin_rev; +union vmx_ctrl_msr ctrl_exit_rev; + struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva); bool prepare_for_vmx_operation(struct vmx_pages *vmx); void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 9622431069bc..c9cede5c7d0d 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -161,6 +161,9 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) vm = calloc(1, sizeof(*vm)); TEST_ASSERT(vm != NULL, "Insufficient Memory"); + INIT_LIST_HEAD(&vm->vcpus); + INIT_LIST_HEAD(&vm->userspace_mem_regions); + vm->mode = mode; vm->type = 0; @@ -258,8 +261,7 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm) if (vmp->has_irqchip) vm_create_irqchip(vmp); - for (region = vmp->userspace_mem_region_head; region; - region = region->next) { + list_for_each_entry(region, &vmp->userspace_mem_regions, list) { int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" " rc: %i errno: %i\n" @@ -319,8 +321,7 @@ userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) { struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + list_for_each_entry(region, &vm->userspace_mem_regions, list) { uint64_t existing_start = region->region.guest_phys_addr; uint64_t existing_end = region->region.guest_phys_addr + region->region.memory_size - 1; @@ -378,11 +379,11 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, */ struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid) { - struct vcpu *vcpup; + struct vcpu *vcpu; - for (vcpup = vm->vcpu_head; vcpup; vcpup = vcpup->next) { - if (vcpup->id == vcpuid) - return vcpup; + list_for_each_entry(vcpu, &vm->vcpus, list) { + if (vcpu->id == vcpuid) + return vcpu; } return NULL; @@ -392,18 +393,16 @@ struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid) * VM VCPU Remove * * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID + * vcpu - VCPU to remove * * Output Args: None * * Return: None, TEST_ASSERT failures for all error conditions * - * Within the VM specified by vm, removes the VCPU given by vcpuid. + * Removes a vCPU from a VM and frees its resources. */ -static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid) +static void vm_vcpu_rm(struct vcpu *vcpu) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; ret = munmap(vcpu->state, sizeof(*vcpu->state)); @@ -413,21 +412,17 @@ static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid) TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i " "errno: %i", ret, errno); - if (vcpu->next) - vcpu->next->prev = vcpu->prev; - if (vcpu->prev) - vcpu->prev->next = vcpu->next; - else - vm->vcpu_head = vcpu->next; + list_del(&vcpu->list); free(vcpu); } void kvm_vm_release(struct kvm_vm *vmp) { + struct vcpu *vcpu, *tmp; int ret; - while (vmp->vcpu_head) - vm_vcpu_rm(vmp, vmp->vcpu_head->id); + list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) + vm_vcpu_rm(vcpu); ret = close(vmp->fd); TEST_ASSERT(ret == 0, "Close of vm fd failed,\n" @@ -438,35 +433,38 @@ void kvm_vm_release(struct kvm_vm *vmp) " vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno); } +static void __vm_mem_region_delete(struct kvm_vm *vm, + struct userspace_mem_region *region) +{ + int ret; + + list_del(®ion->list); + + region->region.memory_size = 0; + ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, " + "rc: %i errno: %i", ret, errno); + + sparsebit_free(®ion->unused_phy_pages); + ret = munmap(region->mmap_start, region->mmap_size); + TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", ret, errno); + + free(region); +} + /* * Destroys and frees the VM pointed to by vmp. */ void kvm_vm_free(struct kvm_vm *vmp) { - int ret; + struct userspace_mem_region *region, *tmp; if (vmp == NULL) return; /* Free userspace_mem_regions. */ - while (vmp->userspace_mem_region_head) { - struct userspace_mem_region *region - = vmp->userspace_mem_region_head; - - region->region.memory_size = 0; - ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, - ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, " - "rc: %i errno: %i", ret, errno); - - vmp->userspace_mem_region_head = region->next; - sparsebit_free(®ion->unused_phy_pages); - ret = munmap(region->mmap_start, region->mmap_size); - TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", - ret, errno); - - free(region); - } + list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list) + __vm_mem_region_delete(vmp, region); /* Free sparsebit arrays. */ sparsebit_free(&vmp->vpages_valid); @@ -612,12 +610,10 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, (uint64_t) region->region.memory_size); /* Confirm no region with the requested slot already exists. */ - for (region = vm->userspace_mem_region_head; region; - region = region->next) { - if (region->region.slot == slot) - break; - } - if (region != NULL) + list_for_each_entry(region, &vm->userspace_mem_regions, list) { + if (region->region.slot != slot) + continue; + TEST_FAIL("A mem region with the requested slot " "already exists.\n" " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" @@ -626,6 +622,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region->region.slot, (uint64_t) region->region.guest_phys_addr, (uint64_t) region->region.memory_size); + } /* Allocate and initialize new mem region structure. */ region = calloc(1, sizeof(*region)); @@ -686,10 +683,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, guest_paddr, (uint64_t) region->region.memory_size); /* Add to linked-list of memory regions. */ - if (vm->userspace_mem_region_head) - vm->userspace_mem_region_head->prev = region; - region->next = vm->userspace_mem_region_head; - vm->userspace_mem_region_head = region; + list_add(®ion->list, &vm->userspace_mem_regions); } /* @@ -712,20 +706,17 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot) { struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + list_for_each_entry(region, &vm->userspace_mem_regions, list) { if (region->region.slot == memslot) - break; - } - if (region == NULL) { - fprintf(stderr, "No mem region with the requested slot found,\n" - " requested slot: %u\n", memslot); - fputs("---- vm dump ----\n", stderr); - vm_dump(stderr, vm, 2); - TEST_FAIL("Mem region not found"); + return region; } - return region; + fprintf(stderr, "No mem region with the requested slot found,\n" + " requested slot: %u\n", memslot); + fputs("---- vm dump ----\n", stderr); + vm_dump(stderr, vm, 2); + TEST_FAIL("Mem region not found"); + return NULL; } /* @@ -789,6 +780,24 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) } /* + * VM Memory Region Delete + * + * Input Args: + * vm - Virtual Machine + * slot - Slot of the memory region to delete + * + * Output Args: None + * + * Return: None + * + * Delete a memory region. + */ +void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) +{ + __vm_mem_region_delete(vm, memslot2region(vm, slot)); +} + +/* * VCPU mmap Size * * Input Args: None @@ -863,10 +872,7 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) "vcpu id: %u errno: %i", vcpuid, errno); /* Add to linked-list of VCPUs. */ - if (vm->vcpu_head) - vm->vcpu_head->prev = vcpu; - vcpu->next = vm->vcpu_head; - vm->vcpu_head = vcpu; + list_add(&vcpu->list, &vm->vcpus); } /* @@ -1059,8 +1065,8 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) { struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + + list_for_each_entry(region, &vm->userspace_mem_regions, list) { if ((gpa >= region->region.guest_phys_addr) && (gpa <= (region->region.guest_phys_addr + region->region.memory_size - 1))) @@ -1092,8 +1098,8 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) { struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + + list_for_each_entry(region, &vm->userspace_mem_regions, list) { if ((hva >= region->host_mem) && (hva <= (region->host_mem + region->region.memory_size - 1))) @@ -1529,8 +1535,7 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size); fprintf(stream, "%*sMem Regions:\n", indent, ""); - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + list_for_each_entry(region, &vm->userspace_mem_regions, list) { fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx " "host_virt: %p\n", indent + 2, "", (uint64_t) region->region.guest_phys_addr, @@ -1549,7 +1554,7 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) virt_dump(stream, vm, indent + 4); } fprintf(stream, "%*sVCPUs:\n", indent, ""); - for (vcpu = vm->vcpu_head; vcpu; vcpu = vcpu->next) + list_for_each_entry(vcpu, &vm->vcpus, list) vcpu_dump(stream, vm, vcpu->id, indent + 2); } @@ -1743,6 +1748,11 @@ unsigned int vm_get_max_gfn(struct kvm_vm *vm) return vm->max_gfn; } +int vm_get_fd(struct kvm_vm *vm) +{ + return vm->fd; +} + static unsigned int vm_calc_num_pages(unsigned int num_pages, unsigned int page_shift, unsigned int new_page_shift, diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index ca56a0133127..2ef446520748 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -13,7 +13,6 @@ #define KVM_DEV_PATH "/dev/kvm" struct userspace_mem_region { - struct userspace_mem_region *next, *prev; struct kvm_userspace_memory_region region; struct sparsebit *unused_phy_pages; int fd; @@ -21,10 +20,11 @@ struct userspace_mem_region { void *host_mem; void *mmap_start; size_t mmap_size; + struct list_head list; }; struct vcpu { - struct vcpu *next, *prev; + struct list_head list; uint32_t id; int fd; struct kvm_run *state; @@ -41,8 +41,8 @@ struct kvm_vm { unsigned int pa_bits; unsigned int va_bits; uint64_t max_gfn; - struct vcpu *vcpu_head; - struct userspace_mem_region *userspace_mem_region_head; + struct list_head vcpus; + struct list_head userspace_mem_regions; struct sparsebit *vpages_valid; struct sparsebit *vpages_mapped; bool has_irqchip; diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index 8d94961bd046..a88c5d665725 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -233,7 +233,10 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) { - struct vcpu *vcpu = vm->vcpu_head; + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + + if (!vcpu) + return; fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n", indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr); diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c new file mode 100644 index 000000000000..b3ece55a2da6 --- /dev/null +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -0,0 +1,408 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <fcntl.h> +#include <pthread.h> +#include <sched.h> +#include <semaphore.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include <linux/compiler.h> + +#include <test_util.h> +#include <kvm_util.h> +#include <processor.h> + +#define VCPU_ID 0 + +/* + * s390x needs at least 1MB alignment, and the x86_64 MOVE/DELETE tests need a + * 2MB sized and aligned region so that the initial region corresponds to + * exactly one large page. + */ +#define MEM_REGION_SIZE 0x200000 + +#ifdef __x86_64__ +/* + * Somewhat arbitrary location and slot, intended to not overlap anything. + */ +#define MEM_REGION_GPA 0xc0000000 +#define MEM_REGION_SLOT 10 + +static const uint64_t MMIO_VAL = 0xbeefull; + +extern const uint64_t final_rip_start; +extern const uint64_t final_rip_end; + +static sem_t vcpu_ready; + +static inline uint64_t guest_spin_on_val(uint64_t spin_val) +{ + uint64_t val; + + do { + val = READ_ONCE(*((uint64_t *)MEM_REGION_GPA)); + } while (val == spin_val); + + GUEST_SYNC(0); + return val; +} + +static void *vcpu_worker(void *data) +{ + struct kvm_vm *vm = data; + struct kvm_run *run; + struct ucall uc; + uint64_t cmd; + + /* + * Loop until the guest is done. Re-enter the guest on all MMIO exits, + * which will occur if the guest attempts to access a memslot after it + * has been deleted or while it is being moved . + */ + run = vcpu_state(vm, VCPU_ID); + + while (1) { + vcpu_run(vm, VCPU_ID); + + if (run->exit_reason == KVM_EXIT_IO) { + cmd = get_ucall(vm, VCPU_ID, &uc); + if (cmd != UCALL_SYNC) + break; + + sem_post(&vcpu_ready); + continue; + } + + if (run->exit_reason != KVM_EXIT_MMIO) + break; + + TEST_ASSERT(!run->mmio.is_write, "Unexpected exit mmio write"); + TEST_ASSERT(run->mmio.len == 8, + "Unexpected exit mmio size = %u", run->mmio.len); + + TEST_ASSERT(run->mmio.phys_addr == MEM_REGION_GPA, + "Unexpected exit mmio address = 0x%llx", + run->mmio.phys_addr); + memcpy(run->mmio.data, &MMIO_VAL, 8); + } + + if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT) + TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0], + __FILE__, uc.args[1], uc.args[2]); + + return NULL; +} + +static void wait_for_vcpu(void) +{ + struct timespec ts; + + TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts), + "clock_gettime() failed: %d\n", errno); + + ts.tv_sec += 2; + TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts), + "sem_timedwait() failed: %d\n", errno); + + /* Wait for the vCPU thread to reenter the guest. */ + usleep(100000); +} + +static struct kvm_vm *spawn_vm(pthread_t *vcpu_thread, void *guest_code) +{ + struct kvm_vm *vm; + uint64_t *hva; + uint64_t gpa; + + vm = vm_create_default(VCPU_ID, 0, guest_code); + + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP, + MEM_REGION_GPA, MEM_REGION_SLOT, + MEM_REGION_SIZE / getpagesize(), 0); + + /* + * Allocate and map two pages so that the GPA accessed by guest_code() + * stays valid across the memslot move. + */ + gpa = vm_phy_pages_alloc(vm, 2, MEM_REGION_GPA, MEM_REGION_SLOT); + TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n"); + + virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2, 0); + + /* Ditto for the host mapping so that both pages can be zeroed. */ + hva = addr_gpa2hva(vm, MEM_REGION_GPA); + memset(hva, 0, 2 * 4096); + + pthread_create(vcpu_thread, NULL, vcpu_worker, vm); + + /* Ensure the guest thread is spun up. */ + wait_for_vcpu(); + + return vm; +} + + +static void guest_code_move_memory_region(void) +{ + uint64_t val; + + GUEST_SYNC(0); + + /* + * Spin until the memory region is moved to a misaligned address. This + * may or may not trigger MMIO, as the window where the memslot is + * invalid is quite small. + */ + val = guest_spin_on_val(0); + GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val); + + /* Spin until the memory region is realigned. */ + val = guest_spin_on_val(MMIO_VAL); + GUEST_ASSERT_1(val == 1, val); + + GUEST_DONE(); +} + +static void test_move_memory_region(void) +{ + pthread_t vcpu_thread; + struct kvm_vm *vm; + uint64_t *hva; + + vm = spawn_vm(&vcpu_thread, guest_code_move_memory_region); + + hva = addr_gpa2hva(vm, MEM_REGION_GPA); + + /* + * Shift the region's base GPA. The guest should not see "2" as the + * hva->gpa translation is misaligned, i.e. the guest is accessing a + * different host pfn. + */ + vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA - 4096); + WRITE_ONCE(*hva, 2); + + /* + * The guest _might_ see an invalid memslot and trigger MMIO, but it's + * a tiny window. Spin and defer the sync until the memslot is + * restored and guest behavior is once again deterministic. + */ + usleep(100000); + + /* + * Note, value in memory needs to be changed *before* restoring the + * memslot, else the guest could race the update and see "2". + */ + WRITE_ONCE(*hva, 1); + + /* Restore the original base, the guest should see "1". */ + vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA); + wait_for_vcpu(); + /* Defered sync from when the memslot was misaligned (above). */ + wait_for_vcpu(); + + pthread_join(vcpu_thread, NULL); + + kvm_vm_free(vm); +} + +static void guest_code_delete_memory_region(void) +{ + uint64_t val; + + GUEST_SYNC(0); + + /* Spin until the memory region is deleted. */ + val = guest_spin_on_val(0); + GUEST_ASSERT_1(val == MMIO_VAL, val); + + /* Spin until the memory region is recreated. */ + val = guest_spin_on_val(MMIO_VAL); + GUEST_ASSERT_1(val == 0, val); + + /* Spin until the memory region is deleted. */ + val = guest_spin_on_val(0); + GUEST_ASSERT_1(val == MMIO_VAL, val); + + asm("1:\n\t" + ".pushsection .rodata\n\t" + ".global final_rip_start\n\t" + "final_rip_start: .quad 1b\n\t" + ".popsection"); + + /* Spin indefinitely (until the code memslot is deleted). */ + guest_spin_on_val(MMIO_VAL); + + asm("1:\n\t" + ".pushsection .rodata\n\t" + ".global final_rip_end\n\t" + "final_rip_end: .quad 1b\n\t" + ".popsection"); + + GUEST_ASSERT_1(0, 0); +} + +static void test_delete_memory_region(void) +{ + pthread_t vcpu_thread; + struct kvm_regs regs; + struct kvm_run *run; + struct kvm_vm *vm; + + vm = spawn_vm(&vcpu_thread, guest_code_delete_memory_region); + + /* Delete the memory region, the guest should not die. */ + vm_mem_region_delete(vm, MEM_REGION_SLOT); + wait_for_vcpu(); + + /* Recreate the memory region. The guest should see "0". */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP, + MEM_REGION_GPA, MEM_REGION_SLOT, + MEM_REGION_SIZE / getpagesize(), 0); + wait_for_vcpu(); + + /* Delete the region again so that there's only one memslot left. */ + vm_mem_region_delete(vm, MEM_REGION_SLOT); + wait_for_vcpu(); + + /* + * Delete the primary memslot. This should cause an emulation error or + * shutdown due to the page tables getting nuked. + */ + vm_mem_region_delete(vm, 0); + + pthread_join(vcpu_thread, NULL); + + run = vcpu_state(vm, VCPU_ID); + + TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN || + run->exit_reason == KVM_EXIT_INTERNAL_ERROR, + "Unexpected exit reason = %d", run->exit_reason); + + vcpu_regs_get(vm, VCPU_ID, ®s); + + /* + * On AMD, after KVM_EXIT_SHUTDOWN the VMCB has been reinitialized already, + * so the instruction pointer would point to the reset vector. + */ + if (run->exit_reason == KVM_EXIT_INTERNAL_ERROR) + TEST_ASSERT(regs.rip >= final_rip_start && + regs.rip < final_rip_end, + "Bad rip, expected 0x%lx - 0x%lx, got 0x%llx\n", + final_rip_start, final_rip_end, regs.rip); + + kvm_vm_free(vm); +} + +static void test_zero_memory_regions(void) +{ + struct kvm_run *run; + struct kvm_vm *vm; + + pr_info("Testing KVM_RUN with zero added memory regions\n"); + + vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + vm_vcpu_add(vm, VCPU_ID); + + TEST_ASSERT(!ioctl(vm_get_fd(vm), KVM_SET_NR_MMU_PAGES, 64), + "KVM_SET_NR_MMU_PAGES failed, errno = %d\n", errno); + vcpu_run(vm, VCPU_ID); + + run = vcpu_state(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, + "Unexpected exit_reason = %u\n", run->exit_reason); + + kvm_vm_free(vm); +} +#endif /* __x86_64__ */ + +/* + * Test it can be added memory slots up to KVM_CAP_NR_MEMSLOTS, then any + * tentative to add further slots should fail. + */ +static void test_add_max_memory_regions(void) +{ + int ret; + struct kvm_vm *vm; + uint32_t max_mem_slots; + uint32_t slot; + uint64_t guest_addr = 0x0; + uint64_t mem_reg_npages; + void *mem; + + max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); + TEST_ASSERT(max_mem_slots > 0, + "KVM_CAP_NR_MEMSLOTS should be greater than 0"); + pr_info("Allowed number of memory slots: %i\n", max_mem_slots); + + vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + + mem_reg_npages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, MEM_REGION_SIZE); + + /* Check it can be added memory slots up to the maximum allowed */ + pr_info("Adding slots 0..%i, each memory region with %dK size\n", + (max_mem_slots - 1), MEM_REGION_SIZE >> 10); + for (slot = 0; slot < max_mem_slots; slot++) { + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + guest_addr, slot, mem_reg_npages, + 0); + guest_addr += MEM_REGION_SIZE; + } + + /* Check it cannot be added memory slots beyond the limit */ + mem = mmap(NULL, MEM_REGION_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host"); + + ret = ioctl(vm_get_fd(vm), KVM_SET_USER_MEMORY_REGION, + &(struct kvm_userspace_memory_region) {slot, 0, guest_addr, + MEM_REGION_SIZE, (uint64_t) mem}); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "Adding one more memory slot should fail with EINVAL"); + + munmap(mem, MEM_REGION_SIZE); + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ +#ifdef __x86_64__ + int i, loops; +#endif + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + +#ifdef __x86_64__ + /* + * FIXME: the zero-memslot test fails on aarch64 and s390x because + * KVM_RUN fails with ENOEXEC or EFAULT. + */ + test_zero_memory_regions(); +#endif + + test_add_max_memory_regions(); + +#ifdef __x86_64__ + if (argc > 1) + loops = atoi(argv[1]); + else + loops = 10; + + pr_info("Testing MOVE of in-use region, %d loops\n", loops); + for (i = 0; i < loops; i++) + test_move_memory_region(); + + pr_info("Testing DELETE of in-use region, %d loops\n", loops); + for (i = 0; i < loops; i++) + test_delete_memory_region(); +#endif + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index 83323f3d7ca0..4a7967cca281 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -26,18 +26,18 @@ static void guest_code(void) { } -static int smt_possible(void) +static bool smt_possible(void) { char buf[16]; FILE *f; - bool res = 1; + bool res = true; f = fopen("/sys/devices/system/cpu/smt/control", "r"); if (f) { if (fread(buf, sizeof(*buf), sizeof(buf), f) > 0) { if (!strncmp(buf, "forceoff", 8) || !strncmp(buf, "notsupported", 12)) - res = 0; + res = false; } fclose(f); } @@ -46,29 +46,31 @@ static int smt_possible(void) } static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, - int evmcs_enabled) + bool evmcs_enabled) { int i; + int nent = 9; + u32 test_val; - if (!evmcs_enabled) - TEST_ASSERT(hv_cpuid_entries->nent == 6, - "KVM_GET_SUPPORTED_HV_CPUID should return 6 entries" - " when Enlightened VMCS is disabled (returned %d)", - hv_cpuid_entries->nent); - else - TEST_ASSERT(hv_cpuid_entries->nent == 7, - "KVM_GET_SUPPORTED_HV_CPUID should return 7 entries" - " when Enlightened VMCS is enabled (returned %d)", - hv_cpuid_entries->nent); + if (evmcs_enabled) + nent += 1; /* 0x4000000A */ + + TEST_ASSERT(hv_cpuid_entries->nent == nent, + "KVM_GET_SUPPORTED_HV_CPUID should return %d entries" + " with evmcs=%d (returned %d)", + nent, evmcs_enabled, hv_cpuid_entries->nent); for (i = 0; i < hv_cpuid_entries->nent; i++) { struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i]; TEST_ASSERT((entry->function >= 0x40000000) && - (entry->function <= 0x4000000A), + (entry->function <= 0x40000082), "function %x is our of supported range", entry->function); + TEST_ASSERT(evmcs_enabled || (entry->function != 0x4000000A), + "0x4000000A leaf should not be reported"); + TEST_ASSERT(entry->index == 0, ".index field should be zero"); @@ -78,12 +80,23 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, TEST_ASSERT(!entry->padding[0] && !entry->padding[1] && !entry->padding[2], "padding should be zero"); - if (entry->function == 0x40000004) { - int nononarchcs = !!(entry->eax & (1UL << 18)); + switch (entry->function) { + case 0x40000000: + test_val = 0x40000082; - TEST_ASSERT(nononarchcs == !smt_possible(), + TEST_ASSERT(entry->eax == test_val, + "Wrong max leaf report in 0x40000000.EAX: %x" + " (evmcs=%d)", + entry->eax, evmcs_enabled + ); + break; + case 0x40000004: + test_val = entry->eax & (1UL << 18); + + TEST_ASSERT(!!test_val == !smt_possible(), "NoNonArchitecturalCoreSharing bit" " doesn't reflect SMT setting"); + break; } /* @@ -133,8 +146,9 @@ struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(struct kvm_vm *vm) int main(int argc, char *argv[]) { struct kvm_vm *vm; - int rv; + int rv, stage; struct kvm_cpuid2 *hv_cpuid_entries; + bool evmcs_enabled; /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); @@ -145,36 +159,31 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); } - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - - test_hv_cpuid_e2big(vm); - - hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm); - if (!hv_cpuid_entries) - return 1; - - test_hv_cpuid(hv_cpuid_entries, 0); - - free(hv_cpuid_entries); + for (stage = 0; stage < 3; stage++) { + evmcs_enabled = false; + + vm = vm_create_default(VCPU_ID, 0, guest_code); + switch (stage) { + case 0: + test_hv_cpuid_e2big(vm); + continue; + case 1: + break; + case 2: + if (!kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { + print_skip("Enlightened VMCS is unsupported"); + continue; + } + vcpu_enable_evmcs(vm, VCPU_ID); + evmcs_enabled = true; + break; + } - if (!kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { - print_skip("Enlightened VMCS is unsupported"); - goto vm_free; + hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm); + test_hv_cpuid(hv_cpuid_entries, evmcs_enabled); + free(hv_cpuid_entries); + kvm_vm_free(vm); } - vcpu_enable_evmcs(vm, VCPU_ID); - - hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm); - if (!hv_cpuid_entries) - return 1; - - test_hv_cpuid(hv_cpuid_entries, 1); - - free(hv_cpuid_entries); - -vm_free: - kvm_vm_free(vm); - return 0; } diff --git a/tools/testing/selftests/kvm/x86_64/set_memory_region_test.c b/tools/testing/selftests/kvm/x86_64/set_memory_region_test.c deleted file mode 100644 index c6691cff4e19..000000000000 --- a/tools/testing/selftests/kvm/x86_64/set_memory_region_test.c +++ /dev/null @@ -1,141 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE /* for program_invocation_short_name */ -#include <fcntl.h> -#include <pthread.h> -#include <sched.h> -#include <signal.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/ioctl.h> - -#include <linux/compiler.h> - -#include <test_util.h> -#include <kvm_util.h> -#include <processor.h> - -#define VCPU_ID 0 - -/* - * Somewhat arbitrary location and slot, intended to not overlap anything. The - * location and size are specifically 2mb sized/aligned so that the initial - * region corresponds to exactly one large page. - */ -#define MEM_REGION_GPA 0xc0000000 -#define MEM_REGION_SIZE 0x200000 -#define MEM_REGION_SLOT 10 - -static void guest_code(void) -{ - uint64_t val; - - do { - val = READ_ONCE(*((uint64_t *)MEM_REGION_GPA)); - } while (!val); - - if (val != 1) - ucall(UCALL_ABORT, 1, val); - - GUEST_DONE(); -} - -static void *vcpu_worker(void *data) -{ - struct kvm_vm *vm = data; - struct kvm_run *run; - struct ucall uc; - uint64_t cmd; - - /* - * Loop until the guest is done. Re-enter the guest on all MMIO exits, - * which will occur if the guest attempts to access a memslot while it - * is being moved. - */ - run = vcpu_state(vm, VCPU_ID); - do { - vcpu_run(vm, VCPU_ID); - } while (run->exit_reason == KVM_EXIT_MMIO); - - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Unexpected exit reason = %d", run->exit_reason); - - cmd = get_ucall(vm, VCPU_ID, &uc); - TEST_ASSERT(cmd == UCALL_DONE, "Unexpected val in guest = %lu", uc.args[0]); - return NULL; -} - -static void test_move_memory_region(void) -{ - pthread_t vcpu_thread; - struct kvm_vm *vm; - uint64_t *hva; - uint64_t gpa; - - vm = vm_create_default(VCPU_ID, 0, guest_code); - - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP, - MEM_REGION_GPA, MEM_REGION_SLOT, - MEM_REGION_SIZE / getpagesize(), 0); - - /* - * Allocate and map two pages so that the GPA accessed by guest_code() - * stays valid across the memslot move. - */ - gpa = vm_phy_pages_alloc(vm, 2, MEM_REGION_GPA, MEM_REGION_SLOT); - TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n"); - - virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2, 0); - - /* Ditto for the host mapping so that both pages can be zeroed. */ - hva = addr_gpa2hva(vm, MEM_REGION_GPA); - memset(hva, 0, 2 * 4096); - - pthread_create(&vcpu_thread, NULL, vcpu_worker, vm); - - /* Ensure the guest thread is spun up. */ - usleep(100000); - - /* - * Shift the region's base GPA. The guest should not see "2" as the - * hva->gpa translation is misaligned, i.e. the guest is accessing a - * different host pfn. - */ - vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA - 4096); - WRITE_ONCE(*hva, 2); - - usleep(100000); - - /* - * Note, value in memory needs to be changed *before* restoring the - * memslot, else the guest could race the update and see "2". - */ - WRITE_ONCE(*hva, 1); - - /* Restore the original base, the guest should see "1". */ - vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA); - - pthread_join(vcpu_thread, NULL); - - kvm_vm_free(vm); -} - -int main(int argc, char *argv[]) -{ - int i, loops; - - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - - if (argc > 1) - loops = atoi(argv[1]); - else - loops = 10; - - for (i = 0; i < loops; i++) - test_move_memory_region(); - - return 0; -} diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index 8230b6bc6b8f..6f8f478b3ceb 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -17,6 +17,7 @@ #include "kvm_util.h" #include "vmx.h" +#include "svm_util.h" #define VCPU_ID 1 @@ -58,7 +59,7 @@ void self_smi(void) APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI); } -void guest_code(struct vmx_pages *vmx_pages) +void guest_code(void *arg) { uint64_t apicbase = rdmsr(MSR_IA32_APICBASE); @@ -72,8 +73,11 @@ void guest_code(struct vmx_pages *vmx_pages) sync_with_host(4); - if (vmx_pages) { - GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + if (arg) { + if (cpu_has_svm()) + generic_svm_setup(arg, NULL, NULL); + else + GUEST_ASSERT(prepare_for_vmx_operation(arg)); sync_with_host(5); @@ -87,7 +91,7 @@ void guest_code(struct vmx_pages *vmx_pages) int main(int argc, char *argv[]) { - vm_vaddr_t vmx_pages_gva = 0; + vm_vaddr_t nested_gva = 0; struct kvm_regs regs; struct kvm_vm *vm; @@ -114,8 +118,11 @@ int main(int argc, char *argv[]) vcpu_set_msr(vm, VCPU_ID, MSR_IA32_SMBASE, SMRAM_GPA); if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + if (kvm_get_supported_cpuid_entry(0x80000001)->ecx & CPUID_SVM) + vcpu_alloc_svm(vm, &nested_gva); + else + vcpu_alloc_vmx(vm, &nested_gva); + vcpu_args_set(vm, VCPU_ID, 1, nested_gva); } else { pr_info("will skip SMM test with VMX enabled\n"); vcpu_args_set(vm, VCPU_ID, 1, 0); diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index 5b1a016edf55..d43b6f99b66c 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -18,14 +18,46 @@ #include "kvm_util.h" #include "processor.h" #include "vmx.h" +#include "svm_util.h" #define VCPU_ID 5 +#define L2_GUEST_STACK_SIZE 256 -void l2_guest_code(void) +void svm_l2_guest_code(void) { + GUEST_SYNC(4); + /* Exit to L1 */ + vmcall(); GUEST_SYNC(6); + /* Done, exit to L1 and never come back. */ + vmcall(); +} - /* Exit to L1 */ +static void svm_l1_guest_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + GUEST_ASSERT(svm->vmcb_gpa); + /* Prepare for L2 execution. */ + generic_svm_setup(svm, svm_l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(3); + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_SYNC(5); + vmcb->save.rip += 3; + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_SYNC(7); +} + +void vmx_l2_guest_code(void) +{ + GUEST_SYNC(6); + + /* Exit to L1 */ vmcall(); /* L1 has now set up a shadow VMCS for us. */ @@ -42,10 +74,9 @@ void l2_guest_code(void) vmcall(); } -void l1_guest_code(struct vmx_pages *vmx_pages) +static void vmx_l1_guest_code(struct vmx_pages *vmx_pages) { -#define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; GUEST_ASSERT(vmx_pages->vmcs_gpa); GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); @@ -56,7 +87,7 @@ void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_SYNC(4); GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); - prepare_vmcs(vmx_pages, l2_guest_code, + prepare_vmcs(vmx_pages, vmx_l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); GUEST_SYNC(5); @@ -106,20 +137,24 @@ void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_ASSERT(vmresume()); } -void guest_code(struct vmx_pages *vmx_pages) +static void __attribute__((__flatten__)) guest_code(void *arg) { GUEST_SYNC(1); GUEST_SYNC(2); - if (vmx_pages) - l1_guest_code(vmx_pages); + if (arg) { + if (cpu_has_svm()) + svm_l1_guest_code(arg); + else + vmx_l1_guest_code(arg); + } GUEST_DONE(); } int main(int argc, char *argv[]) { - vm_vaddr_t vmx_pages_gva = 0; + vm_vaddr_t nested_gva = 0; struct kvm_regs regs1, regs2; struct kvm_vm *vm; @@ -136,8 +171,11 @@ int main(int argc, char *argv[]) vcpu_regs_get(vm, VCPU_ID, ®s1); if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + if (kvm_get_supported_cpuid_entry(0x80000001)->ecx & CPUID_SVM) + vcpu_alloc_svm(vm, &nested_gva); + else + vcpu_alloc_vmx(vm, &nested_gva); + vcpu_args_set(vm, VCPU_ID, 1, nested_gva); } else { pr_info("will skip nested state checks\n"); vcpu_args_set(vm, VCPU_ID, 1, 0); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c new file mode 100644 index 000000000000..cc72b6188ca7 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VMX-preemption timer test + * + * Copyright (C) 2020, Google, LLC. + * + * Test to ensure the VM-Enter after migration doesn't + * incorrectly restarts the timer with the full timer + * value instead of partially decayed timer value + * + */ +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" + +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#define VCPU_ID 5 +#define PREEMPTION_TIMER_VALUE 100000000ull +#define PREEMPTION_TIMER_VALUE_THRESHOLD1 80000000ull + +u32 vmx_pt_rate; +bool l2_save_restore_done; +static u64 l2_vmx_pt_start; +volatile u64 l2_vmx_pt_finish; + +void l2_guest_code(void) +{ + u64 vmx_pt_delta; + + vmcall(); + l2_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate; + + /* + * Wait until the 1st threshold has passed + */ + do { + l2_vmx_pt_finish = rdtsc(); + vmx_pt_delta = (l2_vmx_pt_finish - l2_vmx_pt_start) >> + vmx_pt_rate; + } while (vmx_pt_delta < PREEMPTION_TIMER_VALUE_THRESHOLD1); + + /* + * Force L2 through Save and Restore cycle + */ + GUEST_SYNC(1); + + l2_save_restore_done = 1; + + /* + * Now wait for the preemption timer to fire and + * exit to L1 + */ + while ((l2_vmx_pt_finish = rdtsc())) + ; +} + +void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + u64 l1_vmx_pt_start; + u64 l1_vmx_pt_finish; + u64 l1_tsc_deadline, l2_tsc_deadline; + + GUEST_ASSERT(vmx_pages->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* + * Check for Preemption timer support + */ + basic.val = rdmsr(MSR_IA32_VMX_BASIC); + ctrl_pin_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PINBASED_CTLS + : MSR_IA32_VMX_PINBASED_CTLS); + ctrl_exit_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_EXIT_CTLS + : MSR_IA32_VMX_EXIT_CTLS); + + if (!(ctrl_pin_rev.clr & PIN_BASED_VMX_PREEMPTION_TIMER) || + !(ctrl_exit_rev.clr & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) + return; + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + vmreadz(VM_EXIT_INSTRUCTION_LEN)); + + /* + * Turn on PIN control and resume the guest + */ + GUEST_ASSERT(!vmwrite(PIN_BASED_VM_EXEC_CONTROL, + vmreadz(PIN_BASED_VM_EXEC_CONTROL) | + PIN_BASED_VMX_PREEMPTION_TIMER)); + + GUEST_ASSERT(!vmwrite(VMX_PREEMPTION_TIMER_VALUE, + PREEMPTION_TIMER_VALUE)); + + vmx_pt_rate = rdmsr(MSR_IA32_VMX_MISC) & 0x1F; + + l2_save_restore_done = 0; + + l1_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate; + + GUEST_ASSERT(!vmresume()); + + l1_vmx_pt_finish = rdtsc(); + + /* + * Ensure exit from L2 happens after L2 goes through + * save and restore + */ + GUEST_ASSERT(l2_save_restore_done); + + /* + * Ensure the exit from L2 is due to preemption timer expiry + */ + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_PREEMPTION_TIMER); + + l1_tsc_deadline = l1_vmx_pt_start + + (PREEMPTION_TIMER_VALUE << vmx_pt_rate); + + l2_tsc_deadline = l2_vmx_pt_start + + (PREEMPTION_TIMER_VALUE << vmx_pt_rate); + + /* + * Sync with the host and pass the l1|l2 pt_expiry_finish times and + * tsc deadlines so that host can verify they are as expected + */ + GUEST_SYNC_ARGS(2, l1_vmx_pt_finish, l1_tsc_deadline, + l2_vmx_pt_finish, l2_tsc_deadline); +} + +void guest_code(struct vmx_pages *vmx_pages) +{ + if (vmx_pages) + l1_guest_code(vmx_pages); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva = 0; + + struct kvm_regs regs1, regs2; + struct kvm_vm *vm; + struct kvm_run *run; + struct kvm_x86_state *state; + struct ucall uc; + int stage; + + /* + * AMD currently does not implement any VMX features, so for now we + * just early out. + */ + nested_vmx_check_supported(); + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + run = vcpu_state(vm, VCPU_ID); + + vcpu_regs_get(vm, VCPU_ID, ®s1); + + if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + } else { + pr_info("will skip vmx preemption timer checks\n"); + goto done; + } + + for (stage = 1;; stage++) { + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Stage %d: unexpected exit reason: %u (%s),\n", + stage, run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + /* UCALL_SYNC is handled here. */ + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", + stage, (ulong)uc.args[1]); + /* + * If this stage 2 then we should verify the vmx pt expiry + * is as expected. + * From L1's perspective verify Preemption timer hasn't + * expired too early. + * From L2's perspective verify Preemption timer hasn't + * expired too late. + */ + if (stage == 2) { + + pr_info("Stage %d: L1 PT expiry TSC (%lu) , L1 TSC deadline (%lu)\n", + stage, uc.args[2], uc.args[3]); + + pr_info("Stage %d: L2 PT expiry TSC (%lu) , L2 TSC deadline (%lu)\n", + stage, uc.args[4], uc.args[5]); + + TEST_ASSERT(uc.args[2] >= uc.args[3], + "Stage %d: L1 PT expiry TSC (%lu) < L1 TSC deadline (%lu)", + stage, uc.args[2], uc.args[3]); + + TEST_ASSERT(uc.args[4] < uc.args[5], + "Stage %d: L2 PT expiry TSC (%lu) > L2 TSC deadline (%lu)", + stage, uc.args[4], uc.args[5]); + } + + state = vcpu_save_state(vm, VCPU_ID); + memset(®s1, 0, sizeof(regs1)); + vcpu_regs_get(vm, VCPU_ID, ®s1); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + kvm_vm_restart(vm, O_RDWR); + vm_vcpu_add(vm, VCPU_ID); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vcpu_load_state(vm, VCPU_ID, state); + run = vcpu_state(vm, VCPU_ID); + free(state); + + memset(®s2, 0, sizeof(regs2)); + vcpu_regs_get(vm, VCPU_ID, ®s2); + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", + (ulong) regs2.rdi, (ulong) regs2.rsi); + } + +done: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config index 14a77ea4a8da..b80ee3f6e265 100644 --- a/tools/testing/selftests/lib/config +++ b/tools/testing/selftests/lib/config @@ -2,3 +2,4 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m CONFIG_PRIME_NUMBERS=m CONFIG_TEST_STRSCPY=m +CONFIG_TEST_BITOPS=m diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 3f386eb9e7d7..895ec992b2f1 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -16,6 +16,7 @@ TEST_PROGS += altnames.sh icmp_redirect.sh ip6_gre_headroom.sh TEST_PROGS += route_localnet.sh TEST_PROGS += reuseaddr_ports_exhausted.sh TEST_PROGS += txtimestamp.sh +TEST_PROGS += vrf-xfrm-tests.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 6560ed796ac4..dee567f7576a 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,8 +19,8 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime" +IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp ipv4_compat_mode ipv4_fdb_grp_fcnal ipv4_torture" +IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp ipv6_compat_mode ipv6_fdb_grp_fcnal ipv6_torture" ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" TESTS="${ALL_TESTS}" @@ -146,35 +146,36 @@ setup() create_ns remote IP="ip -netns me" + BRIDGE="bridge -netns me" set -e $IP li add veth1 type veth peer name veth2 $IP li set veth1 up $IP addr add 172.16.1.1/24 dev veth1 - $IP -6 addr add 2001:db8:91::1/64 dev veth1 + $IP -6 addr add 2001:db8:91::1/64 dev veth1 nodad $IP li add veth3 type veth peer name veth4 $IP li set veth3 up $IP addr add 172.16.2.1/24 dev veth3 - $IP -6 addr add 2001:db8:92::1/64 dev veth3 + $IP -6 addr add 2001:db8:92::1/64 dev veth3 nodad $IP li set veth2 netns peer up ip -netns peer addr add 172.16.1.2/24 dev veth2 - ip -netns peer -6 addr add 2001:db8:91::2/64 dev veth2 + ip -netns peer -6 addr add 2001:db8:91::2/64 dev veth2 nodad $IP li set veth4 netns peer up ip -netns peer addr add 172.16.2.2/24 dev veth4 - ip -netns peer -6 addr add 2001:db8:92::2/64 dev veth4 + ip -netns peer -6 addr add 2001:db8:92::2/64 dev veth4 nodad ip -netns remote li add veth5 type veth peer name veth6 ip -netns remote li set veth5 up ip -netns remote addr add dev veth5 172.16.101.1/24 - ip -netns remote addr add dev veth5 2001:db8:101::1/64 + ip -netns remote -6 addr add dev veth5 2001:db8:101::1/64 nodad ip -netns remote ro add 172.16.0.0/22 via 172.16.101.2 ip -netns remote -6 ro add 2001:db8:90::/40 via 2001:db8:101::2 ip -netns remote li set veth6 netns peer up ip -netns peer addr add dev veth6 172.16.101.2/24 - ip -netns peer addr add dev veth6 2001:db8:101::2/64 + ip -netns peer -6 addr add dev veth6 2001:db8:101::2/64 nodad set +e } @@ -248,11 +249,247 @@ check_route6() local expected="$2" local out - out=$($IP -6 route ls match ${pfx} 2>/dev/null) + out=$($IP -6 route ls match ${pfx} 2>/dev/null | sed -e 's/pref medium//') check_output "${out}" "${expected}" } +check_large_grp() +{ + local ipv=$1 + local ecmp=$2 + local grpnum=100 + local nhidstart=100 + local grpidstart=1000 + local iter=0 + local nhidstr="" + local grpidstr="" + local grpstr="" + local ipstr="" + + if [ $ipv -eq 4 ]; then + ipstr="172.16.1." + else + ipstr="2001:db8:91::" + fi + + # + # Create $grpnum groups with specified $ecmp and dump them + # + + # create nexthops with different gateways + iter=2 + while [ $iter -le $(($ecmp + 1)) ] + do + nhidstr="$(($nhidstart + $iter))" + run_cmd "$IP nexthop add id $nhidstr via $ipstr$iter dev veth1" + check_nexthop "id $nhidstr" "id $nhidstr via $ipstr$iter dev veth1 scope link" + + if [ $iter -le $ecmp ]; then + grpstr+="$nhidstr/" + else + grpstr+="$nhidstr" + fi + ((iter++)) + done + + # create duplicate large ecmp groups + iter=0 + while [ $iter -le $grpnum ] + do + grpidstr="$(($grpidstart + $iter))" + run_cmd "$IP nexthop add id $grpidstr group $grpstr" + check_nexthop "id $grpidstr" "id $grpidstr group $grpstr" + ((iter++)) + done + + # dump large groups + run_cmd "$IP nexthop list" + log_test $? 0 "Dump large (x$ecmp) ecmp groups" +} + +start_ip_monitor() +{ + local mtype=$1 + + # start the monitor in the background + tmpfile=`mktemp /var/run/nexthoptestXXX` + mpid=`($IP monitor $mtype > $tmpfile & echo $!) 2>/dev/null` + sleep 0.2 + echo "$mpid $tmpfile" +} + +stop_ip_monitor() +{ + local mpid=$1 + local tmpfile=$2 + local el=$3 + + # check the monitor results + kill $mpid + lines=`wc -l $tmpfile | cut "-d " -f1` + test $lines -eq $el + rc=$? + rm -rf $tmpfile + + return $rc +} + +check_nexthop_fdb_support() +{ + $IP nexthop help 2>&1 | grep -q fdb + if [ $? -ne 0 ]; then + echo "SKIP: iproute2 too old, missing fdb nexthop support" + return $ksft_skip + fi +} + +ipv6_fdb_grp_fcnal() +{ + local rc + + echo + echo "IPv6 fdb groups functional" + echo "--------------------------" + + check_nexthop_fdb_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + # create group with multiple nexthops + run_cmd "$IP nexthop add id 61 via 2001:db8:91::2 fdb" + run_cmd "$IP nexthop add id 62 via 2001:db8:91::3 fdb" + run_cmd "$IP nexthop add id 102 group 61/62 fdb" + check_nexthop "id 102" "id 102 group 61/62 fdb" + log_test $? 0 "Fdb Nexthop group with multiple nexthops" + + ## get nexthop group + run_cmd "$IP nexthop get id 102" + check_nexthop "id 102" "id 102 group 61/62 fdb" + log_test $? 0 "Get Fdb nexthop group by id" + + # fdb nexthop group can only contain fdb nexthops + run_cmd "$IP nexthop add id 63 via 2001:db8:91::4" + run_cmd "$IP nexthop add id 64 via 2001:db8:91::5" + run_cmd "$IP nexthop add id 103 group 63/64 fdb" + log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" + + # Non fdb nexthop group can not contain fdb nexthops + run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 fdb" + run_cmd "$IP nexthop add id 66 via 2001:db8:91::6 fdb" + run_cmd "$IP nexthop add id 104 group 65/66" + log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops" + + # fdb nexthop cannot have blackhole + run_cmd "$IP nexthop add id 67 blackhole fdb" + log_test $? 2 "Fdb Nexthop with blackhole" + + # fdb nexthop with oif + run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with oif" + + # fdb nexthop with onlink + run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 onlink fdb" + log_test $? 2 "Fdb Nexthop with onlink" + + # fdb nexthop with encap + run_cmd "$IP nexthop add id 69 encap mpls 101 via 2001:db8:91::8 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with encap" + + run_cmd "$IP link add name vx10 type vxlan id 1010 local 2001:db8:91::9 remote 2001:db8:91::10 dstport 4789 nolearning noudpcsum tos inherit ttl 100" + run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" + log_test $? 0 "Fdb mac add with nexthop group" + + ## fdb nexthops can only reference nexthop groups and not nexthops + run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 61 self" + log_test $? 255 "Fdb mac add with nexthop" + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 66" + log_test $? 2 "Route add with fdb nexthop" + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 103" + log_test $? 2 "Route add with fdb nexthop group" + + run_cmd "$IP nexthop del id 102" + log_test $? 0 "Fdb nexthop delete" + + $IP link del dev vx10 +} + +ipv4_fdb_grp_fcnal() +{ + local rc + + echo + echo "IPv4 fdb groups functional" + echo "--------------------------" + + check_nexthop_fdb_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + # create group with multiple nexthops + run_cmd "$IP nexthop add id 12 via 172.16.1.2 fdb" + run_cmd "$IP nexthop add id 13 via 172.16.1.3 fdb" + run_cmd "$IP nexthop add id 102 group 12/13 fdb" + check_nexthop "id 102" "id 102 group 12/13 fdb" + log_test $? 0 "Fdb Nexthop group with multiple nexthops" + + # get nexthop group + run_cmd "$IP nexthop get id 102" + check_nexthop "id 102" "id 102 group 12/13 fdb" + log_test $? 0 "Get Fdb nexthop group by id" + + # fdb nexthop group can only contain fdb nexthops + run_cmd "$IP nexthop add id 14 via 172.16.1.2" + run_cmd "$IP nexthop add id 15 via 172.16.1.3" + run_cmd "$IP nexthop add id 103 group 14/15 fdb" + log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" + + # Non fdb nexthop group can not contain fdb nexthops + run_cmd "$IP nexthop add id 16 via 172.16.1.2 fdb" + run_cmd "$IP nexthop add id 17 via 172.16.1.3 fdb" + run_cmd "$IP nexthop add id 104 group 14/15" + log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops" + + # fdb nexthop cannot have blackhole + run_cmd "$IP nexthop add id 18 blackhole fdb" + log_test $? 2 "Fdb Nexthop with blackhole" + + # fdb nexthop with oif + run_cmd "$IP nexthop add id 16 via 172.16.1.2 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with oif" + + # fdb nexthop with onlink + run_cmd "$IP nexthop add id 16 via 172.16.1.2 onlink fdb" + log_test $? 2 "Fdb Nexthop with onlink" + + # fdb nexthop with encap + run_cmd "$IP nexthop add id 17 encap mpls 101 via 172.16.1.2 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with encap" + + run_cmd "$IP link add name vx10 type vxlan id 1010 local 10.0.0.1 remote 10.0.0.2 dstport 4789 nolearning noudpcsum tos inherit ttl 100" + run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" + log_test $? 0 "Fdb mac add with nexthop group" + + # fdb nexthops can only reference nexthop groups and not nexthops + run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 12 self" + log_test $? 255 "Fdb mac add with nexthop" + + run_cmd "$IP ro add 172.16.0.0/22 nhid 15" + log_test $? 2 "Route add with fdb nexthop" + + run_cmd "$IP ro add 172.16.0.0/22 nhid 103" + log_test $? 2 "Route add with fdb nexthop group" + + run_cmd "$IP nexthop del id 102" + log_test $? 0 "Fdb nexthop delete" + + $IP link del dev vx10 +} + ################################################################################ # basic operations (add, delete, replace) on nexthops and nexthop groups # @@ -423,8 +660,6 @@ ipv6_fcnal_runtime() echo "IPv6 functional runtime" echo "-----------------------" - sleep 5 - # # IPv6 - the basics # @@ -481,12 +716,12 @@ ipv6_fcnal_runtime() run_cmd "$IP -6 nexthop add id 85 dev veth1" run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 85" log_test $? 0 "IPv6 route with device only nexthop" - check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 85 dev veth1 metric 1024 pref medium" + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 85 dev veth1 metric 1024" run_cmd "$IP nexthop add id 123 group 81/85" run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 123" log_test $? 0 "IPv6 multipath route with nexthop mix - dev only + gw" - check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 123 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop dev veth1 weight 1 pref medium" + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 123 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop dev veth1 weight 1" # # IPv6 route with v4 nexthop - not allowed @@ -519,6 +754,75 @@ ipv6_fcnal_runtime() # route with src address and using nexthop - not allowed } +ipv6_large_grp() +{ + local ecmp=32 + + echo + echo "IPv6 large groups (x$ecmp)" + echo "---------------------" + + check_large_grp 6 $ecmp + + $IP nexthop flush >/dev/null 2>&1 +} + +ipv6_del_add_loop1() +{ + while :; do + $IP nexthop del id 100 + $IP nexthop add id 100 via 2001:db8:91::2 dev veth1 + done >/dev/null 2>&1 +} + +ipv6_grp_replace_loop() +{ + while :; do + $IP nexthop replace id 102 group 100/101 + done >/dev/null 2>&1 +} + +ipv6_torture() +{ + local pid1 + local pid2 + local pid3 + local pid4 + local pid5 + + echo + echo "IPv6 runtime torture" + echo "--------------------" + if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test; need mausezahn tool" + return + fi + + run_cmd "$IP nexthop add id 100 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 101 via 2001:db8:92::2 dev veth3" + run_cmd "$IP nexthop add id 102 group 100/101" + run_cmd "$IP route add 2001:db8:101::1 nhid 102" + run_cmd "$IP route add 2001:db8:101::2 nhid 102" + + ipv6_del_add_loop1 & + pid1=$! + ipv6_grp_replace_loop & + pid2=$! + ip netns exec me ping -f 2001:db8:101::1 >/dev/null 2>&1 & + pid3=$! + ip netns exec me ping -f 2001:db8:101::2 >/dev/null 2>&1 & + pid4=$! + ip netns exec me mausezahn veth1 -B 2001:db8:101::2 -A 2001:db8:91::1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + pid5=$! + + sleep 300 + kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + + # if we did not crash, success + log_test 0 0 "IPv6 torture test" +} + + ipv4_fcnal() { local rc @@ -866,6 +1170,11 @@ ipv4_fcnal_runtime() $IP neigh sh | grep 'dev veth1' fi + run_cmd "$IP ro del 172.16.101.1/32 via inet6 ${lladdr} dev veth1" + run_cmd "$IP -4 ro add default via inet6 ${lladdr} dev veth1" + run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1" + log_test $? 0 "IPv4 default route with IPv6 gateway" + # # MPLS as an example of LWT encap # @@ -880,6 +1189,241 @@ ipv4_fcnal_runtime() log_test $? 0 "IPv4 route with MPLS encap, v6 gw - check" } +ipv4_large_grp() +{ + local ecmp=32 + + echo + echo "IPv4 large groups (x$ecmp)" + echo "---------------------" + + check_large_grp 4 $ecmp + + $IP nexthop flush >/dev/null 2>&1 +} + +sysctl_nexthop_compat_mode_check() +{ + local sysctlname="net.ipv4.nexthop_compat_mode" + local lprefix=$1 + + IPE="ip netns exec me" + + $IPE sysctl -q $sysctlname 2>&1 >/dev/null + if [ $? -ne 0 ]; then + echo "SKIP: kernel lacks nexthop compat mode sysctl control" + return $ksft_skip + fi + + out=$($IPE sysctl $sysctlname 2>/dev/null) + log_test $? 0 "$lprefix default nexthop compat mode check" + check_output "${out}" "$sysctlname = 1" +} + +sysctl_nexthop_compat_mode_set() +{ + local sysctlname="net.ipv4.nexthop_compat_mode" + local mode=$1 + local lprefix=$2 + + IPE="ip netns exec me" + + out=$($IPE sysctl -w $sysctlname=$mode) + log_test $? 0 "$lprefix set compat mode - $mode" + check_output "${out}" "net.ipv4.nexthop_compat_mode = $mode" +} + +ipv6_compat_mode() +{ + local rc + + echo + echo "IPv6 nexthop api compat mode test" + echo "--------------------------------" + + sysctl_nexthop_compat_mode_check "IPv6" + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 122 group 62/63" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122" + # route add notification should contain expanded nexthops + stop_ip_monitor $ipmout 3 + log_test $? 0 "IPv6 compat mode on - route add notification" + + # route dump should contain expanded nexthops + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop via 2001:db8:91::3 dev veth1 weight 1" + log_test $? 0 "IPv6 compat mode on - route dump" + + # change in nexthop group should generate route notification + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 62/64" + stop_ip_monitor $ipmout 3 + + log_test $? 0 "IPv6 compat mode on - nexthop change" + + # set compat mode off + sysctl_nexthop_compat_mode_set 0 "IPv6" + + run_cmd "$IP -6 ro del 2001:db8:101::1/128 nhid 122" + + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 122 group 62/63" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122" + # route add notification should not contain expanded nexthops + stop_ip_monitor $ipmout 1 + log_test $? 0 "IPv6 compat mode off - route add notification" + + # route dump should not contain expanded nexthops + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024" + log_test $? 0 "IPv6 compat mode off - route dump" + + # change in nexthop group should not generate route notification + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 62/64" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv6 compat mode off - nexthop change" + + # nexthop delete should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop del id 122" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv6 compat mode off - nexthop delete" + + # set compat mode back on + sysctl_nexthop_compat_mode_set 1 "IPv6" +} + +ipv4_compat_mode() +{ + local rc + + echo + echo "IPv4 nexthop api compat mode" + echo "----------------------------" + + sysctl_nexthop_compat_mode_check "IPv4" + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + run_cmd "$IP nexthop add id 21 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 22 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 122 group 21/22" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP ro add 172.16.101.1/32 nhid 122" + stop_ip_monitor $ipmout 3 + + # route add notification should contain expanded nexthops + log_test $? 0 "IPv4 compat mode on - route add notification" + + # route dump should contain expanded nexthops + check_route "172.16.101.1" "172.16.101.1 nhid 122 nexthop via 172.16.1.2 dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1" + log_test $? 0 "IPv4 compat mode on - route dump" + + # change in nexthop group should generate route notification + run_cmd "$IP nexthop add id 23 via 172.16.1.3 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 21/23" + stop_ip_monitor $ipmout 3 + log_test $? 0 "IPv4 compat mode on - nexthop change" + + sysctl_nexthop_compat_mode_set 0 "IPv4" + + # cleanup + run_cmd "$IP ro del 172.16.101.1/32 nhid 122" + + ipmout=$(start_ip_monitor route) + run_cmd "$IP ro add 172.16.101.1/32 nhid 122" + stop_ip_monitor $ipmout 1 + # route add notification should not contain expanded nexthops + log_test $? 0 "IPv4 compat mode off - route add notification" + + # route dump should not contain expanded nexthops + check_route "172.16.101.1" "172.16.101.1 nhid 122" + log_test $? 0 "IPv4 compat mode off - route dump" + + # change in nexthop group should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 21/22" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv4 compat mode off - nexthop change" + + # nexthop delete should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop del id 122" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv4 compat mode off - nexthop delete" + + sysctl_nexthop_compat_mode_set 1 "IPv4" +} + +ipv4_del_add_loop1() +{ + while :; do + $IP nexthop del id 100 + $IP nexthop add id 100 via 172.16.1.2 dev veth1 + done >/dev/null 2>&1 +} + +ipv4_grp_replace_loop() +{ + while :; do + $IP nexthop replace id 102 group 100/101 + done >/dev/null 2>&1 +} + +ipv4_torture() +{ + local pid1 + local pid2 + local pid3 + local pid4 + local pid5 + + echo + echo "IPv4 runtime torture" + echo "--------------------" + if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test; need mausezahn tool" + return + fi + + run_cmd "$IP nexthop add id 100 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 101 via 172.16.2.2 dev veth3" + run_cmd "$IP nexthop add id 102 group 100/101" + run_cmd "$IP route add 172.16.101.1 nhid 102" + run_cmd "$IP route add 172.16.101.2 nhid 102" + + ipv4_del_add_loop1 & + pid1=$! + ipv4_grp_replace_loop & + pid2=$! + ip netns exec me ping -f 172.16.101.1 >/dev/null 2>&1 & + pid3=$! + ip netns exec me ping -f 172.16.101.2 >/dev/null 2>&1 & + pid4=$! + ip netns exec me mausezahn veth1 -B 172.16.101.2 -A 172.16.1.1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + pid5=$! + + sleep 300 + kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + + # if we did not crash, success + log_test 0 0 "IPv4 torture test" +} + basic() { echo diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh index 155d48bd4d9e..f0e6be4c09e9 100644 --- a/tools/testing/selftests/net/forwarding/devlink_lib.sh +++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh @@ -365,7 +365,9 @@ devlink_trap_group_stats_idle_test() devlink_trap_exception_test() { local trap_name=$1; shift - local group_name=$1; shift + local group_name + + group_name=$(devlink_trap_group_get $trap_name) devlink_trap_stats_idle_test $trap_name check_fail $? "Trap stats idle when packets should have been trapped" @@ -377,9 +379,11 @@ devlink_trap_exception_test() devlink_trap_drop_test() { local trap_name=$1; shift - local group_name=$1; shift local dev=$1; shift local handle=$1; shift + local group_name + + group_name=$(devlink_trap_group_get $trap_name) # This is the common part of all the tests. It checks that stats are # initially idle, then non-idle after changing the trap action and @@ -390,7 +394,6 @@ devlink_trap_drop_test() devlink_trap_group_stats_idle_test $group_name check_err $? "Trap group stats not idle with initial drop action" - devlink_trap_action_set $trap_name "trap" devlink_trap_stats_idle_test $trap_name check_fail $? "Trap stats idle after setting action to trap" @@ -420,6 +423,29 @@ devlink_trap_drop_cleanup() tc filter del dev $dev egress protocol $proto pref $pref handle $handle flower } +devlink_trap_stats_test() +{ + local test_name=$1; shift + local trap_name=$1; shift + local send_one="$@" + local t0_packets + local t1_packets + + RET=0 + + t0_packets=$(devlink_trap_rx_packets_get $trap_name) + + $send_one && sleep 1 + + t1_packets=$(devlink_trap_rx_packets_get $trap_name) + + if [[ $t1_packets -eq $t0_packets ]]; then + check_err 1 "Trap stats did not increase" + fi + + log_test "$test_name" +} + devlink_trap_policers_num_get() { devlink -j -p trap policer show | jq '.[]["'$DEVLINK_DEV'"] | length' diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh index 00797597fcf5..c33bfd7ba214 100644 --- a/tools/testing/selftests/net/forwarding/mirror_lib.sh +++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh @@ -29,11 +29,9 @@ mirror_test() local pref=$1; shift local expect=$1; shift - local ping_timeout=$((PING_TIMEOUT * 5)) local t0=$(tc_rule_stats_get $dev $pref) - ip vrf exec $vrf_name \ - ${PING} ${sip:+-I $sip} $dip -c 10 -i 0.5 -w $ping_timeout \ - &> /dev/null + $MZ $vrf_name ${sip:+-A $sip} -B $dip -a own -b bc -q \ + -c 10 -d 100ms -t icmp type=8 sleep 0.5 local t1=$(tc_rule_stats_get $dev $pref) local delta=$((t1 - t0)) diff --git a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh index b50081855913..55eeacf59241 100755 --- a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh +++ b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh @@ -20,10 +20,14 @@ ALL_TESTS=" ping_ipv4 + ping_ipv6 test_ip_dsfield test_ip_dscp test_ip_ecn test_ip_dscp_ecn + test_ip6_dsfield + test_ip6_dscp + test_ip6_ecn " NUM_NETIFS=4 @@ -107,6 +111,11 @@ ping_ipv4() ping_test $h1 192.0.2.2 } +ping_ipv6() +{ + ping6_test $h1 2001:db8:1::2 +} + do_test_pedit_dsfield_common() { local pedit_locus=$1; shift @@ -123,7 +132,12 @@ do_test_pedit_dsfield_common() local pkts pkts=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= 10" \ tc_rule_handle_stats_get "dev $h2 ingress" 101) - check_err $? "Expected to get 10 packets, but got $pkts." + check_err $? "Expected to get 10 packets on test probe, but got $pkts." + + pkts=$(tc_rule_handle_stats_get "$pedit_locus" 101) + ((pkts >= 10)) + check_err $? "Expected to get 10 packets on pedit rule, but got $pkts." + log_test "$pedit_locus pedit $pedit_action" } @@ -228,6 +242,63 @@ test_ip_dscp_ecn() do_test_ip_dscp_ecn "dev $swp2 egress" } +do_test_ip6_dsfield() +{ + local locus=$1; shift + local dsfield + + for dsfield in 0 1 2 3 128 252 253 254 255; do + do_test_pedit_dsfield "$locus" \ + "ip6 traffic_class set $dsfield" \ + ipv6 "ip_tos $dsfield" \ + "-6 -A 2001:db8:1::1 -B 2001:db8:1::2" + done +} + +test_ip6_dsfield() +{ + do_test_ip6_dsfield "dev $swp1 ingress" + do_test_ip6_dsfield "dev $swp2 egress" +} + +do_test_ip6_dscp() +{ + local locus=$1; shift + local dscp + + for dscp in 0 1 2 3 32 61 62 63; do + do_test_pedit_dsfield "$locus" \ + "ip6 traffic_class set $((dscp << 2)) retain 0xfc" \ + ipv6 "ip_tos $(((dscp << 2) | 1))" \ + "-6 -A 2001:db8:1::1 -B 2001:db8:1::2" + done +} + +test_ip6_dscp() +{ + do_test_ip6_dscp "dev $swp1 ingress" + do_test_ip6_dscp "dev $swp2 egress" +} + +do_test_ip6_ecn() +{ + local locus=$1; shift + local ecn + + for ecn in 0 1 2 3; do + do_test_pedit_dsfield "$locus" \ + "ip6 traffic_class set $ecn retain 0x3" \ + ipv6 "ip_tos $((124 | $ecn))" \ + "-6 -A 2001:db8:1::1 -B 2001:db8:1::2" + done +} + +test_ip6_ecn() +{ + do_test_ip6_ecn "dev $swp1 ingress" + do_test_ip6_ecn "dev $swp2 egress" +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index 813d02d1939d..d9eca227136b 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -2,7 +2,8 @@ # SPDX-License-Identifier: GPL-2.0 ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \ - mirred_egress_mirror_test gact_trap_test" + mirred_egress_mirror_test matchall_mirred_egress_mirror_test \ + gact_trap_test" NUM_NETIFS=4 source tc_common.sh source lib.sh @@ -50,6 +51,9 @@ switch_destroy() mirred_egress_test() { local action=$1 + local protocol=$2 + local classifier=$3 + local classifier_args=$4 RET=0 @@ -62,9 +66,9 @@ mirred_egress_test() tc_check_packets "dev $h2 ingress" 101 1 check_fail $? "Matched without redirect rule inserted" - tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ - $tcflags dst_ip 192.0.2.2 action mirred egress $action \ - dev $swp2 + tc filter add dev $swp1 ingress protocol $protocol pref 1 handle 101 \ + $classifier $tcflags $classifier_args \ + action mirred egress $action dev $swp2 $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ -t ip -q @@ -72,10 +76,11 @@ mirred_egress_test() tc_check_packets "dev $h2 ingress" 101 1 check_err $? "Did not match incoming $action packet" - tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower + tc filter del dev $swp1 ingress protocol $protocol pref 1 handle 101 \ + $classifier tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower - log_test "mirred egress $action ($tcflags)" + log_test "mirred egress $classifier $action ($tcflags)" } gact_drop_and_ok_test() @@ -187,12 +192,17 @@ cleanup() mirred_egress_redirect_test() { - mirred_egress_test "redirect" + mirred_egress_test "redirect" "ip" "flower" "dst_ip 192.0.2.2" } mirred_egress_mirror_test() { - mirred_egress_test "mirror" + mirred_egress_test "mirror" "ip" "flower" "dst_ip 192.0.2.2" +} + +matchall_mirred_egress_mirror_test() +{ + mirred_egress_test "mirror" "all" "matchall" "" } trap cleanup EXIT diff --git a/tools/testing/selftests/net/ip_defrag.c b/tools/testing/selftests/net/ip_defrag.c index c0c9ecb891e1..f9ed749fd8c7 100644 --- a/tools/testing/selftests/net/ip_defrag.c +++ b/tools/testing/selftests/net/ip_defrag.c @@ -192,9 +192,9 @@ static void send_fragment(int fd_raw, struct sockaddr *addr, socklen_t alen, } res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen); - if (res < 0) + if (res < 0 && errno != EPERM) error(1, errno, "send_fragment"); - if (res != frag_len) + if (res >= 0 && res != frag_len) error(1, 0, "send_fragment: %d vs %d", res, frag_len); frag_counter++; @@ -313,9 +313,9 @@ static void send_udp_frags(int fd_raw, struct sockaddr *addr, iphdr->ip_len = htons(frag_len); } res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen); - if (res < 0) + if (res < 0 && errno != EPERM) error(1, errno, "sendto overlap: %d", frag_len); - if (res != frag_len) + if (res >= 0 && res != frag_len) error(1, 0, "sendto overlap: %d vs %d", (int)res, frag_len); frag_counter++; } diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 71a62e7e35b1..77c09cd339c3 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -67,6 +67,10 @@ # Same as pmtu_ipv4_vxlan4, but using a generic UDP IPv4/IPv6 # encapsulation (GUE) over IPv4/IPv6, instead of VXLAN # +# - pmtu_ipv{4,6}_ipv{4,6}_exception +# Same as pmtu_ipv4_vxlan4, but using a IPv4/IPv6 tunnel over IPv4/IPv6, +# instead of VXLAN +# # - pmtu_vti4_exception # Set up vti tunnel on top of veth, with xfrm states and policies, in two # namespaces with matching endpoints. Check that route exception is not @@ -151,6 +155,10 @@ tests=" pmtu_ipv6_gue4_exception IPv6 over gue4: PMTU exceptions 1 pmtu_ipv4_gue6_exception IPv4 over gue6: PMTU exceptions 1 pmtu_ipv6_gue6_exception IPv6 over gue6: PMTU exceptions 1 + pmtu_ipv4_ipv4_exception IPv4 over IPv4: PMTU exceptions 1 + pmtu_ipv6_ipv4_exception IPv6 over IPv4: PMTU exceptions 1 + pmtu_ipv4_ipv6_exception IPv4 over IPv6: PMTU exceptions 1 + pmtu_ipv6_ipv6_exception IPv6 over IPv6: PMTU exceptions 1 pmtu_vti6_exception vti6: PMTU exceptions 0 pmtu_vti4_exception vti4: PMTU exceptions 0 pmtu_vti4_default_mtu vti4: default MTU assignment 0 @@ -363,6 +371,62 @@ setup_gue66() { setup_fou_or_gue 6 6 gue } +setup_ipvX_over_ipvY() { + inner=${1} + outer=${2} + + if [ "${outer}" -eq 4 ]; then + a_addr="${prefix4}.${a_r1}.1" + b_addr="${prefix4}.${b_r1}.1" + if [ "${inner}" -eq 4 ]; then + type="ipip" + mode="ipip" + else + type="sit" + mode="ip6ip" + fi + else + a_addr="${prefix6}:${a_r1}::1" + b_addr="${prefix6}:${b_r1}::1" + type="ip6tnl" + if [ "${inner}" -eq 4 ]; then + mode="ipip6" + else + mode="ip6ip6" + fi + fi + + run_cmd ${ns_a} ip link add ip_a type ${type} local ${a_addr} remote ${b_addr} mode ${mode} || return 2 + run_cmd ${ns_b} ip link add ip_b type ${type} local ${b_addr} remote ${a_addr} mode ${mode} + + run_cmd ${ns_a} ip link set ip_a up + run_cmd ${ns_b} ip link set ip_b up + + if [ "${inner}" = "4" ]; then + run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ip_a + run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ip_b + else + run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ip_a + run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ip_b + fi +} + +setup_ip4ip4() { + setup_ipvX_over_ipvY 4 4 +} + +setup_ip6ip4() { + setup_ipvX_over_ipvY 6 4 +} + +setup_ip4ip6() { + setup_ipvX_over_ipvY 4 6 +} + +setup_ip6ip6() { + setup_ipvX_over_ipvY 6 6 +} + setup_namespaces() { for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do ip netns add ${n} || return 1 @@ -908,6 +972,64 @@ test_pmtu_ipv6_gue6_exception() { test_pmtu_ipvX_over_fouY_or_gueY 6 6 gue } +test_pmtu_ipvX_over_ipvY_exception() { + inner=${1} + outer=${2} + ll_mtu=4000 + + setup namespaces routing ip${inner}ip${outer} || return 2 + + trace "${ns_a}" ip_a "${ns_b}" ip_b \ + "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ + "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B + + if [ ${inner} -eq 4 ]; then + ping=ping + dst=${tunnel4_b_addr} + else + ping=${ping6} + dst=${tunnel6_b_addr} + fi + + if [ ${outer} -eq 4 ]; then + # IPv4 header + exp_mtu=$((${ll_mtu} - 20)) + else + # IPv6 header Option 4 + exp_mtu=$((${ll_mtu} - 40 - 8)) + fi + + # Create route exception by exceeding link layer MTU + mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000)) + mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000)) + mtu "${ns_b}" veth_B-R1 ${ll_mtu} + mtu "${ns_r1}" veth_R1-B ${ll_mtu} + + mtu "${ns_a}" ip_a $((${ll_mtu} + 1000)) || return + mtu "${ns_b}" ip_b $((${ll_mtu} + 1000)) || return + run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst} + + # Check that exception was created + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})" + check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ip${inner}ip${outer} interface" +} + +test_pmtu_ipv4_ipv4_exception() { + test_pmtu_ipvX_over_ipvY_exception 4 4 +} + +test_pmtu_ipv6_ipv4_exception() { + test_pmtu_ipvX_over_ipvY_exception 6 4 +} + +test_pmtu_ipv4_ipv6_exception() { + test_pmtu_ipvX_over_ipvY_exception 4 6 +} + +test_pmtu_ipv6_ipv6_exception() { + test_pmtu_ipvX_over_ipvY_exception 6 6 +} + test_pmtu_vti4_exception() { setup namespaces veth vti4 xfrm4 || return 2 trace "${ns_a}" veth_a "${ns_b}" veth_b \ diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 0ea44d975b6c..c5282e62df75 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -101,6 +101,21 @@ FIXTURE(tls) bool notls; }; +FIXTURE_VARIANT(tls) +{ + unsigned int tls_version; +}; + +FIXTURE_VARIANT_ADD(tls, 12) +{ + .tls_version = TLS_1_2_VERSION, +}; + +FIXTURE_VARIANT_ADD(tls, 13) +{ + .tls_version = TLS_1_3_VERSION, +}; + FIXTURE_SETUP(tls) { struct tls12_crypto_info_aes_gcm_128 tls12; @@ -112,7 +127,7 @@ FIXTURE_SETUP(tls) len = sizeof(addr); memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_3_VERSION; + tls12.info.version = variant->tls_version; tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; addr.sin_family = AF_INET; @@ -733,7 +748,7 @@ TEST_F(tls, bidir) struct tls12_crypto_info_aes_gcm_128 tls12; memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_3_VERSION; + tls12.info.version = variant->tls_version; tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; ret = setsockopt(self->fd, SOL_TLS, TLS_RX, &tls12, @@ -1258,78 +1273,4 @@ TEST(keysizes) { close(cfd); } -TEST(tls12) { - int fd, cfd; - bool notls; - - struct tls12_crypto_info_aes_gcm_128 tls12; - struct sockaddr_in addr; - socklen_t len; - int sfd, ret; - - notls = false; - len = sizeof(addr); - - memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_2_VERSION; - tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; - - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = htonl(INADDR_ANY); - addr.sin_port = 0; - - fd = socket(AF_INET, SOCK_STREAM, 0); - sfd = socket(AF_INET, SOCK_STREAM, 0); - - ret = bind(sfd, &addr, sizeof(addr)); - ASSERT_EQ(ret, 0); - ret = listen(sfd, 10); - ASSERT_EQ(ret, 0); - - ret = getsockname(sfd, &addr, &len); - ASSERT_EQ(ret, 0); - - ret = connect(fd, &addr, sizeof(addr)); - ASSERT_EQ(ret, 0); - - ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")); - if (ret != 0) { - notls = true; - printf("Failure setting TCP_ULP, testing without tls\n"); - } - - if (!notls) { - ret = setsockopt(fd, SOL_TLS, TLS_TX, &tls12, - sizeof(tls12)); - ASSERT_EQ(ret, 0); - } - - cfd = accept(sfd, &addr, &len); - ASSERT_GE(cfd, 0); - - if (!notls) { - ret = setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls", - sizeof("tls")); - ASSERT_EQ(ret, 0); - - ret = setsockopt(cfd, SOL_TLS, TLS_RX, &tls12, - sizeof(tls12)); - ASSERT_EQ(ret, 0); - } - - close(sfd); - - char const *test_str = "test_read"; - int send_len = 10; - char buf[10]; - - send_len = strlen(test_str) + 1; - EXPECT_EQ(send(fd, test_str, send_len, 0), send_len); - EXPECT_NE(recv(cfd, buf, send_len, 0), -1); - EXPECT_EQ(memcmp(buf, test_str, send_len), 0); - - close(fd); - close(cfd); -} - TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/vrf-xfrm-tests.sh b/tools/testing/selftests/net/vrf-xfrm-tests.sh new file mode 100755 index 000000000000..184da81f554f --- /dev/null +++ b/tools/testing/selftests/net/vrf-xfrm-tests.sh @@ -0,0 +1,436 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Various combinations of VRF with xfrms and qdisc. + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +PAUSE_ON_FAIL=no +VERBOSE=0 +ret=0 + +HOST1_4=192.168.1.1 +HOST2_4=192.168.1.2 +HOST1_6=2001:db8:1::1 +HOST2_6=2001:db8:1::2 + +XFRM1_4=10.0.1.1 +XFRM2_4=10.0.1.2 +XFRM1_6=fc00:1000::1 +XFRM2_6=fc00:1000::2 +IF_ID=123 + +VRF=red +TABLE=300 + +AUTH_1=0xd94fcfea65fddf21dc6e0d24a0253508 +AUTH_2=0xdc6e0d24a0253508d94fcfea65fddf21 +ENC_1=0xfc46c20f8048be9725930ff3fb07ac2a91f0347dffeacf62 +ENC_2=0x3fb07ac2a91f0347dffeacf62fc46c20f8048be9725930ff +SPI_1=0x02122b77 +SPI_2=0x2b770212 + +which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) + +################################################################################ +# +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + printf "TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf "TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +run_cmd_host1() +{ + local cmd="$*" + local out + local rc + + if [ "$VERBOSE" = "1" ]; then + printf " COMMAND: $cmd\n" + fi + + out=$(eval ip netns exec host1 $cmd 2>&1) + rc=$? + if [ "$VERBOSE" = "1" ]; then + if [ -n "$out" ]; then + echo + echo " $out" + fi + echo + fi + + return $rc +} + +################################################################################ +# create namespaces for hosts and sws + +create_vrf() +{ + local ns=$1 + local vrf=$2 + local table=$3 + + if [ -n "${ns}" ]; then + ns="-netns ${ns}" + fi + + ip ${ns} link add ${vrf} type vrf table ${table} + ip ${ns} link set ${vrf} up + ip ${ns} route add vrf ${vrf} unreachable default metric 8192 + ip ${ns} -6 route add vrf ${vrf} unreachable default metric 8192 + + ip ${ns} addr add 127.0.0.1/8 dev ${vrf} + ip ${ns} -6 addr add ::1 dev ${vrf} nodad + + ip ${ns} ru del pref 0 + ip ${ns} ru add pref 32765 from all lookup local + ip ${ns} -6 ru del pref 0 + ip ${ns} -6 ru add pref 32765 from all lookup local +} + +create_ns() +{ + local ns=$1 + local addr=$2 + local addr6=$3 + + [ -z "${addr}" ] && addr="-" + [ -z "${addr6}" ] && addr6="-" + + ip netns add ${ns} + + ip -netns ${ns} link set lo up + if [ "${addr}" != "-" ]; then + ip -netns ${ns} addr add dev lo ${addr} + fi + if [ "${addr6}" != "-" ]; then + ip -netns ${ns} -6 addr add dev lo ${addr6} + fi + + ip -netns ${ns} ro add unreachable default metric 8192 + ip -netns ${ns} -6 ro add unreachable default metric 8192 + + ip netns exec ${ns} sysctl -qw net.ipv4.ip_forward=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.forwarding=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.forwarding=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.accept_dad=0 +} + +# create veth pair to connect namespaces and apply addresses. +connect_ns() +{ + local ns1=$1 + local ns1_dev=$2 + local ns1_addr=$3 + local ns1_addr6=$4 + local ns2=$5 + local ns2_dev=$6 + local ns2_addr=$7 + local ns2_addr6=$8 + local ns1arg + local ns2arg + + if [ -n "${ns1}" ]; then + ns1arg="-netns ${ns1}" + fi + if [ -n "${ns2}" ]; then + ns2arg="-netns ${ns2}" + fi + + ip ${ns1arg} li add ${ns1_dev} type veth peer name tmp + ip ${ns1arg} li set ${ns1_dev} up + ip ${ns1arg} li set tmp netns ${ns2} name ${ns2_dev} + ip ${ns2arg} li set ${ns2_dev} up + + if [ "${ns1_addr}" != "-" ]; then + ip ${ns1arg} addr add dev ${ns1_dev} ${ns1_addr} + ip ${ns2arg} addr add dev ${ns2_dev} ${ns2_addr} + fi + + if [ "${ns1_addr6}" != "-" ]; then + ip ${ns1arg} addr add dev ${ns1_dev} ${ns1_addr6} nodad + ip ${ns2arg} addr add dev ${ns2_dev} ${ns2_addr6} nodad + fi +} + +################################################################################ + +cleanup() +{ + ip netns del host1 + ip netns del host2 +} + +setup() +{ + create_ns "host1" + create_ns "host2" + + connect_ns "host1" eth0 ${HOST1_4}/24 ${HOST1_6}/64 \ + "host2" eth0 ${HOST2_4}/24 ${HOST2_6}/64 + + create_vrf "host1" ${VRF} ${TABLE} + ip -netns host1 link set dev eth0 master ${VRF} +} + +cleanup_xfrm() +{ + for ns in host1 host2 + do + for x in state policy + do + ip -netns ${ns} xfrm ${x} flush + ip -6 -netns ${ns} xfrm ${x} flush + done + done +} + +setup_xfrm() +{ + local h1_4=$1 + local h2_4=$2 + local h1_6=$3 + local h2_6=$4 + local devarg="$5" + + # + # policy + # + + # host1 - IPv4 out + ip -netns host1 xfrm policy add \ + src ${h1_4} dst ${h2_4} ${devarg} dir out \ + tmpl src ${HOST1_4} dst ${HOST2_4} proto esp mode tunnel + + # host2 - IPv4 in + ip -netns host2 xfrm policy add \ + src ${h1_4} dst ${h2_4} dir in \ + tmpl src ${HOST1_4} dst ${HOST2_4} proto esp mode tunnel + + # host1 - IPv4 in + ip -netns host1 xfrm policy add \ + src ${h2_4} dst ${h1_4} ${devarg} dir in \ + tmpl src ${HOST2_4} dst ${HOST1_4} proto esp mode tunnel + + # host2 - IPv4 out + ip -netns host2 xfrm policy add \ + src ${h2_4} dst ${h1_4} dir out \ + tmpl src ${HOST2_4} dst ${HOST1_4} proto esp mode tunnel + + + # host1 - IPv6 out + ip -6 -netns host1 xfrm policy add \ + src ${h1_6} dst ${h2_6} ${devarg} dir out \ + tmpl src ${HOST1_6} dst ${HOST2_6} proto esp mode tunnel + + # host2 - IPv6 in + ip -6 -netns host2 xfrm policy add \ + src ${h1_6} dst ${h2_6} dir in \ + tmpl src ${HOST1_6} dst ${HOST2_6} proto esp mode tunnel + + # host1 - IPv6 in + ip -6 -netns host1 xfrm policy add \ + src ${h2_6} dst ${h1_6} ${devarg} dir in \ + tmpl src ${HOST2_6} dst ${HOST1_6} proto esp mode tunnel + + # host2 - IPv6 out + ip -6 -netns host2 xfrm policy add \ + src ${h2_6} dst ${h1_6} dir out \ + tmpl src ${HOST2_6} dst ${HOST1_6} proto esp mode tunnel + + # + # state + # + ip -netns host1 xfrm state add src ${HOST1_4} dst ${HOST2_4} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_4} dst ${h2_4} ${devarg} + + ip -netns host2 xfrm state add src ${HOST1_4} dst ${HOST2_4} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_4} dst ${h2_4} + + + ip -netns host1 xfrm state add src ${HOST2_4} dst ${HOST1_4} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_4} dst ${h1_4} ${devarg} + + ip -netns host2 xfrm state add src ${HOST2_4} dst ${HOST1_4} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_4} dst ${h1_4} + + + ip -6 -netns host1 xfrm state add src ${HOST1_6} dst ${HOST2_6} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_6} dst ${h2_6} ${devarg} + + ip -6 -netns host2 xfrm state add src ${HOST1_6} dst ${HOST2_6} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_6} dst ${h2_6} + + + ip -6 -netns host1 xfrm state add src ${HOST2_6} dst ${HOST1_6} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_6} dst ${h1_6} ${devarg} + + ip -6 -netns host2 xfrm state add src ${HOST2_6} dst ${HOST1_6} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_6} dst ${h1_6} +} + +cleanup_xfrm_dev() +{ + ip -netns host1 li del xfrm0 + ip -netns host2 addr del ${XFRM2_4}/24 dev eth0 + ip -netns host2 addr del ${XFRM2_6}/64 dev eth0 +} + +setup_xfrm_dev() +{ + local vrfarg="vrf ${VRF}" + + ip -netns host1 li add type xfrm dev eth0 if_id ${IF_ID} + ip -netns host1 li set xfrm0 ${vrfarg} up + ip -netns host1 addr add ${XFRM1_4}/24 dev xfrm0 + ip -netns host1 addr add ${XFRM1_6}/64 dev xfrm0 + + ip -netns host2 addr add ${XFRM2_4}/24 dev eth0 + ip -netns host2 addr add ${XFRM2_6}/64 dev eth0 + + setup_xfrm ${XFRM1_4} ${XFRM2_4} ${XFRM1_6} ${XFRM2_6} "if_id ${IF_ID}" +} + +run_tests() +{ + cleanup_xfrm + + # no IPsec + run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + log_test $? 0 "IPv4 no xfrm policy" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + log_test $? 0 "IPv6 no xfrm policy" + + # xfrm without VRF in sel + setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} + run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + log_test $? 0 "IPv4 xfrm policy based on address" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + log_test $? 0 "IPv6 xfrm policy based on address" + cleanup_xfrm + + # xfrm with VRF in sel + # Known failure: ipv4 resets the flow oif after the lookup. Fix is + # not straightforward. + # setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} "dev ${VRF}" + # run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + # log_test $? 0 "IPv4 xfrm policy with VRF in selector" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + log_test $? 0 "IPv6 xfrm policy with VRF in selector" + cleanup_xfrm + + # xfrm with enslaved device in sel + # Known failures: combined with the above, __xfrm{4,6}_selector_match + # needs to consider both l3mdev and enslaved device index. + # setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} "dev eth0" + # run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + # log_test $? 0 "IPv4 xfrm policy with enslaved device in selector" + # run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + # log_test $? 0 "IPv6 xfrm policy with enslaved device in selector" + # cleanup_xfrm + + # xfrm device + setup_xfrm_dev + run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${XFRM2_4} + log_test $? 0 "IPv4 xfrm policy with xfrm device" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${XFRM2_6} + log_test $? 0 "IPv6 xfrm policy with xfrm device" + cleanup_xfrm_dev +} + +################################################################################ +# usage + +usage() +{ + cat <<EOF +usage: ${0##*/} OPTS + + -p Pause on fail + -v verbose mode (show commands and output) + +done +EOF +} + +################################################################################ +# main + +while getopts :pv o +do + case $o in + p) PAUSE_ON_FAIL=yes;; + v) VERBOSE=$(($VERBOSE + 1));; + h) usage; exit 0;; + *) usage; exit 1;; + esac +done + +cleanup 2>/dev/null +setup + +echo +echo "No qdisc on VRF device" +run_tests + +run_cmd_host1 tc qdisc add dev ${VRF} root netem delay 100ms +echo +echo "netem qdisc on VRF device" +run_tests + +printf "\nTests passed: %3d\n" ${nsuccess} +printf "Tests failed: %3d\n" ${nfail} + +exit $ret diff --git a/tools/testing/selftests/ntb/ntb_test.sh b/tools/testing/selftests/ntb/ntb_test.sh index 9c60337317c6..020137b61407 100755 --- a/tools/testing/selftests/ntb/ntb_test.sh +++ b/tools/testing/selftests/ntb/ntb_test.sh @@ -241,7 +241,7 @@ function get_files_count() split_remote $LOC if [[ "$REMOTE" == "" ]]; then - echo $(ls -1 "$LOC"/${NAME}* 2>/dev/null | wc -l) + echo $(ls -1 "$VPATH"/${NAME}* 2>/dev/null | wc -l) else echo $(ssh "$REMOTE" "ls -1 \"$VPATH\"/${NAME}* | \ wc -l" 2> /dev/null) diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore index 2d4db5afb142..973198a3ec3d 100644 --- a/tools/testing/selftests/pidfd/.gitignore +++ b/tools/testing/selftests/pidfd/.gitignore @@ -5,3 +5,4 @@ pidfd_test pidfd_wait pidfd_fdinfo_test pidfd_getfd_test +pidfd_setns_test diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index 75a545861375..f4a2f28f926b 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -g -I../../../../usr/include/ -pthread -TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test pidfd_poll_test pidfd_wait pidfd_getfd_test +TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \ + pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test include ../lib.mk diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config new file mode 100644 index 000000000000..bb11de90c0c9 --- /dev/null +++ b/tools/testing/selftests/pidfd/config @@ -0,0 +1,6 @@ +CONFIG_UTS_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +CONFIG_CGROUPS=y diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c new file mode 100644 index 000000000000..133ec5b6cda8 --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/types.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syscall.h> +#include <sys/prctl.h> +#include <sys/wait.h> +#include <unistd.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <linux/kcmp.h> + +#include "pidfd.h" +#include "../clone3/clone3_selftests.h" +#include "../kselftest.h" +#include "../kselftest_harness.h" + +enum { + PIDFD_NS_USER, + PIDFD_NS_MNT, + PIDFD_NS_PID, + PIDFD_NS_UTS, + PIDFD_NS_IPC, + PIDFD_NS_NET, + PIDFD_NS_CGROUP, + PIDFD_NS_PIDCLD, + PIDFD_NS_MAX +}; + +const struct ns_info { + const char *name; + int flag; +} ns_info[] = { + [PIDFD_NS_USER] = { "user", CLONE_NEWUSER, }, + [PIDFD_NS_MNT] = { "mnt", CLONE_NEWNS, }, + [PIDFD_NS_PID] = { "pid", CLONE_NEWPID, }, + [PIDFD_NS_UTS] = { "uts", CLONE_NEWUTS, }, + [PIDFD_NS_IPC] = { "ipc", CLONE_NEWIPC, }, + [PIDFD_NS_NET] = { "net", CLONE_NEWNET, }, + [PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, }, + [PIDFD_NS_PIDCLD] = { "pid_for_children", 0, }, +}; + +FIXTURE(current_nsset) +{ + pid_t pid; + int pidfd; + int nsfds[PIDFD_NS_MAX]; + + pid_t child_pid_exited; + int child_pidfd_exited; + + pid_t child_pid1; + int child_pidfd1; + int child_nsfds1[PIDFD_NS_MAX]; + + pid_t child_pid2; + int child_pidfd2; + int child_nsfds2[PIDFD_NS_MAX]; +}; + +static int sys_waitid(int which, pid_t pid, int options) +{ + return syscall(__NR_waitid, which, pid, NULL, options, NULL); +} + +pid_t create_child(int *pidfd, unsigned flags) +{ + struct clone_args args = { + .flags = CLONE_PIDFD | flags, + .exit_signal = SIGCHLD, + .pidfd = ptr_to_u64(pidfd), + }; + + return sys_clone3(&args, sizeof(struct clone_args)); +} + +FIXTURE_SETUP(current_nsset) +{ + int i, proc_fd, ret; + + for (i = 0; i < PIDFD_NS_MAX; i++) { + self->nsfds[i] = -EBADF; + self->child_nsfds1[i] = -EBADF; + self->child_nsfds2[i] = -EBADF; + } + + proc_fd = open("/proc/self/ns", O_DIRECTORY | O_CLOEXEC); + ASSERT_GE(proc_fd, 0) { + TH_LOG("%m - Failed to open /proc/self/ns"); + } + + self->pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC); + if (self->nsfds[i] < 0) { + EXPECT_EQ(errno, ENOENT) { + TH_LOG("%m - Failed to open %s namespace for process %d", + info->name, self->pid); + } + } + } + + self->pidfd = sys_pidfd_open(self->pid, 0); + EXPECT_GT(self->pidfd, 0) { + TH_LOG("%m - Failed to open pidfd for process %d", self->pid); + } + + /* Create task that exits right away. */ + self->child_pid_exited = create_child(&self->child_pidfd_exited, + CLONE_NEWUSER | CLONE_NEWNET); + EXPECT_GT(self->child_pid_exited, 0); + + if (self->child_pid_exited == 0) + _exit(EXIT_SUCCESS); + + ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0); + + self->pidfd = sys_pidfd_open(self->pid, 0); + EXPECT_GE(self->pidfd, 0) { + TH_LOG("%m - Failed to open pidfd for process %d", self->pid); + } + + /* Create tasks that will be stopped. */ + self->child_pid1 = create_child(&self->child_pidfd1, + CLONE_NEWUSER | CLONE_NEWNS | + CLONE_NEWCGROUP | CLONE_NEWIPC | + CLONE_NEWUTS | CLONE_NEWPID | + CLONE_NEWNET); + EXPECT_GE(self->child_pid1, 0); + + if (self->child_pid1 == 0) { + pause(); + _exit(EXIT_SUCCESS); + } + + self->child_pid2 = create_child(&self->child_pidfd2, + CLONE_NEWUSER | CLONE_NEWNS | + CLONE_NEWCGROUP | CLONE_NEWIPC | + CLONE_NEWUTS | CLONE_NEWPID | + CLONE_NEWNET); + EXPECT_GE(self->child_pid2, 0); + + if (self->child_pid2 == 0) { + pause(); + _exit(EXIT_SUCCESS); + } + + for (i = 0; i < PIDFD_NS_MAX; i++) { + char p[100]; + + const struct ns_info *info = &ns_info[i]; + + self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC); + if (self->nsfds[i] < 0) { + EXPECT_EQ(errno, ENOENT) { + TH_LOG("%m - Failed to open %s namespace for process %d", + info->name, self->pid); + } + } + + ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s", + self->child_pid1, info->name); + EXPECT_GT(ret, 0); + EXPECT_LT(ret, sizeof(p)); + + self->child_nsfds1[i] = open(p, O_RDONLY | O_CLOEXEC); + if (self->child_nsfds1[i] < 0) { + EXPECT_EQ(errno, ENOENT) { + TH_LOG("%m - Failed to open %s namespace for process %d", + info->name, self->child_pid1); + } + } + + ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s", + self->child_pid2, info->name); + EXPECT_GT(ret, 0); + EXPECT_LT(ret, sizeof(p)); + + self->child_nsfds2[i] = open(p, O_RDONLY | O_CLOEXEC); + if (self->child_nsfds2[i] < 0) { + EXPECT_EQ(errno, ENOENT) { + TH_LOG("%m - Failed to open %s namespace for process %d", + info->name, self->child_pid1); + } + } + } + + close(proc_fd); +} + +FIXTURE_TEARDOWN(current_nsset) +{ + int i; + + ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd1, + SIGKILL, NULL, 0), 0); + ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd2, + SIGKILL, NULL, 0), 0); + + for (i = 0; i < PIDFD_NS_MAX; i++) { + if (self->nsfds[i] >= 0) + close(self->nsfds[i]); + if (self->child_nsfds1[i] >= 0) + close(self->child_nsfds1[i]); + if (self->child_nsfds2[i] >= 0) + close(self->child_nsfds2[i]); + } + + if (self->child_pidfd1 >= 0) + EXPECT_EQ(0, close(self->child_pidfd1)); + if (self->child_pidfd2 >= 0) + EXPECT_EQ(0, close(self->child_pidfd2)); + ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0); + ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0); + ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0); +} + +static int preserve_ns(const int pid, const char *ns) +{ + int ret; + char path[50]; + + ret = snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns); + if (ret < 0 || (size_t)ret >= sizeof(path)) + return -EIO; + + return open(path, O_RDONLY | O_CLOEXEC); +} + +static int in_same_namespace(int ns_fd1, pid_t pid2, const char *ns) +{ + int ns_fd2 = -EBADF; + int ret = -1; + struct stat ns_st1, ns_st2; + + ret = fstat(ns_fd1, &ns_st1); + if (ret < 0) + return -1; + + ns_fd2 = preserve_ns(pid2, ns); + if (ns_fd2 < 0) + return -1; + + ret = fstat(ns_fd2, &ns_st2); + close(ns_fd2); + if (ret < 0) + return -1; + + /* processes are in the same namespace */ + if ((ns_st1.st_dev == ns_st2.st_dev) && + (ns_st1.st_ino == ns_st2.st_ino)) + return 1; + + /* processes are in different namespaces */ + return 0; +} + +/* Test that we can't pass garbage to the kernel. */ +TEST_F(current_nsset, invalid_flags) +{ + ASSERT_NE(setns(self->pidfd, 0), 0); + EXPECT_EQ(errno, EINVAL); + + ASSERT_NE(setns(self->pidfd, -1), 0); + EXPECT_EQ(errno, EINVAL); + + ASSERT_NE(setns(self->pidfd, CLONE_VM), 0); + EXPECT_EQ(errno, EINVAL); + + ASSERT_NE(setns(self->pidfd, CLONE_NEWUSER | CLONE_VM), 0); + EXPECT_EQ(errno, EINVAL); +} + +/* Test that we can't attach to a task that has already exited. */ +TEST_F(current_nsset, pidfd_exited_child) +{ + int i; + pid_t pid; + + ASSERT_NE(setns(self->child_pidfd_exited, CLONE_NEWUSER | CLONE_NEWNET), + 0); + EXPECT_EQ(errno, ESRCH); + + pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + /* Verify that we haven't changed any namespaces. */ + if (self->nsfds[i] >= 0) + ASSERT_EQ(in_same_namespace(self->nsfds[i], pid, info->name), 1); + } +} + +TEST_F(current_nsset, pidfd_incremental_setns) +{ + int i; + pid_t pid; + + pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + int nsfd; + + if (self->child_nsfds1[i] < 0) + continue; + + if (info->flag) { + ASSERT_EQ(setns(self->child_pidfd1, info->flag), 0) { + TH_LOG("%m - Failed to setns to %s namespace of %d via pidfd %d", + info->name, self->child_pid1, + self->child_pidfd1); + } + } + + /* Verify that we have changed to the correct namespaces. */ + if (info->flag == CLONE_NEWPID) + nsfd = self->nsfds[i]; + else + nsfd = self->child_nsfds1[i]; + ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) { + TH_LOG("setns failed to place us correctly into %s namespace of %d via pidfd %d", + info->name, self->child_pid1, + self->child_pidfd1); + } + TH_LOG("Managed to correctly setns to %s namespace of %d via pidfd %d", + info->name, self->child_pid1, self->child_pidfd1); + } +} + +TEST_F(current_nsset, nsfd_incremental_setns) +{ + int i; + pid_t pid; + + pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + int nsfd; + + if (self->child_nsfds1[i] < 0) + continue; + + if (info->flag) { + ASSERT_EQ(setns(self->child_nsfds1[i], info->flag), 0) { + TH_LOG("%m - Failed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid1, + self->child_nsfds1[i]); + } + } + + /* Verify that we have changed to the correct namespaces. */ + if (info->flag == CLONE_NEWPID) + nsfd = self->nsfds[i]; + else + nsfd = self->child_nsfds1[i]; + ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) { + TH_LOG("setns failed to place us correctly into %s namespace of %d via nsfd %d", + info->name, self->child_pid1, + self->child_nsfds1[i]); + } + TH_LOG("Managed to correctly setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid1, self->child_nsfds1[i]); + } +} + +TEST_F(current_nsset, pidfd_one_shot_setns) +{ + unsigned flags = 0; + int i; + pid_t pid; + + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + + if (self->child_nsfds1[i] < 0) + continue; + + flags |= info->flag; + TH_LOG("Adding %s namespace of %d to list of namespaces to attach to", + info->name, self->child_pid1); + } + + ASSERT_EQ(setns(self->child_pidfd1, flags), 0) { + TH_LOG("%m - Failed to setns to namespaces of %d", + self->child_pid1); + } + + pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + int nsfd; + + if (self->child_nsfds1[i] < 0) + continue; + + /* Verify that we have changed to the correct namespaces. */ + if (info->flag == CLONE_NEWPID) + nsfd = self->nsfds[i]; + else + nsfd = self->child_nsfds1[i]; + ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) { + TH_LOG("setns failed to place us correctly into %s namespace of %d", + info->name, self->child_pid1); + } + TH_LOG("Managed to correctly setns to %s namespace of %d", + info->name, self->child_pid1); + } +} + +TEST_F(current_nsset, no_foul_play) +{ + unsigned flags = 0; + int i; + + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + + if (self->child_nsfds1[i] < 0) + continue; + + flags |= info->flag; + if (info->flag) /* No use logging pid_for_children. */ + TH_LOG("Adding %s namespace of %d to list of namespaces to attach to", + info->name, self->child_pid1); + } + + ASSERT_EQ(setns(self->child_pidfd1, flags), 0) { + TH_LOG("%m - Failed to setns to namespaces of %d vid pidfd %d", + self->child_pid1, self->child_pidfd1); + } + + /* + * Can't setns to a user namespace outside of our hierarchy since we + * don't have caps in there and didn't create it. That means that under + * no circumstances should we be able to setns to any of the other + * ones since they aren't owned by our user namespace. + */ + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + + if (self->child_nsfds2[i] < 0 || !info->flag) + continue; + + ASSERT_NE(setns(self->child_pidfd2, info->flag), 0) { + TH_LOG("Managed to setns to %s namespace of %d via pidfd %d", + info->name, self->child_pid2, + self->child_pidfd2); + } + TH_LOG("%m - Correctly failed to setns to %s namespace of %d via pidfd %d", + info->name, self->child_pid2, + self->child_pidfd2); + + ASSERT_NE(setns(self->child_nsfds2[i], info->flag), 0) { + TH_LOG("Managed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid2, + self->child_nsfds2[i]); + } + TH_LOG("%m - Correctly failed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid2, + self->child_nsfds2[i]); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index 644770c3b754..0830e63818c1 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -19,6 +19,7 @@ SUB_DIRS = alignment \ copyloops \ dscr \ mm \ + nx-gzip \ pmu \ signal \ primitives \ diff --git a/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules b/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules new file mode 100644 index 000000000000..5a7118495cb3 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules @@ -0,0 +1 @@ +SUBSYSTEM=="nxgzip", KERNEL=="nx-gzip", MODE="0666" diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile new file mode 100644 index 000000000000..640fad6cc2c7 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile @@ -0,0 +1,8 @@ +CFLAGS = -O3 -m64 -I./include + +TEST_GEN_FILES := gzfht_test gunz_test +TEST_PROGS := nx-gzip-test.sh + +include ../../lib.mk + +$(TEST_GEN_FILES): gzip_vas.c diff --git a/tools/testing/selftests/powerpc/nx-gzip/README b/tools/testing/selftests/powerpc/nx-gzip/README new file mode 100644 index 000000000000..9809dbaa1905 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/README @@ -0,0 +1,45 @@ +Test the nx-gzip function: +========================= + +Verify that following device exists: + /dev/crypto/nx-gzip +If you get a permission error run as sudo or set the device permissions: + sudo chmod go+rw /dev/crypto/nx-gzip +However, chmod may not survive across boots. You may create a udev file such +as: + /etc/udev/rules.d/99-nx-gzip.rules + + +To manually build and run: +$ gcc -O3 -I./include -o gzfht_test gzfht_test.c gzip_vas.c +$ gcc -O3 -I./include -o gunz_test gunz_test.c gzip_vas.c + + +Compress any file using Fixed Huffman mode. Output will have a .nx.gz suffix: +$ ./gzfht_test gzip_vas.c +file gzip_vas.c read, 6413 bytes +compressed 6413 to 3124 bytes total, crc32 checksum = abd15e8a + + +Uncompress the previous output. Output will have a .nx.gunzip suffix: +./gunz_test gzip_vas.c.nx.gz +gzHeader FLG 0 +00 00 00 00 04 03 +gzHeader MTIME, XFL, OS ignored +computed checksum abd15e8a isize 0000190d +stored checksum abd15e8a isize 0000190d +decomp is complete: fclose + + +Compare two files: +$ sha1sum gzip_vas.c.nx.gz.nx.gunzip gzip_vas.c +bf43e3c0c3651f5f22b6f9784cd9b1eeab4120b6 gzip_vas.c.nx.gz.nx.gunzip +bf43e3c0c3651f5f22b6f9784cd9b1eeab4120b6 gzip_vas.c + + +Note that the code here are intended for testing the nx-gzip hardware function. +They are not intended for demonstrating performance or compression ratio. +By being simplistic these selftests expect to allocate the entire set of source +and target pages in the memory so it needs enough memory to work. +For more information and source code consider using: +https://github.com/libnxz/power-gzip diff --git a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c new file mode 100644 index 000000000000..6ee0fded0391 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c @@ -0,0 +1,1028 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* P9 gunzip sample code for demonstrating the P9 NX hardware + * interface. Not intended for productive uses or for performance or + * compression ratio measurements. Note also that /dev/crypto/gzip, + * VAS and skiboot support are required + * + * Copyright 2020 IBM Corp. + * + * Author: Bulent Abali <abali@us.ibm.com> + * + * https://github.com/libnxz/power-gzip for zlib api and other utils + * Definitions of acronyms used here. See + * P9 NX Gzip Accelerator User's Manual for details: + * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf + * + * adler/crc: 32 bit checksums appended to stream tail + * ce: completion extension + * cpb: coprocessor parameter block (metadata) + * crb: coprocessor request block (command) + * csb: coprocessor status block (status) + * dht: dynamic huffman table + * dde: data descriptor element (address, length) + * ddl: list of ddes + * dh/fh: dynamic and fixed huffman types + * fc: coprocessor function code + * histlen: history/dictionary length + * history: sliding window of up to 32KB of data + * lzcount: Deflate LZ symbol counts + * rembytecnt: remaining byte count + * sfbt: source final block type; last block's type during decomp + * spbc: source processed byte count + * subc: source unprocessed bit count + * tebc: target ending bit count; valid bits in the last byte + * tpbc: target processed byte count + * vas: virtual accelerator switch; the user mode interface + */ + +#define _ISOC11_SOURCE // For aligned_alloc() +#define _DEFAULT_SOURCE // For endian.h + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/fcntl.h> +#include <sys/mman.h> +#include <endian.h> +#include <bits/endian.h> +#include <sys/ioctl.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include "nxu.h" +#include "nx.h" +#include "crb.h" + +int nx_dbg; +FILE *nx_gzip_log; + +#define NX_MIN(X, Y) (((X) < (Y))?(X):(Y)) +#define NX_MAX(X, Y) (((X) > (Y))?(X):(Y)) + +#define GETINPC(X) fgetc(X) +#define FNAME_MAX 1024 + +/* fifo queue management */ +#define fifo_used_bytes(used) (used) +#define fifo_free_bytes(used, len) ((len)-(used)) +/* amount of free bytes in the first and last parts */ +#define fifo_free_first_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ + ? (len)-((cur)+(used)) : 0) +#define fifo_free_last_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ + ? (cur) : (len)-(used)) +/* amount of used bytes in the first and last parts */ +#define fifo_used_first_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ + ? (used) : (len)-(cur)) +#define fifo_used_last_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ + ? 0 : ((used)+(cur))-(len)) +/* first and last free parts start here */ +#define fifo_free_first_offset(cur, used) ((cur)+(used)) +#define fifo_free_last_offset(cur, used, len) \ + fifo_used_last_bytes(cur, used, len) +/* first and last used parts start here */ +#define fifo_used_first_offset(cur) (cur) +#define fifo_used_last_offset(cur) (0) + +const int fifo_in_len = 1<<24; +const int fifo_out_len = 1<<24; +const int page_sz = 1<<16; +const int line_sz = 1<<7; +const int window_max = 1<<15; + +/* + * Adds an (address, len) pair to the list of ddes (ddl) and updates + * the base dde. ddl[0] is the only dde in a direct dde which + * contains a single (addr,len) pair. For more pairs, ddl[0] becomes + * the indirect (base) dde that points to a list of direct ddes. + * See Section 6.4 of the NX-gzip user manual for DDE description. + * Addr=NULL, len=0 clears the ddl[0]. Returns the total number of + * bytes in ddl. Caller is responsible for allocting the array of + * nx_dde_t *ddl. If N addresses are required in the scatter-gather + * list, the ddl array must have N+1 entries minimum. + */ +static inline uint32_t nx_append_dde(struct nx_dde_t *ddl, void *addr, + uint32_t len) +{ + uint32_t ddecnt; + uint32_t bytes; + + if (addr == NULL && len == 0) { + clearp_dde(ddl); + return 0; + } + + NXPRT(fprintf(stderr, "%d: %s addr %p len %x\n", __LINE__, addr, + __func__, len)); + + /* Number of ddes in the dde list ; == 0 when it is a direct dde */ + ddecnt = getpnn(ddl, dde_count); + bytes = getp32(ddl, ddebc); + + if (ddecnt == 0 && bytes == 0) { + /* First dde is unused; make it a direct dde */ + bytes = len; + putp32(ddl, ddebc, bytes); + putp64(ddl, ddead, (uint64_t) addr); + } else if (ddecnt == 0) { + /* Converting direct to indirect dde + * ddl[0] becomes head dde of ddl + * copy direct to indirect first. + */ + ddl[1] = ddl[0]; + + /* Add the new dde next */ + clear_dde(ddl[2]); + put32(ddl[2], ddebc, len); + put64(ddl[2], ddead, (uint64_t) addr); + + /* Ddl head points to 2 direct ddes */ + ddecnt = 2; + putpnn(ddl, dde_count, ddecnt); + bytes = bytes + len; + putp32(ddl, ddebc, bytes); + /* Pointer to the first direct dde */ + putp64(ddl, ddead, (uint64_t) &ddl[1]); + } else { + /* Append a dde to an existing indirect ddl */ + ++ddecnt; + clear_dde(ddl[ddecnt]); + put64(ddl[ddecnt], ddead, (uint64_t) addr); + put32(ddl[ddecnt], ddebc, len); + + putpnn(ddl, dde_count, ddecnt); + bytes = bytes + len; + putp32(ddl, ddebc, bytes); /* byte sum of all dde */ + } + return bytes; +} + +/* + * Touch specified number of pages represented in number bytes + * beginning from the first buffer in a dde list. + * Do not touch the pages past buf_sz-th byte's page. + * + * Set buf_sz = 0 to touch all pages described by the ddep. + */ +static int nx_touch_pages_dde(struct nx_dde_t *ddep, long buf_sz, long page_sz, + int wr) +{ + uint32_t indirect_count; + uint32_t buf_len; + long total; + uint64_t buf_addr; + struct nx_dde_t *dde_list; + int i; + + assert(!!ddep); + + indirect_count = getpnn(ddep, dde_count); + + NXPRT(fprintf(stderr, "%s dde_count %d request len ", __func__, + indirect_count)); + NXPRT(fprintf(stderr, "0x%lx\n", buf_sz)); + + if (indirect_count == 0) { + /* Direct dde */ + buf_len = getp32(ddep, ddebc); + buf_addr = getp64(ddep, ddead); + + NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n", + buf_len, (void *)buf_addr)); + + if (buf_sz == 0) + nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); + else + nxu_touch_pages((void *)buf_addr, NX_MIN(buf_len, + buf_sz), page_sz, wr); + + return ERR_NX_OK; + } + + /* Indirect dde */ + if (indirect_count > MAX_DDE_COUNT) + return ERR_NX_EXCESSIVE_DDE; + + /* First address of the list */ + dde_list = (struct nx_dde_t *) getp64(ddep, ddead); + + if (buf_sz == 0) + buf_sz = getp32(ddep, ddebc); + + total = 0; + for (i = 0; i < indirect_count; i++) { + buf_len = get32(dde_list[i], ddebc); + buf_addr = get64(dde_list[i], ddead); + total += buf_len; + + NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total ", + buf_len, (void *)buf_addr)); + NXPRT(fprintf(stderr, "0x%lx\n", total)); + + /* Touching fewer pages than encoded in the ddebc */ + if (total > buf_sz) { + buf_len = NX_MIN(buf_len, total - buf_sz); + nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); + NXPRT(fprintf(stderr, "touch loop break len 0x%x ", + buf_len)); + NXPRT(fprintf(stderr, "ddead %p\n", (void *)buf_addr)); + break; + } + nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); + } + return ERR_NX_OK; +} + +/* + * Src and dst buffers are supplied in scatter gather lists. + * NX function code and other parameters supplied in cmdp. + */ +static int nx_submit_job(struct nx_dde_t *src, struct nx_dde_t *dst, + struct nx_gzip_crb_cpb_t *cmdp, void *handle) +{ + uint64_t csbaddr; + + memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb)); + + cmdp->crb.source_dde = *src; + cmdp->crb.target_dde = *dst; + + /* Status, output byte count in tpbc */ + csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask; + put64(cmdp->crb, csb_address, csbaddr); + + /* NX reports input bytes in spbc; cleared */ + cmdp->cpb.out_spbc_comp_wrap = 0; + cmdp->cpb.out_spbc_comp_with_count = 0; + cmdp->cpb.out_spbc_decomp = 0; + + /* Clear output */ + put32(cmdp->cpb, out_crc, INIT_CRC); + put32(cmdp->cpb, out_adler, INIT_ADLER); + + /* Submit the crb, the job descriptor, to the accelerator. */ + return nxu_submit_job(cmdp, handle); +} + +int decompress_file(int argc, char **argv, void *devhandle) +{ + FILE *inpf = NULL; + FILE *outf = NULL; + + int c, expect, i, cc, rc = 0; + char gzfname[FNAME_MAX]; + + /* Queuing, file ops, byte counting */ + char *fifo_in, *fifo_out; + int used_in, cur_in, used_out, cur_out, read_sz, n; + int first_free, last_free, first_used, last_used; + int first_offset, last_offset; + int write_sz, free_space, source_sz; + int source_sz_estimate, target_sz_estimate; + uint64_t last_comp_ratio = 0; /* 1000 max */ + uint64_t total_out = 0; + int is_final, is_eof; + + /* nx hardware */ + int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0; + int history_len = 0; + struct nx_gzip_crb_cpb_t cmd, *cmdp; + struct nx_dde_t *ddl_in; + struct nx_dde_t dde_in[6] __aligned(128); + struct nx_dde_t *ddl_out; + struct nx_dde_t dde_out[6] __aligned(128); + int pgfault_retries; + + /* when using mmap'ed files */ + off_t input_file_offset; + + if (argc > 2) { + fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]); + fprintf(stderr, " writes to stdout or <fname>.nx.gunzip\n"); + return -1; + } + + if (argc == 1) { + inpf = stdin; + outf = stdout; + } else if (argc == 2) { + char w[1024]; + char *wp; + + inpf = fopen(argv[1], "r"); + if (inpf == NULL) { + perror(argv[1]); + return -1; + } + + /* Make a new file name to write to. Ignoring '.gz' */ + wp = (NULL != (wp = strrchr(argv[1], '/'))) ? (wp+1) : argv[1]; + strcpy(w, wp); + strcat(w, ".nx.gunzip"); + + outf = fopen(w, "w"); + if (outf == NULL) { + perror(w); + return -1; + } + } + + /* Decode the gzip header */ + c = GETINPC(inpf); expect = 0x1f; /* ID1 */ + if (c != expect) + goto err1; + + c = GETINPC(inpf); expect = 0x8b; /* ID2 */ + if (c != expect) + goto err1; + + c = GETINPC(inpf); expect = 0x08; /* CM */ + if (c != expect) + goto err1; + + int flg = GETINPC(inpf); /* FLG */ + + if (flg & 0xE0 || flg & 0x4 || flg == EOF) + goto err2; + + fprintf(stderr, "gzHeader FLG %x\n", flg); + + /* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this + * sample code. + */ + for (i = 0; i < 6; i++) { + char tmp[10]; + + tmp[i] = GETINPC(inpf); + if (tmp[i] == EOF) + goto err3; + fprintf(stderr, "%02x ", tmp[i]); + if (i == 5) + fprintf(stderr, "\n"); + } + fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n"); + + /* FNAME */ + if (flg & 0x8) { + int k = 0; + + do { + c = GETINPC(inpf); + if (c == EOF || k >= FNAME_MAX) + goto err3; + gzfname[k++] = c; + } while (c); + fprintf(stderr, "gzHeader FNAME: %s\n", gzfname); + } + + /* FHCRC */ + if (flg & 0x2) { + c = GETINPC(inpf); + if (c == EOF) + goto err3; + c = GETINPC(inpf); + if (c == EOF) + goto err3; + fprintf(stderr, "gzHeader FHCRC: ignored\n"); + } + + used_in = cur_in = used_out = cur_out = 0; + is_final = is_eof = 0; + + /* Allocate one page larger to prevent page faults due to NX + * overfetching. + * Either do this (char*)(uintptr_t)aligned_alloc or use + * -std=c11 flag to make the int-to-pointer warning go away. + */ + assert((fifo_in = (char *)(uintptr_t)aligned_alloc(line_sz, + fifo_in_len + page_sz)) != NULL); + assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz, + fifo_out_len + page_sz + line_sz)) != NULL); + /* Leave unused space due to history rounding rules */ + fifo_out = fifo_out + line_sz; + nxu_touch_pages(fifo_out, fifo_out_len, page_sz, 1); + + ddl_in = &dde_in[0]; + ddl_out = &dde_out[0]; + cmdp = &cmd; + memset(&cmdp->crb, 0, sizeof(cmdp->crb)); + +read_state: + + /* Read from .gz file */ + + NXPRT(fprintf(stderr, "read_state:\n")); + + if (is_eof != 0) + goto write_state; + + /* We read in to fifo_in in two steps: first: read in to from + * cur_in to the end of the buffer. last: if free space wrapped + * around, read from fifo_in offset 0 to offset cur_in. + */ + + /* Reset fifo head to reduce unnecessary wrap arounds */ + cur_in = (used_in == 0) ? 0 : cur_in; + + /* Free space total is reduced by a gap */ + free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len) + - line_sz); + + /* Free space may wrap around as first and last */ + first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len); + last_free = fifo_free_last_bytes(cur_in, used_in, fifo_in_len); + + /* Start offsets of the free memory */ + first_offset = fifo_free_first_offset(cur_in, used_in); + last_offset = fifo_free_last_offset(cur_in, used_in, fifo_in_len); + + /* Reduce read_sz because of the line_sz gap */ + read_sz = NX_MIN(free_space, first_free); + n = 0; + if (read_sz > 0) { + /* Read in to offset cur_in + used_in */ + n = fread(fifo_in + first_offset, 1, read_sz, inpf); + used_in = used_in + n; + free_space = free_space - n; + assert(n <= read_sz); + if (n != read_sz) { + /* Either EOF or error; exit the read loop */ + is_eof = 1; + goto write_state; + } + } + + /* If free space wrapped around */ + if (last_free > 0) { + /* Reduce read_sz because of the line_sz gap */ + read_sz = NX_MIN(free_space, last_free); + n = 0; + if (read_sz > 0) { + n = fread(fifo_in + last_offset, 1, read_sz, inpf); + used_in = used_in + n; /* Increase used space */ + free_space = free_space - n; /* Decrease free space */ + assert(n <= read_sz); + if (n != read_sz) { + /* Either EOF or error; exit the read loop */ + is_eof = 1; + goto write_state; + } + } + } + + /* At this point we have used_in bytes in fifo_in with the + * data head starting at cur_in and possibly wrapping around. + */ + +write_state: + + /* Write decompressed data to output file */ + + NXPRT(fprintf(stderr, "write_state:\n")); + + if (used_out == 0) + goto decomp_state; + + /* If fifo_out has data waiting, write it out to the file to + * make free target space for the accelerator used bytes in + * the first and last parts of fifo_out. + */ + + first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len); + last_used = fifo_used_last_bytes(cur_out, used_out, fifo_out_len); + + write_sz = first_used; + + n = 0; + if (write_sz > 0) { + n = fwrite(fifo_out + cur_out, 1, write_sz, outf); + used_out = used_out - n; + /* Move head of the fifo */ + cur_out = (cur_out + n) % fifo_out_len; + assert(n <= write_sz); + if (n != write_sz) { + fprintf(stderr, "error: write\n"); + rc = -1; + goto err5; + } + } + + if (last_used > 0) { /* If more data available in the last part */ + write_sz = last_used; /* Keep it here for later */ + n = 0; + if (write_sz > 0) { + n = fwrite(fifo_out, 1, write_sz, outf); + used_out = used_out - n; + cur_out = (cur_out + n) % fifo_out_len; + assert(n <= write_sz); + if (n != write_sz) { + fprintf(stderr, "error: write\n"); + rc = -1; + goto err5; + } + } + } + +decomp_state: + + /* NX decompresses input data */ + + NXPRT(fprintf(stderr, "decomp_state:\n")); + + if (is_final) + goto finish_state; + + /* Address/len lists */ + clearp_dde(ddl_in); + clearp_dde(ddl_out); + + /* FC, CRC, HistLen, Table 6-6 */ + if (resuming) { + /* Resuming a partially decompressed input. + * The key to resume is supplying the 32KB + * dictionary (history) to NX, which is basically + * the last 32KB of output produced. + */ + fc = GZIP_FC_DECOMPRESS_RESUME; + + cmdp->cpb.in_crc = cmdp->cpb.out_crc; + cmdp->cpb.in_adler = cmdp->cpb.out_adler; + + /* Round up the history size to quadword. Section 2.10 */ + history_len = (history_len + 15) / 16; + putnn(cmdp->cpb, in_histlen, history_len); + history_len = history_len * 16; /* bytes */ + + if (history_len > 0) { + /* Chain in the history buffer to the DDE list */ + if (cur_out >= history_len) { + nx_append_dde(ddl_in, fifo_out + + (cur_out - history_len), + history_len); + } else { + nx_append_dde(ddl_in, fifo_out + + ((fifo_out_len + cur_out) + - history_len), + history_len - cur_out); + /* Up to 32KB history wraps around fifo_out */ + nx_append_dde(ddl_in, fifo_out, cur_out); + } + + } + } else { + /* First decompress job */ + fc = GZIP_FC_DECOMPRESS; + + history_len = 0; + /* Writing 0 clears out subc as well */ + cmdp->cpb.in_histlen = 0; + total_out = 0; + + put32(cmdp->cpb, in_crc, INIT_CRC); + put32(cmdp->cpb, in_adler, INIT_ADLER); + put32(cmdp->cpb, out_crc, INIT_CRC); + put32(cmdp->cpb, out_adler, INIT_ADLER); + + /* Assuming 10% compression ratio initially; use the + * most recently measured compression ratio as a + * heuristic to estimate the input and output + * sizes. If we give too much input, the target buffer + * overflows and NX cycles are wasted, and then we + * must retry with smaller input size. 1000 is 100%. + */ + last_comp_ratio = 100UL; + } + cmdp->crb.gzip_fc = 0; + putnn(cmdp->crb, gzip_fc, fc); + + /* + * NX source buffers + */ + first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len); + last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len); + + if (first_used > 0) + nx_append_dde(ddl_in, fifo_in + cur_in, first_used); + + if (last_used > 0) + nx_append_dde(ddl_in, fifo_in, last_used); + + /* + * NX target buffers + */ + first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len); + last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len); + + /* Reduce output free space amount not to overwrite the history */ + int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len) + - (1<<16)); + + NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max, + target_max)); + + first_free = NX_MIN(target_max, first_free); + if (first_free > 0) { + first_offset = fifo_free_first_offset(cur_out, used_out); + nx_append_dde(ddl_out, fifo_out + first_offset, first_free); + } + + if (last_free > 0) { + last_free = NX_MIN(target_max - first_free, last_free); + if (last_free > 0) { + last_offset = fifo_free_last_offset(cur_out, used_out, + fifo_out_len); + nx_append_dde(ddl_out, fifo_out + last_offset, + last_free); + } + } + + /* Target buffer size is used to limit the source data size + * based on previous measurements of compression ratio. + */ + + /* source_sz includes history */ + source_sz = getp32(ddl_in, ddebc); + assert(source_sz > history_len); + source_sz = source_sz - history_len; + + /* Estimating how much source is needed to 3/4 fill a + * target_max size target buffer. If we overshoot, then NX + * must repeat the job with smaller input and we waste + * bandwidth. If we undershoot then we use more NX calls than + * necessary. + */ + + source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL) + / 4000; + + if (source_sz_estimate < source_sz) { + /* Target might be small, therefore limiting the + * source data. + */ + source_sz = source_sz_estimate; + target_sz_estimate = target_max; + } else { + /* Source file might be small, therefore limiting target + * touch pages to a smaller value to save processor cycles. + */ + target_sz_estimate = ((uint64_t)source_sz * 1000UL) + / (last_comp_ratio + 1); + target_sz_estimate = NX_MIN(2 * target_sz_estimate, + target_max); + } + + source_sz = source_sz + history_len; + + /* Some NX condition codes require submitting the NX job again. + * Kernel doesn't handle NX page faults. Expects user code to + * touch pages. + */ + pgfault_retries = NX_MAX_FAULTS; + +restart_nx: + + putp32(ddl_in, ddebc, source_sz); + + /* Fault in pages */ + nxu_touch_pages(cmdp, sizeof(struct nx_gzip_crb_cpb_t), page_sz, 1); + nx_touch_pages_dde(ddl_in, 0, page_sz, 0); + nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1); + + /* Send job to NX */ + cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle); + + switch (cc) { + + case ERR_NX_TRANSLATION: + + /* We touched the pages ahead of time. In the most common case + * we shouldn't be here. But may be some pages were paged out. + * Kernel should have placed the faulting address to fsaddr. + */ + NXPRT(fprintf(stderr, "ERR_NX_TRANSLATION %p\n", + (void *)cmdp->crb.csb.fsaddr)); + + if (pgfault_retries == NX_MAX_FAULTS) { + /* Try once with exact number of pages */ + --pgfault_retries; + goto restart_nx; + } else if (pgfault_retries > 0) { + /* If still faulting try fewer input pages + * assuming memory outage + */ + if (source_sz > page_sz) + source_sz = NX_MAX(source_sz / 2, page_sz); + --pgfault_retries; + goto restart_nx; + } else { + fprintf(stderr, "cannot make progress; too many "); + fprintf(stderr, "page fault retries cc= %d\n", cc); + rc = -1; + goto err5; + } + + case ERR_NX_DATA_LENGTH: + + NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; ")); + NXPRT(fprintf(stderr, "stream may have trailing data\n")); + + /* Not an error in the most common case; it just says + * there is trailing data that we must examine. + * + * CC=3 CE(1)=0 CE(0)=1 indicates partial completion + * Fig.6-7 and Table 6-8. + */ + nx_ce = get_csb_ce_ms3b(cmdp->crb.csb); + + if (!csb_ce_termination(nx_ce) && + csb_ce_partial_completion(nx_ce)) { + /* Check CPB for more information + * spbc and tpbc are valid + */ + sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */ + subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */ + spbc = get32(cmdp->cpb, out_spbc_decomp); + tpbc = get32(cmdp->crb.csb, tpbc); + assert(target_max >= tpbc); + + goto ok_cc3; /* not an error */ + } else { + /* History length error when CE(1)=1 CE(0)=0. */ + rc = -1; + fprintf(stderr, "history length error cc= %d\n", cc); + goto err5; + } + + case ERR_NX_TARGET_SPACE: + + /* Target buffer not large enough; retry smaller input + * data; give at least 1 byte. SPBC/TPBC are not valid. + */ + assert(source_sz > history_len); + source_sz = ((source_sz - history_len + 2) / 2) + history_len; + NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with ")); + NXPRT(fprintf(stderr, "smaller input data src %d hist %d\n", + source_sz, history_len)); + goto restart_nx; + + case ERR_NX_OK: + + /* This should not happen for gzip formatted data; + * we need trailing crc and isize + */ + fprintf(stderr, "ERR_NX_OK\n"); + spbc = get32(cmdp->cpb, out_spbc_decomp); + tpbc = get32(cmdp->crb.csb, tpbc); + assert(target_max >= tpbc); + assert(spbc >= history_len); + source_sz = spbc - history_len; + goto offsets_state; + + default: + fprintf(stderr, "error: cc= %d\n", cc); + rc = -1; + goto err5; + } + +ok_cc3: + + NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt)); + + assert(spbc > history_len); + source_sz = spbc - history_len; + + /* Table 6-4: Source Final Block Type (SFBT) describes the + * last processed deflate block and clues the software how to + * resume the next job. SUBC indicates how many input bits NX + * consumed but did not process. SPBC indicates how many + * bytes of source were given to the accelerator including + * history bytes. + */ + + switch (sfbt) { + int dhtlen; + + case 0x0: /* Deflate final EOB received */ + + /* Calculating the checksum start position. */ + + source_sz = source_sz - subc / 8; + is_final = 1; + break; + + /* Resume decompression cases are below. Basically + * indicates where NX has suspended and how to resume + * the input stream. + */ + + case 0x8: /* Within a literal block; use rembytecount */ + case 0x9: /* Within a literal block; use rembytecount; bfinal=1 */ + + /* Supply the partially processed source byte again */ + source_sz = source_sz - ((subc + 7) / 8); + + /* SUBC LS 3bits: number of bits in the first source byte need + * to be processed. + * 000 means all 8 bits; Table 6-3 + * Clear subc, histlen, sfbt, rembytecnt, dhtlen + */ + cmdp->cpb.in_subc = 0; + cmdp->cpb.in_sfbt = 0; + putnn(cmdp->cpb, in_subc, subc % 8); + putnn(cmdp->cpb, in_sfbt, sfbt); + putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb, + out_rembytecnt)); + break; + + case 0xA: /* Within a FH block; */ + case 0xB: /* Within a FH block; bfinal=1 */ + + source_sz = source_sz - ((subc + 7) / 8); + + /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ + cmdp->cpb.in_subc = 0; + cmdp->cpb.in_sfbt = 0; + putnn(cmdp->cpb, in_subc, subc % 8); + putnn(cmdp->cpb, in_sfbt, sfbt); + break; + + case 0xC: /* Within a DH block; */ + case 0xD: /* Within a DH block; bfinal=1 */ + + source_sz = source_sz - ((subc + 7) / 8); + + /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ + cmdp->cpb.in_subc = 0; + cmdp->cpb.in_sfbt = 0; + putnn(cmdp->cpb, in_subc, subc % 8); + putnn(cmdp->cpb, in_sfbt, sfbt); + + dhtlen = getnn(cmdp->cpb, out_dhtlen); + putnn(cmdp->cpb, in_dhtlen, dhtlen); + assert(dhtlen >= 42); + + /* Round up to a qword */ + dhtlen = (dhtlen + 127) / 128; + + while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */ + --dhtlen; + cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen]; + } + break; + + case 0xE: /* Within a block header; bfinal=0; */ + /* Also given if source data exactly ends (SUBC=0) with + * EOB code with BFINAL=0. Means the next byte will + * contain a block header. + */ + case 0xF: /* within a block header with BFINAL=1. */ + + source_sz = source_sz - ((subc + 7) / 8); + + /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ + cmdp->cpb.in_subc = 0; + cmdp->cpb.in_sfbt = 0; + putnn(cmdp->cpb, in_subc, subc % 8); + putnn(cmdp->cpb, in_sfbt, sfbt); + + /* Engine did not process any data */ + if (is_eof && (source_sz == 0)) + is_final = 1; + } + +offsets_state: + + /* Adjust the source and target buffer offsets and lengths */ + + NXPRT(fprintf(stderr, "offsets_state:\n")); + + /* Delete input data from fifo_in */ + used_in = used_in - source_sz; + cur_in = (cur_in + source_sz) % fifo_in_len; + input_file_offset = input_file_offset + source_sz; + + /* Add output data to fifo_out */ + used_out = used_out + tpbc; + + assert(used_out <= fifo_out_len); + + total_out = total_out + tpbc; + + /* Deflate history is 32KB max. No need to supply more + * than 32KB on a resume. + */ + history_len = (total_out > window_max) ? window_max : total_out; + + /* To estimate expected expansion in the next NX job; 500 means 50%. + * Deflate best case is around 1 to 1000. + */ + last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1)) + / ((uint64_t)tpbc + 1); + last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1); + NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n", + last_comp_ratio, source_sz, spbc, tpbc)); + + resuming = 1; + +finish_state: + + NXPRT(fprintf(stderr, "finish_state:\n")); + + if (is_final) { + if (used_out) + goto write_state; /* More data to write out */ + else if (used_in < 8) { + /* Need at least 8 more bytes containing gzip crc + * and isize. + */ + rc = -1; + goto err4; + } else { + /* Compare checksums and exit */ + int i; + unsigned char tail[8]; + uint32_t cksum, isize; + + for (i = 0; i < 8; i++) + tail[i] = fifo_in[(cur_in + i) % fifo_in_len]; + fprintf(stderr, "computed checksum %08x isize %08x\n", + cmdp->cpb.out_crc, (uint32_t) (total_out + % (1ULL<<32))); + cksum = ((uint32_t) tail[0] | (uint32_t) tail[1]<<8 + | (uint32_t) tail[2]<<16 + | (uint32_t) tail[3]<<24); + isize = ((uint32_t) tail[4] | (uint32_t) tail[5]<<8 + | (uint32_t) tail[6]<<16 + | (uint32_t) tail[7]<<24); + fprintf(stderr, "stored checksum %08x isize %08x\n", + cksum, isize); + + if (cksum == cmdp->cpb.out_crc && isize == (uint32_t) + (total_out % (1ULL<<32))) { + rc = 0; goto ok1; + } else { + rc = -1; goto err4; + } + } + } else + goto read_state; + + return -1; + +err1: + fprintf(stderr, "error: not a gzip file, expect %x, read %x\n", + expect, c); + return -1; + +err2: + fprintf(stderr, "error: the FLG byte is wrong or not being handled\n"); + return -1; + +err3: + fprintf(stderr, "error: gzip header\n"); + return -1; + +err4: + fprintf(stderr, "error: checksum missing or mismatch\n"); + +err5: +ok1: + fprintf(stderr, "decomp is complete: fclose\n"); + fclose(outf); + + return rc; +} + + +int main(int argc, char **argv) +{ + int rc; + struct sigaction act; + void *handle; + + nx_dbg = 0; + nx_gzip_log = NULL; + act.sa_handler = 0; + act.sa_sigaction = nxu_sigsegv_handler; + act.sa_flags = SA_SIGINFO; + act.sa_restorer = 0; + sigemptyset(&act.sa_mask); + sigaction(SIGSEGV, &act, NULL); + + handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0); + if (!handle) { + fprintf(stderr, "Unable to init NX, errno %d\n", errno); + exit(-1); + } + + rc = decompress_file(argc, argv, handle); + + nx_function_end(handle); + + return rc; +} diff --git a/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c new file mode 100644 index 000000000000..7496a83f9c9d --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* P9 gzip sample code for demonstrating the P9 NX hardware interface. + * Not intended for productive uses or for performance or compression + * ratio measurements. For simplicity of demonstration, this sample + * code compresses in to fixed Huffman blocks only (Deflate btype=1) + * and has very simple memory management. Dynamic Huffman blocks + * (Deflate btype=2) are more involved as detailed in the user guide. + * Note also that /dev/crypto/gzip, VAS and skiboot support are + * required. + * + * Copyright 2020 IBM Corp. + * + * https://github.com/libnxz/power-gzip for zlib api and other utils + * + * Author: Bulent Abali <abali@us.ibm.com> + * + * Definitions of acronyms used here. See + * P9 NX Gzip Accelerator User's Manual for details: + * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf + * + * adler/crc: 32 bit checksums appended to stream tail + * ce: completion extension + * cpb: coprocessor parameter block (metadata) + * crb: coprocessor request block (command) + * csb: coprocessor status block (status) + * dht: dynamic huffman table + * dde: data descriptor element (address, length) + * ddl: list of ddes + * dh/fh: dynamic and fixed huffman types + * fc: coprocessor function code + * histlen: history/dictionary length + * history: sliding window of up to 32KB of data + * lzcount: Deflate LZ symbol counts + * rembytecnt: remaining byte count + * sfbt: source final block type; last block's type during decomp + * spbc: source processed byte count + * subc: source unprocessed bit count + * tebc: target ending bit count; valid bits in the last byte + * tpbc: target processed byte count + * vas: virtual accelerator switch; the user mode interface + */ + +#define _ISOC11_SOURCE // For aligned_alloc() +#define _DEFAULT_SOURCE // For endian.h + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/fcntl.h> +#include <sys/mman.h> +#include <endian.h> +#include <bits/endian.h> +#include <sys/ioctl.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include "nxu.h" +#include "nx.h" + +int nx_dbg; +FILE *nx_gzip_log; + +#define NX_MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) +#define FNAME_MAX 1024 +#define FEXT ".nx.gz" + +/* + * LZ counts returned in the user supplied nx_gzip_crb_cpb_t structure. + */ +static int compress_fht_sample(char *src, uint32_t srclen, char *dst, + uint32_t dstlen, int with_count, + struct nx_gzip_crb_cpb_t *cmdp, void *handle) +{ + uint32_t fc; + + assert(!!cmdp); + + put32(cmdp->crb, gzip_fc, 0); /* clear */ + fc = (with_count) ? GZIP_FC_COMPRESS_RESUME_FHT_COUNT : + GZIP_FC_COMPRESS_RESUME_FHT; + putnn(cmdp->crb, gzip_fc, fc); + putnn(cmdp->cpb, in_histlen, 0); /* resuming with no history */ + memset((void *) &cmdp->crb.csb, 0, sizeof(cmdp->crb.csb)); + + /* Section 6.6 programming notes; spbc may be in two different + * places depending on FC. + */ + if (!with_count) + put32(cmdp->cpb, out_spbc_comp, 0); + else + put32(cmdp->cpb, out_spbc_comp_with_count, 0); + + /* Figure 6-3 6-4; CSB location */ + put64(cmdp->crb, csb_address, 0); + put64(cmdp->crb, csb_address, + (uint64_t) &cmdp->crb.csb & csb_address_mask); + + /* Source direct dde (scatter-gather list) */ + clear_dde(cmdp->crb.source_dde); + putnn(cmdp->crb.source_dde, dde_count, 0); + put32(cmdp->crb.source_dde, ddebc, srclen); + put64(cmdp->crb.source_dde, ddead, (uint64_t) src); + + /* Target direct dde (scatter-gather list) */ + clear_dde(cmdp->crb.target_dde); + putnn(cmdp->crb.target_dde, dde_count, 0); + put32(cmdp->crb.target_dde, ddebc, dstlen); + put64(cmdp->crb.target_dde, ddead, (uint64_t) dst); + + /* Submit the crb, the job descriptor, to the accelerator */ + return nxu_submit_job(cmdp, handle); +} + +/* + * Prepares a blank no filename no timestamp gzip header and returns + * the number of bytes written to buf. + * Gzip specification at https://tools.ietf.org/html/rfc1952 + */ +int gzip_header_blank(char *buf) +{ + int i = 0; + + buf[i++] = 0x1f; /* ID1 */ + buf[i++] = 0x8b; /* ID2 */ + buf[i++] = 0x08; /* CM */ + buf[i++] = 0x00; /* FLG */ + buf[i++] = 0x00; /* MTIME */ + buf[i++] = 0x00; /* MTIME */ + buf[i++] = 0x00; /* MTIME */ + buf[i++] = 0x00; /* MTIME */ + buf[i++] = 0x04; /* XFL 4=fastest */ + buf[i++] = 0x03; /* OS UNIX */ + + return i; +} + +/* Caller must free the allocated buffer return nonzero on error. */ +int read_alloc_input_file(char *fname, char **buf, size_t *bufsize) +{ + struct stat statbuf; + FILE *fp; + char *p; + size_t num_bytes; + + if (stat(fname, &statbuf)) { + perror(fname); + return(-1); + } + fp = fopen(fname, "r"); + if (fp == NULL) { + perror(fname); + return(-1); + } + assert(NULL != (p = (char *) malloc(statbuf.st_size))); + num_bytes = fread(p, 1, statbuf.st_size, fp); + if (ferror(fp) || (num_bytes != statbuf.st_size)) { + perror(fname); + return(-1); + } + *buf = p; + *bufsize = num_bytes; + return 0; +} + +/* Returns nonzero on error */ +int write_output_file(char *fname, char *buf, size_t bufsize) +{ + FILE *fp; + size_t num_bytes; + + fp = fopen(fname, "w"); + if (fp == NULL) { + perror(fname); + return(-1); + } + num_bytes = fwrite(buf, 1, bufsize, fp); + if (ferror(fp) || (num_bytes != bufsize)) { + perror(fname); + return(-1); + } + fclose(fp); + return 0; +} + +/* + * Z_SYNC_FLUSH as described in zlib.h. + * Returns number of appended bytes + */ +int append_sync_flush(char *buf, int tebc, int final) +{ + uint64_t flush; + int shift = (tebc & 0x7); + + if (tebc > 0) { + /* Last byte is partially full */ + buf = buf - 1; + *buf = *buf & (unsigned char) ((1<<tebc)-1); + } else + *buf = 0; + flush = ((0x1ULL & final) << shift) | *buf; + shift = shift + 3; /* BFINAL and BTYPE written */ + shift = (shift <= 8) ? 8 : 16; + flush |= (0xFFFF0000ULL) << shift; /* Zero length block */ + shift = shift + 32; + while (shift > 0) { + *buf++ = (unsigned char) (flush & 0xffULL); + flush = flush >> 8; + shift = shift - 8; + } + return(((tebc > 5) || (tebc == 0)) ? 5 : 4); +} + +/* + * Final deflate block bit. This call assumes the block + * beginning is byte aligned. + */ +static void set_bfinal(void *buf, int bfinal) +{ + char *b = buf; + + if (bfinal) + *b = *b | (unsigned char) 0x01; + else + *b = *b & (unsigned char) 0xfe; +} + +int compress_file(int argc, char **argv, void *handle) +{ + char *inbuf, *outbuf, *srcbuf, *dstbuf; + char outname[FNAME_MAX]; + uint32_t srclen, dstlen; + uint32_t flushlen, chunk; + size_t inlen, outlen, dsttotlen, srctotlen; + uint32_t crc, spbc, tpbc, tebc; + int lzcounts = 0; + int cc; + int num_hdr_bytes; + struct nx_gzip_crb_cpb_t *cmdp; + uint32_t pagelen = 65536; + int fault_tries = NX_MAX_FAULTS; + + cmdp = (void *)(uintptr_t) + aligned_alloc(sizeof(struct nx_gzip_crb_cpb_t), + sizeof(struct nx_gzip_crb_cpb_t)); + + if (argc != 2) { + fprintf(stderr, "usage: %s <fname>\n", argv[0]); + exit(-1); + } + if (read_alloc_input_file(argv[1], &inbuf, &inlen)) + exit(-1); + fprintf(stderr, "file %s read, %ld bytes\n", argv[1], inlen); + + /* Generous output buffer for header/trailer */ + outlen = 2 * inlen + 1024; + + assert(NULL != (outbuf = (char *)malloc(outlen))); + nxu_touch_pages(outbuf, outlen, pagelen, 1); + + /* Compress piecemeal in smallish chunks */ + chunk = 1<<22; + + /* Write the gzip header to the stream */ + num_hdr_bytes = gzip_header_blank(outbuf); + dstbuf = outbuf + num_hdr_bytes; + outlen = outlen - num_hdr_bytes; + dsttotlen = num_hdr_bytes; + + srcbuf = inbuf; + srctotlen = 0; + + /* Init the CRB, the coprocessor request block */ + memset(&cmdp->crb, 0, sizeof(cmdp->crb)); + + /* Initial gzip crc32 */ + put32(cmdp->cpb, in_crc, 0); + + while (inlen > 0) { + + /* Submit chunk size source data per job */ + srclen = NX_MIN(chunk, inlen); + /* Supply large target in case data expands */ + dstlen = NX_MIN(2*srclen, outlen); + + /* Page faults are handled by the user code */ + + /* Fault-in pages; an improved code wouldn't touch so + * many pages but would try to estimate the + * compression ratio and adjust both the src and dst + * touch amounts. + */ + nxu_touch_pages(cmdp, sizeof(struct nx_gzip_crb_cpb_t), pagelen, + 1); + nxu_touch_pages(srcbuf, srclen, pagelen, 0); + nxu_touch_pages(dstbuf, dstlen, pagelen, 1); + + cc = compress_fht_sample( + srcbuf, srclen, + dstbuf, dstlen, + lzcounts, cmdp, handle); + + if (cc != ERR_NX_OK && cc != ERR_NX_TPBC_GT_SPBC && + cc != ERR_NX_TRANSLATION) { + fprintf(stderr, "nx error: cc= %d\n", cc); + exit(-1); + } + + /* Page faults are handled by the user code */ + if (cc == ERR_NX_TRANSLATION) { + NXPRT(fprintf(stderr, "page fault: cc= %d, ", cc)); + NXPRT(fprintf(stderr, "try= %d, fsa= %08llx\n", + fault_tries, + (unsigned long long) cmdp->crb.csb.fsaddr)); + fault_tries--; + if (fault_tries > 0) { + continue; + } else { + fprintf(stderr, "error: cannot progress; "); + fprintf(stderr, "too many faults\n"); + exit(-1); + }; + } + + fault_tries = NX_MAX_FAULTS; /* Reset for the next chunk */ + + inlen = inlen - srclen; + srcbuf = srcbuf + srclen; + srctotlen = srctotlen + srclen; + + /* Two possible locations for spbc depending on the function + * code. + */ + spbc = (!lzcounts) ? get32(cmdp->cpb, out_spbc_comp) : + get32(cmdp->cpb, out_spbc_comp_with_count); + assert(spbc == srclen); + + /* Target byte count */ + tpbc = get32(cmdp->crb.csb, tpbc); + /* Target ending bit count */ + tebc = getnn(cmdp->cpb, out_tebc); + NXPRT(fprintf(stderr, "compressed chunk %d ", spbc)); + NXPRT(fprintf(stderr, "to %d bytes, tebc= %d\n", tpbc, tebc)); + + if (inlen > 0) { /* More chunks to go */ + set_bfinal(dstbuf, 0); + dstbuf = dstbuf + tpbc; + dsttotlen = dsttotlen + tpbc; + outlen = outlen - tpbc; + /* Round up to the next byte with a flush + * block; do not set the BFINAqL bit. + */ + flushlen = append_sync_flush(dstbuf, tebc, 0); + dsttotlen = dsttotlen + flushlen; + outlen = outlen - flushlen; + dstbuf = dstbuf + flushlen; + NXPRT(fprintf(stderr, "added sync_flush %d bytes\n", + flushlen)); + } else { /* Done */ + /* Set the BFINAL bit of the last block per Deflate + * specification. + */ + set_bfinal(dstbuf, 1); + dstbuf = dstbuf + tpbc; + dsttotlen = dsttotlen + tpbc; + outlen = outlen - tpbc; + } + + /* Resuming crc32 for the next chunk */ + crc = get32(cmdp->cpb, out_crc); + put32(cmdp->cpb, in_crc, crc); + crc = be32toh(crc); + } + + /* Append crc32 and ISIZE to the end */ + memcpy(dstbuf, &crc, 4); + memcpy(dstbuf+4, &srctotlen, 4); + dsttotlen = dsttotlen + 8; + outlen = outlen - 8; + + assert(FNAME_MAX > (strlen(argv[1]) + strlen(FEXT))); + strcpy(outname, argv[1]); + strcat(outname, FEXT); + if (write_output_file(outname, outbuf, dsttotlen)) { + fprintf(stderr, "write error: %s\n", outname); + exit(-1); + } + + fprintf(stderr, "compressed %ld to %ld bytes total, ", srctotlen, + dsttotlen); + fprintf(stderr, "crc32 checksum = %08x\n", crc); + + if (inbuf != NULL) + free(inbuf); + + if (outbuf != NULL) + free(outbuf); + + return 0; +} + +int main(int argc, char **argv) +{ + int rc; + struct sigaction act; + void *handle; + + nx_dbg = 0; + nx_gzip_log = NULL; + act.sa_handler = 0; + act.sa_sigaction = nxu_sigsegv_handler; + act.sa_flags = SA_SIGINFO; + act.sa_restorer = 0; + sigemptyset(&act.sa_mask); + sigaction(SIGSEGV, &act, NULL); + + handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0); + if (!handle) { + fprintf(stderr, "Unable to init NX, errno %d\n", errno); + exit(-1); + } + + rc = compress_file(argc, argv, handle); + + nx_function_end(handle); + + return rc; +} diff --git a/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c b/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c new file mode 100644 index 000000000000..c055885da40a --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* + * Copyright 2020 IBM Corp. + * + * Author: Bulent Abali <abali@us.ibm.com> + * + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/fcntl.h> +#include <sys/mman.h> +#include <endian.h> +#include <bits/endian.h> +#include <sys/ioctl.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include "vas-api.h" +#include "nx.h" +#include "copy-paste.h" +#include "nxu.h" +#include "nx_dbg.h" +#include <sys/platform/ppc.h> + +#define barrier() +#define hwsync() ({ asm volatile("sync" ::: "memory"); }) + +#ifndef NX_NO_CPU_PRI +#define cpu_pri_default() ({ asm volatile ("or 2, 2, 2"); }) +#define cpu_pri_low() ({ asm volatile ("or 31, 31, 31"); }) +#else +#define cpu_pri_default() +#define cpu_pri_low() +#endif + +void *nx_fault_storage_address; + +struct nx_handle { + int fd; + int function; + void *paste_addr; +}; + +static int open_device_nodes(char *devname, int pri, struct nx_handle *handle) +{ + int rc, fd; + void *addr; + struct vas_tx_win_open_attr txattr; + + fd = open(devname, O_RDWR); + if (fd < 0) { + fprintf(stderr, " open device name %s\n", devname); + return -errno; + } + + memset(&txattr, 0, sizeof(txattr)); + txattr.version = 1; + txattr.vas_id = pri; + rc = ioctl(fd, VAS_TX_WIN_OPEN, (unsigned long)&txattr); + if (rc < 0) { + fprintf(stderr, "ioctl() n %d, error %d\n", rc, errno); + rc = -errno; + goto out; + } + + addr = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0ULL); + if (addr == MAP_FAILED) { + fprintf(stderr, "mmap() failed, errno %d\n", errno); + rc = -errno; + goto out; + } + handle->fd = fd; + handle->paste_addr = (void *)((char *)addr + 0x400); + + rc = 0; +out: + close(fd); + return rc; +} + +void *nx_function_begin(int function, int pri) +{ + int rc; + char *devname = "/dev/crypto/nx-gzip"; + struct nx_handle *nxhandle; + + if (function != NX_FUNC_COMP_GZIP) { + errno = EINVAL; + fprintf(stderr, " NX_FUNC_COMP_GZIP not found\n"); + return NULL; + } + + + nxhandle = malloc(sizeof(*nxhandle)); + if (!nxhandle) { + errno = ENOMEM; + fprintf(stderr, " No memory\n"); + return NULL; + } + + nxhandle->function = function; + rc = open_device_nodes(devname, pri, nxhandle); + if (rc < 0) { + errno = -rc; + fprintf(stderr, " open_device_nodes failed\n"); + return NULL; + } + + return nxhandle; +} + +int nx_function_end(void *handle) +{ + int rc = 0; + struct nx_handle *nxhandle = handle; + + rc = munmap(nxhandle->paste_addr - 0x400, 4096); + if (rc < 0) { + fprintf(stderr, "munmap() failed, errno %d\n", errno); + return rc; + } + close(nxhandle->fd); + free(nxhandle); + + return rc; +} + +static int nx_wait_for_csb(struct nx_gzip_crb_cpb_t *cmdp) +{ + long poll = 0; + uint64_t t; + + /* Save power and let other threads use the h/w. top may show + * 100% but only because OS doesn't know we slowed the this + * h/w thread while polling. We're letting other threads have + * higher throughput on the core. + */ + cpu_pri_low(); + +#define CSB_MAX_POLL 200000000UL +#define USLEEP_TH 300000UL + + t = __ppc_get_timebase(); + + while (getnn(cmdp->crb.csb, csb_v) == 0) { + ++poll; + hwsync(); + + cpu_pri_low(); + + /* usleep(0) takes around 29000 ticks ~60 us. + * 300000 is spinning for about 600 us then + * start sleeping. + */ + if ((__ppc_get_timebase() - t) > USLEEP_TH) { + cpu_pri_default(); + usleep(1); + } + + if (poll > CSB_MAX_POLL) + break; + + /* Fault address from signal handler */ + if (nx_fault_storage_address) { + cpu_pri_default(); + return -EAGAIN; + } + + } + + cpu_pri_default(); + + /* hw has updated csb and output buffer */ + hwsync(); + + /* Check CSB flags. */ + if (getnn(cmdp->crb.csb, csb_v) == 0) { + fprintf(stderr, "CSB still not valid after %d polls.\n", + (int) poll); + prt_err("CSB still not valid after %d polls, giving up.\n", + (int) poll); + return -ETIMEDOUT; + } + + return 0; +} + +static int nxu_run_job(struct nx_gzip_crb_cpb_t *cmdp, void *handle) +{ + int i, ret, retries; + struct nx_handle *nxhandle = handle; + + assert(handle != NULL); + i = 0; + retries = 5000; + while (i++ < retries) { + hwsync(); + vas_copy(&cmdp->crb, 0); + ret = vas_paste(nxhandle->paste_addr, 0); + hwsync(); + + NXPRT(fprintf(stderr, "Paste attempt %d/%d returns 0x%x\n", + i, retries, ret)); + + if ((ret == 2) || (ret == 3)) { + + ret = nx_wait_for_csb(cmdp); + if (!ret) { + goto out; + } else if (ret == -EAGAIN) { + long x; + + prt_err("Touching address %p, 0x%lx\n", + nx_fault_storage_address, + *(long *) nx_fault_storage_address); + x = *(long *) nx_fault_storage_address; + *(long *) nx_fault_storage_address = x; + nx_fault_storage_address = 0; + continue; + } else { + prt_err("wait_for_csb() returns %d\n", ret); + break; + } + } else { + if (i < 10) { + /* spin for few ticks */ +#define SPIN_TH 500UL + uint64_t fail_spin; + + fail_spin = __ppc_get_timebase(); + while ((__ppc_get_timebase() - fail_spin) < + SPIN_TH) + ; + } else { + /* sleep */ + unsigned int pr = 0; + + if (pr++ % 100 == 0) { + prt_err("Paste attempt %d/", i); + prt_err("%d, failed pid= %d\n", retries, + getpid()); + } + usleep(1); + } + continue; + } + } + +out: + cpu_pri_default(); + + return ret; +} + +int nxu_submit_job(struct nx_gzip_crb_cpb_t *cmdp, void *handle) +{ + int cc; + + cc = nxu_run_job(cmdp, handle); + + if (!cc) + cc = getnn(cmdp->crb.csb, csb_cc); /* CC Table 6-8 */ + + return cc; +} + + +void nxu_sigsegv_handler(int sig, siginfo_t *info, void *ctx) +{ + fprintf(stderr, "%d: Got signal %d si_code %d, si_addr %p\n", getpid(), + sig, info->si_code, info->si_addr); + + nx_fault_storage_address = info->si_addr; +} + +/* + * Fault in pages prior to NX job submission. wr=1 may be required to + * touch writeable pages. System zero pages do not fault-in the page as + * intended. Typically set wr=1 for NX target pages and set wr=0 for NX + * source pages. + */ +int nxu_touch_pages(void *buf, long buf_len, long page_len, int wr) +{ + char *begin = buf; + char *end = (char *) buf + buf_len - 1; + volatile char t; + + assert(buf_len >= 0 && !!buf); + + NXPRT(fprintf(stderr, "touch %p %p len 0x%lx wr=%d\n", buf, + (buf + buf_len), buf_len, wr)); + + if (buf_len <= 0 || buf == NULL) + return -1; + + do { + t = *begin; + if (wr) + *begin = t; + begin = begin + page_len; + } while (begin < end); + + /* When buf_sz is small or buf tail is in another page */ + t = *end; + if (wr) + *end = t; + + return 0; +} diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/copy-paste.h b/tools/testing/selftests/powerpc/nx-gzip/include/copy-paste.h new file mode 100644 index 000000000000..0db2d6485037 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/copy-paste.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +/* From asm-compat.h */ +#define __stringify_in_c(...) #__VA_ARGS__ +#define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " " + +/* + * Macros taken from arch/powerpc/include/asm/ppc-opcode.h and other + * header files. + */ +#define ___PPC_RA(a) (((a) & 0x1f) << 16) +#define ___PPC_RB(b) (((b) & 0x1f) << 11) + +#define PPC_INST_COPY 0x7c20060c +#define PPC_INST_PASTE 0x7c20070d + +#define PPC_COPY(a, b) stringify_in_c(.long PPC_INST_COPY | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_PASTE(a, b) stringify_in_c(.long PPC_INST_PASTE | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define CR0_SHIFT 28 +#define CR0_MASK 0xF +/* + * Copy/paste instructions: + * + * copy RA,RB + * Copy contents of address (RA) + effective_address(RB) + * to internal copy-buffer. + * + * paste RA,RB + * Paste contents of internal copy-buffer to the address + * (RA) + effective_address(RB) + */ +static inline int vas_copy(void *crb, int offset) +{ + asm volatile(PPC_COPY(%0, %1)";" + : + : "b" (offset), "b" (crb) + : "memory"); + + return 0; +} + +static inline int vas_paste(void *paste_address, int offset) +{ + __u32 cr; + + cr = 0; + asm volatile(PPC_PASTE(%1, %2)";" + "mfocrf %0, 0x80;" + : "=r" (cr) + : "b" (offset), "b" (paste_address) + : "memory", "cr0"); + + return (cr >> CR0_SHIFT) & CR0_MASK; +} diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/crb.h b/tools/testing/selftests/powerpc/nx-gzip/include/crb.h new file mode 100644 index 000000000000..ab101085fa7e --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/crb.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __CRB_H +#define __CRB_H +#include <linux/types.h> +#include "nx.h" + +/* CCW 842 CI/FC masks + * NX P8 workbook, section 4.3.1, figure 4-6 + * "CI/FC Boundary by NX CT type" + */ +#define CCW_CI_842 (0x00003ff8) +#define CCW_FC_842 (0x00000007) + +/* Chapter 6.5.8 Coprocessor-Completion Block (CCB) */ + +#define CCB_VALUE (0x3fffffffffffffff) +#define CCB_ADDRESS (0xfffffffffffffff8) +#define CCB_CM (0x0000000000000007) +#define CCB_CM0 (0x0000000000000004) +#define CCB_CM12 (0x0000000000000003) + +#define CCB_CM0_ALL_COMPLETIONS (0x0) +#define CCB_CM0_LAST_IN_CHAIN (0x4) +#define CCB_CM12_STORE (0x0) +#define CCB_CM12_INTERRUPT (0x1) + +#define CCB_SIZE (0x10) +#define CCB_ALIGN CCB_SIZE + +struct coprocessor_completion_block { + __be64 value; + __be64 address; +} __aligned(CCB_ALIGN); + + +/* Chapter 6.5.7 Coprocessor-Status Block (CSB) */ + +#define CSB_V (0x80) +#define CSB_F (0x04) +#define CSB_CH (0x03) +#define CSB_CE_INCOMPLETE (0x80) +#define CSB_CE_TERMINATION (0x40) +#define CSB_CE_TPBC (0x20) + +#define CSB_CC_SUCCESS (0) +#define CSB_CC_INVALID_ALIGN (1) +#define CSB_CC_OPERAND_OVERLAP (2) +#define CSB_CC_DATA_LENGTH (3) +#define CSB_CC_TRANSLATION (5) +#define CSB_CC_PROTECTION (6) +#define CSB_CC_RD_EXTERNAL (7) +#define CSB_CC_INVALID_OPERAND (8) +#define CSB_CC_PRIVILEGE (9) +#define CSB_CC_INTERNAL (10) +#define CSB_CC_WR_EXTERNAL (12) +#define CSB_CC_NOSPC (13) +#define CSB_CC_EXCESSIVE_DDE (14) +#define CSB_CC_WR_TRANSLATION (15) +#define CSB_CC_WR_PROTECTION (16) +#define CSB_CC_UNKNOWN_CODE (17) +#define CSB_CC_ABORT (18) +#define CSB_CC_TRANSPORT (20) +#define CSB_CC_SEGMENTED_DDL (31) +#define CSB_CC_PROGRESS_POINT (32) +#define CSB_CC_DDE_OVERFLOW (33) +#define CSB_CC_SESSION (34) +#define CSB_CC_PROVISION (36) +#define CSB_CC_CHAIN (37) +#define CSB_CC_SEQUENCE (38) +#define CSB_CC_HW (39) + +#define CSB_SIZE (0x10) +#define CSB_ALIGN CSB_SIZE + +struct coprocessor_status_block { + __u8 flags; + __u8 cs; + __u8 cc; + __u8 ce; + __be32 count; + __be64 address; +} __aligned(CSB_ALIGN); + + +/* Chapter 6.5.10 Data-Descriptor List (DDL) + * each list contains one or more Data-Descriptor Entries (DDE) + */ + +#define DDE_P (0x8000) + +#define DDE_SIZE (0x10) +#define DDE_ALIGN DDE_SIZE + +struct data_descriptor_entry { + __be16 flags; + __u8 count; + __u8 index; + __be32 length; + __be64 address; +} __aligned(DDE_ALIGN); + + +/* Chapter 6.5.2 Coprocessor-Request Block (CRB) */ + +#define CRB_SIZE (0x80) +#define CRB_ALIGN (0x100) /* Errata: requires 256 alignment */ + + +/* Coprocessor Status Block field + * ADDRESS address of CSB + * C CCB is valid + * AT 0 = addrs are virtual, 1 = addrs are phys + * M enable perf monitor + */ +#define CRB_CSB_ADDRESS (0xfffffffffffffff0) +#define CRB_CSB_C (0x0000000000000008) +#define CRB_CSB_AT (0x0000000000000002) +#define CRB_CSB_M (0x0000000000000001) + +struct coprocessor_request_block { + __be32 ccw; + __be32 flags; + __be64 csb_addr; + + struct data_descriptor_entry source; + struct data_descriptor_entry target; + + struct coprocessor_completion_block ccb; + + __u8 reserved[48]; + + struct coprocessor_status_block csb; +} __aligned(CRB_ALIGN); + +#define crb_csb_addr(c) __be64_to_cpu(c->csb_addr) +#define crb_nx_fault_addr(c) __be64_to_cpu(c->stamp.nx.fault_storage_addr) +#define crb_nx_flags(c) c->stamp.nx.flags +#define crb_nx_fault_status(c) c->stamp.nx.fault_status +#define crb_nx_pswid(c) c->stamp.nx.pswid + + +/* RFC02167 Initiate Coprocessor Instructions document + * Chapter 8.2.1.1.1 RS + * Chapter 8.2.3 Coprocessor Directive + * Chapter 8.2.4 Execution + * + * The CCW must be converted to BE before passing to icswx() + */ + +#define CCW_PS (0xff000000) +#define CCW_CT (0x00ff0000) +#define CCW_CD (0x0000ffff) +#define CCW_CL (0x0000c000) + +#endif diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/nx.h b/tools/testing/selftests/powerpc/nx-gzip/include/nx.h new file mode 100644 index 000000000000..1abe23fc29e8 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/nx.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2020 IBM Corp. + * + */ +#ifndef _NX_H +#define _NX_H + +#include <stdbool.h> + +#define NX_FUNC_COMP_842 1 +#define NX_FUNC_COMP_GZIP 2 + +#ifndef __aligned +#define __aligned(x) __attribute__((aligned(x))) +#endif + +struct nx842_func_args { + bool use_crc; + bool decompress; /* true decompress; false compress */ + bool move_data; + int timeout; /* seconds */ +}; + +struct nxbuf_t { + int len; + char *buf; +}; + +/* @function should be EFT (aka 842), GZIP etc */ +void *nx_function_begin(int function, int pri); + +int nx_function(void *handle, struct nxbuf_t *in, struct nxbuf_t *out, + void *arg); + +int nx_function_end(void *handle); + +#endif /* _NX_H */ diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/nx_dbg.h b/tools/testing/selftests/powerpc/nx-gzip/include/nx_dbg.h new file mode 100644 index 000000000000..16464e19c47f --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/nx_dbg.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2020 IBM Corporation + * + */ + +#ifndef _NXU_DBG_H_ +#define _NXU_DBG_H_ + +#include <sys/file.h> +#include <stdint.h> +#include <stdio.h> +#include <time.h> +#include <pthread.h> + +extern FILE * nx_gzip_log; +extern int nx_gzip_trace; +extern unsigned int nx_gzip_inflate_impl; +extern unsigned int nx_gzip_deflate_impl; +extern unsigned int nx_gzip_inflate_flags; +extern unsigned int nx_gzip_deflate_flags; + +extern int nx_dbg; +pthread_mutex_t mutex_log; + +#define nx_gzip_trace_enabled() (nx_gzip_trace & 0x1) +#define nx_gzip_hw_trace_enabled() (nx_gzip_trace & 0x2) +#define nx_gzip_sw_trace_enabled() (nx_gzip_trace & 0x4) +#define nx_gzip_gather_statistics() (nx_gzip_trace & 0x8) +#define nx_gzip_per_stream_stat() (nx_gzip_trace & 0x10) + +#define prt(fmt, ...) do { \ + pthread_mutex_lock(&mutex_log); \ + flock(nx_gzip_log->_fileno, LOCK_EX); \ + time_t t; struct tm *m; time(&t); m = localtime(&t); \ + fprintf(nx_gzip_log, "[%04d/%02d/%02d %02d:%02d:%02d] " \ + "pid %d: " fmt, \ + (int)m->tm_year + 1900, (int)m->tm_mon+1, (int)m->tm_mday, \ + (int)m->tm_hour, (int)m->tm_min, (int)m->tm_sec, \ + (int)getpid(), ## __VA_ARGS__); \ + fflush(nx_gzip_log); \ + flock(nx_gzip_log->_fileno, LOCK_UN); \ + pthread_mutex_unlock(&mutex_log); \ +} while (0) + +/* Use in case of an error */ +#define prt_err(fmt, ...) do { if (nx_dbg >= 0) { \ + prt("%s:%u: Error: "fmt, \ + __FILE__, __LINE__, ## __VA_ARGS__); \ +}} while (0) + +/* Use in case of an warning */ +#define prt_warn(fmt, ...) do { if (nx_dbg >= 1) { \ + prt("%s:%u: Warning: "fmt, \ + __FILE__, __LINE__, ## __VA_ARGS__); \ +}} while (0) + +/* Informational printouts */ +#define prt_info(fmt, ...) do { if (nx_dbg >= 2) { \ + prt("Info: "fmt, ## __VA_ARGS__); \ +}} while (0) + +/* Trace zlib wrapper code */ +#define prt_trace(fmt, ...) do { if (nx_gzip_trace_enabled()) { \ + prt("### "fmt, ## __VA_ARGS__); \ +}} while (0) + +/* Trace statistics */ +#define prt_stat(fmt, ...) do { if (nx_gzip_gather_statistics()) { \ + prt("### "fmt, ## __VA_ARGS__); \ +}} while (0) + +/* Trace zlib hardware implementation */ +#define hw_trace(fmt, ...) do { \ + if (nx_gzip_hw_trace_enabled()) \ + fprintf(nx_gzip_log, "hhh " fmt, ## __VA_ARGS__); \ + } while (0) + +/* Trace zlib software implementation */ +#define sw_trace(fmt, ...) do { \ + if (nx_gzip_sw_trace_enabled()) \ + fprintf(nx_gzip_log, "sss " fmt, ## __VA_ARGS__); \ + } while (0) + + +/** + * str_to_num - Convert string into number and copy with endings like + * KiB for kilobyte + * MiB for megabyte + * GiB for gigabyte + */ +uint64_t str_to_num(char *str); +void nx_lib_debug(int onoff); + +#endif /* _NXU_DBG_H_ */ diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/nxu.h b/tools/testing/selftests/powerpc/nx-gzip/include/nxu.h new file mode 100644 index 000000000000..20a4e883e0d3 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/nxu.h @@ -0,0 +1,650 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Hardware interface of the NX-GZIP compression accelerator + * + * Copyright (C) IBM Corporation, 2020 + * + * Author: Bulent Abali <abali@us.ibm.com> + * + */ + +#ifndef _NXU_H +#define _NXU_H + +#include <stdint.h> +#include <endian.h> +#include "nx.h" + +/* deflate */ +#define LLSZ 286 +#define DSZ 30 + +/* nx */ +#define DHTSZ 18 +#define DHT_MAXSZ 288 +#define MAX_DDE_COUNT 256 + +/* util */ +#ifdef NXDBG +#define NXPRT(X) X +#else +#define NXPRT(X) +#endif + +#ifdef NXTIMER +#include <sys/platform/ppc.h> +#define NX_CLK(X) X +#define nx_get_time() __ppc_get_timebase() +#define nx_get_freq() __ppc_get_timebase_freq() +#else +#define NX_CLK(X) +#define nx_get_time() (-1) +#define nx_get_freq() (-1) +#endif + +#define NX_MAX_FAULTS 500 + +/* + * Definitions of acronyms used here. See + * P9 NX Gzip Accelerator User's Manual for details: + * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf + * + * adler/crc: 32 bit checksums appended to stream tail + * ce: completion extension + * cpb: coprocessor parameter block (metadata) + * crb: coprocessor request block (command) + * csb: coprocessor status block (status) + * dht: dynamic huffman table + * dde: data descriptor element (address, length) + * ddl: list of ddes + * dh/fh: dynamic and fixed huffman types + * fc: coprocessor function code + * histlen: history/dictionary length + * history: sliding window of up to 32KB of data + * lzcount: Deflate LZ symbol counts + * rembytecnt: remaining byte count + * sfbt: source final block type; last block's type during decomp + * spbc: source processed byte count + * subc: source unprocessed bit count + * tebc: target ending bit count; valid bits in the last byte + * tpbc: target processed byte count + * vas: virtual accelerator switch; the user mode interface + */ + +union nx_qw_t { + uint32_t word[4]; + uint64_t dword[2]; +} __aligned(16); + +/* + * Note: NX registers with fewer than 32 bits are declared by + * convention as uint32_t variables in unions. If *_offset and *_mask + * are defined for a variable, then use get_ put_ macros to + * conveniently access the register fields for endian conversions. + */ + +struct nx_dde_t { + /* Data Descriptor Element, Section 6.4 */ + union { + uint32_t dde_count; + /* When dde_count == 0 ddead is a pointer to a data buffer; + * ddebc is the buffer length bytes. + * When dde_count > 0 dde is an indirect dde; ddead is a + * pointer to a contiguous list of direct ddes; ddebc is the + * total length of all data pointed to by the list of direct + * ddes. Note that only one level of indirection is permitted. + * See Section 6.4 of the user manual for additional details. + */ + }; + uint32_t ddebc; /* dde byte count */ + uint64_t ddead; /* dde address */ +} __aligned(16); + +struct nx_csb_t { + /* Coprocessor Status Block, Section 6.6 */ + union { + uint32_t csb_v; + /* Valid bit. v must be set to 0 by the program + * before submitting the coprocessor command. + * Software can poll for the v bit + */ + + uint32_t csb_f; + /* 16B CSB size. Written to 0 by DMA when it writes the CPB */ + + uint32_t csb_cs; + /* cs completion sequence; unused */ + + uint32_t csb_cc; + /* cc completion code; cc != 0 exception occurred */ + + uint32_t csb_ce; + /* ce completion extension */ + + }; + uint32_t tpbc; + /* target processed byte count TPBC */ + + uint64_t fsaddr; + /* Section 6.12.1 CSB NonZero error summary. FSA Failing storage + * address. Address where error occurred. When available, written + * to A field of CSB + */ +} __aligned(16); + +struct nx_ccb_t { + /* Coprocessor Completion Block, Section 6.7 */ + + uint32_t reserved[3]; + union { + /* When crb.c==0 (no ccb defined) it is reserved; + * When crb.c==1 (ccb defined) it is cm + */ + + uint32_t ccb_cm; + /* Signal interrupt of crb.c==1 and cm==1 */ + + uint32_t word; + /* generic access to the 32bit word */ + }; +} __aligned(16); + +struct vas_stamped_crb_t { + /* + * CRB operand of the paste coprocessor instruction is stamped + * in quadword 4 with the information shown here as its written + * in to the receive FIFO of the coprocessor + */ + + union { + uint32_t vas_buf_num; + /* Verification only vas buffer number which correlates to + * the low order bits of the atag in the paste command + */ + + uint32_t send_wc_id; + /* Pointer to Send Window Context that provides for NX address + * translation information, such as MSR and LPCR bits, job + * completion interrupt RA, PSWID, and job utilization counter. + */ + + }; + union { + uint32_t recv_wc_id; + /* Pointer to Receive Window Context. NX uses this to return + * credits to a Receive FIFO as entries are dequeued. + */ + + }; + uint32_t reserved2; + union { + uint32_t vas_invalid; + /* Invalid bit. If this bit is 1 the CRB is discarded by + * NX upon fetching from the receive FIFO. If this bit is 0 + * the CRB is processed normally. The bit is stamped to 0 + * by VAS and may be written to 1 by hypervisor while + * the CRB is in the receive FIFO (in memory). + */ + + }; +}; + +struct nx_stamped_fault_crb_t { + /* + * A CRB that has a translation fault is stamped by NX in quadword 4 + * and pasted to the Fault Send Window in VAS. + */ + uint64_t fsa; + union { + uint32_t nxsf_t; + uint32_t nxsf_fs; + }; + uint32_t pswid; +}; + +union stamped_crb_t { + struct vas_stamped_crb_t vas; + struct nx_stamped_fault_crb_t nx; +}; + +struct nx_gzip_cpb_t { + /* + * Coprocessor Parameter Block In/Out are used to pass metadata + * to/from accelerator. Tables 6.5 and 6.6 of the user manual. + */ + + /* CPBInput */ + + struct { + union { + union nx_qw_t qw0; + struct { + uint32_t in_adler; /* bits 0:31 */ + uint32_t in_crc; /* bits 32:63 */ + union { + uint32_t in_histlen; /* bits 64:75 */ + uint32_t in_subc; /* bits 93:95 */ + }; + union { + /* bits 108:111 */ + uint32_t in_sfbt; + /* bits 112:127 */ + uint32_t in_rembytecnt; + /* bits 116:127 */ + uint32_t in_dhtlen; + }; + }; + }; + union { + union nx_qw_t in_dht[DHTSZ]; /* qw[1:18] */ + char in_dht_char[DHT_MAXSZ]; /* byte access */ + }; + union nx_qw_t reserved[5]; /* qw[19:23] */ + }; + + /* CPBOutput */ + + volatile struct { + union { + union nx_qw_t qw24; + struct { + uint32_t out_adler; /* bits 0:31 qw[24] */ + uint32_t out_crc; /* bits 32:63 qw[24] */ + union { + /* bits 77:79 qw[24] */ + uint32_t out_tebc; + /* bits 80:95 qw[24] */ + uint32_t out_subc; + }; + union { + /* bits 108:111 qw[24] */ + uint32_t out_sfbt; + /* bits 112:127 qw[24] */ + uint32_t out_rembytecnt; + /* bits 116:127 qw[24] */ + uint32_t out_dhtlen; + }; + }; + }; + union { + union nx_qw_t qw25[79]; /* qw[25:103] */ + /* qw[25] compress no lzcounts or wrap */ + uint32_t out_spbc_comp_wrap; + uint32_t out_spbc_wrap; /* qw[25] wrap */ + /* qw[25] compress no lzcounts */ + uint32_t out_spbc_comp; + /* 286 LL and 30 D symbol counts */ + uint32_t out_lzcount[LLSZ+DSZ]; + struct { + union nx_qw_t out_dht[DHTSZ]; /* qw[25:42] */ + /* qw[43] decompress */ + uint32_t out_spbc_decomp; + }; + }; + /* qw[104] compress with lzcounts */ + uint32_t out_spbc_comp_with_count; + }; +} __aligned(128); + +struct nx_gzip_crb_t { + union { /* byte[0:3] */ + uint32_t gzip_fc; /* bits[24-31] */ + }; + uint32_t reserved1; /* byte[4:7] */ + union { + uint64_t csb_address; /* byte[8:15] */ + struct { + uint32_t reserved2; + union { + uint32_t crb_c; + /* c==0 no ccb defined */ + + uint32_t crb_at; + /* at==0 address type is ignored; + * all addrs effective assumed. + */ + + }; + }; + }; + struct nx_dde_t source_dde; /* byte[16:31] */ + struct nx_dde_t target_dde; /* byte[32:47] */ + volatile struct nx_ccb_t ccb; /* byte[48:63] */ + volatile union { + /* byte[64:239] shift csb by 128 bytes out of the crb; csb was + * in crb earlier; JReilly says csb written with partial inject + */ + union nx_qw_t reserved64[11]; + union stamped_crb_t stamp; /* byte[64:79] */ + }; + volatile struct nx_csb_t csb; +} __aligned(128); + +struct nx_gzip_crb_cpb_t { + struct nx_gzip_crb_t crb; + struct nx_gzip_cpb_t cpb; +} __aligned(2048); + + +/* + * NX hardware convention has the msb bit on the left numbered 0. + * The defines below has *_offset defined as the right most bit + * position of a field. x of size_mask(x) is the field width in bits. + */ + +#define size_mask(x) ((1U<<(x))-1) + +/* + * Offsets and Widths within the containing 32 bits of the various NX + * gzip hardware registers. Use the getnn/putnn macros to access + * these regs + */ + +#define dde_count_mask size_mask(8) +#define dde_count_offset 23 + +/* CSB */ + +#define csb_v_mask size_mask(1) +#define csb_v_offset 0 +#define csb_f_mask size_mask(1) +#define csb_f_offset 6 +#define csb_cs_mask size_mask(8) +#define csb_cs_offset 15 +#define csb_cc_mask size_mask(8) +#define csb_cc_offset 23 +#define csb_ce_mask size_mask(8) +#define csb_ce_offset 31 + +/* CCB */ + +#define ccb_cm_mask size_mask(3) +#define ccb_cm_offset 31 + +/* VAS stamped CRB fields */ + +#define vas_buf_num_mask size_mask(6) +#define vas_buf_num_offset 5 +#define send_wc_id_mask size_mask(16) +#define send_wc_id_offset 31 +#define recv_wc_id_mask size_mask(16) +#define recv_wc_id_offset 31 +#define vas_invalid_mask size_mask(1) +#define vas_invalid_offset 31 + +/* NX stamped fault CRB fields */ + +#define nxsf_t_mask size_mask(1) +#define nxsf_t_offset 23 +#define nxsf_fs_mask size_mask(8) +#define nxsf_fs_offset 31 + +/* CPB input */ + +#define in_histlen_mask size_mask(12) +#define in_histlen_offset 11 +#define in_dhtlen_mask size_mask(12) +#define in_dhtlen_offset 31 +#define in_subc_mask size_mask(3) +#define in_subc_offset 31 +#define in_sfbt_mask size_mask(4) +#define in_sfbt_offset 15 +#define in_rembytecnt_mask size_mask(16) +#define in_rembytecnt_offset 31 + +/* CPB output */ + +#define out_tebc_mask size_mask(3) +#define out_tebc_offset 15 +#define out_subc_mask size_mask(16) +#define out_subc_offset 31 +#define out_sfbt_mask size_mask(4) +#define out_sfbt_offset 15 +#define out_rembytecnt_mask size_mask(16) +#define out_rembytecnt_offset 31 +#define out_dhtlen_mask size_mask(12) +#define out_dhtlen_offset 31 + +/* CRB */ + +#define gzip_fc_mask size_mask(8) +#define gzip_fc_offset 31 +#define crb_c_mask size_mask(1) +#define crb_c_offset 28 +#define crb_at_mask size_mask(1) +#define crb_at_offset 30 +#define csb_address_mask ~(15UL) /* mask off bottom 4b */ + +/* + * Access macros for the registers. Do not access registers directly + * because of the endian conversion. P9 processor may run either as + * Little or Big endian. However the NX coprocessor regs are always + * big endian. + * Use the 32 and 64b macros to access respective + * register sizes. + * Use nn forms for the register fields shorter than 32 bits. + */ + +#define getnn(ST, REG) ((be32toh(ST.REG) >> (31-REG##_offset)) \ + & REG##_mask) +#define getpnn(ST, REG) ((be32toh((ST)->REG) >> (31-REG##_offset)) \ + & REG##_mask) +#define get32(ST, REG) (be32toh(ST.REG)) +#define getp32(ST, REG) (be32toh((ST)->REG)) +#define get64(ST, REG) (be64toh(ST.REG)) +#define getp64(ST, REG) (be64toh((ST)->REG)) + +#define unget32(ST, REG) (get32(ST, REG) & ~((REG##_mask) \ + << (31-REG##_offset))) +/* get 32bits less the REG field */ + +#define ungetp32(ST, REG) (getp32(ST, REG) & ~((REG##_mask) \ + << (31-REG##_offset))) +/* get 32bits less the REG field */ + +#define clear_regs(ST) memset((void *)(&(ST)), 0, sizeof(ST)) +#define clear_dde(ST) do { ST.dde_count = ST.ddebc = 0; ST.ddead = 0; \ + } while (0) +#define clearp_dde(ST) do { (ST)->dde_count = (ST)->ddebc = 0; \ + (ST)->ddead = 0; \ + } while (0) +#define clear_struct(ST) memset((void *)(&(ST)), 0, sizeof(ST)) +#define putnn(ST, REG, X) (ST.REG = htobe32(unget32(ST, REG) | (((X) \ + & REG##_mask) << (31-REG##_offset)))) +#define putpnn(ST, REG, X) ((ST)->REG = htobe32(ungetp32(ST, REG) \ + | (((X) & REG##_mask) << (31-REG##_offset)))) + +#define put32(ST, REG, X) (ST.REG = htobe32(X)) +#define putp32(ST, REG, X) ((ST)->REG = htobe32(X)) +#define put64(ST, REG, X) (ST.REG = htobe64(X)) +#define putp64(ST, REG, X) ((ST)->REG = htobe64(X)) + +/* + * Completion extension ce(0) ce(1) ce(2). Bits ce(3-7) + * unused. Section 6.6 Figure 6.7. + */ + +#define get_csb_ce(ST) ((uint32_t)getnn(ST, csb_ce)) +#define get_csb_ce_ms3b(ST) (get_csb_ce(ST) >> 5) +#define put_csb_ce_ms3b(ST, X) putnn(ST, csb_ce, ((uint32_t)(X) << 5)) + +#define CSB_CE_PARTIAL 0x4 +#define CSB_CE_TERMINATE 0x2 +#define CSB_CE_TPBC_VALID 0x1 + +#define csb_ce_termination(X) (!!((X) & CSB_CE_TERMINATE)) +/* termination, output buffers may be modified, SPBC/TPBC invalid Fig.6-7 */ + +#define csb_ce_check_completion(X) (!csb_ce_termination(X)) +/* if not terminated then check full or partial completion */ + +#define csb_ce_partial_completion(X) (!!((X) & CSB_CE_PARTIAL)) +#define csb_ce_full_completion(X) (!csb_ce_partial_completion(X)) +#define csb_ce_tpbc_valid(X) (!!((X) & CSB_CE_TPBC_VALID)) +/* TPBC indicates successfully stored data count */ + +#define csb_ce_default_err(X) csb_ce_termination(X) +/* most error CEs have CE(0)=0 and CE(1)=1 */ + +#define csb_ce_cc3_partial(X) csb_ce_partial_completion(X) +/* some CC=3 are partially completed, Table 6-8 */ + +#define csb_ce_cc64(X) ((X)&(CSB_CE_PARTIAL \ + | CSB_CE_TERMINATE) == 0) +/* Compression: when TPBC>SPBC then CC=64 Table 6-8; target didn't + * compress smaller than source. + */ + +/* Decompress SFBT combinations Tables 5-3, 6-4, 6-6 */ + +#define SFBT_BFINAL 0x1 +#define SFBT_LIT 0x4 +#define SFBT_FHT 0x5 +#define SFBT_DHT 0x6 +#define SFBT_HDR 0x7 + +/* + * NX gzip function codes. Table 6.2. + * Bits 0:4 are the FC. Bit 5 is used by the DMA controller to + * select one of the two Byte Count Limits. + */ + +#define GZIP_FC_LIMIT_MASK 0x01 +#define GZIP_FC_COMPRESS_FHT 0x00 +#define GZIP_FC_COMPRESS_DHT 0x02 +#define GZIP_FC_COMPRESS_FHT_COUNT 0x04 +#define GZIP_FC_COMPRESS_DHT_COUNT 0x06 +#define GZIP_FC_COMPRESS_RESUME_FHT 0x08 +#define GZIP_FC_COMPRESS_RESUME_DHT 0x0a +#define GZIP_FC_COMPRESS_RESUME_FHT_COUNT 0x0c +#define GZIP_FC_COMPRESS_RESUME_DHT_COUNT 0x0e +#define GZIP_FC_DECOMPRESS 0x10 +#define GZIP_FC_DECOMPRESS_SINGLE_BLK_N_SUSPEND 0x12 +#define GZIP_FC_DECOMPRESS_RESUME 0x14 +#define GZIP_FC_DECOMPRESS_RESUME_SINGLE_BLK_N_SUSPEND 0x16 +#define GZIP_FC_WRAP 0x1e + +#define fc_is_compress(fc) (((fc) & 0x10) == 0) +#define fc_has_count(fc) (fc_is_compress(fc) && (((fc) & 0x4) != 0)) + +/* CSB.CC Error codes */ + +#define ERR_NX_OK 0 +#define ERR_NX_ALIGNMENT 1 +#define ERR_NX_OPOVERLAP 2 +#define ERR_NX_DATA_LENGTH 3 +#define ERR_NX_TRANSLATION 5 +#define ERR_NX_PROTECTION 6 +#define ERR_NX_EXTERNAL_UE7 7 +#define ERR_NX_INVALID_OP 8 +#define ERR_NX_PRIVILEGE 9 +#define ERR_NX_INTERNAL_UE 10 +#define ERR_NX_EXTERN_UE_WR 12 +#define ERR_NX_TARGET_SPACE 13 +#define ERR_NX_EXCESSIVE_DDE 14 +#define ERR_NX_TRANSL_WR 15 +#define ERR_NX_PROTECT_WR 16 +#define ERR_NX_SUBFUNCTION 17 +#define ERR_NX_FUNC_ABORT 18 +#define ERR_NX_BYTE_MAX 19 +#define ERR_NX_CORRUPT_CRB 20 +#define ERR_NX_INVALID_CRB 21 +#define ERR_NX_INVALID_DDE 30 +#define ERR_NX_SEGMENTED_DDL 31 +#define ERR_NX_DDE_OVERFLOW 33 +#define ERR_NX_TPBC_GT_SPBC 64 +#define ERR_NX_MISSING_CODE 66 +#define ERR_NX_INVALID_DIST 67 +#define ERR_NX_INVALID_DHT 68 +#define ERR_NX_EXTERNAL_UE90 90 +#define ERR_NX_WDOG_TIMER 224 +#define ERR_NX_AT_FAULT 250 +#define ERR_NX_INTR_SERVER 252 +#define ERR_NX_UE253 253 +#define ERR_NX_NO_HW 254 +#define ERR_NX_HUNG_OP 255 +#define ERR_NX_END 256 + +/* initial values for non-resume operations */ +#define INIT_CRC 0 /* crc32(0L, Z_NULL, 0) */ +#define INIT_ADLER 1 /* adler32(0L, Z_NULL, 0) adler is initialized to 1 */ + +/* prototypes */ +int nxu_submit_job(struct nx_gzip_crb_cpb_t *c, void *handle); + +extern void nxu_sigsegv_handler(int sig, siginfo_t *info, void *ctx); +extern int nxu_touch_pages(void *buf, long buf_len, long page_len, int wr); + +/* caller supplies a print buffer 4*sizeof(crb) */ + +char *nx_crb_str(struct nx_gzip_crb_t *crb, char *prbuf); +char *nx_cpb_str(struct nx_gzip_cpb_t *cpb, char *prbuf); +char *nx_prt_hex(void *cp, int sz, char *prbuf); +char *nx_lzcount_str(struct nx_gzip_cpb_t *cpb, char *prbuf); +char *nx_strerror(int e); + +#ifdef NX_SIM +#include <stdio.h> +int nx_sim_init(void *ctx); +int nx_sim_end(void *ctx); +int nxu_run_sim_job(struct nx_gzip_crb_cpb_t *c, void *ctx); +#endif /* NX_SIM */ + +/* Deflate stream manipulation */ + +#define set_final_bit(x) (x |= (unsigned char)1) +#define clr_final_bit(x) (x &= ~(unsigned char)1) + +#define append_empty_fh_blk(p, b) do { *(p) = (2 | (1&(b))); *((p)+1) = 0; \ + } while (0) +/* append 10 bits 0000001b 00...... ; + * assumes appending starts on a byte boundary; b is the final bit. + */ + + +#ifdef NX_842 + +/* 842 Engine */ + +struct nx_eft_crb_t { + union { /* byte[0:3] */ + uint32_t eft_fc; /* bits[29-31] */ + }; + uint32_t reserved1; /* byte[4:7] */ + union { + uint64_t csb_address; /* byte[8:15] */ + struct { + uint32_t reserved2; + union { + uint32_t crb_c; + /* c==0 no ccb defined */ + + uint32_t crb_at; + /* at==0 address type is ignored; + * all addrs effective assumed. + */ + + }; + }; + }; + struct nx_dde_t source_dde; /* byte[16:31] */ + struct nx_dde_t target_dde; /* byte[32:47] */ + struct nx_ccb_t ccb; /* byte[48:63] */ + union { + union nx_qw_t reserved64[3]; /* byte[64:96] */ + }; + struct nx_csb_t csb; +} __aligned(128); + +/* 842 CRB */ + +#define EFT_FC_MASK size_mask(3) +#define EFT_FC_OFFSET 31 +#define EFT_FC_COMPRESS 0x0 +#define EFT_FC_COMPRESS_WITH_CRC 0x1 +#define EFT_FC_DECOMPRESS 0x2 +#define EFT_FC_DECOMPRESS_WITH_CRC 0x3 +#define EFT_FC_BLK_DATA_MOVE 0x4 +#endif /* NX_842 */ + +#endif /* _NXU_H */ diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/vas-api.h b/tools/testing/selftests/powerpc/nx-gzip/include/vas-api.h new file mode 120000 index 000000000000..77fb4c7236d0 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/vas-api.h @@ -0,0 +1 @@ +../../../../../../arch/powerpc/include/uapi/asm/vas-api.h
\ No newline at end of file diff --git a/tools/testing/selftests/powerpc/nx-gzip/nx-gzip-test.sh b/tools/testing/selftests/powerpc/nx-gzip/nx-gzip-test.sh new file mode 100755 index 000000000000..c7b46c5fd7b3 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/nx-gzip-test.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later + +if [[ ! -w /dev/crypto/nx-gzip ]]; then + echo "Can't access /dev/crypto/nx-gzip, skipping" + echo "skip: $0" + exit 4 +fi + +set -e + +function cleanup +{ + rm -f nx-tempfile* +} + +trap cleanup EXIT + +function test_sizes +{ + local n=$1 + local fname="nx-tempfile.$n" + + for size in 4K 64K 1M 64M + do + echo "Testing $size ($n) ..." + dd if=/dev/urandom of=$fname bs=$size count=1 + ./gzfht_test $fname + ./gunz_test ${fname}.nx.gz + done +} + +echo "Doing basic test of different sizes ..." +test_sizes 0 + +echo "Running tests in parallel ..." +for i in {1..16} +do + test_sizes $i & +done + +wait + +echo "OK" + +exit 0 diff --git a/tools/testing/selftests/powerpc/pmu/.gitignore b/tools/testing/selftests/powerpc/pmu/.gitignore index ff7896903d7b..f69b1e2641a1 100644 --- a/tools/testing/selftests/powerpc/pmu/.gitignore +++ b/tools/testing/selftests/powerpc/pmu/.gitignore @@ -2,3 +2,4 @@ count_instructions l3_bank_test per_event_excludes +count_stcx_fail diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index 19046db995fe..904672fb78dd 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -2,7 +2,7 @@ noarg: $(MAKE) -C ../ -TEST_GEN_PROGS := count_instructions l3_bank_test per_event_excludes +TEST_GEN_PROGS := count_instructions count_stcx_fail l3_bank_test per_event_excludes EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c top_srcdir = ../../../../.. @@ -13,8 +13,12 @@ all: $(TEST_GEN_PROGS) ebb $(TEST_GEN_PROGS): $(EXTRA_SOURCES) # loop.S can only be built 64-bit +$(OUTPUT)/count_instructions: CFLAGS += -m64 $(OUTPUT)/count_instructions: loop.S count_instructions.c $(EXTRA_SOURCES) - $(CC) $(CFLAGS) -m64 -o $@ $^ + +$(OUTPUT)/count_stcx_fail: CFLAGS += -m64 +$(OUTPUT)/count_stcx_fail: loop.S $(EXTRA_SOURCES) + $(OUTPUT)/per_event_excludes: ../utils.c diff --git a/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c b/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c new file mode 100644 index 000000000000..7b4ac4537702 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c @@ -0,0 +1,161 @@ +/* + * Copyright 2013, Michael Ellerman, IBM Corp. + * Licensed under GPLv2. + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdbool.h> +#include <string.h> +#include <sys/prctl.h> + +#include "event.h" +#include "utils.h" +#include "lib.h" + +extern void thirty_two_instruction_loop_with_ll_sc(u64 loops, u64 *ll_sc_target); + +static void setup_event(struct event *e, u64 config, int type, char *name) +{ + event_init_opts(e, config, type, name); + + e->attr.disabled = 1; + e->attr.exclude_kernel = 1; + e->attr.exclude_hv = 1; + e->attr.exclude_idle = 1; +} + +static int do_count_loop(struct event *events, u64 instructions, + u64 overhead, bool report) +{ + s64 difference, expected; + double percentage; + u64 dummy; + + prctl(PR_TASK_PERF_EVENTS_ENABLE); + + /* Run for 1M instructions */ + thirty_two_instruction_loop_with_ll_sc(instructions >> 5, &dummy); + + prctl(PR_TASK_PERF_EVENTS_DISABLE); + + event_read(&events[0]); + event_read(&events[1]); + event_read(&events[2]); + + expected = instructions + overhead + (events[2].result.value * 10); + difference = events[0].result.value - expected; + percentage = (double)difference / events[0].result.value * 100; + + if (report) { + printf("-----\n"); + event_report(&events[0]); + event_report(&events[1]); + event_report(&events[2]); + + printf("Looped for %llu instructions, overhead %llu\n", instructions, overhead); + printf("Expected %llu\n", expected); + printf("Actual %llu\n", events[0].result.value); + printf("Delta %lld, %f%%\n", difference, percentage); + } + + event_reset(&events[0]); + event_reset(&events[1]); + event_reset(&events[2]); + + if (difference < 0) + difference = -difference; + + /* Tolerate a difference below 0.0001 % */ + difference *= 10000 * 100; + if (difference / events[0].result.value) + return -1; + + return 0; +} + +/* Count how many instructions it takes to do a null loop */ +static u64 determine_overhead(struct event *events) +{ + u64 current, overhead; + int i; + + do_count_loop(events, 0, 0, false); + overhead = events[0].result.value; + + for (i = 0; i < 100; i++) { + do_count_loop(events, 0, 0, false); + current = events[0].result.value; + if (current < overhead) { + printf("Replacing overhead %llu with %llu\n", overhead, current); + overhead = current; + } + } + + return overhead; +} + +#define PM_MRK_STCX_FAIL 0x03e158 +#define PM_STCX_FAIL 0x01e058 + +static int test_body(void) +{ + struct event events[3]; + u64 overhead; + + setup_event(&events[0], PERF_COUNT_HW_INSTRUCTIONS, PERF_TYPE_HARDWARE, "instructions"); + setup_event(&events[1], PERF_COUNT_HW_CPU_CYCLES, PERF_TYPE_HARDWARE, "cycles"); + setup_event(&events[2], PM_STCX_FAIL, PERF_TYPE_RAW, "stcx_fail"); + + if (event_open(&events[0])) { + perror("perf_event_open"); + return -1; + } + + if (event_open_with_group(&events[1], events[0].fd)) { + perror("perf_event_open"); + return -1; + } + + if (event_open_with_group(&events[2], events[0].fd)) { + perror("perf_event_open"); + return -1; + } + + overhead = determine_overhead(events); + printf("Overhead of null loop: %llu instructions\n", overhead); + + /* Run for 1Mi instructions */ + FAIL_IF(do_count_loop(events, 1000000, overhead, true)); + + /* Run for 10Mi instructions */ + FAIL_IF(do_count_loop(events, 10000000, overhead, true)); + + /* Run for 100Mi instructions */ + FAIL_IF(do_count_loop(events, 100000000, overhead, true)); + + /* Run for 1Bi instructions */ + FAIL_IF(do_count_loop(events, 1000000000, overhead, true)); + + /* Run for 16Bi instructions */ + FAIL_IF(do_count_loop(events, 16000000000, overhead, true)); + + /* Run for 64Bi instructions */ + FAIL_IF(do_count_loop(events, 64000000000, overhead, true)); + + event_close(&events[0]); + event_close(&events[1]); + + return 0; +} + +static int count_ll_sc(void) +{ + return eat_cpu(test_body); +} + +int main(void) +{ + return test_harness(count_ll_sc, "count_ll_sc"); +} diff --git a/tools/testing/selftests/powerpc/pmu/ebb/trace.h b/tools/testing/selftests/powerpc/pmu/ebb/trace.h index 7c0fb5d2bdb1..da2a3be5441f 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/trace.h +++ b/tools/testing/selftests/powerpc/pmu/ebb/trace.h @@ -18,7 +18,7 @@ struct trace_entry { u8 type; u8 length; - u8 data[0]; + u8 data[]; }; struct trace_buffer @@ -26,7 +26,7 @@ struct trace_buffer u64 size; bool overflow; void *tail; - u8 data[0]; + u8 data[]; }; struct trace_buffer *trace_buffer_allocate(u64 size); diff --git a/tools/testing/selftests/powerpc/pmu/loop.S b/tools/testing/selftests/powerpc/pmu/loop.S index 8cc9b5e2c9de..c52ba09b6fed 100644 --- a/tools/testing/selftests/powerpc/pmu/loop.S +++ b/tools/testing/selftests/powerpc/pmu/loop.S @@ -41,3 +41,38 @@ FUNC_START(thirty_two_instruction_loop) subi r3,r3,1 b FUNC_NAME(thirty_two_instruction_loop) FUNC_END(thirty_two_instruction_loop) + +FUNC_START(thirty_two_instruction_loop_with_ll_sc) + cmpdi r3,0 + beqlr + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 # 5 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 +1: ldarx r6,0,r4 # 10 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 # 15 + addi r5,r5,1 + addi r5,r5,1 + stdcx. r6,0,r4 + bne- 1b + addi r5,r5,1 # 20 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 # 25 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 # 30 + subi r3,r3,1 + b FUNC_NAME(thirty_two_instruction_loop_with_ll_sc) +FUNC_END(thirty_two_instruction_loop_with_ll_sc) diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index 932a032bf036..d6ae54663aed 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso +TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso sig_sc_double_restart CFLAGS += -maltivec $(OUTPUT)/signal_tm: CFLAGS += -mhtm diff --git a/tools/testing/selftests/powerpc/signal/sig_sc_double_restart.c b/tools/testing/selftests/powerpc/signal/sig_sc_double_restart.c new file mode 100644 index 000000000000..e3972264615b --- /dev/null +++ b/tools/testing/selftests/powerpc/signal/sig_sc_double_restart.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test that a syscall does not get restarted twice, handled by trap_norestart() + * + * Based on Al's description, and a test for the bug fixed in this commit: + * + * commit 9a81c16b527528ad307843be5571111aa8d35a80 + * Author: Al Viro <viro@zeniv.linux.org.uk> + * Date: Mon Sep 20 21:48:57 2010 +0100 + * + * powerpc: fix double syscall restarts + * + * Make sigreturn zero regs->trap, make do_signal() do the same on all + * paths. As it is, signal interrupting e.g. read() from fd 512 (== + * ERESTARTSYS) with another signal getting unblocked when the first + * handler finishes will lead to restart one insn earlier than it ought + * to. Same for multiple signals with in-kernel handlers interrupting + * that sucker at the same time. Same for multiple signals of any kind + * interrupting that sucker on 64bit... + */ +#define _GNU_SOURCE +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <signal.h> +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include "utils.h" + +static void SIGUSR1_handler(int sig) +{ + kill(getpid(), SIGUSR2); + /* + * SIGUSR2 is blocked until the handler exits, at which point it will + * be raised again and think there is a restart to be done because the + * pending restarted syscall has 512 (ERESTARTSYS) in r3. The second + * restart will retreat NIP another 4 bytes to fail case branch. + */ +} + +static void SIGUSR2_handler(int sig) +{ +} + +static ssize_t raw_read(int fd, void *buf, size_t count) +{ + register long nr asm("r0") = __NR_read; + register long _fd asm("r3") = fd; + register void *_buf asm("r4") = buf; + register size_t _count asm("r5") = count; + + asm volatile( +" b 0f \n" +" b 1f \n" +" 0: sc 0 \n" +" bns 2f \n" +" neg %0,%0 \n" +" b 2f \n" +" 1: \n" +" li %0,%4 \n" +" 2: \n" + : "+r"(_fd), "+r"(nr), "+r"(_buf), "+r"(_count) + : "i"(-ENOANO) + : "memory", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "ctr", "cr0"); + + if (_fd < 0) { + errno = -_fd; + _fd = -1; + } + + return _fd; +} + +#define DATA "test 123" +#define DLEN (strlen(DATA)+1) + +int test_restart(void) +{ + int pipefd[2]; + pid_t pid; + char buf[512]; + + if (pipe(pipefd) == -1) { + perror("pipe"); + exit(EXIT_FAILURE); + } + + pid = fork(); + if (pid == -1) { + perror("fork"); + exit(EXIT_FAILURE); + } + + if (pid == 0) { /* Child reads from pipe */ + struct sigaction act; + int fd; + + memset(&act, 0, sizeof(act)); + sigaddset(&act.sa_mask, SIGUSR2); + act.sa_handler = SIGUSR1_handler; + act.sa_flags = SA_RESTART; + if (sigaction(SIGUSR1, &act, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&act, 0, sizeof(act)); + act.sa_handler = SIGUSR2_handler; + act.sa_flags = SA_RESTART; + if (sigaction(SIGUSR2, &act, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + /* Let's get ERESTARTSYS into r3 */ + while ((fd = dup(pipefd[0])) != 512) { + if (fd == -1) { + perror("dup"); + exit(EXIT_FAILURE); + } + } + + if (raw_read(fd, buf, 512) == -1) { + if (errno == ENOANO) { + fprintf(stderr, "Double restart moved restart before sc instruction.\n"); + _exit(EXIT_FAILURE); + } + perror("read"); + exit(EXIT_FAILURE); + } + + if (strncmp(buf, DATA, DLEN)) { + fprintf(stderr, "bad test string %s\n", buf); + exit(EXIT_FAILURE); + } + + return 0; + + } else { + int wstatus; + + usleep(100000); /* Hack to get reader waiting */ + kill(pid, SIGUSR1); + usleep(100000); + if (write(pipefd[1], DATA, DLEN) != DLEN) { + perror("write"); + exit(EXIT_FAILURE); + } + close(pipefd[0]); + close(pipefd[1]); + if (wait(&wstatus) == -1) { + perror("wait"); + exit(EXIT_FAILURE); + } + if (!WIFEXITED(wstatus)) { + fprintf(stderr, "child exited abnormally\n"); + exit(EXIT_FAILURE); + } + + FAIL_IF(WEXITSTATUS(wstatus) != EXIT_SUCCESS); + + return 0; + } +} + +int main(void) +{ + test_harness_set_timeout(10); + return test_harness(test_restart, "sig sys restart"); +} diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 4bca5a9327a4..bed4b5318a86 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -2,7 +2,9 @@ /fd-001-lookup /fd-002-posix-eq /fd-003-kthread +/proc-fsconfig-hidepid /proc-loadavg-001 +/proc-multiple-procfs /proc-pid-vm /proc-self-map-files-001 /proc-self-map-files-002 diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index a8ed0f684829..8be8a03d2973 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -19,5 +19,7 @@ TEST_GEN_PROGS += self TEST_GEN_PROGS += setns-dcache TEST_GEN_PROGS += setns-sysvipc TEST_GEN_PROGS += thread-self +TEST_GEN_PROGS += proc-multiple-procfs +TEST_GEN_PROGS += proc-fsconfig-hidepid include ../lib.mk diff --git a/tools/testing/selftests/proc/proc-fsconfig-hidepid.c b/tools/testing/selftests/proc/proc-fsconfig-hidepid.c new file mode 100644 index 000000000000..b9af8f537185 --- /dev/null +++ b/tools/testing/selftests/proc/proc-fsconfig-hidepid.c @@ -0,0 +1,50 @@ +/* + * Copyright © 2020 Alexey Gladkov <gladkov.alexey@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include <assert.h> +#include <unistd.h> +#include <stdlib.h> +#include <errno.h> +#include <linux/mount.h> +#include <linux/unistd.h> + +static inline int fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} + +static inline int fsconfig(int fd, unsigned int cmd, const char *key, const void *val, int aux) +{ + return syscall(__NR_fsconfig, fd, cmd, key, val, aux); +} + +int main(void) +{ + int fsfd, ret; + int hidepid = 2; + + assert((fsfd = fsopen("proc", 0)) != -1); + + ret = fsconfig(fsfd, FSCONFIG_SET_BINARY, "hidepid", &hidepid, 0); + assert(ret == -1); + assert(errno == EINVAL); + + assert(!fsconfig(fsfd, FSCONFIG_SET_STRING, "hidepid", "2", 0)); + assert(!fsconfig(fsfd, FSCONFIG_SET_STRING, "hidepid", "invisible", 0)); + + assert(!close(fsfd)); + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-multiple-procfs.c b/tools/testing/selftests/proc/proc-multiple-procfs.c new file mode 100644 index 000000000000..ab912ad95dab --- /dev/null +++ b/tools/testing/selftests/proc/proc-multiple-procfs.c @@ -0,0 +1,48 @@ +/* + * Copyright © 2020 Alexey Gladkov <gladkov.alexey@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include <assert.h> +#include <stdlib.h> +#include <stdio.h> +#include <sys/mount.h> +#include <sys/types.h> +#include <sys/stat.h> + +int main(void) +{ + struct stat proc_st1, proc_st2; + char procbuff[] = "/tmp/proc.XXXXXX/meminfo"; + char procdir1[] = "/tmp/proc.XXXXXX"; + char procdir2[] = "/tmp/proc.XXXXXX"; + + assert(mkdtemp(procdir1) != NULL); + assert(mkdtemp(procdir2) != NULL); + + assert(!mount("proc", procdir1, "proc", 0, "hidepid=1")); + assert(!mount("proc", procdir2, "proc", 0, "hidepid=2")); + + snprintf(procbuff, sizeof(procbuff), "%s/meminfo", procdir1); + assert(!stat(procbuff, &proc_st1)); + + snprintf(procbuff, sizeof(procbuff), "%s/meminfo", procdir2); + assert(!stat(procbuff, &proc_st2)); + + umount(procdir1); + umount(procdir2); + + assert(proc_st1.st_dev != proc_st2.st_dev); + + return 0; +} diff --git a/tools/testing/selftests/pstore/pstore_tests b/tools/testing/selftests/pstore/pstore_tests index 1cef54458aff..2aa9a3852a84 100755 --- a/tools/testing/selftests/pstore/pstore_tests +++ b/tools/testing/selftests/pstore/pstore_tests @@ -10,7 +10,7 @@ . ./common_tests prlog -n "Checking pstore console is registered ... " -dmesg | grep -q "console \[pstore" +dmesg | grep -Eq "console \[(pstore|${backend})" show_result $? prlog -n "Checking /dev/pmsg0 exists ... " diff --git a/tools/testing/selftests/ptp/testptp.c b/tools/testing/selftests/ptp/testptp.c index c0dd10257df5..da7a9dda9490 100644 --- a/tools/testing/selftests/ptp/testptp.c +++ b/tools/testing/selftests/ptp/testptp.c @@ -269,14 +269,16 @@ int main(int argc, char *argv[]) " %d programmable periodic signals\n" " %d pulse per second\n" " %d programmable pins\n" - " %d cross timestamping\n", + " %d cross timestamping\n" + " %d adjust_phase\n", caps.max_adj, caps.n_alarm, caps.n_ext_ts, caps.n_per_out, caps.pps, caps.n_pins, - caps.cross_timestamping); + caps.cross_timestamping, + caps.adjust_phase); } } diff --git a/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh b/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh new file mode 100755 index 000000000000..e5cc6b2f195e --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# If this was a KCSAN run, collapse the reports in the various console.log +# files onto pairs of functions. +# +# Usage: kcsan-collapse.sh resultsdir +# +# Copyright (C) 2020 Facebook, Inc. +# +# Authors: Paul E. McKenney <paulmck@kernel.org> + +if test -z "$TORTURE_KCONFIG_KCSAN_ARG" +then + exit 0 +fi +cat $1/*/console.log | + grep "BUG: KCSAN: " | + sed -e 's/^\[[^]]*] //' | + sort | + uniq -c | + sort -k1nr > $1/kcsan.sum diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh index 9d9a41625dd9..1706cd4466b4 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh @@ -41,7 +41,21 @@ else title="$title ($ngpsps/s)" fi echo $title $stopstate $fwdprog - nclosecalls=`grep --binary-files=text 'torture: Reader Batch' $i/console.log | tail -1 | awk '{for (i=NF-8;i<=NF;i++) sum+=$i; } END {print sum}'` + nclosecalls=`grep --binary-files=text 'torture: Reader Batch' $i/console.log | tail -1 | \ + awk -v sum=0 ' + { + for (i = 0; i <= NF; i++) { + sum += $i; + if ($i ~ /Batch:/) { + sum = 0; + i = i + 2; + } + } + } + + END { + print sum + }'` if test -z "$nclosecalls" then exit 0 diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index 0326f4a5ff9c..736f04749b90 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -70,6 +70,15 @@ do fi fi done + if test -f "$rd/kcsan.sum" + then + if test -s "$rd/kcsan.sum" + then + echo KCSAN summary in $rd/kcsan.sum + else + echo Clean KCSAN run in $rd + fi + fi done EDITOR=echo kvm-find-errors.sh "${@: -1}" > $T 2>&1 ret=$? diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index e0352304b98b..6ff611c630d1 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -44,30 +44,32 @@ then fi echo ' ---' `date`: Starting build echo ' ---' Kconfig fragment at: $config_template >> $resdir/log -touch $resdir/ConfigFragment.input $resdir/ConfigFragment -if test -r "$config_dir/CFcommon" -then - echo " --- $config_dir/CFcommon" >> $resdir/ConfigFragment.input - cat < $config_dir/CFcommon >> $resdir/ConfigFragment.input - config_override.sh $config_dir/CFcommon $config_template > $T/Kc1 - grep '#CHECK#' $config_dir/CFcommon >> $resdir/ConfigFragment -else - cp $config_template $T/Kc1 -fi -echo " --- $config_template" >> $resdir/ConfigFragment.input -cat $config_template >> $resdir/ConfigFragment.input -grep '#CHECK#' $config_template >> $resdir/ConfigFragment -if test -n "$TORTURE_KCONFIG_ARG" -then - echo $TORTURE_KCONFIG_ARG | tr -s " " "\012" > $T/cmdline - echo " --- --kconfig argument" >> $resdir/ConfigFragment.input - cat $T/cmdline >> $resdir/ConfigFragment.input - config_override.sh $T/Kc1 $T/cmdline > $T/Kc2 - # Note that "#CHECK#" is not permitted on commandline. -else - cp $T/Kc1 $T/Kc2 -fi -cat $T/Kc2 >> $resdir/ConfigFragment +touch $resdir/ConfigFragment.input + +# Combine additional Kconfig options into an existing set such that +# newer options win. The first argument is the Kconfig source ID, the +# second the to-be-updated file within $T, and the third and final the +# list of additional Kconfig options. Note that a $2.tmp file is +# created when doing the update. +config_override_param () { + if test -n "$3" + then + echo $3 | sed -e 's/^ *//' -e 's/ *$//' | tr -s " " "\012" > $T/Kconfig_args + echo " --- $1" >> $resdir/ConfigFragment.input + cat $T/Kconfig_args >> $resdir/ConfigFragment.input + config_override.sh $T/$2 $T/Kconfig_args > $T/$2.tmp + mv $T/$2.tmp $T/$2 + # Note that "#CHECK#" is not permitted on commandline. + fi +} + +echo > $T/KcList +config_override_param "$config_dir/CFcommon" KcList "`cat $config_dir/CFcommon 2> /dev/null`" +config_override_param "$config_template" KcList "`cat $config_template 2> /dev/null`" +config_override_param "--kasan options" KcList "$TORTURE_KCONFIG_KASAN_ARG" +config_override_param "--kcsan options" KcList "$TORTURE_KCONFIG_KCSAN_ARG" +config_override_param "--kconfig argument" KcList "$TORTURE_KCONFIG_ARG" +cp $T/KcList $resdir/ConfigFragment base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux @@ -80,7 +82,7 @@ then ln -s $base_resdir/.config $resdir # for kvm-recheck.sh # Arch-independent indicator touch $resdir/builtkernel -elif kvm-build.sh $T/Kc2 $resdir +elif kvm-build.sh $T/KcList $resdir then # Had to build a kernel for this test. QEMU="`identify_qemu vmlinux`" diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 2315e2ec12d6..c279cf9cb010 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -31,6 +31,8 @@ TORTURE_DEFCONFIG=defconfig TORTURE_BOOT_IMAGE="" TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD TORTURE_KCONFIG_ARG="" +TORTURE_KCONFIG_KASAN_ARG="" +TORTURE_KCONFIG_KCSAN_ARG="" TORTURE_KMAKE_ARG="" TORTURE_QEMU_MEM=512 TORTURE_SHUTDOWN_GRACE=180 @@ -133,6 +135,12 @@ do TORTURE_KCONFIG_ARG="$2" shift ;; + --kasan) + TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG + ;; + --kcsan) + TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_KCSAN_INTERRUPT_WATCHER=y"; export TORTURE_KCONFIG_KCSAN_ARG + ;; --kmake-arg) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' TORTURE_KMAKE_ARG="$2" @@ -310,6 +318,8 @@ TORTURE_BUILDONLY="$TORTURE_BUILDONLY"; export TORTURE_BUILDONLY TORTURE_DEFCONFIG="$TORTURE_DEFCONFIG"; export TORTURE_DEFCONFIG TORTURE_INITRD="$TORTURE_INITRD"; export TORTURE_INITRD TORTURE_KCONFIG_ARG="$TORTURE_KCONFIG_ARG"; export TORTURE_KCONFIG_ARG +TORTURE_KCONFIG_KASAN_ARG="$TORTURE_KCONFIG_KASAN_ARG"; export TORTURE_KCONFIG_KASAN_ARG +TORTURE_KCONFIG_KCSAN_ARG="$TORTURE_KCONFIG_KCSAN_ARG"; export TORTURE_KCONFIG_KCSAN_ARG TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE @@ -464,6 +474,7 @@ echo echo echo " --- `date` Test summary:" echo Results directory: $resdir/$ds +kcsan-collapse.sh $resdir/$ds kvm-recheck.sh $resdir/$ds ___EOF___ diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST index c3c1fb5a9e1f..f2b20db9e296 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST @@ -14,3 +14,6 @@ TINY02 TASKS01 TASKS02 TASKS03 +RUDE01 +TRACE01 +TRACE02 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 new file mode 100644 index 000000000000..bafe94cbd739 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 @@ -0,0 +1,10 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=2 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y +#CHECK#CONFIG_PROVE_RCU=y +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot new file mode 100644 index 000000000000..9363708c9075 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot @@ -0,0 +1 @@ +rcutorture.torture_type=tasks-rude diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 new file mode 100644 index 000000000000..12e7661b86f5 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 @@ -0,0 +1,11 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=4 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y +#CHECK#CONFIG_PROVE_RCU=y +CONFIG_TASKS_TRACE_RCU_READ_MB=y +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot new file mode 100644 index 000000000000..9675ad632dcc --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot @@ -0,0 +1 @@ +rcutorture.torture_type=tasks-tracing diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 new file mode 100644 index 000000000000..b69ed6673c41 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 @@ -0,0 +1,11 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=4 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +#CHECK#CONFIG_PROVE_RCU=n +CONFIG_TASKS_TRACE_RCU_READ_MB=n +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot new file mode 100644 index 000000000000..9675ad632dcc --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot @@ -0,0 +1 @@ +rcutorture.torture_type=tasks-tracing diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 index 2debe7891aeb..7311f84a5876 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 @@ -1,5 +1,5 @@ CONFIG_SMP=y -CONFIG_NR_CPUS=100 +CONFIG_NR_CPUS=56 CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n diff --git a/tools/testing/selftests/sysctl/config b/tools/testing/selftests/sysctl/config index 6ca14800d755..fc263efd1fad 100644 --- a/tools/testing/selftests/sysctl/config +++ b/tools/testing/selftests/sysctl/config @@ -1 +1 @@ -CONFIG_TEST_SYSCTL=y +CONFIG_TEST_SYSCTL=m diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index 6a970b127c9b..19515dcb7d04 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -39,16 +39,7 @@ ALL_TESTS="$ALL_TESTS 0003:1:1:int_0002" ALL_TESTS="$ALL_TESTS 0004:1:1:uint_0001" ALL_TESTS="$ALL_TESTS 0005:3:1:int_0003" ALL_TESTS="$ALL_TESTS 0006:50:1:bitmap_0001" - -test_modprobe() -{ - if [ ! -d $DIR ]; then - echo "$0: $DIR not present" >&2 - echo "You must have the following enabled in your kernel:" >&2 - cat $TEST_DIR/config >&2 - exit $ksft_skip - fi -} +ALL_TESTS="$ALL_TESTS 0007:1:1:boot_int" function allow_user_defaults() { @@ -122,13 +113,15 @@ test_reqs() function load_req_mod() { - if [ ! -d $DIR ]; then + if [ ! -d $SYSCTL ]; then if ! modprobe -q -n $TEST_DRIVER; then echo "$0: module $TEST_DRIVER not found [SKIP]" + echo "You must set CONFIG_TEST_SYSCTL=m in your kernel" >&2 exit $ksft_skip fi modprobe $TEST_DRIVER if [ $? -ne 0 ]; then + echo "$0: modprobe $TEST_DRIVER failed." exit fi fi @@ -752,6 +745,46 @@ sysctl_test_0006() run_bitmaptest } +sysctl_test_0007() +{ + TARGET="${SYSCTL}/boot_int" + if [ ! -f $TARGET ]; then + echo "Skipping test for $TARGET as it is not present ..." + return $ksft_skip + fi + + if [ -d $DIR ]; then + echo "Boot param test only possible sysctl_test is built-in, not module:" + cat $TEST_DIR/config >&2 + return $ksft_skip + fi + + echo -n "Testing if $TARGET is set to 1 ..." + ORIG=$(cat "${TARGET}") + + if [ x$ORIG = "x1" ]; then + echo "ok" + return 0 + fi + echo "FAIL" + echo "Checking if /proc/cmdline contains setting of the expected parameter ..." + if [ ! -f /proc/cmdline ]; then + echo "/proc/cmdline does not exist, test inconclusive" + return 0 + fi + + FOUND=$(grep -c "sysctl[./]debug[./]test_sysctl[./]boot_int=1" /proc/cmdline) + if [ $FOUND = "1" ]; then + echo "Kernel param found but $TARGET is not 1, TEST FAILED" + rc=1 + test_rc + fi + + echo "Skipping test, expected kernel parameter missing." + echo "To perform this test, make sure kernel is booted with parameter: sysctl.debug.test_sysctl.boot_int=1" + return $ksft_skip +} + list_tests() { echo "Test ID list:" @@ -766,6 +799,7 @@ list_tests() echo "0004 x $(get_test_count 0004) - tests proc_douintvec()" echo "0005 x $(get_test_count 0005) - tests proc_douintvec() array" echo "0006 x $(get_test_count 0006) - tests proc_do_large_bitmap()" + echo "0007 x $(get_test_count 0007) - tests setting sysctl from kernel boot param" } usage() @@ -929,7 +963,6 @@ test_reqs allow_user_defaults check_production_sysctl_writes_strict load_req_mod -test_modprobe trap "test_finish" EXIT diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json b/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json index f8ea6f5fa8e9..72cdc3c800a5 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json @@ -1472,6 +1472,31 @@ ] }, { + "id": "94bb", + "name": "Add pedit action with LAYERED_OP ip6 traffic_class", + "category": [ + "actions", + "pedit", + "layered_op" + ], + "setup": [ + [ + "$TC actions flush action pedit", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action pedit ex munge ip6 traffic_class set 0x40 continue", + "expExitCode": "0", + "verifyCmd": "$TC actions list action pedit", + "matchPattern": "ipv6\\+0: val 04000000 mask f00fffff", + "matchCount": "1", + "teardown": [ + "$TC actions flush action pedit" + ] + }, + { "id": "6f5e", "name": "Add pedit action with LAYERED_OP ip6 flow_lbl", "category": [ diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json index 8877f7b2b809..bb543bf69d69 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json @@ -32,7 +32,7 @@ "setup": [ "$TC qdisc add dev $DEV2 ingress" ], - "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 parent ffff: handle 0xffffffff flower action ok", + "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress handle 0xffffffff flower action ok", "expExitCode": "0", "verifyCmd": "$TC filter show dev $DEV2 ingress", "matchPattern": "filter protocol ip pref 1 flower.*handle 0xffffffff", @@ -77,9 +77,9 @@ }, "setup": [ "$TC qdisc add dev $DEV2 ingress", - "$TC filter add dev $DEV2 protocol ip prio 1 parent ffff: flower dst_mac e4:11:22:11:4a:51 src_mac e4:11:22:11:4a:50 ip_proto tcp src_ip 1.1.1.1 dst_ip 2.2.2.2 action drop" + "$TC filter add dev $DEV2 protocol ip prio 1 ingress flower dst_mac e4:11:22:11:4a:51 src_mac e4:11:22:11:4a:50 ip_proto tcp src_ip 1.1.1.1 dst_ip 2.2.2.2 action drop" ], - "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip prio 1 parent ffff: flower dst_mac e4:11:22:11:4a:51 src_mac e4:11:22:11:4a:50 ip_proto tcp src_ip 1.1.1.1 dst_ip 2.2.2.2 action drop", + "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip prio 1 ingress flower dst_mac e4:11:22:11:4a:51 src_mac e4:11:22:11:4a:50 ip_proto tcp src_ip 1.1.1.1 dst_ip 2.2.2.2 action drop", "expExitCode": "2", "verifyCmd": "$TC -s filter show dev $DEV2 ingress", "matchPattern": "filter protocol ip pref 1 flower chain 0 handle", @@ -87,5 +87,43 @@ "teardown": [ "$TC qdisc del dev $DEV2 ingress" ] + }, + { + "id": "7c65", + "name": "Add flower filter and then terse dump it", + "category": [ + "filter", + "flower" + ], + "setup": [ + "$TC qdisc add dev $DEV2 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress flower dst_mac e4:11:22:11:4a:51 action drop", + "expExitCode": "0", + "verifyCmd": "$TC filter show terse dev $DEV2 ingress", + "matchPattern": "filter protocol ip pref 1 flower.*handle", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV2 ingress" + ] + }, + { + "id": "d45e", + "name": "Add flower filter and verify that terse dump doesn't output filter key", + "category": [ + "filter", + "flower" + ], + "setup": [ + "$TC qdisc add dev $DEV2 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress flower dst_mac e4:11:22:11:4a:51 action drop", + "expExitCode": "0", + "verifyCmd": "$TC filter show terse dev $DEV2 ingress", + "matchPattern": " dst_mac e4:11:22:11:4a:51", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV2 ingress" + ] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json new file mode 100644 index 000000000000..1cda2e11b3ad --- /dev/null +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json @@ -0,0 +1,21 @@ +[ + { + "id": "83be", + "name": "Create FQ-PIE with invalid number of flows", + "category": [ + "qdisc", + "fq_pie" + ], + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY root fq_pie flows 65536", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc", + "matchCount": "0", + "teardown": [ + "$IP link del dev $DUMMY" + ] + } +] diff --git a/tools/testing/selftests/tc-testing/tdc_batch.py b/tools/testing/selftests/tc-testing/tdc_batch.py index 6a2bd2cf528e..995f66ce43eb 100755 --- a/tools/testing/selftests/tc-testing/tdc_batch.py +++ b/tools/testing/selftests/tc-testing/tdc_batch.py @@ -72,21 +72,21 @@ mac_prefix = args.mac_prefix def format_add_filter(device, prio, handle, skip, src_mac, dst_mac, share_action): - return ("filter add dev {} {} protocol ip parent ffff: handle {} " + return ("filter add dev {} {} protocol ip ingress handle {} " " flower {} src_mac {} dst_mac {} action drop {}".format( device, prio, handle, skip, src_mac, dst_mac, share_action)) def format_rep_filter(device, prio, handle, skip, src_mac, dst_mac, share_action): - return ("filter replace dev {} {} protocol ip parent ffff: handle {} " + return ("filter replace dev {} {} protocol ip ingress handle {} " " flower {} src_mac {} dst_mac {} action drop {}".format( device, prio, handle, skip, src_mac, dst_mac, share_action)) def format_del_filter(device, prio, handle, skip, src_mac, dst_mac, share_action): - return ("filter del dev {} {} protocol ip parent ffff: handle {} " + return ("filter del dev {} {} protocol ip ingress handle {} " "flower".format(device, prio, handle)) diff --git a/tools/testing/selftests/timens/clock_nanosleep.c b/tools/testing/selftests/timens/clock_nanosleep.c index 8e7b7c72ef65..72d41b955fb2 100644 --- a/tools/testing/selftests/timens/clock_nanosleep.c +++ b/tools/testing/selftests/timens/clock_nanosleep.c @@ -119,7 +119,7 @@ int main(int argc, char *argv[]) ksft_set_plan(4); - check_config_posix_timers(); + check_supported_timers(); if (unshare_timens()) return 1; diff --git a/tools/testing/selftests/timens/timens.c b/tools/testing/selftests/timens/timens.c index 098be7c83be3..52b6a1185f52 100644 --- a/tools/testing/selftests/timens/timens.c +++ b/tools/testing/selftests/timens/timens.c @@ -155,7 +155,7 @@ int main(int argc, char *argv[]) nscheck(); - check_config_posix_timers(); + check_supported_timers(); ksft_set_plan(ARRAY_SIZE(clocks) * 2); diff --git a/tools/testing/selftests/timens/timens.h b/tools/testing/selftests/timens/timens.h index e09e7e39bc52..d4fc52d47146 100644 --- a/tools/testing/selftests/timens/timens.h +++ b/tools/testing/selftests/timens/timens.h @@ -14,15 +14,26 @@ #endif static int config_posix_timers = true; +static int config_alarm_timers = true; -static inline void check_config_posix_timers(void) +static inline void check_supported_timers(void) { + struct timespec ts; + if (timer_create(-1, 0, 0) == -1 && errno == ENOSYS) config_posix_timers = false; + + if (clock_gettime(CLOCK_BOOTTIME_ALARM, &ts) == -1 && errno == EINVAL) + config_alarm_timers = false; } static inline bool check_skip(int clockid) { + if (!config_alarm_timers && clockid == CLOCK_BOOTTIME_ALARM) { + ksft_test_result_skip("CLOCK_BOOTTIME_ALARM isn't supported\n"); + return true; + } + if (config_posix_timers) return false; diff --git a/tools/testing/selftests/timens/timer.c b/tools/testing/selftests/timens/timer.c index 96dba11ebe44..5e7f0051bd7b 100644 --- a/tools/testing/selftests/timens/timer.c +++ b/tools/testing/selftests/timens/timer.c @@ -22,6 +22,9 @@ int run_test(int clockid, struct timespec now) timer_t fd; int i; + if (check_skip(clockid)) + return 0; + for (i = 0; i < 2; i++) { struct sigevent sevp = {.sigev_notify = SIGEV_NONE}; int flags = 0; @@ -74,6 +77,8 @@ int main(int argc, char *argv[]) nscheck(); + check_supported_timers(); + ksft_set_plan(3); clock_gettime(CLOCK_MONOTONIC, &mtime_now); diff --git a/tools/testing/selftests/timens/timerfd.c b/tools/testing/selftests/timens/timerfd.c index eff1ec5ff215..9edd43d6b2c1 100644 --- a/tools/testing/selftests/timens/timerfd.c +++ b/tools/testing/selftests/timens/timerfd.c @@ -28,6 +28,9 @@ int run_test(int clockid, struct timespec now) long long elapsed; int fd, i; + if (check_skip(clockid)) + return 0; + if (tclock_gettime(clockid, &now)) return pr_perror("clock_gettime(%d)", clockid); @@ -81,6 +84,8 @@ int main(int argc, char *argv[]) nscheck(); + check_supported_timers(); + ksft_set_plan(3); clock_gettime(CLOCK_MONOTONIC, &mtime_now); diff --git a/tools/testing/selftests/tpm2/test_smoke.sh b/tools/testing/selftests/tpm2/test_smoke.sh index 8155c2ea7ccb..663062701d5a 100755 --- a/tools/testing/selftests/tpm2/test_smoke.sh +++ b/tools/testing/selftests/tpm2/test_smoke.sh @@ -1,6 +1,11 @@ #!/bin/bash # SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +[ -f /dev/tpm0 ] || exit $ksft_skip + python -m unittest -v tpm2_tests.SmokeTest python -m unittest -v tpm2_tests.AsyncTest diff --git a/tools/testing/selftests/tpm2/test_space.sh b/tools/testing/selftests/tpm2/test_space.sh index a6f5e346635e..36c9d030a1c6 100755 --- a/tools/testing/selftests/tpm2/test_space.sh +++ b/tools/testing/selftests/tpm2/test_space.sh @@ -1,4 +1,9 @@ #!/bin/bash # SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +[ -f /dev/tpmrm0 ] || exit $ksft_skip + python -m unittest -v tpm2_tests.SpaceTest diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore index 382cfb39a1a3..5eb64d41e541 100644 --- a/tools/testing/selftests/vDSO/.gitignore +++ b/tools/testing/selftests/vDSO/.gitignore @@ -1,3 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only vdso_test +vdso_test_gettimeofday +vdso_test_getcpu vdso_standalone_test_x86 diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 9e03d61f52fd..0069f2f83f86 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -4,7 +4,7 @@ include ../lib.mk uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) -TEST_GEN_PROGS := $(OUTPUT)/vdso_test +TEST_GEN_PROGS := $(OUTPUT)/vdso_test_gettimeofday $(OUTPUT)/vdso_test_getcpu ifeq ($(ARCH),x86) TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86 endif @@ -17,7 +17,8 @@ LDLIBS += -lgcc_s endif all: $(TEST_GEN_PROGS) -$(OUTPUT)/vdso_test: parse_vdso.c vdso_test.c +$(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c +$(OUTPUT)/vdso_test_getcpu: parse_vdso.c vdso_test_getcpu.c $(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c $(CC) $(CFLAGS) $(CFLAGS_vdso_standalone_test_x86) \ vdso_standalone_test_x86.c parse_vdso.c \ diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c index 1dbb4b87268f..413f75620a35 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.c +++ b/tools/testing/selftests/vDSO/parse_vdso.c @@ -21,29 +21,7 @@ #include <limits.h> #include <elf.h> -/* - * To use this vDSO parser, first call one of the vdso_init_* functions. - * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR - * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv. - * Then call vdso_sym for each symbol you want. For example, to look up - * gettimeofday on x86_64, use: - * - * <some pointer> = vdso_sym("LINUX_2.6", "gettimeofday"); - * or - * <some pointer> = vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); - * - * vdso_sym will return 0 if the symbol doesn't exist or if the init function - * failed or was not called. vdso_sym is a little slow, so its return value - * should be cached. - * - * vdso_sym is threadsafe; the init functions are not. - * - * These are the prototypes: - */ -extern void vdso_init_from_auxv(void *auxv); -extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); -extern void *vdso_sym(const char *version, const char *name); - +#include "parse_vdso.h" /* And here's the code. */ #ifndef ELF_BITS diff --git a/tools/testing/selftests/vDSO/parse_vdso.h b/tools/testing/selftests/vDSO/parse_vdso.h new file mode 100644 index 000000000000..de0453067d7c --- /dev/null +++ b/tools/testing/selftests/vDSO/parse_vdso.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef PARSE_VDSO_H +#define PARSE_VDSO_H + +#include <stdint.h> + +/* + * To use this vDSO parser, first call one of the vdso_init_* functions. + * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR + * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv. + * Then call vdso_sym for each symbol you want. For example, to look up + * gettimeofday on x86_64, use: + * + * <some pointer> = vdso_sym("LINUX_2.6", "gettimeofday"); + * or + * <some pointer> = vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); + * + * vdso_sym will return 0 if the symbol doesn't exist or if the init function + * failed or was not called. vdso_sym is a little slow, so its return value + * should be cached. + * + * vdso_sym is threadsafe; the init functions are not. + * + * These are the prototypes: + */ +void *vdso_sym(const char *version, const char *name); +void vdso_init_from_sysinfo_ehdr(uintptr_t base); +void vdso_init_from_auxv(void *auxv); + +#endif diff --git a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c index 5ac4b00acfbc..8a44ff973ee1 100644 --- a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c +++ b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c @@ -16,9 +16,7 @@ #include <unistd.h> #include <stdint.h> -extern void *vdso_sym(const char *version, const char *name); -extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); -extern void vdso_init_from_auxv(void *auxv); +#include "parse_vdso.h" /* We need a libc functions... */ int strcmp(const char *a, const char *b) diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c new file mode 100644 index 000000000000..fc25ede131b8 --- /dev/null +++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vdso_test_getcpu.c: Sample code to test parse_vdso.c and vDSO getcpu() + * + * Copyright (c) 2020 Arm Ltd + */ + +#include <stdint.h> +#include <elf.h> +#include <stdio.h> +#include <sys/auxv.h> +#include <sys/time.h> + +#include "../kselftest.h" +#include "parse_vdso.h" + +const char *version = "LINUX_2.6"; +const char *name = "__vdso_getcpu"; + +struct getcpu_cache; +typedef long (*getcpu_t)(unsigned int *, unsigned int *, + struct getcpu_cache *); + +int main(int argc, char **argv) +{ + unsigned long sysinfo_ehdr; + unsigned int cpu, node; + getcpu_t get_cpu; + long ret; + + sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); + if (!sysinfo_ehdr) { + printf("AT_SYSINFO_EHDR is not present!\n"); + return KSFT_SKIP; + } + + vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR)); + + get_cpu = (getcpu_t)vdso_sym(version, name); + if (!get_cpu) { + printf("Could not find %s\n", name); + return KSFT_SKIP; + } + + ret = get_cpu(&cpu, &node, 0); + if (ret == 0) { + printf("Running on CPU %u node %u\n", cpu, node); + } else { + printf("%s failed\n", name); + return KSFT_FAIL; + } + + return 0; +} diff --git a/tools/testing/selftests/vDSO/vdso_test.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c index 719d5a6bd664..8ccc73ed8240 100644 --- a/tools/testing/selftests/vDSO/vdso_test.c +++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c @@ -1,10 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vdso_test.c: Sample code to test parse_vdso.c + * vdso_test_gettimeofday.c: Sample code to test parse_vdso.c and + * vDSO gettimeofday() * Copyright (c) 2014 Andy Lutomirski * * Compile with: - * gcc -std=gnu99 vdso_test.c parse_vdso.c + * gcc -std=gnu99 vdso_test_gettimeofday.c parse_vdso_gettimeofday.c * * Tested on x86, 32-bit and 64-bit. It may work on other architectures, too. */ @@ -16,10 +17,7 @@ #include <sys/time.h> #include "../kselftest.h" - -extern void *vdso_sym(const char *version, const char *name); -extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); -extern void vdso_init_from_auxv(void *auxv); +#include "parse_vdso.h" /* * ARM64's vDSO exports its gettimeofday() implementation with a different diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 0edb6d900e8d..849e8226395a 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -1,13 +1,16 @@ # SPDX-License-Identifier: GPL-2.0-only hugepage-mmap hugepage-shm +khugepaged map_hugetlb map_populate thuge-gen compaction_test mlock2-tests +mremap_dontunmap on-fault-limit transhuge-stress +protection_keys userfaultfd mlock-intersect-test mlock-random-test @@ -16,3 +19,4 @@ gup_benchmark va_128TBswitch map_fixed_noreplace write_to_hugetlbfs +hmm-tests diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 6998877f707e..a9026706d597 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -7,6 +7,7 @@ CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS) LDLIBS = -lrt TEST_GEN_FILES = compaction_test TEST_GEN_FILES += gup_benchmark +TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-shm TEST_GEN_FILES += map_hugetlb @@ -19,6 +20,31 @@ TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd +TEST_GEN_FILES += khugepaged + +ifeq ($(ARCH),x86_64) +CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) +CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_64bit_program.c) +CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_program.c -no-pie) + +TARGETS := protection_keys +BINARIES_32 := $(TARGETS:%=%_32) +BINARIES_64 := $(TARGETS:%=%_64) + +ifeq ($(CAN_BUILD_WITH_NOPIE),1) +CFLAGS += -no-pie +endif + +ifeq ($(CAN_BUILD_I386),1) +TEST_GEN_FILES += $(BINARIES_32) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +TEST_GEN_FILES += $(BINARIES_64) +endif +else +TEST_GEN_FILES += protection_keys +endif ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64)) TEST_GEN_FILES += va_128TBswitch @@ -33,6 +59,57 @@ TEST_FILES := test_vmalloc.sh KSFT_KHDR_INSTALL := 1 include ../lib.mk +$(OUTPUT)/hmm-tests: LDLIBS += -lhugetlbfs -lpthread + +ifeq ($(ARCH),x86_64) +BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) +BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) + +define gen-target-rule-32 +$(1) $(1)_32: $(OUTPUT)/$(1)_32 +.PHONY: $(1) $(1)_32 +endef + +define gen-target-rule-64 +$(1) $(1)_64: $(OUTPUT)/$(1)_64 +.PHONY: $(1) $(1)_64 +endef + +ifeq ($(CAN_BUILD_I386),1) +$(BINARIES_32): CFLAGS += -m32 +$(BINARIES_32): LDLIBS += -lrt -ldl -lm +$(BINARIES_32): %_32: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(TARGETS),$(eval $(call gen-target-rule-32,$(t)))) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +$(BINARIES_64): CFLAGS += -m64 +$(BINARIES_64): LDLIBS += -lrt -ldl +$(BINARIES_64): %_64: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(TARGETS),$(eval $(call gen-target-rule-64,$(t)))) +endif + +# x86_64 users should be encouraged to install 32-bit libraries +ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) +all: warn_32bit_failure + +warn_32bit_failure: + @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ + echo "environment. This will reduce test coverage of 64-bit" 2>&1; \ + echo "kernels. If you are using a Debian-like distribution," 2>&1; \ + echo "try:"; 2>&1; \ + echo ""; \ + echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ + echo ""; \ + echo "If you are using a Fedora-like distribution, try:"; \ + echo ""; \ + echo " yum install glibc-devel.*i686"; \ + exit 0; +endif +endif + $(OUTPUT)/userfaultfd: LDLIBS += -lpthread $(OUTPUT)/mlock-random-test: LDLIBS += -lcap diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config index 93b90a9b1eeb..3ba674b64fa9 100644 --- a/tools/testing/selftests/vm/config +++ b/tools/testing/selftests/vm/config @@ -1,3 +1,5 @@ CONFIG_SYSVIPC=y CONFIG_USERFAULTFD=y CONFIG_TEST_VMALLOC=m +CONFIG_DEVICE_PRIVATE=y +CONFIG_TEST_HMM=m diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c new file mode 100644 index 000000000000..79db22604019 --- /dev/null +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -0,0 +1,1359 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HMM stands for Heterogeneous Memory Management, it is a helper layer inside + * the linux kernel to help device drivers mirror a process address space in + * the device. This allows the device to use the same address space which + * makes communication and data exchange a lot easier. + * + * This framework's sole purpose is to exercise various code paths inside + * the kernel to make sure that HMM performs as expected and to flush out any + * bugs. + */ + +#include "../kselftest_harness.h" + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <unistd.h> +#include <strings.h> +#include <time.h> +#include <pthread.h> +#include <hugetlbfs.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/ioctl.h> + +/* + * This is a private UAPI to the kernel test module so it isn't exported + * in the usual include/uapi/... directory. + */ +#include "../../../../lib/test_hmm_uapi.h" + +struct hmm_buffer { + void *ptr; + void *mirror; + unsigned long size; + int fd; + uint64_t cpages; + uint64_t faults; +}; + +#define TWOMEG (1 << 21) +#define HMM_BUFFER_SIZE (1024 << 12) +#define HMM_PATH_MAX 64 +#define NTIMES 256 + +#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) + +FIXTURE(hmm) +{ + int fd; + unsigned int page_size; + unsigned int page_shift; +}; + +FIXTURE(hmm2) +{ + int fd0; + int fd1; + unsigned int page_size; + unsigned int page_shift; +}; + +static int hmm_open(int unit) +{ + char pathname[HMM_PATH_MAX]; + int fd; + + snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit); + fd = open(pathname, O_RDWR, 0); + if (fd < 0) + fprintf(stderr, "could not open hmm dmirror driver (%s)\n", + pathname); + return fd; +} + +FIXTURE_SETUP(hmm) +{ + self->page_size = sysconf(_SC_PAGE_SIZE); + self->page_shift = ffs(self->page_size) - 1; + + self->fd = hmm_open(0); + ASSERT_GE(self->fd, 0); +} + +FIXTURE_SETUP(hmm2) +{ + self->page_size = sysconf(_SC_PAGE_SIZE); + self->page_shift = ffs(self->page_size) - 1; + + self->fd0 = hmm_open(0); + ASSERT_GE(self->fd0, 0); + self->fd1 = hmm_open(1); + ASSERT_GE(self->fd1, 0); +} + +FIXTURE_TEARDOWN(hmm) +{ + int ret = close(self->fd); + + ASSERT_EQ(ret, 0); + self->fd = -1; +} + +FIXTURE_TEARDOWN(hmm2) +{ + int ret = close(self->fd0); + + ASSERT_EQ(ret, 0); + self->fd0 = -1; + + ret = close(self->fd1); + ASSERT_EQ(ret, 0); + self->fd1 = -1; +} + +static int hmm_dmirror_cmd(int fd, + unsigned long request, + struct hmm_buffer *buffer, + unsigned long npages) +{ + struct hmm_dmirror_cmd cmd; + int ret; + + /* Simulate a device reading system memory. */ + cmd.addr = (__u64)buffer->ptr; + cmd.ptr = (__u64)buffer->mirror; + cmd.npages = npages; + + for (;;) { + ret = ioctl(fd, request, &cmd); + if (ret == 0) + break; + if (errno == EINTR) + continue; + return -errno; + } + buffer->cpages = cmd.cpages; + buffer->faults = cmd.faults; + + return 0; +} + +static void hmm_buffer_free(struct hmm_buffer *buffer) +{ + if (buffer == NULL) + return; + + if (buffer->ptr) + munmap(buffer->ptr, buffer->size); + free(buffer->mirror); + free(buffer); +} + +/* + * Create a temporary file that will be deleted on close. + */ +static int hmm_create_file(unsigned long size) +{ + char path[HMM_PATH_MAX]; + int fd; + + strcpy(path, "/tmp"); + fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600); + if (fd >= 0) { + int r; + + do { + r = ftruncate(fd, size); + } while (r == -1 && errno == EINTR); + if (!r) + return fd; + close(fd); + } + return -1; +} + +/* + * Return a random unsigned number. + */ +static unsigned int hmm_random(void) +{ + static int fd = -1; + unsigned int r; + + if (fd < 0) { + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + fprintf(stderr, "%s:%d failed to open /dev/urandom\n", + __FILE__, __LINE__); + return ~0U; + } + } + read(fd, &r, sizeof(r)); + return r; +} + +static void hmm_nanosleep(unsigned int n) +{ + struct timespec t; + + t.tv_sec = 0; + t.tv_nsec = n; + nanosleep(&t, NULL); +} + +/* + * Simple NULL test of device open/close. + */ +TEST_F(hmm, open_close) +{ +} + +/* + * Read private anonymous memory. + */ +TEST_F(hmm, anon_read) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int val; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* + * Initialize buffer in system memory but leave the first two pages + * zero (pte_none and pfn_zero). + */ + i = 2 * self->page_size / sizeof(*ptr); + for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Set buffer permission to read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Populate the CPU page table with a special zero page. */ + val = *(int *)(buffer->ptr + self->page_size); + ASSERT_EQ(val, 0); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + ptr = buffer->mirror; + for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + for (; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Read private anonymous memory which has been protected with + * mprotect() PROT_NONE. + */ +TEST_F(hmm, anon_read_prot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize mirror buffer so we can verify it isn't written. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + /* Protect buffer from reading. */ + ret = mprotect(buffer->ptr, size, PROT_NONE); + ASSERT_EQ(ret, 0); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, -EFAULT); + + /* Allow CPU to read the buffer so we can check it. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + hmm_buffer_free(buffer); +} + +/* + * Write private anonymous memory. + */ +TEST_F(hmm, anon_write) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Write private anonymous memory which has been protected with + * mprotect() PROT_READ. + */ +TEST_F(hmm, anon_write_prot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device reading a zero page of memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + ASSERT_EQ(buffer->faults, 1); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, -EPERM); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + /* Now allow writing and see that the zero page is replaced. */ + ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Check that a device writing an anonymous private mapping + * will copy-on-write if a child process inherits the mapping. + */ +TEST_F(hmm, anon_write_child) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + pid_t pid; + int child_fd; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did not change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + return; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + close(child_fd); + exit(0); +} + +/* + * Check that a device writing an anonymous shared mapping + * will not copy-on-write if a child process inherits the mapping. + */ +TEST_F(hmm, anon_write_child_shared) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + pid_t pid; + int child_fd; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + return; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + close(child_fd); + exit(0); +} + +/* + * Write private anonymous huge page. + */ +TEST_F(hmm, anon_write_huge) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = 2 * TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + size = TWOMEG; + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Write huge TLBFS page. + */ +TEST_F(hmm, anon_write_hugetlbfs) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + long pagesizes[4]; + int n, idx; + + /* Skip test if we can't allocate a hugetlbfs page. */ + + n = gethugepagesizes(pagesizes, 4); + if (n <= 0) + return; + for (idx = 0; --n > 0; ) { + if (pagesizes[n] < pagesizes[idx]) + idx = n; + } + size = ALIGN(TWOMEG, pagesizes[idx]); + npages = size >> self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->ptr = get_hugepage_region(size, GHR_STRICT); + if (buffer->ptr == NULL) { + free(buffer); + return; + } + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + free_hugepage_region(buffer->ptr); + buffer->ptr = NULL; + hmm_buffer_free(buffer); +} + +/* + * Read mmap'ed file memory. + */ +TEST_F(hmm, file_read) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int fd; + ssize_t len; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + fd = hmm_create_file(size); + ASSERT_GE(fd, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = fd; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Write initial contents of the file. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + len = pwrite(fd, buffer->mirror, size, 0); + ASSERT_EQ(len, size); + memset(buffer->mirror, 0, size); + + buffer->ptr = mmap(NULL, size, + PROT_READ, + MAP_SHARED, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Write mmap'ed file memory. + */ +TEST_F(hmm, file_write) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int fd; + ssize_t len; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + fd = hmm_create_file(size); + ASSERT_GE(fd, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = fd; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Check that the device also wrote the file. */ + len = pread(fd, buffer->mirror, size, 0); + ASSERT_EQ(len, size); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device private memory. + */ +TEST_F(hmm, migrate) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device private memory and fault it back to system + * memory. + */ +TEST_F(hmm, migrate_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Try to migrate various memory types to device private memory. + */ +TEST_F(hmm2, migrate_mixed) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int *ptr; + unsigned char *p; + int ret; + int val; + + npages = 6; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + p = buffer->ptr; + + /* Migrating a protected area should be an error. */ + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, npages); + ASSERT_EQ(ret, -EINVAL); + + /* Punch a hole after the first page address. */ + ret = munmap(buffer->ptr + self->page_size, self->page_size); + ASSERT_EQ(ret, 0); + + /* We expect an error if the vma doesn't cover the range. */ + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 3); + ASSERT_EQ(ret, -EINVAL); + + /* Page 2 will be a read-only zero page. */ + ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 2 * self->page_size); + val = *ptr + 3; + ASSERT_EQ(val, 3); + + /* Page 3 will be read-only. */ + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 3 * self->page_size); + *ptr = val; + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + + /* Page 4-5 will be read-write. */ + ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 4 * self->page_size); + *ptr = val; + ptr = (int *)(buffer->ptr + 5 * self->page_size); + *ptr = val; + + /* Now try to migrate pages 2-5 to device 1. */ + buffer->ptr = p + 2 * self->page_size; + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 4); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 4); + + /* Page 5 won't be migrated to device 0 because it's on device 1. */ + buffer->ptr = p + 5 * self->page_size; + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1); + ASSERT_EQ(ret, -ENOENT); + buffer->ptr = p; + + buffer->ptr = p; + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device private memory and fault it back to system + * memory multiple times. + */ +TEST_F(hmm, migrate_multiple) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + unsigned long c; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; c++) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, + npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); + } +} + +/* + * Read anonymous memory multiple times. + */ +TEST_F(hmm, anon_read_multiple) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + unsigned long c; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; c++) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i + c; + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, + npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i + c); + + hmm_buffer_free(buffer); + } +} + +void *unmap_buffer(void *p) +{ + struct hmm_buffer *buffer = p; + + /* Delay for a bit and then unmap buffer while it is being read. */ + hmm_nanosleep(hmm_random() % 32000); + munmap(buffer->ptr + buffer->size / 2, buffer->size / 2); + buffer->ptr = NULL; + + return NULL; +} + +/* + * Try reading anonymous memory while it is being unmapped. + */ +TEST_F(hmm, anon_teardown) +{ + unsigned long npages; + unsigned long size; + unsigned long c; + void *ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; ++c) { + pthread_t thread; + struct hmm_buffer *buffer; + unsigned long i; + int *ptr; + int rc; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i + c; + + rc = pthread_create(&thread, NULL, unmap_buffer, buffer); + ASSERT_EQ(rc, 0); + + /* Simulate a device reading system memory. */ + rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, + npages); + if (rc == 0) { + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; + i < size / sizeof(*ptr); + ++i) + ASSERT_EQ(ptr[i], i + c); + } + + pthread_join(thread, &ret); + hmm_buffer_free(buffer); + } +} + +/* + * Test memory snapshot without faulting in pages accessed by the device. + */ +TEST_F(hmm2, snapshot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int *ptr; + unsigned char *p; + unsigned char *m; + int ret; + int val; + + npages = 7; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + p = buffer->ptr; + + /* Punch a hole after the first page address. */ + ret = munmap(buffer->ptr + self->page_size, self->page_size); + ASSERT_EQ(ret, 0); + + /* Page 2 will be read-only zero page. */ + ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 2 * self->page_size); + val = *ptr + 3; + ASSERT_EQ(val, 3); + + /* Page 3 will be read-only. */ + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 3 * self->page_size); + *ptr = val; + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + + /* Page 4-6 will be read-write. */ + ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 4 * self->page_size); + *ptr = val; + + /* Page 5 will be migrated to device 0. */ + buffer->ptr = p + 5 * self->page_size; + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + + /* Page 6 will be migrated to device 1. */ + buffer->ptr = p + 6 * self->page_size; + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + + /* Simulate a device snapshotting CPU pagetables. */ + buffer->ptr = p; + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR); + ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR); + ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ); + ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ); + ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); + + hmm_buffer_free(buffer); +} + +/* + * Test two devices reading the same memory (double mapped). + */ +TEST_F(hmm2, double_map) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = 6; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Make region read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate device 0 reading system memory. */ + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Simulate device 1 reading system memory. */ + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Punch a hole after the first page address. */ + ret = munmap(buffer->ptr + self->page_size, self->page_size); + ASSERT_EQ(ret, 0); + + hmm_buffer_free(buffer); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c new file mode 100644 index 000000000000..51b89cedd09d --- /dev/null +++ b/tools/testing/selftests/vm/khugepaged.c @@ -0,0 +1,1035 @@ +#define _GNU_SOURCE +#include <fcntl.h> +#include <limits.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> + +#include <sys/mman.h> +#include <sys/wait.h> + +#ifndef MADV_PAGEOUT +#define MADV_PAGEOUT 21 +#endif + +#define BASE_ADDR ((void *)(1UL << 30)) +static unsigned long hpage_pmd_size; +static unsigned long page_size; +static int hpage_pmd_nr; + +#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" +#define PID_SMAPS "/proc/self/smaps" + +enum thp_enabled { + THP_ALWAYS, + THP_MADVISE, + THP_NEVER, +}; + +static const char *thp_enabled_strings[] = { + "always", + "madvise", + "never", + NULL +}; + +enum thp_defrag { + THP_DEFRAG_ALWAYS, + THP_DEFRAG_DEFER, + THP_DEFRAG_DEFER_MADVISE, + THP_DEFRAG_MADVISE, + THP_DEFRAG_NEVER, +}; + +static const char *thp_defrag_strings[] = { + "always", + "defer", + "defer+madvise", + "madvise", + "never", + NULL +}; + +enum shmem_enabled { + SHMEM_ALWAYS, + SHMEM_WITHIN_SIZE, + SHMEM_ADVISE, + SHMEM_NEVER, + SHMEM_DENY, + SHMEM_FORCE, +}; + +static const char *shmem_enabled_strings[] = { + "always", + "within_size", + "advise", + "never", + "deny", + "force", + NULL +}; + +struct khugepaged_settings { + bool defrag; + unsigned int alloc_sleep_millisecs; + unsigned int scan_sleep_millisecs; + unsigned int max_ptes_none; + unsigned int max_ptes_swap; + unsigned int max_ptes_shared; + unsigned long pages_to_scan; +}; + +struct settings { + enum thp_enabled thp_enabled; + enum thp_defrag thp_defrag; + enum shmem_enabled shmem_enabled; + bool debug_cow; + bool use_zero_page; + struct khugepaged_settings khugepaged; +}; + +static struct settings default_settings = { + .thp_enabled = THP_MADVISE, + .thp_defrag = THP_DEFRAG_ALWAYS, + .shmem_enabled = SHMEM_NEVER, + .debug_cow = 0, + .use_zero_page = 0, + .khugepaged = { + .defrag = 1, + .alloc_sleep_millisecs = 10, + .scan_sleep_millisecs = 10, + }, +}; + +static struct settings saved_settings; +static bool skip_settings_restore; + +static int exit_status; + +static void success(const char *msg) +{ + printf(" \e[32m%s\e[0m\n", msg); +} + +static void fail(const char *msg) +{ + printf(" \e[31m%s\e[0m\n", msg); + exit_status++; +} + +static int read_file(const char *path, char *buf, size_t buflen) +{ + int fd; + ssize_t numread; + + fd = open(path, O_RDONLY); + if (fd == -1) + return 0; + + numread = read(fd, buf, buflen - 1); + if (numread < 1) { + close(fd); + return 0; + } + + buf[numread] = '\0'; + close(fd); + + return (unsigned int) numread; +} + +static int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) + return 0; + + return (unsigned int) numwritten; +} + +static int read_string(const char *name, const char *strings[]) +{ + char path[PATH_MAX]; + char buf[256]; + char *c; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!read_file(path, buf, sizeof(buf))) { + perror(path); + exit(EXIT_FAILURE); + } + + c = strchr(buf, '['); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + + c++; + memmove(buf, c, sizeof(buf) - (c - buf)); + + c = strchr(buf, ']'); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + *c = '\0'; + + ret = 0; + while (strings[ret]) { + if (!strcmp(strings[ret], buf)) + return ret; + ret++; + } + + printf("Failed to parse %s\n", name); + exit(EXIT_FAILURE); +} + +static void write_string(const char *name, const char *val) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(path, val, strlen(val) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +static const unsigned long read_num(const char *name) +{ + char path[PATH_MAX]; + char buf[21]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + ret = read_file(path, buf, sizeof(buf)); + if (ret < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + + return strtoul(buf, NULL, 10); +} + +static void write_num(const char *name, unsigned long num) +{ + char path[PATH_MAX]; + char buf[21]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + sprintf(buf, "%ld", num); + if (!write_file(path, buf, strlen(buf) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +static void write_settings(struct settings *settings) +{ + struct khugepaged_settings *khugepaged = &settings->khugepaged; + + write_string("enabled", thp_enabled_strings[settings->thp_enabled]); + write_string("defrag", thp_defrag_strings[settings->thp_defrag]); + write_string("shmem_enabled", + shmem_enabled_strings[settings->shmem_enabled]); + write_num("debug_cow", settings->debug_cow); + write_num("use_zero_page", settings->use_zero_page); + + write_num("khugepaged/defrag", khugepaged->defrag); + write_num("khugepaged/alloc_sleep_millisecs", + khugepaged->alloc_sleep_millisecs); + write_num("khugepaged/scan_sleep_millisecs", + khugepaged->scan_sleep_millisecs); + write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); + write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); + write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); + write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); +} + +static void restore_settings(int sig) +{ + if (skip_settings_restore) + goto out; + + printf("Restore THP and khugepaged settings..."); + write_settings(&saved_settings); + success("OK"); + if (sig) + exit(EXIT_FAILURE); +out: + exit(exit_status); +} + +static void save_settings(void) +{ + printf("Save THP and khugepaged settings..."); + saved_settings = (struct settings) { + .thp_enabled = read_string("enabled", thp_enabled_strings), + .thp_defrag = read_string("defrag", thp_defrag_strings), + .shmem_enabled = + read_string("shmem_enabled", shmem_enabled_strings), + .debug_cow = read_num("debug_cow"), + .use_zero_page = read_num("use_zero_page"), + }; + saved_settings.khugepaged = (struct khugepaged_settings) { + .defrag = read_num("khugepaged/defrag"), + .alloc_sleep_millisecs = + read_num("khugepaged/alloc_sleep_millisecs"), + .scan_sleep_millisecs = + read_num("khugepaged/scan_sleep_millisecs"), + .max_ptes_none = read_num("khugepaged/max_ptes_none"), + .max_ptes_swap = read_num("khugepaged/max_ptes_swap"), + .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), + .pages_to_scan = read_num("khugepaged/pages_to_scan"), + }; + success("OK"); + + signal(SIGTERM, restore_settings); + signal(SIGINT, restore_settings); + signal(SIGHUP, restore_settings); + signal(SIGQUIT, restore_settings); +} + +static void adjust_settings(void) +{ + + printf("Adjust settings..."); + write_settings(&default_settings); + success("OK"); +} + +#define MAX_LINE_LENGTH 500 + +static bool check_for_pattern(FILE *fp, char *pattern, char *buf) +{ + while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { + if (!strncmp(buf, pattern, strlen(pattern))) + return true; + } + return false; +} + +static bool check_huge(void *addr) +{ + bool thp = false; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(PID_SMAPS, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer)) + goto err_out; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB", + hpage_pmd_size >> 10); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + /* + * Fetch the AnonHugePages: in the same block and check whether it got + * the expected number of hugeepages next. + */ + if (!check_for_pattern(fp, "AnonHugePages:", buffer)) + goto err_out; + + if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) + goto err_out; + + thp = true; +err_out: + fclose(fp); + return thp; +} + + +static bool check_swap(void *addr, unsigned long size) +{ + bool swap = false; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(PID_SMAPS, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer)) + goto err_out; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", + size >> 10); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + /* + * Fetch the Swap: in the same block and check whether it got + * the expected number of hugeepages next. + */ + if (!check_for_pattern(fp, "Swap:", buffer)) + goto err_out; + + if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) + goto err_out; + + swap = true; +err_out: + fclose(fp); + return swap; +} + +static void *alloc_mapping(void) +{ + void *p; + + p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (p != BASE_ADDR) { + printf("Failed to allocate VMA at %p\n", BASE_ADDR); + exit(EXIT_FAILURE); + } + + return p; +} + +static void fill_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) + p[i * page_size / sizeof(*p)] = i + 0xdead0000; +} + +static void validate_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) { + if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { + printf("Page %d is corrupted: %#x\n", + i, p[i * page_size / sizeof(*p)]); + exit(EXIT_FAILURE); + } + } +} + +#define TICK 500000 +static bool wait_for_scan(const char *msg, char *p) +{ + int full_scans; + int timeout = 6; /* 3 seconds */ + + /* Sanity check */ + if (check_huge(p)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + + /* Wait until the second full_scan completed */ + full_scans = read_num("khugepaged/full_scans") + 2; + + printf("%s...", msg); + while (timeout--) { + if (check_huge(p)) + break; + if (read_num("khugepaged/full_scans") >= full_scans) + break; + printf("."); + usleep(TICK); + } + + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + + return !timeout; +} + +static void alloc_at_fault(void) +{ + struct settings settings = default_settings; + char *p; + + settings.thp_enabled = THP_ALWAYS; + write_settings(&settings); + + p = alloc_mapping(); + *p = 1; + printf("Allocate huge page on fault..."); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + write_settings(&default_settings); + + madvise(p, page_size, MADV_DONTNEED); + printf("Split huge PMD on MADV_DONTNEED..."); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + munmap(p, hpage_pmd_size); +} + +static void collapse_full(void) +{ + void *p; + + p = alloc_mapping(); + fill_memory(p, 0, hpage_pmd_size); + if (wait_for_scan("Collapse fully populated PTE table", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_empty(void) +{ + void *p; + + p = alloc_mapping(); + if (wait_for_scan("Do not collapse empty PTE table", p)) + fail("Timeout"); + else if (check_huge(p)) + fail("Fail"); + else + success("OK"); + munmap(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry(void) +{ + void *p; + + p = alloc_mapping(); + fill_memory(p, 0, page_size); + if (wait_for_scan("Collapse PTE table with single PTE entry present", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_max_ptes_none(void) +{ + int max_ptes_none = hpage_pmd_nr / 2; + struct settings settings = default_settings; + void *p; + + settings.khugepaged.max_ptes_none = max_ptes_none; + write_settings(&settings); + + p = alloc_mapping(); + + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + if (wait_for_scan("Do not collapse with max_ptes_none exceeded", p)) + fail("Timeout"); + else if (check_huge(p)) + fail("Fail"); + else + success("OK"); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + + munmap(p, hpage_pmd_size); + write_settings(&default_settings); +} + +static void collapse_swapin_single_pte(void) +{ + void *p; + p = alloc_mapping(); + fill_memory(p, 0, hpage_pmd_size); + + printf("Swapout one page..."); + if (madvise(p, page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + if (wait_for_scan("Collapse with swapping in single PTE entry", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); +out: + munmap(p, hpage_pmd_size); +} + +static void collapse_max_ptes_swap(void) +{ + int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); + void *p; + + p = alloc_mapping(); + + fill_memory(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); + if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, (max_ptes_swap + 1) * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + if (wait_for_scan("Do not collapse with max_ptes_swap exceeded", p)) + fail("Timeout"); + else if (check_huge(p)) + fail("Fail"); + else + success("OK"); + validate_memory(p, 0, hpage_pmd_size); + + fill_memory(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr); + if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, max_ptes_swap * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); +out: + munmap(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry_compound(void) +{ + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + + printf("Split huge page leaving single PTE mapping compound page..."); + madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Collapse PTE table with single PTE mapping compound page", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_full_of_compound(void) +{ + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Split huge page leaving single PTE page table full of compound pages..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Collapse PTE table full of compound pages", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_compound_extreme(void) +{ + void *p; + int i; + + p = alloc_mapping(); + for (i = 0; i < hpage_pmd_nr; i++) { + printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", + i + 1, hpage_pmd_nr); + + madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(BASE_ADDR, 0, hpage_pmd_size); + if (!check_huge(BASE_ADDR)) { + printf("Failed to allocate huge page\n"); + exit(EXIT_FAILURE); + } + madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); + + p = mremap(BASE_ADDR - i * page_size, + i * page_size + hpage_pmd_size, + (i + 1) * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR + 2 * hpage_pmd_size); + if (p == MAP_FAILED) { + perror("mremap+unmap"); + exit(EXIT_FAILURE); + } + + p = mremap(BASE_ADDR + 2 * hpage_pmd_size, + (i + 1) * page_size, + (i + 1) * page_size + hpage_pmd_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR - (i + 1) * page_size); + if (p == MAP_FAILED) { + perror("mremap+alloc"); + exit(EXIT_FAILURE); + } + } + + munmap(BASE_ADDR, hpage_pmd_size); + fill_memory(p, 0, hpage_pmd_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Collapse PTE table full of different compound pages", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_fork(void) +{ + int wstatus; + void *p; + + p = alloc_mapping(); + + printf("Allocate small page..."); + fill_memory(p, 0, page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Share small page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + fill_memory(p, page_size, 2 * page_size); + + if (wait_for_scan("Collapse PTE table with single page shared with parent process", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has small page..."); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_fork_compound(void) +{ + int wstatus; + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Split huge page PMD in child process..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + fill_memory(p, 0, page_size); + + write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); + if (wait_for_scan("Collapse PTE table full of compound pages in child", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + write_num("khugepaged/max_ptes_shared", + default_settings.khugepaged.max_ptes_shared); + + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_max_ptes_shared() +{ + int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); + int wstatus; + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p)) + fail("Timeout"); + else if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + + if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +int main(void) +{ + setbuf(stdout, NULL); + + page_size = getpagesize(); + hpage_pmd_size = read_num("hpage_pmd_size"); + hpage_pmd_nr = hpage_pmd_size / page_size; + + default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; + default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; + default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; + default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; + + save_settings(); + adjust_settings(); + + alloc_at_fault(); + collapse_full(); + collapse_empty(); + collapse_single_pte_entry(); + collapse_max_ptes_none(); + collapse_swapin_single_pte(); + collapse_max_ptes_swap(); + collapse_single_pte_entry_compound(); + collapse_full_of_compound(); + collapse_compound_extreme(); + collapse_fork(); + collapse_fork_compound(); + collapse_max_ptes_shared(); + + restore_settings(0); +} diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c index ee06cb0b9efb..3a7b5ef0b0c6 100644 --- a/tools/testing/selftests/vm/mremap_dontunmap.c +++ b/tools/testing/selftests/vm/mremap_dontunmap.c @@ -11,7 +11,6 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <stdlib.h> #include <unistd.h> #include "../kselftest.h" diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h new file mode 100644 index 000000000000..622a85848f61 --- /dev/null +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PKEYS_HELPER_H +#define _PKEYS_HELPER_H +#define _GNU_SOURCE +#include <string.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdint.h> +#include <stdbool.h> +#include <signal.h> +#include <assert.h> +#include <stdlib.h> +#include <ucontext.h> +#include <sys/mman.h> + +/* Define some kernel-like types */ +#define u8 __u8 +#define u16 __u16 +#define u32 __u32 +#define u64 __u64 + +#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) + +#ifndef DEBUG_LEVEL +#define DEBUG_LEVEL 0 +#endif +#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 +extern int dprint_in_signal; +extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +extern int test_nr; +extern int iteration_nr; + +#ifdef __GNUC__ +__attribute__((format(printf, 1, 2))) +#endif +static inline void sigsafe_printf(const char *format, ...) +{ + va_list ap; + + if (!dprint_in_signal) { + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + } else { + int ret; + /* + * No printf() functions are signal-safe. + * They deadlock easily. Write the format + * string to get some output, even if + * incomplete. + */ + ret = write(1, format, strlen(format)); + if (ret < 0) + exit(1); + } +} +#define dprintf_level(level, args...) do { \ + if (level <= DEBUG_LEVEL) \ + sigsafe_printf(args); \ +} while (0) +#define dprintf0(args...) dprintf_level(0, args) +#define dprintf1(args...) dprintf_level(1, args) +#define dprintf2(args...) dprintf_level(2, args) +#define dprintf3(args...) dprintf_level(3, args) +#define dprintf4(args...) dprintf_level(4, args) + +extern void abort_hooks(void); +#define pkey_assert(condition) do { \ + if (!(condition)) { \ + dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ + __FILE__, __LINE__, \ + test_nr, iteration_nr); \ + dprintf0("errno at assert: %d", errno); \ + abort_hooks(); \ + exit(__LINE__); \ + } \ +} while (0) + +__attribute__((noinline)) int read_ptr(int *ptr); +void expected_pkey_fault(int pkey); +int sys_pkey_alloc(unsigned long flags, unsigned long init_val); +int sys_pkey_free(unsigned long pkey); +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey); +void record_pkey_malloc(void *ptr, long size, int prot); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ +#include "pkey-x86.h" +#elif defined(__powerpc64__) /* arch */ +#include "pkey-powerpc.h" +#else /* arch */ +#error Architecture not supported +#endif /* arch */ + +#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) + +static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + u32 shift = pkey_bit_position(pkey); + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + /* OR in new bits for pkey */ + reg |= (flags & PKEY_MASK) << shift; + return reg; +} + +static inline u64 get_pkey_bits(u64 reg, int pkey) +{ + u32 shift = pkey_bit_position(pkey); + /* + * shift down the relevant bits to the lowest two, then + * mask off all the other higher bits + */ + return ((reg >> shift) & PKEY_MASK); +} + +extern u64 shadow_pkey_reg; + +static inline u64 _read_pkey_reg(int line) +{ + u64 pkey_reg = __read_pkey_reg(); + + dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx" + " shadow: %016llx\n", + line, pkey_reg, shadow_pkey_reg); + assert(pkey_reg == shadow_pkey_reg); + + return pkey_reg; +} + +#define read_pkey_reg() _read_pkey_reg(__LINE__) + +static inline void write_pkey_reg(u64 pkey_reg) +{ + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + /* will do the shadow check for us: */ + read_pkey_reg(); + __write_pkey_reg(pkey_reg); + shadow_pkey_reg = pkey_reg; + dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__, + pkey_reg, __read_pkey_reg()); +} + +/* + * These are technically racy. since something could + * change PKEY register between the read and the write. + */ +static inline void __pkey_access_allow(int pkey, int do_allow) +{ + u64 pkey_reg = read_pkey_reg(); + int bit = pkey * 2; + + if (do_allow) + pkey_reg &= (1<<bit); + else + pkey_reg |= (1<<bit); + + dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); + write_pkey_reg(pkey_reg); +} + +static inline void __pkey_write_allow(int pkey, int do_allow_write) +{ + u64 pkey_reg = read_pkey_reg(); + int bit = pkey * 2 + 1; + + if (do_allow_write) + pkey_reg &= (1<<bit); + else + pkey_reg |= (1<<bit); + + write_pkey_reg(pkey_reg); + dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); +} + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) +#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) +#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) +#define ALIGN_PTR_UP(p, ptr_align_to) \ + ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) +#define ALIGN_PTR_DOWN(p, ptr_align_to) \ + ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) +{ +#ifdef si_pkey + return &si->si_pkey; +#else + return (u32 *)(((u8 *)si) + si_pkey_offset); +#endif +} + +static inline int kernel_has_pkeys(void) +{ + /* try allocating a key and see if it succeeds */ + int ret = sys_pkey_alloc(0, 0); + if (ret <= 0) { + return 0; + } + sys_pkey_free(ret); + return 1; +} + +static inline int is_pkeys_supported(void) +{ + /* check if the cpu supports pkeys */ + if (!cpu_has_pkeys()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return 0; + } + + /* check if the kernel supports pkeys */ + if (!kernel_has_pkeys()) { + dprintf1("SKIP: %s: no kernel support\n", __func__); + return 0; + } + + return 1; +} + +#endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h new file mode 100644 index 000000000000..1ebb586b2fbc --- /dev/null +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_POWERPC_H +#define _PKEYS_POWERPC_H + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 386 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 384 +# define SYS_pkey_free 385 +#endif +#define REG_IP_IDX PT_NIP +#define REG_TRAPNO PT_TRAP +#define gregs gp_regs +#define fpregs fp_regs +#define si_pkey_offset 0x20 + +#undef PKEY_DISABLE_ACCESS +#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ + +#undef PKEY_DISABLE_WRITE +#define PKEY_DISABLE_WRITE 0x2 + +#define NR_PKEYS 32 +#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey + and 24 other keys that cannot be + represented in the PTE */ +#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0, + pkey-1 and exec-only key */ +#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1, + pkey-31 and exec-only key */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL << 24) +#define PAGE_SIZE sysconf(_SC_PAGESIZE) + +static inline u32 pkey_bit_position(int pkey) +{ + return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY; +} + +static inline u64 __read_pkey_reg(void) +{ + u64 pkey_reg; + + asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg)); + + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + u64 amr = pkey_reg; + + dprintf4("%s() changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); + + asm volatile("isync; mtspr 0xd, %0; isync" + : : "r" ((unsigned long)(amr)) : "memory"); + + dprintf4("%s() pkey register after changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); +} + +static inline int cpu_has_pkeys(void) +{ + /* No simple way to determine this */ + return 1; +} + +static inline bool arch_is_powervm() +{ + struct stat buf; + + if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) ) + return true; + + return false; +} + +static inline int get_arch_reserved_keys(void) +{ + if (sysconf(_SC_PAGESIZE) == 4096) + return NR_RESERVED_PKEYS_4K; + else + if (arch_is_powervm()) + return NR_RESERVED_PKEYS_64K_4KEYS; + else + return NR_RESERVED_PKEYS_64K_3KEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + /* + * powerpc does not allow userspace to change permissions of exec-only + * keys since those keys are not allocated by userspace. The signal + * handler wont be able to reset the permissions, which means the code + * will infinitely continue to segfault here. + */ + return; +} + +/* 4-byte instructions * 16384 = 64K page */ +#define __page_o_noops() asm(".rept 16384 ; nop; .endr") + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + + ret = syscall(__NR_subpage_prot, ptr, size, NULL); + if (ret) { + perror("subpage_perm"); + return PTR_ERR_ENOTSUP; + } + + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size, prot); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + +#endif /* _PKEYS_POWERPC_H */ diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h new file mode 100644 index 000000000000..3be20f5d5275 --- /dev/null +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_X86_H +#define _PKEYS_X86_H + +#ifdef __i386__ + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 380 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 381 +# define SYS_pkey_free 382 +#endif + +#define REG_IP_IDX REG_EIP +#define si_pkey_offset 0x14 + +#else + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 329 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 330 +# define SYS_pkey_free 331 +#endif + +#define REG_IP_IDX REG_RIP +#define si_pkey_offset 0x20 + +#endif + +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x1 +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE 0x2 +#endif + +#define NR_PKEYS 16 +#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL<<21) +#define PAGE_SIZE 4096 +#define MB (1<<20) + +static inline void __page_o_noops(void) +{ + /* 8-bytes of instruction * 512 bytes = 1 page */ + asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); +} + +static inline u64 __read_pkey_reg(void) +{ + unsigned int eax, edx; + unsigned int ecx = 0; + unsigned pkey_reg; + + asm volatile(".byte 0x0f,0x01,0xee\n\t" + : "=a" (eax), "=d" (edx) + : "c" (ecx)); + pkey_reg = eax; + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + unsigned int eax = pkey_reg; + unsigned int ecx = 0; + unsigned int edx = 0; + + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (eax), "c" (ecx), "d" (edx)); + assert(pkey_reg == __read_pkey_reg()); +} + +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile( + "cpuid;" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ +#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ + +static inline int cpu_has_pkeys(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + + eax = 0x7; + ecx = 0x0; + __cpuid(&eax, &ebx, &ecx, &edx); + + if (!(ecx & X86_FEATURE_PKU)) { + dprintf2("cpu does not have PKU\n"); + return 0; + } + if (!(ecx & X86_FEATURE_OSPKE)) { + dprintf2("cpu does not have OSPKE\n"); + return 0; + } + return 1; +} + +static inline u32 pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +#define XSTATE_PKEY_BIT (9) +#define XSTATE_PKEY 0x200 + +int pkey_reg_xstate_offset(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + int xstate_offset; + int xstate_size; + unsigned long XSTATE_CPUID = 0xd; + int leaf; + + /* assume that XSTATE_PKEY is set in XCR0 */ + leaf = XSTATE_PKEY_BIT; + { + eax = XSTATE_CPUID; + ecx = leaf; + __cpuid(&eax, &ebx, &ecx, &edx); + + if (leaf == XSTATE_PKEY_BIT) { + xstate_offset = ebx; + xstate_size = eax; + } + } + + if (xstate_size == 0) { + printf("could not find size/offset of PKEY in xsave state\n"); + return 0; + } + + return xstate_offset; +} + +static inline int get_arch_reserved_keys(void) +{ + return NR_RESERVED_PKEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + int ptr_contents; + + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + expected_pkey_fault(pkey); +} + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + return PTR_ERR_ENOTSUP; +} + +#endif /* _PKEYS_X86_H */ diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 480995bceefa..fc19addcb5c8 100644 --- a/tools/testing/selftests/x86/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Tests x86 Memory Protection Keys (see Documentation/core-api/protection-keys.rst) + * Tests Memory Protection Keys (see Documentation/vm/protection-keys.txt) * * There are examples in here of: * * how to set protection keys on memory - * * how to set/clear bits in PKRU (the rights register) - * * how to handle SEGV_PKRU signals and extract pkey-relevant + * * how to set/clear bits in pkey registers (the rights register) + * * how to handle SEGV_PKUERR signals and extract pkey-relevant * information from the siginfo * * Things to add: @@ -22,8 +22,10 @@ * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm */ #define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ #include <errno.h> #include <linux/futex.h> +#include <time.h> #include <sys/time.h> #include <sys/syscall.h> #include <string.h> @@ -48,34 +50,10 @@ int iteration_nr = 1; int test_nr; -unsigned int shadow_pkru; - -#define HPAGE_SIZE (1UL<<21) -#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) -#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) -#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) -#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) -#define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) -#define __stringify_1(x...) #x -#define __stringify(x...) __stringify_1(x) - -#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) - +u64 shadow_pkey_reg; int dprint_in_signal; char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; -extern void abort_hooks(void); -#define pkey_assert(condition) do { \ - if (!(condition)) { \ - dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ - __FILE__, __LINE__, \ - test_nr, iteration_nr); \ - dprintf0("errno at assert: %d", errno); \ - abort_hooks(); \ - exit(__LINE__); \ - } \ -} while (0) - void cat_into_file(char *str, char *file) { int fd = open(file, O_RDWR); @@ -158,12 +136,6 @@ void abort_hooks(void) #endif } -static inline void __page_o_noops(void) -{ - /* 8-bytes of instruction * 512 bytes = 1 page */ - asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); -} - /* * This attempts to have roughly a page of instructions followed by a few * instructions that do a write, and another page of instructions. That @@ -174,7 +146,12 @@ static inline void __page_o_noops(void) * will then fault, which makes sure that the fault code handles * execute-only memory properly. */ +#ifdef __powerpc64__ +/* This way, both 4K and 64K alignment are maintained */ +__attribute__((__aligned__(65536))) +#else __attribute__((__aligned__(PAGE_SIZE))) +#endif void lots_o_noops_around_write(int *write_to_me) { dprintf3("running %s()\n", __func__); @@ -186,51 +163,134 @@ void lots_o_noops_around_write(int *write_to_me) dprintf3("%s() done\n", __func__); } -/* Define some kernel-like types */ -#define u8 uint8_t -#define u16 uint16_t -#define u32 uint32_t -#define u64 uint64_t +void dump_mem(void *dumpme, int len_bytes) +{ + char *c = (void *)dumpme; + int i; -#ifdef __i386__ + for (i = 0; i < len_bytes; i += sizeof(u64)) { + u64 *ptr = (u64 *)(c + i); + dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); + } +} -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 380 -#endif +static u32 hw_pkey_get(int pkey, unsigned long flags) +{ + u64 pkey_reg = __read_pkey_reg(); -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 381 -# define SYS_pkey_free 382 -#endif + dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", + __func__, pkey, flags, 0, 0); + dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); -#define REG_IP_IDX REG_EIP -#define si_pkey_offset 0x14 + return (u32) get_pkey_bits(pkey_reg, pkey); +} -#else +static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) +{ + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); + u64 old_pkey_reg = __read_pkey_reg(); + u64 new_pkey_reg; -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 329 -#endif + /* make sure that 'rights' only contains the bits we expect: */ + assert(!(rights & ~mask)); -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 330 -# define SYS_pkey_free 331 -#endif + /* modify bits accordingly in old pkey_reg and assign it */ + new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); -#define REG_IP_IDX REG_RIP -#define si_pkey_offset 0x20 + __write_pkey_reg(new_pkey_reg); -#endif + dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" + " pkey_reg now: %016llx old_pkey_reg: %016llx\n", + __func__, pkey, rights, flags, 0, __read_pkey_reg(), + old_pkey_reg); + return 0; +} -void dump_mem(void *dumpme, int len_bytes) +void pkey_disable_set(int pkey, int flags) { - char *c = (void *)dumpme; - int i; + unsigned long syscall_flags = 0; + int ret; + int pkey_rights; + u64 orig_pkey_reg = read_pkey_reg(); - for (i = 0; i < len_bytes; i += sizeof(u64)) { - u64 *ptr = (u64 *)(c + i); - dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr); - } + dprintf1("START->%s(%d, 0x%x)\n", __func__, + pkey, flags); + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + pkey_assert(pkey_rights >= 0); + + pkey_rights |= flags; + + ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); + assert(!ret); + /* pkey_reg and flags have the same format */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + dprintf1("%s(%d) shadow: 0x%016llx\n", + __func__, pkey, shadow_pkey_reg); + + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", + __func__, pkey, read_pkey_reg()); + if (flags) + pkey_assert(read_pkey_reg() >= orig_pkey_reg); + dprintf1("END<---%s(%d, 0x%x)\n", __func__, + pkey, flags); +} + +void pkey_disable_clear(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights = hw_pkey_get(pkey, syscall_flags); + u64 orig_pkey_reg = read_pkey_reg(); + + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + pkey_assert(pkey_rights >= 0); + + pkey_rights &= ~flags; + + ret = hw_pkey_set(pkey, pkey_rights, 0); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, + pkey, read_pkey_reg()); + if (flags) + assert(read_pkey_reg() <= orig_pkey_reg); +} + +void pkey_write_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); +} +void pkey_write_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_WRITE); +} +void pkey_access_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); +} +void pkey_access_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); } /* Failed address bound checks: */ @@ -255,7 +315,7 @@ static char *si_code_str(int si_code) return "UNKNOWN"; } -int pkru_faults; +int pkey_faults; int last_si_pkey = -1; void signal_handler(int signum, siginfo_t *si, void *vucontext) { @@ -263,24 +323,28 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) int trapno; unsigned long ip; char *fpregs; - u32 *pkru_ptr; +#if defined(__i386__) || defined(__x86_64__) /* arch */ + u32 *pkey_reg_ptr; + int pkey_reg_offset; +#endif /* arch */ u64 siginfo_pkey; u32 *si_pkey_ptr; - int pkru_offset; - fpregset_t fpregset; dprint_in_signal = 1; dprintf1(">>>>===============SIGSEGV============================\n"); - dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__, - __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, + __read_pkey_reg(), shadow_pkey_reg); trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; - fpregset = uctxt->uc_mcontext.fpregs; - fpregs = (void *)fpregset; + fpregs = (char *) uctxt->uc_mcontext.fpregs; - dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__, - trapno, ip, si_code_str(si->si_code), si->si_code); + dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", + __func__, trapno, ip, si_code_str(si->si_code), + si->si_code); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ #ifdef __i386__ /* * 32-bit has some extra padding so that userspace can tell whether @@ -288,20 +352,22 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) * state. We just assume that it is here. */ fpregs += 0x70; -#endif - pkru_offset = pkru_xstate_offset(); - pkru_ptr = (void *)(&fpregs[pkru_offset]); +#endif /* i386 */ + pkey_reg_offset = pkey_reg_xstate_offset(); + pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); - dprintf1("siginfo: %p\n", si); - dprintf1(" fpregs: %p\n", fpregs); /* - * If we got a PKRU fault, we *HAVE* to have at least one bit set in + * If we got a PKEY fault, we *HAVE* to have at least one bit set in * here. */ - dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset()); + dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); if (DEBUG_LEVEL > 4) - dump_mem(pkru_ptr - 128, 256); - pkey_assert(*pkru_ptr); + dump_mem(pkey_reg_ptr - 128, 256); + pkey_assert(*pkey_reg_ptr); +#endif /* arch */ + + dprintf1("siginfo: %p\n", si); + dprintf1(" fpregs: %p\n", fpregs); if ((si->si_code == SEGV_MAPERR) || (si->si_code == SEGV_ACCERR) || @@ -310,20 +376,29 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) exit(4); } - si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset); + si_pkey_ptr = siginfo_get_pkey_ptr(si); dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); dump_mem((u8 *)si_pkey_ptr - 8, 24); siginfo_pkey = *si_pkey_ptr; pkey_assert(siginfo_pkey < NR_PKEYS); last_si_pkey = siginfo_pkey; - dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr); - /* need __rdpkru() version so we do not do shadow_pkru checking */ - dprintf1("signal pkru from pkru: %08x\n", __rdpkru()); - dprintf1("pkey from siginfo: %jx\n", siginfo_pkey); - *(u64 *)pkru_ptr = 0x00000000; - dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n"); - pkru_faults++; + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + dprintf1("signal pkey_reg from pkey_reg: %016llx\n", + __read_pkey_reg()); + dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ + dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); + *(u64 *)pkey_reg_ptr = 0x00000000; + dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); +#elif defined(__powerpc64__) /* arch */ + /* restore access and let the faulting instruction continue */ + pkey_access_allow(siginfo_pkey); +#endif /* arch */ + pkey_faults++; dprintf1("<<<<==================================================\n"); dprint_in_signal = 0; } @@ -391,143 +466,6 @@ pid_t fork_lazy_child(void) return forkret; } -#ifndef PKEY_DISABLE_ACCESS -# define PKEY_DISABLE_ACCESS 0x1 -#endif - -#ifndef PKEY_DISABLE_WRITE -# define PKEY_DISABLE_WRITE 0x2 -#endif - -static u32 hw_pkey_get(int pkey, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 pkru = __rdpkru(); - u32 shifted_pkru; - u32 masked_pkru; - - dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", - __func__, pkey, flags, 0, 0); - dprintf2("%s() raw pkru: %x\n", __func__, pkru); - - shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY)); - dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru); - masked_pkru = shifted_pkru & mask; - dprintf2("%s() masked pkru: %x\n", __func__, masked_pkru); - /* - * shift down the relevant bits to the lowest two, then - * mask off all the other high bits. - */ - return masked_pkru; -} - -static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 old_pkru = __rdpkru(); - u32 new_pkru; - - /* make sure that 'rights' only contains the bits we expect: */ - assert(!(rights & ~mask)); - - /* copy old pkru */ - new_pkru = old_pkru; - /* mask out bits from pkey in old value: */ - new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY)); - /* OR in new bits for pkey: */ - new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY)); - - __wrpkru(new_pkru); - - dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n", - __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru); - return 0; -} - -void pkey_disable_set(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights; - u32 orig_pkru = rdpkru(); - - dprintf1("START->%s(%d, 0x%x)\n", __func__, - pkey, flags); - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); - assert(!ret); - /*pkru and flags have the same format */ - shadow_pkru |= flags << (pkey * 2); - dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru); - - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); - if (flags) - pkey_assert(rdpkru() > orig_pkru); - dprintf1("END<---%s(%d, 0x%x)\n", __func__, - pkey, flags); -} - -void pkey_disable_clear(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u32 orig_pkru = rdpkru(); - - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, 0); - /* pkru and flags have the same format */ - shadow_pkru &= ~(flags << (pkey * 2)); - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); - if (flags) - assert(rdpkru() > orig_pkru); -} - -void pkey_write_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); -} -void pkey_write_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_WRITE); -} -void pkey_access_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); -} -void pkey_access_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); -} - int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, unsigned long pkey) { @@ -561,33 +499,44 @@ int alloc_pkey(void) int ret; unsigned long init_val = 0x0; - dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n", - __LINE__, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); ret = sys_pkey_alloc(0, init_val); /* - * pkey_alloc() sets PKRU, so we need to reflect it in - * shadow_pkru: + * pkey_alloc() sets PKEY register, so we need to reflect it in + * shadow_pkey_reg: */ - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); if (ret) { /* clear both the bits: */ - shadow_pkru &= ~(0x3 << (ret * 2)); - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + ~PKEY_MASK); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, + __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); /* * move the new state in from init_val - * (remember, we cheated and init_val == pkru format) + * (remember, we cheated and init_val == pkey_reg format) */ - shadow_pkru |= (init_val << (ret * 2)); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + init_val); } - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); - dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); /* for shadow checking: */ - rdpkru(); - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + read_pkey_reg(); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); return ret; } @@ -612,10 +561,10 @@ int alloc_random_pkey(void) int nr_alloced = 0; int random_index; memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); + srand((unsigned int)time(NULL)); /* allocate every possible key and make a note of which ones we got */ max_nr_pkey_allocs = NR_PKEYS; - max_nr_pkey_allocs = 1; for (i = 0; i < max_nr_pkey_allocs; i++) { int new_pkey = alloc_pkey(); if (new_pkey < 0) @@ -638,8 +587,9 @@ int alloc_random_pkey(void) free_ret = sys_pkey_free(alloced_pkeys[i]); pkey_assert(!free_ret); } - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; } @@ -657,11 +607,15 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, if (nr_iterations-- < 0) break; - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); sys_pkey_free(rpkey); - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); } pkey_assert(pkey < NR_PKEYS); @@ -669,8 +623,9 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", ptr, size, orig_prot, pkey, ret); pkey_assert(!ret); - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; } @@ -752,7 +707,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) void *ptr; int ret; - rdpkru(); + read_pkey_reg(); dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, size, prot, pkey); pkey_assert(pkey < NR_PKEYS); @@ -761,7 +716,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); pkey_assert(!ret); record_pkey_malloc(ptr, size, prot); - rdpkru(); + read_pkey_reg(); dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); return ptr; @@ -798,12 +753,15 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) } int hugetlb_setup_ok; +#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" #define GET_NR_HUGE_PAGES 10 void setup_hugetlbfs(void) { int err; int fd; - char buf[] = "123"; + char buf[256]; + long hpagesz_kb; + long hpagesz_mb; if (geteuid() != 0) { fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); @@ -814,11 +772,16 @@ void setup_hugetlbfs(void) /* * Now go make sure that we got the pages and that they - * are 2M pages. Someone might have made 1G the default. + * are PMD-level pages. Someone might have made PUD-level + * pages the default. */ - fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY); + hpagesz_kb = HPAGE_SIZE / 1024; + hpagesz_mb = hpagesz_kb / 1024; + sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); + fd = open(buf, O_RDONLY); if (fd < 0) { - perror("opening sysfs 2M hugetlb config"); + fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); return; } @@ -826,13 +789,14 @@ void setup_hugetlbfs(void) err = read(fd, buf, sizeof(buf)-1); close(fd); if (err <= 0) { - perror("reading sysfs 2M hugetlb config"); + fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); return; } if (atoi(buf) != GET_NR_HUGE_PAGES) { - fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n", - buf, GET_NR_HUGE_PAGES); + fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", + hpagesz_mb, buf, GET_NR_HUGE_PAGES); return; } @@ -886,6 +850,7 @@ void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { malloc_pkey_with_mprotect, + malloc_pkey_with_mprotect_subpage, malloc_pkey_anon_huge, malloc_pkey_hugetlb /* can not do direct with the pkey_mprotect() API: @@ -924,14 +889,14 @@ void *malloc_pkey(long size, int prot, u16 pkey) return ret; } -int last_pkru_faults; +int last_pkey_faults; #define UNKNOWN_PKEY -2 -void expected_pk_fault(int pkey) +void expected_pkey_fault(int pkey) { - dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n", - __func__, last_pkru_faults, pkru_faults); + dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", + __func__, last_pkey_faults, pkey_faults); dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); - pkey_assert(last_pkru_faults + 1 == pkru_faults); + pkey_assert(last_pkey_faults + 1 == pkey_faults); /* * For exec-only memory, we do not know the pkey in @@ -940,24 +905,28 @@ void expected_pk_fault(int pkey) if (pkey != UNKNOWN_PKEY) pkey_assert(last_si_pkey == pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ /* - * The signal handler shold have cleared out PKRU to let the + * The signal handler shold have cleared out PKEY register to let the * test program continue. We now have to restore it. */ - if (__rdpkru() != 0) + if (__read_pkey_reg() != 0) +#else /* arch */ + if (__read_pkey_reg() != shadow_pkey_reg) +#endif /* arch */ pkey_assert(0); - __wrpkru(shadow_pkru); - dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n", - __func__, shadow_pkru); - last_pkru_faults = pkru_faults; + __write_pkey_reg(shadow_pkey_reg); + dprintf1("%s() set pkey_reg=%016llx to restore state after signal " + "nuked it\n", __func__, shadow_pkey_reg); + last_pkey_faults = pkey_faults; last_si_pkey = -1; } -#define do_not_expect_pk_fault(msg) do { \ - if (last_pkru_faults != pkru_faults) \ - dprintf0("unexpected PK fault: %s\n", msg); \ - pkey_assert(last_pkru_faults == pkru_faults); \ +#define do_not_expect_pkey_fault(msg) do { \ + if (last_pkey_faults != pkey_faults) \ + dprintf0("unexpected PKey fault: %s\n", msg); \ + pkey_assert(last_pkey_faults == pkey_faults); \ } while (0) int test_fds[10] = { -1 }; @@ -1000,6 +969,58 @@ __attribute__((noinline)) int read_ptr(int *ptr) return *ptr; } +void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) +{ + int i, err; + int max_nr_pkey_allocs; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + long size; + + pkey_assert(pkey_last_malloc_record); + size = pkey_last_malloc_record->size; + /* + * This is a bit of a hack. But mprotect() requires + * huge-page-aligned sizes when operating on hugetlbfs. + * So, make sure that we use something that's a multiple + * of a huge page when we can. + */ + if (size >= HPAGE_SIZE) + size = HPAGE_SIZE; + + /* allocate every possible key and make sure key-0 never got allocated */ + max_nr_pkey_allocs = NR_PKEYS; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + pkey_assert(new_pkey != 0); + + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + /* free all the allocated keys */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + + /* attach key-0 in various modes */ + err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); + pkey_assert(!err); +} + void test_read_of_write_disabled_region(int *ptr, u16 pkey) { int ptr_contents; @@ -1015,26 +1036,67 @@ void test_read_of_access_disabled_region(int *ptr, u16 pkey) int ptr_contents; dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); - rdpkru(); + read_pkey_reg(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pkey_fault(pkey); +} + +void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", + pkey, ptr); + ptr_contents = read_ptr(ptr); + dprintf1("reading ptr before disabling the read : %d\n", + ptr_contents); + read_pkey_reg(); pkey_access_deny(pkey); ptr_contents = read_ptr(ptr); dprintf1("*ptr: %d\n", ptr_contents); - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } + +void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling write access; after accessing the page, " + "to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + void test_write_of_write_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); pkey_write_deny(pkey); *ptr = __LINE__; - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } void test_write_of_access_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); pkey_access_deny(pkey); *ptr = __LINE__; - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } + +void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling access; after accessing the page, " + " to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) { int ret; @@ -1160,9 +1222,11 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) int new_pkey; dprintf1("%s() alloc loop: %d\n", __func__, i); new_pkey = alloc_pkey(); - dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, err, __rdpkru(), shadow_pkru); - rdpkru(); /* for shadow checking */ + dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, err, __read_pkey_reg(), + shadow_pkey_reg); + read_pkey_reg(); /* for shadow checking */ dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); if ((new_pkey == -1) && (errno == ENOSPC)) { dprintf2("%s() failed to allocate pkey after %d tries\n", @@ -1188,6 +1252,7 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) dprintf3("%s()::%d\n", __func__, __LINE__); /* + * On x86: * There are 16 pkeys supported in hardware. Three are * allocated by the time we get here: * 1. The default key (0) @@ -1195,13 +1260,21 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) * 3. One allocated by the test code and passed in via * 'pkey' to this function. * Ensure that we can allocate at least another 13 (16-3). + * + * On powerpc: + * There are either 5, 28, 29 or 32 pkeys supported in + * hardware depending on the page size (4K or 64K) and + * platform (powernv or powervm). Four are allocated by + * the time we get here. These include pkey-0, pkey-1, + * exec-only pkey and the one allocated by the test code. + * Ensure that we can allocate the remaining. */ - pkey_assert(i >= NR_PKEYS-3); + pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); for (i = 0; i < nr_allocated_pkeys; i++) { err = sys_pkey_free(allocated_pkeys[i]); pkey_assert(!err); - rdpkru(); /* for shadow checking */ + read_pkey_reg(); /* for shadow checking */ } } @@ -1287,7 +1360,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey) pkey_assert(ret != -1); /* Now access from the current task, and expect an exception: */ peek_result = read_ptr(ptr); - expected_pk_fault(pkey); + expected_pkey_fault(pkey); /* * Try to access the NON-pkey-protected "plain_ptr" via ptrace: @@ -1297,7 +1370,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey) pkey_assert(ret != -1); /* Now access from the current task, and expect NO exception: */ peek_result = read_ptr(plain_ptr); - do_not_expect_pk_fault("read plain pointer after ptrace"); + do_not_expect_pkey_fault("read plain pointer after ptrace"); ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); pkey_assert(ret != -1); @@ -1347,17 +1420,15 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) pkey_assert(!ret); pkey_access_deny(pkey); - dprintf2("pkru: %x\n", rdpkru()); + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); /* * Make sure this is an *instruction* fault */ madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(&scratch); - do_not_expect_pk_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pk_fault(pkey); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, pkey); } void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) @@ -1378,15 +1449,13 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); pkey_assert(!ret); - dprintf2("pkru: %x\n", rdpkru()); + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); /* Make sure this is an *instruction* fault */ madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(&scratch); - do_not_expect_pk_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pk_fault(UNKNOWN_PKEY); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); /* * Put the memory back to non-PROT_EXEC. Should clear the @@ -1400,7 +1469,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); pkey_assert(!ret); ptr_contents = read_ptr(p1); - do_not_expect_pk_fault("plain read on recently PROT_EXEC area"); + do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); } void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) @@ -1408,7 +1477,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) int size = PAGE_SIZE; int sret; - if (cpu_has_pku()) { + if (cpu_has_pkeys()) { dprintf1("SKIP: %s: no CPU support\n", __func__); return; } @@ -1420,8 +1489,11 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) void (*pkey_tests[])(int *ptr, u16 pkey) = { test_read_of_write_disabled_region, test_read_of_access_disabled_region, + test_read_of_access_disabled_region_with_page_already_mapped, test_write_of_write_disabled_region, + test_write_of_write_disabled_region_with_page_already_mapped, test_write_of_access_disabled_region, + test_write_of_access_disabled_region_with_page_already_mapped, test_kernel_write_of_access_disabled_region, test_kernel_write_of_write_disabled_region, test_kernel_gup_of_access_disabled_region, @@ -1433,6 +1505,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_pkey_syscalls_on_non_allocated_pkey, test_pkey_syscalls_bad_args, test_pkey_alloc_exhaust, + test_pkey_alloc_free_attach_pkey0, }; void run_tests_once(void) @@ -1442,7 +1515,7 @@ void run_tests_once(void) for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { int pkey; - int orig_pkru_faults = pkru_faults; + int orig_pkey_faults = pkey_faults; dprintf1("======================\n"); dprintf1("test %d preparing...\n", test_nr); @@ -1457,8 +1530,8 @@ void run_tests_once(void) free_pkey_malloc(ptr); sys_pkey_free(pkey); - dprintf1("pkru_faults: %d\n", pkru_faults); - dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults); + dprintf1("pkey_faults: %d\n", pkey_faults); + dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); tracing_off(); close_test_fds(); @@ -1471,18 +1544,19 @@ void run_tests_once(void) void pkey_setup_shadow(void) { - shadow_pkru = __rdpkru(); + shadow_pkey_reg = __read_pkey_reg(); } int main(void) { int nr_iterations = 22; + int pkeys_supported = is_pkeys_supported(); setup_handlers(); - printf("has pku: %d\n", cpu_has_pku()); + printf("has pkeys: %d\n", pkeys_supported); - if (!cpu_has_pku()) { + if (!pkeys_supported) { int size = PAGE_SIZE; int *ptr; @@ -1495,7 +1569,7 @@ int main(void) } pkey_setup_shadow(); - printf("startup pkru: %x\n", rdpkru()); + printf("startup pkey_reg: %016llx\n", read_pkey_reg()); setup_hugetlbfs(); while (nr_iterations-- > 0) diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index 76ca5e7a3951..a3f4f30f0a2e 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -307,4 +307,20 @@ else echo "[FAIL]" exitcode=1 fi + +echo "running HMM smoke test" +echo "------------------------------------" +./test_hmm.sh smoke +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + exit $exitcode diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh new file mode 100755 index 000000000000..0647b525a625 --- /dev/null +++ b/tools/testing/selftests/vm/test_hmm.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com> +# +# This is a test script for the kernel test driver to analyse vmalloc +# allocator. Therefore it is just a kernel module loader. You can specify +# and pass different parameters in order to: +# a) analyse performance of vmalloc allocations; +# b) stressing and stability check of vmalloc subsystem. + +TEST_NAME="test_hmm" +DRIVER="test_hmm" + +# 1 if fails +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +check_test_requirements() +{ + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo "$0: Must be run as root" + exit $ksft_skip + fi + + if ! which modprobe > /dev/null 2>&1; then + echo "$0: You need modprobe installed" + exit $ksft_skip + fi + + if ! modinfo $DRIVER > /dev/null 2>&1; then + echo "$0: You must have the following enabled in your kernel:" + echo "CONFIG_TEST_HMM=m" + exit $ksft_skip + fi +} + +load_driver() +{ + modprobe $DRIVER > /dev/null 2>&1 + if [ $? == 0 ]; then + major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices) + mknod /dev/hmm_dmirror0 c $major 0 + mknod /dev/hmm_dmirror1 c $major 1 + fi +} + +unload_driver() +{ + modprobe -r $DRIVER > /dev/null 2>&1 + rm -f /dev/hmm_dmirror? +} + +run_smoke() +{ + echo "Running smoke test. Note, this test provides basic coverage." + + load_driver + $(dirname "${BASH_SOURCE[0]}")/hmm-tests + unload_driver +} + +usage() +{ + echo -n "Usage: $0" + echo + echo "Example usage:" + echo + echo "# Shows help message" + echo "./${TEST_NAME}.sh" + echo + echo "# Smoke testing" + echo "./${TEST_NAME}.sh smoke" + echo + exit 0 +} + +function run_test() +{ + if [ $# -eq 0 ]; then + usage + else + if [ "$1" = "smoke" ]; then + run_smoke + else + usage + fi + fi +} + +check_test_requirements +run_test $@ + +exit 0 diff --git a/tools/testing/selftests/vm/write_to_hugetlbfs.c b/tools/testing/selftests/vm/write_to_hugetlbfs.c index 110bc4e4015d..6a2caba19ee1 100644 --- a/tools/testing/selftests/vm/write_to_hugetlbfs.c +++ b/tools/testing/selftests/vm/write_to_hugetlbfs.c @@ -74,8 +74,6 @@ int main(int argc, char **argv) int write = 0; int reserve = 1; - unsigned long i; - if (signal(SIGINT, sig_handler) == SIG_ERR) err(1, "\ncan't catch SIGINT\n"); diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile index 90598a425c18..4bdd6c1a19d3 100644 --- a/tools/testing/selftests/wireguard/qemu/Makefile +++ b/tools/testing/selftests/wireguard/qemu/Makefile @@ -44,7 +44,7 @@ endef $(eval $(call tar_download,MUSL,musl,1.2.0,.tar.gz,https://musl.libc.org/releases/,c6de7b191139142d3f9a7b5b702c9cae1b5ee6e7f57e582da9328629408fd4e8)) $(eval $(call tar_download,IPERF,iperf,3.7,.tar.gz,https://downloads.es.net/pub/iperf/,d846040224317caf2f75c843d309a950a7db23f9b44b94688ccbe557d6d1710c)) $(eval $(call tar_download,BASH,bash,5.0,.tar.gz,https://ftp.gnu.org/gnu/bash/,b4a80f2ac66170b2913efbfb9f2594f1f76c7b1afd11f799e22035d63077fb4d)) -$(eval $(call tar_download,IPROUTE2,iproute2,5.4.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,fe97aa60a0d4c5ac830be18937e18dc3400ca713a33a89ad896ff1e3d46086ae)) +$(eval $(call tar_download,IPROUTE2,iproute2,5.6.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,1b5b0e25ce6e23da7526ea1da044e814ad85ba761b10dd29c2b027c056b04692)) $(eval $(call tar_download,IPTABLES,iptables,1.8.4,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,993a3a5490a544c2cbf2ef15cf7e7ed21af1845baf228318d5c36ef8827e157c)) $(eval $(call tar_download,NMAP,nmap,7.80,.tar.bz2,https://nmap.org/dist/,fcfa5a0e42099e12e4bf7a68ebe6fde05553383a682e816a7ec9256ab4773faa)) $(eval $(call tar_download,IPUTILS,iputils,s20190709,.tar.gz,https://github.com/iputils/iputils/archive/s20190709.tar.gz/#,a15720dd741d7538dd2645f9f516d193636ae4300ff7dbc8bfca757bf166490a)) diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config index 9803dbb54181..b50c2085c1ac 100644 --- a/tools/testing/selftests/wireguard/qemu/debug.config +++ b/tools/testing/selftests/wireguard/qemu/debug.config @@ -57,7 +57,6 @@ CONFIG_RCU_EQS_DEBUG=y CONFIG_USER_STACKTRACE_SUPPORT=y CONFIG_DEBUG_SG=y CONFIG_DEBUG_NOTIFIERS=y -CONFIG_DOUBLEFAULT=y CONFIG_X86_DEBUG_FPU=y CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_DEBUG_PAGEALLOC=y diff --git a/tools/testing/selftests/x86/.gitignore b/tools/testing/selftests/x86/.gitignore index 022a1f3b64ef..1aaef5bf119a 100644 --- a/tools/testing/selftests/x86/.gitignore +++ b/tools/testing/selftests/x86/.gitignore @@ -12,5 +12,4 @@ ldt_gdt iopl mpx-mini-test ioperm -protection_keys test_vdso diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 5d49bfec1e9a..5f16821c7f63 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -12,7 +12,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie) TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ check_initial_reg_state sigreturn iopl ioperm \ - protection_keys test_vdso test_vsyscall mov_ss_trap \ + test_vdso test_vsyscall mov_ss_trap \ syscall_arg_fault TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ diff --git a/tools/testing/selftests/x86/pkey-helpers.h b/tools/testing/selftests/x86/pkey-helpers.h deleted file mode 100644 index 254e5436bdd9..000000000000 --- a/tools/testing/selftests/x86/pkey-helpers.h +++ /dev/null @@ -1,219 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PKEYS_HELPER_H -#define _PKEYS_HELPER_H -#define _GNU_SOURCE -#include <string.h> -#include <stdarg.h> -#include <stdio.h> -#include <stdint.h> -#include <stdbool.h> -#include <signal.h> -#include <assert.h> -#include <stdlib.h> -#include <ucontext.h> -#include <sys/mman.h> - -#define NR_PKEYS 16 -#define PKRU_BITS_PER_PKEY 2 - -#ifndef DEBUG_LEVEL -#define DEBUG_LEVEL 0 -#endif -#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 -extern int dprint_in_signal; -extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; -static inline void sigsafe_printf(const char *format, ...) -{ - va_list ap; - - if (!dprint_in_signal) { - va_start(ap, format); - vprintf(format, ap); - va_end(ap); - } else { - int ret; - /* - * No printf() functions are signal-safe. - * They deadlock easily. Write the format - * string to get some output, even if - * incomplete. - */ - ret = write(1, format, strlen(format)); - if (ret < 0) - exit(1); - } -} -#define dprintf_level(level, args...) do { \ - if (level <= DEBUG_LEVEL) \ - sigsafe_printf(args); \ -} while (0) -#define dprintf0(args...) dprintf_level(0, args) -#define dprintf1(args...) dprintf_level(1, args) -#define dprintf2(args...) dprintf_level(2, args) -#define dprintf3(args...) dprintf_level(3, args) -#define dprintf4(args...) dprintf_level(4, args) - -extern unsigned int shadow_pkru; -static inline unsigned int __rdpkru(void) -{ - unsigned int eax, edx; - unsigned int ecx = 0; - unsigned int pkru; - - asm volatile(".byte 0x0f,0x01,0xee\n\t" - : "=a" (eax), "=d" (edx) - : "c" (ecx)); - pkru = eax; - return pkru; -} - -static inline unsigned int _rdpkru(int line) -{ - unsigned int pkru = __rdpkru(); - - dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n", - line, pkru, shadow_pkru); - assert(pkru == shadow_pkru); - - return pkru; -} - -#define rdpkru() _rdpkru(__LINE__) - -static inline void __wrpkru(unsigned int pkru) -{ - unsigned int eax = pkru; - unsigned int ecx = 0; - unsigned int edx = 0; - - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); - asm volatile(".byte 0x0f,0x01,0xef\n\t" - : : "a" (eax), "c" (ecx), "d" (edx)); - assert(pkru == __rdpkru()); -} - -static inline void wrpkru(unsigned int pkru) -{ - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); - /* will do the shadow check for us: */ - rdpkru(); - __wrpkru(pkru); - shadow_pkru = pkru; - dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru()); -} - -/* - * These are technically racy. since something could - * change PKRU between the read and the write. - */ -static inline void __pkey_access_allow(int pkey, int do_allow) -{ - unsigned int pkru = rdpkru(); - int bit = pkey * 2; - - if (do_allow) - pkru &= (1<<bit); - else - pkru |= (1<<bit); - - dprintf4("pkru now: %08x\n", rdpkru()); - wrpkru(pkru); -} - -static inline void __pkey_write_allow(int pkey, int do_allow_write) -{ - long pkru = rdpkru(); - int bit = pkey * 2 + 1; - - if (do_allow_write) - pkru &= (1<<bit); - else - pkru |= (1<<bit); - - wrpkru(pkru); - dprintf4("pkru now: %08x\n", rdpkru()); -} - -#define PROT_PKEY0 0x10 /* protection key value (bit 0) */ -#define PROT_PKEY1 0x20 /* protection key value (bit 1) */ -#define PROT_PKEY2 0x40 /* protection key value (bit 2) */ -#define PROT_PKEY3 0x80 /* protection key value (bit 3) */ - -#define PAGE_SIZE 4096 -#define MB (1<<20) - -static inline void __cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - /* ecx is often an input as well as an output. */ - asm volatile( - "cpuid;" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx)); -} - -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ -#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ - -static inline int cpu_has_pku(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - - eax = 0x7; - ecx = 0x0; - __cpuid(&eax, &ebx, &ecx, &edx); - - if (!(ecx & X86_FEATURE_PKU)) { - dprintf2("cpu does not have PKU\n"); - return 0; - } - if (!(ecx & X86_FEATURE_OSPKE)) { - dprintf2("cpu does not have OSPKE\n"); - return 0; - } - return 1; -} - -#define XSTATE_PKRU_BIT (9) -#define XSTATE_PKRU 0x200 - -int pkru_xstate_offset(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - int xstate_offset; - int xstate_size; - unsigned long XSTATE_CPUID = 0xd; - int leaf; - - /* assume that XSTATE_PKRU is set in XCR0 */ - leaf = XSTATE_PKRU_BIT; - { - eax = XSTATE_CPUID; - ecx = leaf; - __cpuid(&eax, &ebx, &ecx, &edx); - - if (leaf == XSTATE_PKRU_BIT) { - xstate_offset = ebx; - xstate_size = eax; - } - } - - if (xstate_size == 0) { - printf("could not find size/offset of PKRU in xsave state\n"); - return 0; - } - - return xstate_offset; -} - -#endif /* _PKEYS_HELPER_H */ |