From dbb60c8a26daf388f183f599e1e96de5bb9f96e1 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Thu, 3 Nov 2022 16:57:45 +0100
Subject: selftests: add tests for the HID-bpf initial implementation

The tests are pretty basic:
- create a virtual uhid device that no userspace will like (to not mess
  up the running system)
- attach a BPF prog to it
- open the matching hidraw node
- inject one event and check:
  * that the BPF program can do something on the event stream
  * can modify the event stream
- add another test where we attach/detach BPF programs to see if we get
  errors

Note: the Makefile is extracted from selftests/bpf so we can rebuild
the libbpf and bpftool components from the current kernel tree without
relying on system installed components.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 tools/testing/selftests/Makefile                   |   1 +
 tools/testing/selftests/hid/.gitignore             |   4 +
 tools/testing/selftests/hid/Makefile               | 233 +++++++
 tools/testing/selftests/hid/config                 |  20 +
 tools/testing/selftests/hid/hid_bpf.c              | 675 +++++++++++++++++++++
 tools/testing/selftests/hid/progs/hid.c            |  41 ++
 .../testing/selftests/hid/progs/hid_bpf_helpers.h  |  14 +
 7 files changed, 988 insertions(+)
 create mode 100644 tools/testing/selftests/hid/.gitignore
 create mode 100644 tools/testing/selftests/hid/Makefile
 create mode 100644 tools/testing/selftests/hid/config
 create mode 100644 tools/testing/selftests/hid/hid_bpf.c
 create mode 100644 tools/testing/selftests/hid/progs/hid.c
 create mode 100644 tools/testing/selftests/hid/progs/hid_bpf_helpers.h

(limited to 'tools')

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index f07aef7c592c..479127efddf9 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -26,6 +26,7 @@ TARGETS += fpu
 TARGETS += ftrace
 TARGETS += futex
 TARGETS += gpio
+TARGETS += hid
 TARGETS += intel_pstate
 TARGETS += ipc
 TARGETS += ir
diff --git a/tools/testing/selftests/hid/.gitignore b/tools/testing/selftests/hid/.gitignore
new file mode 100644
index 000000000000..a462ca6ab2c0
--- /dev/null
+++ b/tools/testing/selftests/hid/.gitignore
@@ -0,0 +1,4 @@
+bpftool
+*.skel.h
+/tools
+hid_bpf
diff --git a/tools/testing/selftests/hid/Makefile b/tools/testing/selftests/hid/Makefile
new file mode 100644
index 000000000000..693f1cb6e47a
--- /dev/null
+++ b/tools/testing/selftests/hid/Makefile
@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# based on tools/testing/selftest/bpf/Makefile
+include ../../../build/Build.include
+include ../../../scripts/Makefile.arch
+include ../../../scripts/Makefile.include
+
+CXX ?= $(CROSS_COMPILE)g++
+
+CURDIR := $(abspath .)
+TOOLSDIR := $(abspath ../../..)
+TOP_SRCDIR = $(CURDIR)/../../../..
+KHDR_INCLUDES := $(TOP_SRCDIR)/usr/include
+LIBDIR := $(TOOLSDIR)/lib
+BPFDIR := $(LIBDIR)/bpf
+TOOLSINCDIR := $(TOOLSDIR)/include
+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
+HOSTPKG_CONFIG := pkg-config
+
+CFLAGS += -g -O0 -rdynamic -Wall -Werror -I$(KHDR_INCLUDES)
+LDLIBS += -lelf -lz -lrt -lpthread
+
+# Silence some warnings when compiled with clang
+ifneq ($(LLVM),)
+CFLAGS += -Wno-unused-command-line-argument
+endif
+
+# Order correspond to 'make run_tests' order
+TEST_GEN_PROGS = hid_bpf
+
+# Emit succinct information message describing current building step
+# $1 - generic step name (e.g., CC, LINK, etc);
+# $2 - optional "flavor" specifier; if provided, will be emitted as [flavor];
+# $3 - target (assumed to be file); only file name will be emitted;
+# $4 - optional extra arg, emitted as-is, if provided.
+ifeq ($(V),1)
+Q =
+msg =
+else
+Q = @
+msg = @printf '  %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))";
+MAKEFLAGS += --no-print-directory
+submake_extras := feature_display=0
+endif
+
+# override lib.mk's default rules
+OVERRIDE_TARGETS := 1
+override define CLEAN
+	$(call msg,CLEAN)
+	$(Q)$(RM) -r $(TEST_GEN_PROGS)
+	$(Q)$(RM) -r $(EXTRA_CLEAN)
+endef
+
+include ../lib.mk
+
+SCRATCH_DIR := $(OUTPUT)/tools
+BUILD_DIR := $(SCRATCH_DIR)/build
+INCLUDE_DIR := $(SCRATCH_DIR)/include
+BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a
+ifneq ($(CROSS_COMPILE),)
+HOST_BUILD_DIR		:= $(BUILD_DIR)/host
+HOST_SCRATCH_DIR	:= $(OUTPUT)/host-tools
+HOST_INCLUDE_DIR	:= $(HOST_SCRATCH_DIR)/include
+else
+HOST_BUILD_DIR		:= $(BUILD_DIR)
+HOST_SCRATCH_DIR	:= $(SCRATCH_DIR)
+HOST_INCLUDE_DIR	:= $(INCLUDE_DIR)
+endif
+HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a
+RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids
+
+VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux)				\
+		     $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)	\
+		     ../../../../vmlinux				\
+		     /sys/kernel/btf/vmlinux				\
+		     /boot/vmlinux-$(shell uname -r)
+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
+ifeq ($(VMLINUX_BTF),)
+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
+endif
+
+# Define simple and short `make test_progs`, `make test_sysctl`, etc targets
+# to build individual tests.
+# NOTE: Semicolon at the end is critical to override lib.mk's default static
+# rule for binaries.
+$(notdir $(TEST_GEN_PROGS)): %: $(OUTPUT)/% ;
+
+# sort removes libbpf duplicates when not cross-building
+MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
+	       $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids	\
+	       $(INCLUDE_DIR))
+$(MAKE_DIRS):
+	$(call msg,MKDIR,,$@)
+	$(Q)mkdir -p $@
+
+$(OUTPUT)/%.o: %.c
+	$(call msg,CC,,$@)
+	$(Q)$(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@
+
+# LLVM's ld.lld doesn't support all the architectures, so use it only on x86
+ifeq ($(SRCARCH),x86)
+LLD := lld
+else
+LLD := ld
+endif
+
+DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool
+
+TEST_GEN_PROGS_EXTENDED += $(DEFAULT_BPFTOOL)
+
+$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(BPFOBJ)
+
+BPFTOOL ?= $(DEFAULT_BPFTOOL)
+$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)    \
+		    $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool
+	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)			       \
+		    ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) 	       \
+		    EXTRA_CFLAGS='-g -O0'				       \
+		    OUTPUT=$(HOST_BUILD_DIR)/bpftool/			       \
+		    LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/		       \
+		    LIBBPF_DESTDIR=$(HOST_SCRATCH_DIR)/			       \
+		    prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install-bin
+
+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
+	   $(KHDR_INCLUDES)/linux/bpf.h					       \
+	   | $(BUILD_DIR)/libbpf
+	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \
+		    EXTRA_CFLAGS='-g -O0'				       \
+		    DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
+
+ifneq ($(BPFOBJ),$(HOST_BPFOBJ))
+$(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
+		$(KHDR_INCLUDES)/linux/bpf.h					       \
+		| $(HOST_BUILD_DIR)/libbpf
+	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR)                             \
+		    EXTRA_CFLAGS='-g -O0' ARCH= CROSS_COMPILE=		       \
+		    OUTPUT=$(HOST_BUILD_DIR)/libbpf/ CC=$(HOSTCC) LD=$(HOSTLD) \
+		    DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers
+endif
+
+$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
+ifeq ($(VMLINUX_H),)
+	$(call msg,GEN,,$@)
+	$(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
+else
+	$(call msg,CP,,$@)
+	$(Q)cp "$(VMLINUX_H)" $@
+endif
+
+$(RESOLVE_BTFIDS): $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/resolve_btfids	\
+		       $(TOOLSDIR)/bpf/resolve_btfids/main.c	\
+		       $(TOOLSDIR)/lib/rbtree.c			\
+		       $(TOOLSDIR)/lib/zalloc.c			\
+		       $(TOOLSDIR)/lib/string.c			\
+		       $(TOOLSDIR)/lib/ctype.c			\
+		       $(TOOLSDIR)/lib/str_error_r.c
+	$(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/resolve_btfids	\
+		CC=$(HOSTCC) LD=$(HOSTLD) AR=$(HOSTAR) \
+		LIBBPF_INCLUDE=$(HOST_INCLUDE_DIR) \
+		OUTPUT=$(HOST_BUILD_DIR)/resolve_btfids/ BPFOBJ=$(HOST_BPFOBJ)
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '-target bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
+endef
+
+# Determine target endianness.
+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \
+			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
+MENDIAN=$(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)
+
+CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG))
+BPF_CFLAGS = -g -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) 		\
+	     -I$(INCLUDE_DIR) -I$(CURDIR) -I$(KHDR_INCLUDES)		\
+	     -I$(abspath $(OUTPUT)/../usr/include)
+
+CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) \
+	       -Wno-compare-distinct-pointer-types
+
+# Build BPF object using Clang
+# $1 - input .c file
+# $2 - output .o file
+# $3 - CFLAGS
+define CLANG_BPF_BUILD_RULE
+	$(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2)
+	$(Q)$(CLANG) $3 -O2 -target bpf -c $1 -mcpu=v3 -o $2
+endef
+# Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32
+define CLANG_NOALU32_BPF_BUILD_RULE
+	$(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2)
+	$(Q)$(CLANG) $3 -O2 -target bpf -c $1 -mcpu=v2 -o $2
+endef
+# Build BPF object using GCC
+define GCC_BPF_BUILD_RULE
+	$(call msg,GCC-BPF,$(TRUNNER_BINARY),$2)
+	$(Q)$(BPF_GCC) $3 -O2 -c $1 -o $2
+endef
+
+BPF_PROGS_DIR := progs
+BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE
+BPF_SRCS := $(notdir $(wildcard $(BPF_PROGS_DIR)/*.c))
+BPF_OBJS := $(patsubst %.c,$(OUTPUT)/%.bpf.o, $(BPF_SRCS))
+BPF_SKELS := $(patsubst %.c,$(OUTPUT)/%.skel.h, $(BPF_SRCS))
+TEST_GEN_FILES += $(BPF_OBJS)
+
+$(BPF_PROGS_DIR)-bpfobjs := y
+$(BPF_OBJS): $(OUTPUT)/%.bpf.o:				\
+	     $(BPF_PROGS_DIR)/%.c			\
+	     $(wildcard $(BPF_PROGS_DIR)/*.h)		\
+	     $(INCLUDE_DIR)/vmlinux.h				\
+	     $(wildcard $(BPFDIR)/hid_bpf_*.h)			\
+	     $(wildcard $(BPFDIR)/*.bpf.h)			\
+	     | $(OUTPUT) $(BPFOBJ)
+	$(call $(BPF_BUILD_RULE),$<,$@, $(BPF_CFLAGS))
+
+$(BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(OUTPUT)
+	$(call msg,GEN-SKEL,$(BINARY),$@)
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
+	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked1.o) name $(notdir $(<:.bpf.o=)) > $@
+
+$(OUTPUT)/%:%.c $(BPF_SKELS)
+	$(call msg,BINARY,,$@)
+	$(Q)$(LINK.c) $^ $(LDLIBS) -o $@
+
+EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) feature bpftool	\
+	$(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32)
diff --git a/tools/testing/selftests/hid/config b/tools/testing/selftests/hid/config
new file mode 100644
index 000000000000..d4130489c1b1
--- /dev/null
+++ b/tools/testing/selftests/hid/config
@@ -0,0 +1,20 @@
+CONFIG_BPF_EVENTS=y
+CONFIG_BPFILTER=y
+CONFIG_BPF_JIT_ALWAYS_ON=y
+CONFIG_BPF_JIT=y
+CONFIG_BPF_KPROBE_OVERRIDE=y
+CONFIG_BPF_LSM=y
+CONFIG_BPF_PRELOAD_UMD=y
+CONFIG_BPF_PRELOAD=y
+CONFIG_BPF_STREAM_PARSER=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BPF=y
+CONFIG_CGROUP_BPF=y
+CONFIG_DEBUG_INFO_BTF=y
+CONFIG_FPROBE=y
+CONFIG_FTRACE_SYSCALLS=y
+CONFIG_FUNCTION_TRACER=y
+CONFIG_HIDRAW=y
+CONFIG_HID=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_UHID=y
diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
new file mode 100644
index 000000000000..4fc7794c6f8b
--- /dev/null
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -0,0 +1,675 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Red Hat */
+#include "hid.skel.h"
+
+#include "../kselftest_harness.h"
+
+#include <bpf/bpf.h>
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <dirent.h>
+#include <poll.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <linux/hidraw.h>
+#include <linux/uhid.h>
+
+#define SHOW_UHID_DEBUG 0
+
+static unsigned char rdesc[] = {
+	0x06, 0x00, 0xff,	/* Usage Page (Vendor Defined Page 1) */
+	0x09, 0x21,		/* Usage (Vendor Usage 0x21) */
+	0xa1, 0x01,		/* COLLECTION (Application) */
+	0x09, 0x01,			/* Usage (Vendor Usage 0x01) */
+	0xa1, 0x00,			/* COLLECTION (Physical) */
+	0x85, 0x01,				/* REPORT_ID (1) */
+	0x06, 0x00, 0xff,			/* Usage Page (Vendor Defined Page 1) */
+	0x19, 0x01,				/* USAGE_MINIMUM (1) */
+	0x29, 0x03,				/* USAGE_MAXIMUM (3) */
+	0x15, 0x00,				/* LOGICAL_MINIMUM (0) */
+	0x25, 0x01,				/* LOGICAL_MAXIMUM (1) */
+	0x95, 0x03,				/* REPORT_COUNT (3) */
+	0x75, 0x01,				/* REPORT_SIZE (1) */
+	0x81, 0x02,				/* INPUT (Data,Var,Abs) */
+	0x95, 0x01,				/* REPORT_COUNT (1) */
+	0x75, 0x05,				/* REPORT_SIZE (5) */
+	0x81, 0x01,				/* INPUT (Cnst,Var,Abs) */
+	0x05, 0x01,				/* USAGE_PAGE (Generic Desktop) */
+	0x09, 0x30,				/* USAGE (X) */
+	0x09, 0x31,				/* USAGE (Y) */
+	0x15, 0x81,				/* LOGICAL_MINIMUM (-127) */
+	0x25, 0x7f,				/* LOGICAL_MAXIMUM (127) */
+	0x75, 0x10,				/* REPORT_SIZE (16) */
+	0x95, 0x02,				/* REPORT_COUNT (2) */
+	0x81, 0x06,				/* INPUT (Data,Var,Rel) */
+
+	0x06, 0x00, 0xff,			/* Usage Page (Vendor Defined Page 1) */
+	0x19, 0x01,				/* USAGE_MINIMUM (1) */
+	0x29, 0x03,				/* USAGE_MAXIMUM (3) */
+	0x15, 0x00,				/* LOGICAL_MINIMUM (0) */
+	0x25, 0x01,				/* LOGICAL_MAXIMUM (1) */
+	0x95, 0x03,				/* REPORT_COUNT (3) */
+	0x75, 0x01,				/* REPORT_SIZE (1) */
+	0x91, 0x02,				/* Output (Data,Var,Abs) */
+	0x95, 0x01,				/* REPORT_COUNT (1) */
+	0x75, 0x05,				/* REPORT_SIZE (5) */
+	0x91, 0x01,				/* Output (Cnst,Var,Abs) */
+
+	0x06, 0x00, 0xff,			/* Usage Page (Vendor Defined Page 1) */
+	0x19, 0x06,				/* USAGE_MINIMUM (6) */
+	0x29, 0x08,				/* USAGE_MAXIMUM (8) */
+	0x15, 0x00,				/* LOGICAL_MINIMUM (0) */
+	0x25, 0x01,				/* LOGICAL_MAXIMUM (1) */
+	0x95, 0x03,				/* REPORT_COUNT (3) */
+	0x75, 0x01,				/* REPORT_SIZE (1) */
+	0xb1, 0x02,				/* Feature (Data,Var,Abs) */
+	0x95, 0x01,				/* REPORT_COUNT (1) */
+	0x75, 0x05,				/* REPORT_SIZE (5) */
+	0x91, 0x01,				/* Output (Cnst,Var,Abs) */
+
+	0xc0,				/* END_COLLECTION */
+	0xc0,			/* END_COLLECTION */
+};
+
+struct attach_prog_args {
+	int prog_fd;
+	unsigned int hid;
+	int retval;
+};
+
+#define ASSERT_OK(data) ASSERT_FALSE(data)
+#define ASSERT_OK_PTR(ptr) ASSERT_NE(NULL, ptr)
+
+#define UHID_LOG(fmt, ...) do { \
+	if (SHOW_UHID_DEBUG) \
+		TH_LOG(fmt, ##__VA_ARGS__); \
+} while (0)
+
+static pthread_mutex_t uhid_started_mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t uhid_started = PTHREAD_COND_INITIALIZER;
+
+/* no need to protect uhid_stopped, only one thread accesses it */
+static bool uhid_stopped;
+
+static int uhid_write(struct __test_metadata *_metadata, int fd, const struct uhid_event *ev)
+{
+	ssize_t ret;
+
+	ret = write(fd, ev, sizeof(*ev));
+	if (ret < 0) {
+		TH_LOG("Cannot write to uhid: %m");
+		return -errno;
+	} else if (ret != sizeof(*ev)) {
+		TH_LOG("Wrong size written to uhid: %zd != %zu",
+			ret, sizeof(ev));
+		return -EFAULT;
+	} else {
+		return 0;
+	}
+}
+
+static int uhid_create(struct __test_metadata *_metadata, int fd, int rand_nb)
+{
+	struct uhid_event ev;
+	char buf[25];
+
+	sprintf(buf, "test-uhid-device-%d", rand_nb);
+
+	memset(&ev, 0, sizeof(ev));
+	ev.type = UHID_CREATE;
+	strcpy((char *)ev.u.create.name, buf);
+	ev.u.create.rd_data = rdesc;
+	ev.u.create.rd_size = sizeof(rdesc);
+	ev.u.create.bus = BUS_USB;
+	ev.u.create.vendor = 0x0001;
+	ev.u.create.product = 0x0a37;
+	ev.u.create.version = 0;
+	ev.u.create.country = 0;
+
+	sprintf(buf, "%d", rand_nb);
+	strcpy((char *)ev.u.create.phys, buf);
+
+	return uhid_write(_metadata, fd, &ev);
+}
+
+static void uhid_destroy(struct __test_metadata *_metadata, int fd)
+{
+	struct uhid_event ev;
+
+	memset(&ev, 0, sizeof(ev));
+	ev.type = UHID_DESTROY;
+
+	uhid_write(_metadata, fd, &ev);
+}
+
+static int uhid_event(struct __test_metadata *_metadata, int fd)
+{
+	struct uhid_event ev;
+	ssize_t ret;
+
+	memset(&ev, 0, sizeof(ev));
+	ret = read(fd, &ev, sizeof(ev));
+	if (ret == 0) {
+		UHID_LOG("Read HUP on uhid-cdev");
+		return -EFAULT;
+	} else if (ret < 0) {
+		UHID_LOG("Cannot read uhid-cdev: %m");
+		return -errno;
+	} else if (ret != sizeof(ev)) {
+		UHID_LOG("Invalid size read from uhid-dev: %zd != %zu",
+			ret, sizeof(ev));
+		return -EFAULT;
+	}
+
+	switch (ev.type) {
+	case UHID_START:
+		pthread_mutex_lock(&uhid_started_mtx);
+		pthread_cond_signal(&uhid_started);
+		pthread_mutex_unlock(&uhid_started_mtx);
+
+		UHID_LOG("UHID_START from uhid-dev");
+		break;
+	case UHID_STOP:
+		uhid_stopped = true;
+
+		UHID_LOG("UHID_STOP from uhid-dev");
+		break;
+	case UHID_OPEN:
+		UHID_LOG("UHID_OPEN from uhid-dev");
+		break;
+	case UHID_CLOSE:
+		UHID_LOG("UHID_CLOSE from uhid-dev");
+		break;
+	case UHID_OUTPUT:
+		UHID_LOG("UHID_OUTPUT from uhid-dev");
+		break;
+	case UHID_GET_REPORT:
+		UHID_LOG("UHID_GET_REPORT from uhid-dev");
+		break;
+	case UHID_SET_REPORT:
+		UHID_LOG("UHID_SET_REPORT from uhid-dev");
+		break;
+	default:
+		TH_LOG("Invalid event from uhid-dev: %u", ev.type);
+	}
+
+	return 0;
+}
+
+struct uhid_thread_args {
+	int fd;
+	struct __test_metadata *_metadata;
+};
+static void *uhid_read_events_thread(void *arg)
+{
+	struct uhid_thread_args *args = (struct uhid_thread_args *)arg;
+	struct __test_metadata *_metadata = args->_metadata;
+	struct pollfd pfds[1];
+	int fd = args->fd;
+	int ret = 0;
+
+	pfds[0].fd = fd;
+	pfds[0].events = POLLIN;
+
+	uhid_stopped = false;
+
+	while (!uhid_stopped) {
+		ret = poll(pfds, 1, 100);
+		if (ret < 0) {
+			TH_LOG("Cannot poll for fds: %m");
+			break;
+		}
+		if (pfds[0].revents & POLLIN) {
+			ret = uhid_event(_metadata, fd);
+			if (ret)
+				break;
+		}
+	}
+
+	return (void *)(long)ret;
+}
+
+static int uhid_start_listener(struct __test_metadata *_metadata, pthread_t *tid, int uhid_fd)
+{
+	struct uhid_thread_args args = {
+		.fd = uhid_fd,
+		._metadata = _metadata,
+	};
+	int err;
+
+	pthread_mutex_lock(&uhid_started_mtx);
+	err = pthread_create(tid, NULL, uhid_read_events_thread, (void *)&args);
+	ASSERT_EQ(0, err) {
+		TH_LOG("Could not start the uhid thread: %d", err);
+		pthread_mutex_unlock(&uhid_started_mtx);
+		close(uhid_fd);
+		return -EIO;
+	}
+	pthread_cond_wait(&uhid_started, &uhid_started_mtx);
+	pthread_mutex_unlock(&uhid_started_mtx);
+
+	return 0;
+}
+
+static int uhid_send_event(struct __test_metadata *_metadata, int fd, __u8 *buf, size_t size)
+{
+	struct uhid_event ev;
+
+	if (size > sizeof(ev.u.input.data))
+		return -E2BIG;
+
+	memset(&ev, 0, sizeof(ev));
+	ev.type = UHID_INPUT2;
+	ev.u.input2.size = size;
+
+	memcpy(ev.u.input2.data, buf, size);
+
+	return uhid_write(_metadata, fd, &ev);
+}
+
+static int setup_uhid(struct __test_metadata *_metadata, int rand_nb)
+{
+	int fd;
+	const char *path = "/dev/uhid";
+	int ret;
+
+	fd = open(path, O_RDWR | O_CLOEXEC);
+	ASSERT_GE(fd, 0) TH_LOG("open uhid-cdev failed; %d", fd);
+
+	ret = uhid_create(_metadata, fd, rand_nb);
+	ASSERT_EQ(0, ret) {
+		TH_LOG("create uhid device failed: %d", ret);
+		close(fd);
+	}
+
+	return fd;
+}
+
+static bool match_sysfs_device(int dev_id, const char *workdir, struct dirent *dir)
+{
+	const char *target = "0003:0001:0A37.*";
+	char phys[512];
+	char uevent[1024];
+	char temp[512];
+	int fd, nread;
+	bool found = false;
+
+	if (fnmatch(target, dir->d_name, 0))
+		return false;
+
+	/* we found the correct VID/PID, now check for phys */
+	sprintf(uevent, "%s/%s/uevent", workdir, dir->d_name);
+
+	fd = open(uevent, O_RDONLY | O_NONBLOCK);
+	if (fd < 0)
+		return false;
+
+	sprintf(phys, "PHYS=%d", dev_id);
+
+	nread = read(fd, temp, ARRAY_SIZE(temp));
+	if (nread > 0 && (strstr(temp, phys)) != NULL)
+		found = true;
+
+	close(fd);
+
+	return found;
+}
+
+static int get_hid_id(int dev_id)
+{
+	const char *workdir = "/sys/devices/virtual/misc/uhid";
+	const char *str_id;
+	DIR *d;
+	struct dirent *dir;
+	int found = -1, attempts = 3;
+
+	/* it would be nice to be able to use nftw, but the no_alu32 target doesn't support it */
+
+	while (found < 0 && attempts > 0) {
+		attempts--;
+		d = opendir(workdir);
+		if (d) {
+			while ((dir = readdir(d)) != NULL) {
+				if (!match_sysfs_device(dev_id, workdir, dir))
+					continue;
+
+				str_id = dir->d_name + sizeof("0003:0001:0A37.");
+				found = (int)strtol(str_id, NULL, 16);
+
+				break;
+			}
+			closedir(d);
+		}
+		if (found < 0)
+			usleep(100000);
+	}
+
+	return found;
+}
+
+static int get_hidraw(int dev_id)
+{
+	const char *workdir = "/sys/devices/virtual/misc/uhid";
+	char sysfs[1024];
+	DIR *d, *subd;
+	struct dirent *dir, *subdir;
+	int i, found = -1;
+
+	/* retry 5 times in case the system is loaded */
+	for (i = 5; i > 0; i--) {
+		usleep(10);
+		d = opendir(workdir);
+
+		if (!d)
+			continue;
+
+		while ((dir = readdir(d)) != NULL) {
+			if (!match_sysfs_device(dev_id, workdir, dir))
+				continue;
+
+			sprintf(sysfs, "%s/%s/hidraw", workdir, dir->d_name);
+
+			subd = opendir(sysfs);
+			if (!subd)
+				continue;
+
+			while ((subdir = readdir(subd)) != NULL) {
+				if (fnmatch("hidraw*", subdir->d_name, 0))
+					continue;
+
+				found = atoi(subdir->d_name + strlen("hidraw"));
+			}
+
+			closedir(subd);
+
+			if (found > 0)
+				break;
+		}
+		closedir(d);
+	}
+
+	return found;
+}
+
+static int open_hidraw(int dev_id)
+{
+	int hidraw_number;
+	char hidraw_path[64] = { 0 };
+
+	hidraw_number = get_hidraw(dev_id);
+	if (hidraw_number < 0)
+		return hidraw_number;
+
+	/* open hidraw node to check the other side of the pipe */
+	sprintf(hidraw_path, "/dev/hidraw%d", hidraw_number);
+	return open(hidraw_path, O_RDWR | O_NONBLOCK);
+}
+
+FIXTURE(hid_bpf) {
+	int dev_id;
+	int uhid_fd;
+	int hidraw_fd;
+	int hid_id;
+	pthread_t tid;
+	struct hid *skel;
+};
+static void detach_bpf(FIXTURE_DATA(hid_bpf) * self)
+{
+	if (self->hidraw_fd)
+		close(self->hidraw_fd);
+	self->hidraw_fd = 0;
+
+	hid__destroy(self->skel);
+	self->skel = NULL;
+}
+
+FIXTURE_TEARDOWN(hid_bpf) {
+	void *uhid_err;
+
+	uhid_destroy(_metadata, self->uhid_fd);
+
+	detach_bpf(self);
+	pthread_join(self->tid, &uhid_err);
+}
+#define TEARDOWN_LOG(fmt, ...) do { \
+	TH_LOG(fmt, ##__VA_ARGS__); \
+	hid_bpf_teardown(_metadata, self, variant); \
+} while (0)
+
+FIXTURE_SETUP(hid_bpf)
+{
+	time_t t;
+	int err;
+
+	/* initialize random number generator */
+	srand((unsigned int)time(&t));
+
+	self->dev_id = rand() % 1024;
+
+	self->uhid_fd = setup_uhid(_metadata, self->dev_id);
+
+	/* locate the uev, self, variant);ent file of the created device */
+	self->hid_id = get_hid_id(self->dev_id);
+	ASSERT_GT(self->hid_id, 0)
+		TEARDOWN_LOG("Could not locate uhid device id: %d", self->hid_id);
+
+	err = uhid_start_listener(_metadata, &self->tid, self->uhid_fd);
+	ASSERT_EQ(0, err) TEARDOWN_LOG("could not start udev listener: %d", err);
+}
+
+struct test_program {
+	const char *name;
+};
+#define LOAD_PROGRAMS(progs) \
+	load_programs(progs, ARRAY_SIZE(progs), _metadata, self, variant)
+#define LOAD_BPF \
+	load_programs(NULL, 0, _metadata, self, variant)
+static void load_programs(const struct test_program programs[],
+			  const size_t progs_count,
+			  struct __test_metadata *_metadata,
+			  FIXTURE_DATA(hid_bpf) * self,
+			  const FIXTURE_VARIANT(hid_bpf) * variant)
+{
+	int attach_fd, err = -EINVAL;
+	struct attach_prog_args args = {
+		.retval = -1,
+	};
+	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, tattr,
+			    .ctx_in = &args,
+			    .ctx_size_in = sizeof(args),
+	);
+
+	/* open the bpf file */
+	self->skel = hid__open();
+	ASSERT_OK_PTR(self->skel) TEARDOWN_LOG("Error while calling hid__open");
+
+	for (int i = 0; i < progs_count; i++) {
+		struct bpf_program *prog;
+
+		prog = bpf_object__find_program_by_name(*self->skel->skeleton->obj,
+							programs[i].name);
+		ASSERT_OK_PTR(prog) TH_LOG("can not find program by name '%s'", programs[i].name);
+
+		bpf_program__set_autoload(prog, true);
+	}
+
+	err = hid__load(self->skel);
+	ASSERT_OK(err) TH_LOG("hid_skel_load failed: %d", err);
+
+	attach_fd = bpf_program__fd(self->skel->progs.attach_prog);
+	ASSERT_GE(attach_fd, 0) TH_LOG("locate attach_prog: %d", attach_fd);
+
+	for (int i = 0; i < progs_count; i++) {
+		struct bpf_program *prog;
+
+		prog = bpf_object__find_program_by_name(*self->skel->skeleton->obj,
+							programs[i].name);
+		ASSERT_OK_PTR(prog) TH_LOG("can not find program by name '%s'", programs[i].name);
+
+		args.prog_fd = bpf_program__fd(prog);
+		args.hid = self->hid_id;
+		err = bpf_prog_test_run_opts(attach_fd, &tattr);
+		ASSERT_OK(args.retval) TH_LOG("attach_hid(%s): %d", programs[i].name, args.retval);
+	}
+
+	self->hidraw_fd = open_hidraw(self->dev_id);
+	ASSERT_GE(self->hidraw_fd, 0) TH_LOG("open_hidraw");
+}
+
+/*
+ * A simple test to see if the fixture is working fine.
+ * If this fails, none of the other tests will pass.
+ */
+TEST_F(hid_bpf, test_create_uhid)
+{
+}
+
+/*
+ * Attach hid_first_event to the given uhid device,
+ * retrieve and open the matching hidraw node,
+ * inject one event in the uhid device,
+ * check that the program sees it and can change the data
+ */
+TEST_F(hid_bpf, raw_event)
+{
+	const struct test_program progs[] = {
+		{ .name = "hid_first_event" },
+	};
+	__u8 buf[10] = {0};
+	int err;
+
+	LOAD_PROGRAMS(progs);
+
+	/* check that the program is correctly loaded */
+	ASSERT_EQ(self->skel->data->callback_check, 52) TH_LOG("callback_check1");
+	ASSERT_EQ(self->skel->data->callback2_check, 52) TH_LOG("callback2_check1");
+
+	/* inject one event */
+	buf[0] = 1;
+	buf[1] = 42;
+	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+
+	/* check that hid_first_event() was executed */
+	ASSERT_EQ(self->skel->data->callback_check, 42) TH_LOG("callback_check1");
+
+	/* read the data from hidraw */
+	memset(buf, 0, sizeof(buf));
+	err = read(self->hidraw_fd, buf, sizeof(buf));
+	ASSERT_EQ(err, 6) TH_LOG("read_hidraw");
+	ASSERT_EQ(buf[0], 1);
+	ASSERT_EQ(buf[2], 47);
+
+	/* inject another event */
+	memset(buf, 0, sizeof(buf));
+	buf[0] = 1;
+	buf[1] = 47;
+	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+
+	/* check that hid_first_event() was executed */
+	ASSERT_EQ(self->skel->data->callback_check, 47) TH_LOG("callback_check1");
+
+	/* read the data from hidraw */
+	memset(buf, 0, sizeof(buf));
+	err = read(self->hidraw_fd, buf, sizeof(buf));
+	ASSERT_EQ(err, 6) TH_LOG("read_hidraw");
+	ASSERT_EQ(buf[2], 52);
+}
+
+/*
+ * Ensures that we can attach/detach programs
+ */
+TEST_F(hid_bpf, test_attach_detach)
+{
+	const struct test_program progs[] = {
+		{ .name = "hid_first_event" },
+	};
+	__u8 buf[10] = {0};
+	int err;
+
+	LOAD_PROGRAMS(progs);
+
+	/* inject one event */
+	buf[0] = 1;
+	buf[1] = 42;
+	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+
+	/* read the data from hidraw */
+	memset(buf, 0, sizeof(buf));
+	err = read(self->hidraw_fd, buf, sizeof(buf));
+	ASSERT_EQ(err, 6) TH_LOG("read_hidraw");
+	ASSERT_EQ(buf[0], 1);
+	ASSERT_EQ(buf[2], 47);
+
+	/* pin the program and immediately unpin it */
+#define PIN_PATH "/sys/fs/bpf/hid_first_event"
+	bpf_program__pin(self->skel->progs.hid_first_event, PIN_PATH);
+	remove(PIN_PATH);
+#undef PIN_PATH
+	usleep(100000);
+
+	/* detach the program */
+	detach_bpf(self);
+
+	self->hidraw_fd = open_hidraw(self->dev_id);
+	ASSERT_GE(self->hidraw_fd, 0) TH_LOG("open_hidraw");
+
+	/* inject another event */
+	memset(buf, 0, sizeof(buf));
+	buf[0] = 1;
+	buf[1] = 47;
+	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+
+	/* read the data from hidraw */
+	memset(buf, 0, sizeof(buf));
+	err = read(self->hidraw_fd, buf, sizeof(buf));
+	ASSERT_EQ(err, 6) TH_LOG("read_hidraw_no_bpf");
+	ASSERT_EQ(buf[0], 1);
+	ASSERT_EQ(buf[1], 47);
+	ASSERT_EQ(buf[2], 0);
+
+	/* re-attach our program */
+
+	LOAD_PROGRAMS(progs);
+
+	/* inject one event */
+	memset(buf, 0, sizeof(buf));
+	buf[0] = 1;
+	buf[1] = 42;
+	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+
+	/* read the data from hidraw */
+	memset(buf, 0, sizeof(buf));
+	err = read(self->hidraw_fd, buf, sizeof(buf));
+	ASSERT_EQ(err, 6) TH_LOG("read_hidraw");
+	ASSERT_EQ(buf[0], 1);
+	ASSERT_EQ(buf[2], 47);
+}
+
+static int libbpf_print_fn(enum libbpf_print_level level,
+			   const char *format, va_list args)
+{
+	char buf[1024];
+
+	if (level == LIBBPF_DEBUG)
+		return 0;
+
+	snprintf(buf, sizeof(buf), "# %s", format);
+
+	vfprintf(stdout, buf, args);
+	return 0;
+}
+
+static void __attribute__((constructor)) __constructor_order_last(void)
+{
+	if (!__constructor_order)
+		__constructor_order = _CONSTRUCTOR_ORDER_BACKWARD;
+}
+
+int main(int argc, char **argv)
+{
+	/* Use libbpf 1.0 API mode */
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+	libbpf_set_print(libbpf_print_fn);
+
+	return test_harness_run(argc, argv);
+}
diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c
new file mode 100644
index 000000000000..b23ae2304346
--- /dev/null
+++ b/tools/testing/selftests/hid/progs/hid.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Red hat */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "hid_bpf_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct attach_prog_args {
+	int prog_fd;
+	unsigned int hid;
+	int retval;
+};
+
+__u64 callback_check = 52;
+__u64 callback2_check = 52;
+
+SEC("?fmod_ret/hid_bpf_device_event")
+int BPF_PROG(hid_first_event, struct hid_bpf_ctx *hid_ctx)
+{
+	__u8 *rw_data = hid_bpf_get_data(hid_ctx, 0 /* offset */, 3 /* size */);
+
+	if (!rw_data)
+		return 0; /* EPERM check */
+
+	callback_check = rw_data[1];
+
+	rw_data[2] = rw_data[1] + 5;
+
+	return 0;
+}
+
+SEC("syscall")
+int attach_prog(struct attach_prog_args *ctx)
+{
+	ctx->retval = hid_bpf_attach_prog(ctx->hid,
+					  ctx->prog_fd,
+					  0);
+	return 0;
+}
diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
new file mode 100644
index 000000000000..b9fa33f7d580
--- /dev/null
+++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2022 Benjamin Tissoires
+ */
+
+#ifndef __HID_BPF_HELPERS_H
+#define __HID_BPF_HELPERS_H
+
+/* following are kfuncs exported by HID for HID-BPF */
+extern __u8 *hid_bpf_get_data(struct hid_bpf_ctx *ctx,
+			      unsigned int offset,
+			      const size_t __sz) __ksym;
+extern int hid_bpf_attach_prog(unsigned int hid_id, int prog_fd, u32 flags) __ksym;
+
+#endif /* __HID_BPF_HELPERS_H */
-- 
cgit 


From 0330f725cc5b01f4149a2a21e474b2090a1dcead Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Thu, 3 Nov 2022 16:57:48 +0100
Subject: selftests/hid: add test to change the report size

Use a different report with a bigger size and ensures we are doing
things properly.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 tools/testing/selftests/hid/hid_bpf.c   | 41 +++++++++++++++++++++++++++++++++
 tools/testing/selftests/hid/progs/hid.c | 15 +++++++++++-
 2 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index 4fc7794c6f8b..6c1f43ec4bdd 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -22,6 +22,17 @@ static unsigned char rdesc[] = {
 	0xa1, 0x01,		/* COLLECTION (Application) */
 	0x09, 0x01,			/* Usage (Vendor Usage 0x01) */
 	0xa1, 0x00,			/* COLLECTION (Physical) */
+	0x85, 0x02,				/* REPORT_ID (2) */
+	0x19, 0x01,				/* USAGE_MINIMUM (1) */
+	0x29, 0x08,				/* USAGE_MAXIMUM (3) */
+	0x15, 0x00,				/* LOGICAL_MINIMUM (0) */
+	0x25, 0xff,				/* LOGICAL_MAXIMUM (255) */
+	0x95, 0x08,				/* REPORT_COUNT (8) */
+	0x75, 0x08,				/* REPORT_SIZE (8) */
+	0x81, 0x02,				/* INPUT (Data,Var,Abs) */
+	0xc0,				/* END_COLLECTION */
+	0x09, 0x01,			/* Usage (Vendor Usage 0x01) */
+	0xa1, 0x00,			/* COLLECTION (Physical) */
 	0x85, 0x01,				/* REPORT_ID (1) */
 	0x06, 0x00, 0xff,			/* Usage Page (Vendor Defined Page 1) */
 	0x19, 0x01,				/* USAGE_MINIMUM (1) */
@@ -645,6 +656,36 @@ TEST_F(hid_bpf, test_attach_detach)
 	ASSERT_EQ(buf[2], 47);
 }
 
+/*
+ * Attach hid_change_report_id to the given uhid device,
+ * retrieve and open the matching hidraw node,
+ * inject one event in the uhid device,
+ * check that the program sees it and can change the data
+ */
+TEST_F(hid_bpf, test_hid_change_report)
+{
+	const struct test_program progs[] = {
+		{ .name = "hid_change_report_id" },
+	};
+	__u8 buf[10] = {0};
+	int err;
+
+	LOAD_PROGRAMS(progs);
+
+	/* inject one event */
+	buf[0] = 1;
+	buf[1] = 42;
+	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+
+	/* read the data from hidraw */
+	memset(buf, 0, sizeof(buf));
+	err = read(self->hidraw_fd, buf, sizeof(buf));
+	ASSERT_EQ(err, 9) TH_LOG("read_hidraw");
+	ASSERT_EQ(buf[0], 2);
+	ASSERT_EQ(buf[1], 42);
+	ASSERT_EQ(buf[2], 0) TH_LOG("leftovers_from_previous_test");
+}
+
 static int libbpf_print_fn(enum libbpf_print_level level,
 			   const char *format, va_list args)
 {
diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c
index b23ae2304346..b69c3f8e1ac3 100644
--- a/tools/testing/selftests/hid/progs/hid.c
+++ b/tools/testing/selftests/hid/progs/hid.c
@@ -28,7 +28,20 @@ int BPF_PROG(hid_first_event, struct hid_bpf_ctx *hid_ctx)
 
 	rw_data[2] = rw_data[1] + 5;
 
-	return 0;
+	return hid_ctx->size;
+}
+
+SEC("?fmod_ret/hid_bpf_device_event")
+int BPF_PROG(hid_change_report_id, struct hid_bpf_ctx *hid_ctx)
+{
+	__u8 *rw_data = hid_bpf_get_data(hid_ctx, 0 /* offset */, 3 /* size */);
+
+	if (!rw_data)
+		return 0; /* EPERM check */
+
+	rw_data[0] = 2;
+
+	return 9;
 }
 
 SEC("syscall")
-- 
cgit 


From 4f7153cf461ed0f78d8da8c9fd02d38728a76b90 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Thu, 3 Nov 2022 16:57:50 +0100
Subject: selftests/hid: add tests for bpf_hid_hw_request

Add tests for the newly implemented function.
We test here only the GET_REPORT part because the other calls are pure
HID protocol and won't infer the result of the test of the bpf hook.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 tools/testing/selftests/hid/hid_bpf.c              | 56 +++++++++++++++++++++-
 tools/testing/selftests/hid/progs/hid.c            | 36 ++++++++++++++
 .../testing/selftests/hid/progs/hid_bpf_helpers.h  |  7 +++
 3 files changed, 98 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index 6c1f43ec4bdd..f77ee7fe4af4 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -82,12 +82,23 @@ static unsigned char rdesc[] = {
 	0xc0,			/* END_COLLECTION */
 };
 
+static __u8 feature_data[] = { 1, 2 };
+
 struct attach_prog_args {
 	int prog_fd;
 	unsigned int hid;
 	int retval;
 };
 
+struct hid_hw_request_syscall_args {
+	__u8 data[10];
+	unsigned int hid;
+	int retval;
+	size_t size;
+	enum hid_report_type type;
+	__u8 request_type;
+};
+
 #define ASSERT_OK(data) ASSERT_FALSE(data)
 #define ASSERT_OK_PTR(ptr) ASSERT_NE(NULL, ptr)
 
@@ -155,7 +166,7 @@ static void uhid_destroy(struct __test_metadata *_metadata, int fd)
 
 static int uhid_event(struct __test_metadata *_metadata, int fd)
 {
-	struct uhid_event ev;
+	struct uhid_event ev, answer;
 	ssize_t ret;
 
 	memset(&ev, 0, sizeof(ev));
@@ -196,6 +207,15 @@ static int uhid_event(struct __test_metadata *_metadata, int fd)
 		break;
 	case UHID_GET_REPORT:
 		UHID_LOG("UHID_GET_REPORT from uhid-dev");
+
+		answer.type = UHID_GET_REPORT_REPLY;
+		answer.u.get_report_reply.id = ev.u.get_report.id;
+		answer.u.get_report_reply.err = ev.u.get_report.rnum == 1 ? 0 : -EIO;
+		answer.u.get_report_reply.size = sizeof(feature_data);
+		memcpy(answer.u.get_report_reply.data, feature_data, sizeof(feature_data));
+
+		uhid_write(_metadata, fd, &answer);
+
 		break;
 	case UHID_SET_REPORT:
 		UHID_LOG("UHID_SET_REPORT from uhid-dev");
@@ -686,6 +706,40 @@ TEST_F(hid_bpf, test_hid_change_report)
 	ASSERT_EQ(buf[2], 0) TH_LOG("leftovers_from_previous_test");
 }
 
+/*
+ * Attach hid_user_raw_request to the given uhid device,
+ * call the bpf program from userspace
+ * check that the program is called and does the expected.
+ */
+TEST_F(hid_bpf, test_hid_user_raw_request_call)
+{
+	struct hid_hw_request_syscall_args args = {
+		.retval = -1,
+		.type = HID_FEATURE_REPORT,
+		.request_type = HID_REQ_GET_REPORT,
+		.size = 10,
+	};
+	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, tattrs,
+			    .ctx_in = &args,
+			    .ctx_size_in = sizeof(args),
+	);
+	int err, prog_fd;
+
+	LOAD_BPF;
+
+	args.hid = self->hid_id;
+	args.data[0] = 1; /* report ID */
+
+	prog_fd = bpf_program__fd(self->skel->progs.hid_user_raw_request);
+
+	err = bpf_prog_test_run_opts(prog_fd, &tattrs);
+	ASSERT_OK(err) TH_LOG("error while calling bpf_prog_test_run_opts");
+
+	ASSERT_EQ(args.retval, 2);
+
+	ASSERT_EQ(args.data[1], 2);
+}
+
 static int libbpf_print_fn(enum libbpf_print_level level,
 			   const char *format, va_list args)
 {
diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c
index b69c3f8e1ac3..7e35bb8e1abf 100644
--- a/tools/testing/selftests/hid/progs/hid.c
+++ b/tools/testing/selftests/hid/progs/hid.c
@@ -52,3 +52,39 @@ int attach_prog(struct attach_prog_args *ctx)
 					  0);
 	return 0;
 }
+
+struct hid_hw_request_syscall_args {
+	/* data needs to come at offset 0 so we can use it in calls */
+	__u8 data[10];
+	unsigned int hid;
+	int retval;
+	size_t size;
+	enum hid_report_type type;
+	__u8 request_type;
+};
+
+SEC("syscall")
+int hid_user_raw_request(struct hid_hw_request_syscall_args *args)
+{
+	struct hid_bpf_ctx *ctx;
+	const size_t size = args->size;
+	int i, ret = 0;
+
+	if (size > sizeof(args->data))
+		return -7; /* -E2BIG */
+
+	ctx = hid_bpf_allocate_context(args->hid);
+	if (!ctx)
+		return -1; /* EPERM check */
+
+	ret = hid_bpf_hw_request(ctx,
+				 args->data,
+				 size,
+				 args->type,
+				 args->request_type);
+	args->retval = ret;
+
+	hid_bpf_release_context(ctx);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
index b9fa33f7d580..4fff31dbe0e7 100644
--- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
+++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
@@ -10,5 +10,12 @@ extern __u8 *hid_bpf_get_data(struct hid_bpf_ctx *ctx,
 			      unsigned int offset,
 			      const size_t __sz) __ksym;
 extern int hid_bpf_attach_prog(unsigned int hid_id, int prog_fd, u32 flags) __ksym;
+extern struct hid_bpf_ctx *hid_bpf_allocate_context(unsigned int hid_id) __ksym;
+extern void hid_bpf_release_context(struct hid_bpf_ctx *ctx) __ksym;
+extern int hid_bpf_hw_request(struct hid_bpf_ctx *ctx,
+			      __u8 *data,
+			      size_t buf__sz,
+			      enum hid_report_type type,
+			      enum hid_class_request reqtype) __ksym;
 
 #endif /* __HID_BPF_HELPERS_H */
-- 
cgit 


From e8445737c0264cf4ddac682c278e0ef5b8a61a3d Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Thu, 3 Nov 2022 16:57:52 +0100
Subject: selftests/hid: add report descriptor fixup tests

Simple report descriptor override in HID: replace part of the report
descriptor from a static definition in the bpf kernel program.

Note that this test should be run last because we disconnect/reconnect
the device, meaning that it changes the overall uhid device.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 tools/testing/selftests/hid/hid_bpf.c   | 32 ++++++++++++++++++++
 tools/testing/selftests/hid/progs/hid.c | 53 +++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index f77ee7fe4af4..eb329b5d299f 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -740,6 +740,38 @@ TEST_F(hid_bpf, test_hid_user_raw_request_call)
 	ASSERT_EQ(args.data[1], 2);
 }
 
+/*
+ * Attach hid_rdesc_fixup to the given uhid device,
+ * retrieve and open the matching hidraw node,
+ * check that the hidraw report descriptor has been updated.
+ */
+TEST_F(hid_bpf, test_rdesc_fixup)
+{
+	struct hidraw_report_descriptor rpt_desc = {0};
+	const struct test_program progs[] = {
+		{ .name = "hid_rdesc_fixup" },
+	};
+	int err, desc_size;
+
+	LOAD_PROGRAMS(progs);
+
+	/* check that hid_rdesc_fixup() was executed */
+	ASSERT_EQ(self->skel->data->callback2_check, 0x21);
+
+	/* read the exposed report descriptor from hidraw */
+	err = ioctl(self->hidraw_fd, HIDIOCGRDESCSIZE, &desc_size);
+	ASSERT_GE(err, 0) TH_LOG("error while reading HIDIOCGRDESCSIZE: %d", err);
+
+	/* ensure the new size of the rdesc is bigger than the old one */
+	ASSERT_GT(desc_size, sizeof(rdesc));
+
+	rpt_desc.size = desc_size;
+	err = ioctl(self->hidraw_fd, HIDIOCGRDESC, &rpt_desc);
+	ASSERT_GE(err, 0) TH_LOG("error while reading HIDIOCGRDESC: %d", err);
+
+	ASSERT_EQ(rpt_desc.value[4], 0x42);
+}
+
 static int libbpf_print_fn(enum libbpf_print_level level,
 			   const char *format, va_list args)
 {
diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c
index 7e35bb8e1abf..5758366a90e6 100644
--- a/tools/testing/selftests/hid/progs/hid.c
+++ b/tools/testing/selftests/hid/progs/hid.c
@@ -88,3 +88,56 @@ int hid_user_raw_request(struct hid_hw_request_syscall_args *args)
 
 	return 0;
 }
+
+static const __u8 rdesc[] = {
+	0x05, 0x01,				/* USAGE_PAGE (Generic Desktop) */
+	0x09, 0x32,				/* USAGE (Z) */
+	0x95, 0x01,				/* REPORT_COUNT (1) */
+	0x81, 0x06,				/* INPUT (Data,Var,Rel) */
+
+	0x06, 0x00, 0xff,			/* Usage Page (Vendor Defined Page 1) */
+	0x19, 0x01,				/* USAGE_MINIMUM (1) */
+	0x29, 0x03,				/* USAGE_MAXIMUM (3) */
+	0x15, 0x00,				/* LOGICAL_MINIMUM (0) */
+	0x25, 0x01,				/* LOGICAL_MAXIMUM (1) */
+	0x95, 0x03,				/* REPORT_COUNT (3) */
+	0x75, 0x01,				/* REPORT_SIZE (1) */
+	0x91, 0x02,				/* Output (Data,Var,Abs) */
+	0x95, 0x01,				/* REPORT_COUNT (1) */
+	0x75, 0x05,				/* REPORT_SIZE (5) */
+	0x91, 0x01,				/* Output (Cnst,Var,Abs) */
+
+	0x06, 0x00, 0xff,			/* Usage Page (Vendor Defined Page 1) */
+	0x19, 0x06,				/* USAGE_MINIMUM (6) */
+	0x29, 0x08,				/* USAGE_MAXIMUM (8) */
+	0x15, 0x00,				/* LOGICAL_MINIMUM (0) */
+	0x25, 0x01,				/* LOGICAL_MAXIMUM (1) */
+	0x95, 0x03,				/* REPORT_COUNT (3) */
+	0x75, 0x01,				/* REPORT_SIZE (1) */
+	0xb1, 0x02,				/* Feature (Data,Var,Abs) */
+	0x95, 0x01,				/* REPORT_COUNT (1) */
+	0x75, 0x05,				/* REPORT_SIZE (5) */
+	0x91, 0x01,				/* Output (Cnst,Var,Abs) */
+
+	0xc0,				/* END_COLLECTION */
+	0xc0,			/* END_COLLECTION */
+};
+
+SEC("?fmod_ret/hid_bpf_rdesc_fixup")
+int BPF_PROG(hid_rdesc_fixup, struct hid_bpf_ctx *hid_ctx)
+{
+	__u8 *data = hid_bpf_get_data(hid_ctx, 0 /* offset */, 4096 /* size */);
+
+	if (!data)
+		return 0; /* EPERM check */
+
+	callback2_check = data[4];
+
+	/* insert rdesc at offset 73 */
+	__builtin_memcpy(&data[73], rdesc, sizeof(rdesc));
+
+	/* Change Usage Vendor globally */
+	data[4] = 0x42;
+
+	return sizeof(rdesc) + 73;
+}
-- 
cgit 


From 80e189f2af37d383b4840f3f89896071392cb58f Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Thu, 3 Nov 2022 16:57:53 +0100
Subject: selftests/hid: Add a test for BPF_F_INSERT_HEAD

Insert 3 programs to check that we are doing the correct thing:
'2', '1', '3' are inserted, but '1' is supposed to be executed first.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 tools/testing/selftests/hid/hid_bpf.c   | 43 ++++++++++++++++++++++++++
 tools/testing/selftests/hid/progs/hid.c | 55 ++++++++++++++++++++++++++++++++-
 2 files changed, 97 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index eb329b5d299f..6615c26fb5dd 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -88,6 +88,7 @@ struct attach_prog_args {
 	int prog_fd;
 	unsigned int hid;
 	int retval;
+	int insert_head;
 };
 
 struct hid_hw_request_syscall_args {
@@ -490,6 +491,7 @@ FIXTURE_SETUP(hid_bpf)
 
 struct test_program {
 	const char *name;
+	int insert_head;
 };
 #define LOAD_PROGRAMS(progs) \
 	load_programs(progs, ARRAY_SIZE(progs), _metadata, self, variant)
@@ -539,6 +541,7 @@ static void load_programs(const struct test_program programs[],
 
 		args.prog_fd = bpf_program__fd(prog);
 		args.hid = self->hid_id;
+		args.insert_head = programs[i].insert_head;
 		err = bpf_prog_test_run_opts(attach_fd, &tattr);
 		ASSERT_OK(args.retval) TH_LOG("attach_hid(%s): %d", programs[i].name, args.retval);
 	}
@@ -740,6 +743,46 @@ TEST_F(hid_bpf, test_hid_user_raw_request_call)
 	ASSERT_EQ(args.data[1], 2);
 }
 
+/*
+ * Attach hid_insert{0,1,2} to the given uhid device,
+ * retrieve and open the matching hidraw node,
+ * inject one event in the uhid device,
+ * check that the programs have been inserted in the correct order.
+ */
+TEST_F(hid_bpf, test_hid_attach_flags)
+{
+	const struct test_program progs[] = {
+		{
+			.name = "hid_test_insert2",
+			.insert_head = 0,
+		},
+		{
+			.name = "hid_test_insert1",
+			.insert_head = 1,
+		},
+		{
+			.name = "hid_test_insert3",
+			.insert_head = 0,
+		},
+	};
+	__u8 buf[10] = {0};
+	int err;
+
+	LOAD_PROGRAMS(progs);
+
+	/* inject one event */
+	buf[0] = 1;
+	uhid_send_event(_metadata, self->uhid_fd, buf, 6);
+
+	/* read the data from hidraw */
+	memset(buf, 0, sizeof(buf));
+	err = read(self->hidraw_fd, buf, sizeof(buf));
+	ASSERT_EQ(err, 6) TH_LOG("read_hidraw");
+	ASSERT_EQ(buf[1], 1);
+	ASSERT_EQ(buf[2], 2);
+	ASSERT_EQ(buf[3], 3);
+}
+
 /*
  * Attach hid_rdesc_fixup to the given uhid device,
  * retrieve and open the matching hidraw node,
diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c
index 5758366a90e6..6a86af0aa545 100644
--- a/tools/testing/selftests/hid/progs/hid.c
+++ b/tools/testing/selftests/hid/progs/hid.c
@@ -11,6 +11,7 @@ struct attach_prog_args {
 	int prog_fd;
 	unsigned int hid;
 	int retval;
+	int insert_head;
 };
 
 __u64 callback_check = 52;
@@ -49,7 +50,8 @@ int attach_prog(struct attach_prog_args *ctx)
 {
 	ctx->retval = hid_bpf_attach_prog(ctx->hid,
 					  ctx->prog_fd,
-					  0);
+					  ctx->insert_head ? HID_BPF_FLAG_INSERT_HEAD :
+							     HID_BPF_FLAG_NONE);
 	return 0;
 }
 
@@ -141,3 +143,54 @@ int BPF_PROG(hid_rdesc_fixup, struct hid_bpf_ctx *hid_ctx)
 
 	return sizeof(rdesc) + 73;
 }
+
+SEC("?fmod_ret/hid_bpf_device_event")
+int BPF_PROG(hid_test_insert1, struct hid_bpf_ctx *hid_ctx)
+{
+	__u8 *data = hid_bpf_get_data(hid_ctx, 0 /* offset */, 4 /* size */);
+
+	if (!data)
+		return 0; /* EPERM check */
+
+	/* we need to be run first */
+	if (data[2] || data[3])
+		return -1;
+
+	data[1] = 1;
+
+	return 0;
+}
+
+SEC("?fmod_ret/hid_bpf_device_event")
+int BPF_PROG(hid_test_insert2, struct hid_bpf_ctx *hid_ctx)
+{
+	__u8 *data = hid_bpf_get_data(hid_ctx, 0 /* offset */, 4 /* size */);
+
+	if (!data)
+		return 0; /* EPERM check */
+
+	/* after insert0 and before insert2 */
+	if (!data[1] || data[3])
+		return -1;
+
+	data[2] = 2;
+
+	return 0;
+}
+
+SEC("?fmod_ret/hid_bpf_device_event")
+int BPF_PROG(hid_test_insert3, struct hid_bpf_ctx *hid_ctx)
+{
+	__u8 *data = hid_bpf_get_data(hid_ctx, 0 /* offset */, 4 /* size */);
+
+	if (!data)
+		return 0; /* EPERM check */
+
+	/* at the end */
+	if (!data[1] || !data[2])
+		return -1;
+
+	data[3] = 3;
+
+	return 0;
+}
-- 
cgit 


From d0b93a0d2b00a719d6c761bab5bbe72f41a81a30 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 6 Dec 2022 15:59:35 +0100
Subject: selftests: hid: ensures we have the proper requirements in config

DYNAMIC_FTRACE_WITH_DIRECT_CALLS is implicit on x86_64 but is still a
WIP on aarm64. Ensure we get it selected to not have any surprises.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Reviewed-by: Jiri Kosina <jkosina@suse.cz>
Link: https://lore.kernel.org/r/20221206145936.922196-5-benjamin.tissoires@redhat.com
---
 tools/testing/selftests/hid/config | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/config b/tools/testing/selftests/hid/config
index d4130489c1b1..9c5a55abca6b 100644
--- a/tools/testing/selftests/hid/config
+++ b/tools/testing/selftests/hid/config
@@ -11,6 +11,7 @@ CONFIG_BPF_SYSCALL=y
 CONFIG_BPF=y
 CONFIG_CGROUP_BPF=y
 CONFIG_DEBUG_INFO_BTF=y
+CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y
 CONFIG_FPROBE=y
 CONFIG_FTRACE_SYSCALLS=y
 CONFIG_FUNCTION_TRACER=y
-- 
cgit 


From e9d48abbce93e2138e37a695ba5931f3d3c9fdd9 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 6 Dec 2022 15:59:36 +0100
Subject: kselftests: hid: fix missing headers_install step

The Makefile was assuming that headers_install was already done in
the top source directory, and was searching for installed uapi headers
there.
Unfortunately this is not the case and we need to manually call that step.

To do so, reorder the declaration of the variables, and reuses top_srcdir
provided by lib.mk

Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/all/202212060216.a6X8Py5H-lkp@intel.com/#t
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Reviewed-by: Jiri Kosina <jkosina@suse.cz>
Link: https://lore.kernel.org/r/20221206145936.922196-6-benjamin.tissoires@redhat.com
---
 tools/testing/selftests/hid/Makefile | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/Makefile b/tools/testing/selftests/hid/Makefile
index 693f1cb6e47a..f6fc5cfff770 100644
--- a/tools/testing/selftests/hid/Makefile
+++ b/tools/testing/selftests/hid/Makefile
@@ -7,17 +7,9 @@ include ../../../scripts/Makefile.include
 
 CXX ?= $(CROSS_COMPILE)g++
 
-CURDIR := $(abspath .)
-TOOLSDIR := $(abspath ../../..)
-TOP_SRCDIR = $(CURDIR)/../../../..
-KHDR_INCLUDES := $(TOP_SRCDIR)/usr/include
-LIBDIR := $(TOOLSDIR)/lib
-BPFDIR := $(LIBDIR)/bpf
-TOOLSINCDIR := $(TOOLSDIR)/include
-BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
 HOSTPKG_CONFIG := pkg-config
 
-CFLAGS += -g -O0 -rdynamic -Wall -Werror -I$(KHDR_INCLUDES)
+CFLAGS += -g -O0 -rdynamic -Wall -Werror -I$(KHDR_INCLUDES) -I$(OUTPUT)
 LDLIBS += -lelf -lz -lrt -lpthread
 
 # Silence some warnings when compiled with clang
@@ -53,9 +45,15 @@ endef
 
 include ../lib.mk
 
+TOOLSDIR := $(top_srcdir)/tools
+LIBDIR := $(TOOLSDIR)/lib
+BPFDIR := $(LIBDIR)/bpf
+TOOLSINCDIR := $(TOOLSDIR)/include
+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
 SCRATCH_DIR := $(OUTPUT)/tools
 BUILD_DIR := $(SCRATCH_DIR)/build
 INCLUDE_DIR := $(SCRATCH_DIR)/include
+KHDR_INCLUDES := $(SCRATCH_DIR)/uapi/include
 BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a
 ifneq ($(CROSS_COMPILE),)
 HOST_BUILD_DIR		:= $(BUILD_DIR)/host
@@ -122,7 +120,6 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)    \
 		    prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install-bin
 
 $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
-	   $(KHDR_INCLUDES)/linux/bpf.h					       \
 	   | $(BUILD_DIR)/libbpf
 	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \
 		    EXTRA_CFLAGS='-g -O0'				       \
@@ -130,7 +127,6 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
 
 ifneq ($(BPFOBJ),$(HOST_BPFOBJ))
 $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
-		$(KHDR_INCLUDES)/linux/bpf.h					       \
 		| $(HOST_BUILD_DIR)/libbpf
 	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR)                             \
 		    EXTRA_CFLAGS='-g -O0' ARCH= CROSS_COMPILE=		       \
@@ -147,6 +143,9 @@ else
 	$(Q)cp "$(VMLINUX_H)" $@
 endif
 
+$(KHDR_INCLUDES)/linux/hid.h: $(top_srcdir)/include/uapi/linux/hid.h
+	$(MAKE) -C $(top_srcdir) INSTALL_HDR_PATH=$(SCRATCH_DIR)/uapi headers_install
+
 $(RESOLVE_BTFIDS): $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/resolve_btfids	\
 		       $(TOOLSDIR)/bpf/resolve_btfids/main.c	\
 		       $(TOOLSDIR)/lib/rbtree.c			\
@@ -178,8 +177,7 @@ MENDIAN=$(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)
 
 CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG))
 BPF_CFLAGS = -g -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) 		\
-	     -I$(INCLUDE_DIR) -I$(CURDIR) -I$(KHDR_INCLUDES)		\
-	     -I$(abspath $(OUTPUT)/../usr/include)
+	     -I$(INCLUDE_DIR)
 
 CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) \
 	       -Wno-compare-distinct-pointer-types
@@ -225,7 +223,7 @@ $(BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(OUTPUT)
 	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
 	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked1.o) name $(notdir $(<:.bpf.o=)) > $@
 
-$(OUTPUT)/%:%.c $(BPF_SKELS)
+$(OUTPUT)/%:%.c $(BPF_SKELS) $(KHDR_INCLUDES)/linux/hid.h
 	$(call msg,BINARY,,$@)
 	$(Q)$(LINK.c) $^ $(LDLIBS) -o $@
 
-- 
cgit 


From c9883ee9d110703ccb3dfe2ca13e0b7a01351077 Mon Sep 17 00:00:00 2001
From: Xin Liu <liuxin350@huawei.com>
Date: Sat, 10 Dec 2022 16:20:45 +0800
Subject: libbpf: Optimized return value in libbpf_strerror when errno is
 libbpf errno

This is a small improvement in libbpf_strerror. When libbpf_strerror
is used to obtain the system error description, if the length of the
buf is insufficient, libbpf_sterror returns ERANGE and sets errno to
ERANGE.

However, this processing is not performed when the error code
customized by libbpf is obtained. Make some minor improvements here,
return -ERANGE and set errno to ERANGE when buf is not enough for
custom description.

Signed-off-by: Xin Liu <liuxin350@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20221210082045.233697-1-liuxin350@huawei.com
---
 tools/lib/bpf/libbpf_errno.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf_errno.c b/tools/lib/bpf/libbpf_errno.c
index 96f67a772a1b..6b180172ec6b 100644
--- a/tools/lib/bpf/libbpf_errno.c
+++ b/tools/lib/bpf/libbpf_errno.c
@@ -39,14 +39,14 @@ static const char *libbpf_strerror_table[NR_ERRNO] = {
 
 int libbpf_strerror(int err, char *buf, size_t size)
 {
+	int ret;
+
 	if (!buf || !size)
 		return libbpf_err(-EINVAL);
 
 	err = err > 0 ? err : -err;
 
 	if (err < __LIBBPF_ERRNO__START) {
-		int ret;
-
 		ret = strerror_r(err, buf, size);
 		buf[size - 1] = '\0';
 		return libbpf_err_errno(ret);
@@ -56,12 +56,20 @@ int libbpf_strerror(int err, char *buf, size_t size)
 		const char *msg;
 
 		msg = libbpf_strerror_table[ERRNO_OFFSET(err)];
-		snprintf(buf, size, "%s", msg);
+		ret = snprintf(buf, size, "%s", msg);
 		buf[size - 1] = '\0';
+		/* The length of the buf and msg is positive.
+		 * A negative number may be returned only when the
+		 * size exceeds INT_MAX. Not likely to appear.
+		 */
+		if (ret >= size)
+			return libbpf_err(-ERANGE);
 		return 0;
 	}
 
-	snprintf(buf, size, "Unknown libbpf error %d", err);
+	ret = snprintf(buf, size, "Unknown libbpf error %d", err);
 	buf[size - 1] = '\0';
+	if (ret >= size)
+		return libbpf_err(-ERANGE);
 	return libbpf_err(-ENOENT);
 }
-- 
cgit 


From 872aec4b5f635d94111d48ec3c57fbe078d64e7d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 12 Dec 2022 13:15:00 -0800
Subject: libbpf: Fix single-line struct definition output in btf_dump

btf_dump APIs emit unnecessary tabs when emitting struct/union
definition that fits on the single line. Before this patch we'd get:

struct blah {<tab>};

This patch fixes this and makes sure that we get more natural:

struct blah {};

Fixes: 44a726c3f23c ("bpftool: Print newline before '}' for struct with padding only fields")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20221212211505.558851-2-andrii@kernel.org
---
 tools/lib/bpf/btf_dump.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index deb2bc9a0a7b..69e80ee5f70e 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -959,9 +959,12 @@ static void btf_dump_emit_struct_def(struct btf_dump *d,
 	 * Keep `struct empty {}` on a single line,
 	 * only print newline when there are regular or padding fields.
 	 */
-	if (vlen || t->size)
+	if (vlen || t->size) {
 		btf_dump_printf(d, "\n");
-	btf_dump_printf(d, "%s}", pfx(lvl));
+		btf_dump_printf(d, "%s}", pfx(lvl));
+	} else {
+		btf_dump_printf(d, "}");
+	}
 	if (packed)
 		btf_dump_printf(d, " __attribute__((packed))");
 }
-- 
cgit 


From 21a9a1bcccaa4f0337a24d666fe55944abcb171e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 12 Dec 2022 13:15:01 -0800
Subject: libbpf: Handle non-standardly sized enums better in BTF-to-C dumper

Turns out C allows to force enum to be 1-byte or 8-byte explicitly using
mode(byte) or mode(word), respecticely. Linux sources are using this in
some cases. This is imporant to handle correctly, as enum size
determines corresponding fields in a struct that use that enum type. And
if enum size is incorrect, this will lead to invalid struct layout. So
add mode(byte) and mode(word) attribute support to btf_dump APIs.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20221212211505.558851-3-andrii@kernel.org
---
 tools/lib/bpf/btf_dump.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 69e80ee5f70e..234e82334d56 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -13,6 +13,7 @@
 #include <ctype.h>
 #include <endian.h>
 #include <errno.h>
+#include <limits.h>
 #include <linux/err.h>
 #include <linux/btf.h>
 #include <linux/kernel.h>
@@ -1076,6 +1077,43 @@ static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id,
 	else
 		btf_dump_emit_enum64_val(d, t, lvl, vlen);
 	btf_dump_printf(d, "\n%s}", pfx(lvl));
+
+	/* special case enums with special sizes */
+	if (t->size == 1) {
+		/* one-byte enums can be forced with mode(byte) attribute */
+		btf_dump_printf(d, " __attribute__((mode(byte)))");
+	} else if (t->size == 8 && d->ptr_sz == 8) {
+		/* enum can be 8-byte sized if one of the enumerator values
+		 * doesn't fit in 32-bit integer, or by adding mode(word)
+		 * attribute (but probably only on 64-bit architectures); do
+		 * our best here to try to satisfy the contract without adding
+		 * unnecessary attributes
+		 */
+		bool needs_word_mode;
+
+		if (btf_is_enum(t)) {
+			/* enum can't represent 64-bit values, so we need word mode */
+			needs_word_mode = true;
+		} else {
+			/* enum64 needs mode(word) if none of its values has
+			 * non-zero upper 32-bits (which means that all values
+			 * fit in 32-bit integers and won't cause compiler to
+			 * bump enum to be 64-bit naturally
+			 */
+			int i;
+
+			needs_word_mode = true;
+			for (i = 0; i < vlen; i++) {
+				if (btf_enum64(t)[i].val_hi32 != 0) {
+					needs_word_mode = false;
+					break;
+				}
+			}
+		}
+		if (needs_word_mode)
+			btf_dump_printf(d, " __attribute__((mode(word)))");
+	}
+
 }
 
 static void btf_dump_emit_fwd_def(struct btf_dump *d, __u32 id,
-- 
cgit 


From 9d2349740e430b20660457b7c88fa06467457272 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 12 Dec 2022 13:15:02 -0800
Subject: selftests/bpf: Add non-standardly sized enum tests for btf_dump

Add few custom enum definitions testing mode(byte) and mode(word)
attributes.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20221212211505.558851-4-andrii@kernel.org
---
 .../bpf/progs/btf_dump_test_case_syntax.c          | 36 ++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
index 4ee4748133fe..26fffb02ed10 100644
--- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
@@ -25,6 +25,39 @@ typedef enum {
 	H = 2,
 } e3_t;
 
+/* ----- START-EXPECTED-OUTPUT ----- */
+/*
+ *enum e_byte {
+ *	EBYTE_1 = 0,
+ *	EBYTE_2 = 1,
+ *} __attribute__((mode(byte)));
+ *
+ */
+/* ----- END-EXPECTED-OUTPUT ----- */
+enum e_byte {
+	EBYTE_1,
+	EBYTE_2,
+} __attribute__((mode(byte)));
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+/*
+ *enum e_word {
+ *	EWORD_1 = 0LL,
+ *	EWORD_2 = 1LL,
+ *} __attribute__((mode(word)));
+ *
+ */
+/* ----- END-EXPECTED-OUTPUT ----- */
+enum e_word {
+	EWORD_1,
+	EWORD_2,
+} __attribute__((mode(word))); /* force to use 8-byte backing for this enum */
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+enum e_big {
+	EBIG_1 = 1000000000000ULL,
+};
+
 typedef int int_t;
 
 typedef volatile const int * volatile const crazy_ptr_t;
@@ -224,6 +257,9 @@ struct root_struct {
 	enum e2 _2;
 	e2_t _2_1;
 	e3_t _2_2;
+	enum e_byte _100;
+	enum e_word _101;
+	enum e_big _102;
 	struct struct_w_typedefs _3;
 	anon_struct_t _7;
 	struct struct_fwd *_8;
-- 
cgit 


From 25a4481b4136af7794e1df2d6c90ed2f354d60ce Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 12 Dec 2022 13:15:03 -0800
Subject: libbpf: Fix btf__align_of() by taking into account field offsets

btf__align_of() is supposed to be return alignment requirement of
a requested BTF type. For STRUCT/UNION it doesn't always return correct
value, because it calculates alignment only based on field types. But
for packed structs this is not enough, we need to also check field
offsets and struct size. If field offset isn't aligned according to
field type's natural alignment, then struct must be packed. Similarly,
if struct size is not a multiple of struct's natural alignment, then
struct must be packed as well.

This patch fixes this issue precisely by additionally checking these
conditions.

Fixes: 3d208f4ca111 ("libbpf: Expose btf__align_of() API")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20221212211505.558851-5-andrii@kernel.org
---
 tools/lib/bpf/btf.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 71e165b09ed5..8cbcef959456 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -688,8 +688,21 @@ int btf__align_of(const struct btf *btf, __u32 id)
 			if (align <= 0)
 				return libbpf_err(align);
 			max_align = max(max_align, align);
+
+			/* if field offset isn't aligned according to field
+			 * type's alignment, then struct must be packed
+			 */
+			if (btf_member_bitfield_size(t, i) == 0 &&
+			    (m->offset % (8 * align)) != 0)
+				return 1;
 		}
 
+		/* if struct/union size isn't a multiple of its alignment,
+		 * then struct must be packed
+		 */
+		if ((t->size % max_align) != 0)
+			return 1;
+
 		return max_align;
 	}
 	default:
-- 
cgit 


From ea2ce1ba99aa6a60c8d8a706e3abadf3de372163 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 12 Dec 2022 13:15:04 -0800
Subject: libbpf: Fix BTF-to-C converter's padding logic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Turns out that btf_dump API doesn't handle a bunch of tricky corner
cases, as reported by Per, and further discovered using his testing
Python script ([0]).

This patch revamps btf_dump's padding logic significantly, making it
more correct and also avoiding unnecessary explicit padding, where
compiler would pad naturally. This overall topic turned out to be very
tricky and subtle, there are lots of subtle corner cases. The comments
in the code tries to give some clues, but comments themselves are
supposed to be paired with good understanding of C alignment and padding
rules. Plus some experimentation to figure out subtle things like
whether `long :0;` means that struct is now forced to be long-aligned
(no, it's not, turns out).

Anyways, Per's script, while not completely correct in some known
situations, doesn't show any obvious cases where this logic breaks, so
this is a nice improvement over the previous state of this logic.

Some selftests had to be adjusted to accommodate better use of natural
alignment rules, eliminating some unnecessary padding, or changing it to
`type: 0;` alignment markers.

Note also that for when we are in between bitfields, we emit explicit
bit size, while otherwise we use `: 0`, this feels much more natural in
practice.

Next patch will add few more test cases, found through randomized Per's
script.

  [0] https://lore.kernel.org/bpf/85f83c333f5355c8ac026f835b18d15060725fcb.camel@ericsson.com/

Reported-by: Per Sundström XP <per.xp.sundstrom@ericsson.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20221212211505.558851-6-andrii@kernel.org
---
 tools/lib/bpf/btf_dump.c                           | 169 +++++++++++++++------
 .../bpf/progs/btf_dump_test_case_bitfields.c       |   2 +-
 .../bpf/progs/btf_dump_test_case_padding.c         |  58 ++++---
 3 files changed, 164 insertions(+), 65 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 234e82334d56..d6fd93a57f11 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -830,6 +830,25 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id)
 	}
 }
 
+static int btf_natural_align_of(const struct btf *btf, __u32 id)
+{
+	const struct btf_type *t = btf__type_by_id(btf, id);
+	int i, align, vlen;
+	const struct btf_member *m;
+
+	if (!btf_is_composite(t))
+		return btf__align_of(btf, id);
+
+	align = 1;
+	m = btf_members(t);
+	vlen = btf_vlen(t);
+	for (i = 0; i < vlen; i++, m++) {
+		align = max(align, btf__align_of(btf, m->type));
+	}
+
+	return align;
+}
+
 static bool btf_is_struct_packed(const struct btf *btf, __u32 id,
 				 const struct btf_type *t)
 {
@@ -837,16 +856,16 @@ static bool btf_is_struct_packed(const struct btf *btf, __u32 id,
 	int align, i, bit_sz;
 	__u16 vlen;
 
-	align = btf__align_of(btf, id);
-	/* size of a non-packed struct has to be a multiple of its alignment*/
-	if (align && t->size % align)
+	align = btf_natural_align_of(btf, id);
+	/* size of a non-packed struct has to be a multiple of its alignment */
+	if (align && (t->size % align) != 0)
 		return true;
 
 	m = btf_members(t);
 	vlen = btf_vlen(t);
 	/* all non-bitfield fields have to be naturally aligned */
 	for (i = 0; i < vlen; i++, m++) {
-		align = btf__align_of(btf, m->type);
+		align = btf_natural_align_of(btf, m->type);
 		bit_sz = btf_member_bitfield_size(t, i);
 		if (align && bit_sz == 0 && m->offset % (8 * align) != 0)
 			return true;
@@ -859,44 +878,97 @@ static bool btf_is_struct_packed(const struct btf *btf, __u32 id,
 	return false;
 }
 
-static int chip_away_bits(int total, int at_most)
-{
-	return total % at_most ? : at_most;
-}
-
 static void btf_dump_emit_bit_padding(const struct btf_dump *d,
-				      int cur_off, int m_off, int m_bit_sz,
-				      int align, int lvl)
+				      int cur_off, int next_off, int next_align,
+				      bool in_bitfield, int lvl)
 {
-	int off_diff = m_off - cur_off;
-	int ptr_bits = d->ptr_sz * 8;
+	const struct {
+		const char *name;
+		int bits;
+	} pads[] = {
+		{"long", d->ptr_sz * 8}, {"int", 32}, {"short", 16}, {"char", 8}
+	};
+	int new_off, pad_bits, bits, i;
+	const char *pad_type;
+
+	if (cur_off >= next_off)
+		return; /* no gap */
+
+	/* For filling out padding we want to take advantage of
+	 * natural alignment rules to minimize unnecessary explicit
+	 * padding. First, we find the largest type (among long, int,
+	 * short, or char) that can be used to force naturally aligned
+	 * boundary. Once determined, we'll use such type to fill in
+	 * the remaining padding gap. In some cases we can rely on
+	 * compiler filling some gaps, but sometimes we need to force
+	 * alignment to close natural alignment with markers like
+	 * `long: 0` (this is always the case for bitfields).  Note
+	 * that even if struct itself has, let's say 4-byte alignment
+	 * (i.e., it only uses up to int-aligned types), using `long:
+	 * X;` explicit padding doesn't actually change struct's
+	 * overall alignment requirements, but compiler does take into
+	 * account that type's (long, in this example) natural
+	 * alignment requirements when adding implicit padding. We use
+	 * this fact heavily and don't worry about ruining correct
+	 * struct alignment requirement.
+	 */
+	for (i = 0; i < ARRAY_SIZE(pads); i++) {
+		pad_bits = pads[i].bits;
+		pad_type = pads[i].name;
 
-	if (off_diff <= 0)
-		/* no gap */
-		return;
-	if (m_bit_sz == 0 && off_diff < align * 8)
-		/* natural padding will take care of a gap */
-		return;
+		new_off = roundup(cur_off, pad_bits);
+		if (new_off <= next_off)
+			break;
+	}
 
-	while (off_diff > 0) {
-		const char *pad_type;
-		int pad_bits;
-
-		if (ptr_bits > 32 && off_diff > 32) {
-			pad_type = "long";
-			pad_bits = chip_away_bits(off_diff, ptr_bits);
-		} else if (off_diff > 16) {
-			pad_type = "int";
-			pad_bits = chip_away_bits(off_diff, 32);
-		} else if (off_diff > 8) {
-			pad_type = "short";
-			pad_bits = chip_away_bits(off_diff, 16);
-		} else {
-			pad_type = "char";
-			pad_bits = chip_away_bits(off_diff, 8);
+	if (new_off > cur_off && new_off <= next_off) {
+		/* We need explicit `<type>: 0` aligning mark if next
+		 * field is right on alignment offset and its
+		 * alignment requirement is less strict than <type>'s
+		 * alignment (so compiler won't naturally align to the
+		 * offset we expect), or if subsequent `<type>: X`,
+		 * will actually completely fit in the remaining hole,
+		 * making compiler basically ignore `<type>: X`
+		 * completely.
+		 */
+		if (in_bitfield ||
+		    (new_off == next_off && roundup(cur_off, next_align * 8) != new_off) ||
+		    (new_off != next_off && next_off - new_off <= new_off - cur_off))
+			/* but for bitfields we'll emit explicit bit count */
+			btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type,
+					in_bitfield ? new_off - cur_off : 0);
+		cur_off = new_off;
+	}
+
+	/* Now we know we start at naturally aligned offset for a chosen
+	 * padding type (long, int, short, or char), and so the rest is just
+	 * a straightforward filling of remaining padding gap with full
+	 * `<type>: sizeof(<type>);` markers, except for the last one, which
+	 * might need smaller than sizeof(<type>) padding.
+	 */
+	while (cur_off != next_off) {
+		bits = min(next_off - cur_off, pad_bits);
+		if (bits == pad_bits) {
+			btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, pad_bits);
+			cur_off += bits;
+			continue;
+		}
+		/* For the remainder padding that doesn't cover entire
+		 * pad_type bit length, we pick the smallest necessary type.
+		 * This is pure aesthetics, we could have just used `long`,
+		 * but having smallest necessary one communicates better the
+		 * scale of the padding gap.
+		 */
+		for (i = ARRAY_SIZE(pads) - 1; i >= 0; i--) {
+			pad_type = pads[i].name;
+			pad_bits = pads[i].bits;
+			if (pad_bits < bits)
+				continue;
+
+			btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, bits);
+			cur_off += bits;
+			break;
 		}
-		btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, pad_bits);
-		off_diff -= pad_bits;
 	}
 }
 
@@ -916,9 +988,11 @@ static void btf_dump_emit_struct_def(struct btf_dump *d,
 {
 	const struct btf_member *m = btf_members(t);
 	bool is_struct = btf_is_struct(t);
-	int align, i, packed, off = 0;
+	bool packed, prev_bitfield = false;
+	int align, i, off = 0;
 	__u16 vlen = btf_vlen(t);
 
+	align = btf__align_of(d->btf, id);
 	packed = is_struct ? btf_is_struct_packed(d->btf, id, t) : 0;
 
 	btf_dump_printf(d, "%s%s%s {",
@@ -928,33 +1002,36 @@ static void btf_dump_emit_struct_def(struct btf_dump *d,
 
 	for (i = 0; i < vlen; i++, m++) {
 		const char *fname;
-		int m_off, m_sz;
+		int m_off, m_sz, m_align;
+		bool in_bitfield;
 
 		fname = btf_name_of(d, m->name_off);
 		m_sz = btf_member_bitfield_size(t, i);
 		m_off = btf_member_bit_offset(t, i);
-		align = packed ? 1 : btf__align_of(d->btf, m->type);
+		m_align = packed ? 1 : btf__align_of(d->btf, m->type);
+
+		in_bitfield = prev_bitfield && m_sz != 0;
 
-		btf_dump_emit_bit_padding(d, off, m_off, m_sz, align, lvl + 1);
+		btf_dump_emit_bit_padding(d, off, m_off, m_align, in_bitfield, lvl + 1);
 		btf_dump_printf(d, "\n%s", pfx(lvl + 1));
 		btf_dump_emit_type_decl(d, m->type, fname, lvl + 1);
 
 		if (m_sz) {
 			btf_dump_printf(d, ": %d", m_sz);
 			off = m_off + m_sz;
+			prev_bitfield = true;
 		} else {
 			m_sz = max((__s64)0, btf__resolve_size(d->btf, m->type));
 			off = m_off + m_sz * 8;
+			prev_bitfield = false;
 		}
+
 		btf_dump_printf(d, ";");
 	}
 
 	/* pad at the end, if necessary */
-	if (is_struct) {
-		align = packed ? 1 : btf__align_of(d->btf, id);
-		btf_dump_emit_bit_padding(d, off, t->size * 8, 0, align,
-					  lvl + 1);
-	}
+	if (is_struct)
+		btf_dump_emit_bit_padding(d, off, t->size * 8, align, false, lvl + 1);
 
 	/*
 	 * Keep `struct empty {}` on a single line,
diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c
index e5560a656030..e01690618e1e 100644
--- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c
@@ -53,7 +53,7 @@ struct bitfields_only_mixed_types {
  */
 /* ------ END-EXPECTED-OUTPUT ------ */
 struct bitfield_mixed_with_others {
-	long: 4; /* char is enough as a backing field */
+	char: 4; /* char is enough as a backing field */
 	int a: 4;
 	/* 8-bit implicit padding */
 	short b; /* combined with previous bitfield */
diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c
index 7cb522d22a66..6f963d34c45b 100644
--- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c
@@ -19,7 +19,7 @@ struct padded_implicitly {
 /*
  *struct padded_explicitly {
  *	int a;
- *	int: 32;
+ *	long: 0;
  *	int b;
  *};
  *
@@ -28,41 +28,28 @@ struct padded_implicitly {
 
 struct padded_explicitly {
 	int a;
-	int: 1; /* algo will explicitly pad with full 32 bits here */
+	int: 1; /* algo will emit aligning `long: 0;` here */
 	int b;
 };
 
 /* ----- START-EXPECTED-OUTPUT ----- */
-/*
- *struct padded_a_lot {
- *	int a;
- *	long: 32;
- *	long: 64;
- *	long: 64;
- *	int b;
- *};
- *
- */
-/* ------ END-EXPECTED-OUTPUT ------ */
-
 struct padded_a_lot {
 	int a;
-	/* 32 bit of implicit padding here, which algo will make explicit */
 	long: 64;
 	long: 64;
 	int b;
 };
 
+/* ------ END-EXPECTED-OUTPUT ------ */
+
 /* ----- START-EXPECTED-OUTPUT ----- */
 /*
  *struct padded_cache_line {
  *	int a;
- *	long: 32;
  *	long: 64;
  *	long: 64;
  *	long: 64;
  *	int b;
- *	long: 32;
  *	long: 64;
  *	long: 64;
  *	long: 64;
@@ -85,7 +72,7 @@ struct padded_cache_line {
  *struct zone {
  *	int a;
  *	short b;
- *	short: 16;
+ *	long: 0;
  *	struct zone_padding __pad__;
  *};
  *
@@ -108,6 +95,39 @@ struct padding_wo_named_members {
 	long: 64;
 };
 
+struct padding_weird_1 {
+	int a;
+	long: 64;
+	short: 16;
+	short b;
+};
+
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+/*
+ *struct padding_weird_2 {
+ *	long: 56;
+ *	char a;
+ *	long: 56;
+ *	char b;
+ *	char: 8;
+ *};
+ *
+ */
+/* ------ END-EXPECTED-OUTPUT ------ */
+struct padding_weird_2 {
+	int: 32;	/* these paddings will be collapsed into `long: 56;` */
+	short: 16;
+	char: 8;
+	char a;
+	int: 32;	/* these paddings will be collapsed into `long: 56;` */
+	short: 16;
+	char: 8;
+	char b;
+	char: 8;
+};
+
 /* ------ END-EXPECTED-OUTPUT ------ */
 
 int f(struct {
@@ -117,6 +137,8 @@ int f(struct {
 	struct padded_cache_line _4;
 	struct zone _5;
 	struct padding_wo_named_members _6;
+	struct padding_weird_1 _7;
+	struct padding_weird_2 _8;
 } *_)
 {
 	return 0;
-- 
cgit 


From b148c8b9b926e257a59c8eb2cd6fa3adfd443254 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 12 Dec 2022 13:15:05 -0800
Subject: selftests/bpf: Add few corner cases to test padding handling of
 btf_dump

Add few hand-crafted cases and few randomized cases found using script
from [0] that tests btf_dump's padding logic.

  [0] https://lore.kernel.org/bpf/85f83c333f5355c8ac026f835b18d15060725fcb.camel@ericsson.com/

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20221212211505.558851-7-andrii@kernel.org
---
 .../bpf/progs/btf_dump_test_case_packing.c         |  61 +++++++++++-
 .../bpf/progs/btf_dump_test_case_padding.c         | 104 +++++++++++++++++++++
 2 files changed, 164 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c
index e304b6204bd9..5c6c62f7ed32 100644
--- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c
@@ -58,7 +58,64 @@ union jump_code_union {
 	} __attribute__((packed));
 };
 
-/*------ END-EXPECTED-OUTPUT ------ */
+/* ----- START-EXPECTED-OUTPUT ----- */
+/*
+ *struct nested_packed_but_aligned_struct {
+ *	int x1;
+ *	int x2;
+ *};
+ *
+ *struct outer_implicitly_packed_struct {
+ *	char y1;
+ *	struct nested_packed_but_aligned_struct y2;
+ *} __attribute__((packed));
+ *
+ */
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+struct nested_packed_but_aligned_struct {
+	int x1;
+	int x2;
+} __attribute__((packed));
+
+struct outer_implicitly_packed_struct {
+	char y1;
+	struct nested_packed_but_aligned_struct y2;
+};
+/* ----- START-EXPECTED-OUTPUT ----- */
+/*
+ *struct usb_ss_ep_comp_descriptor {
+ *	char: 8;
+ *	char bDescriptorType;
+ *	char bMaxBurst;
+ *	short wBytesPerInterval;
+ *};
+ *
+ *struct usb_host_endpoint {
+ *	long: 64;
+ *	char: 8;
+ *	struct usb_ss_ep_comp_descriptor ss_ep_comp;
+ *	long: 0;
+ *} __attribute__((packed));
+ *
+ */
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+struct usb_ss_ep_comp_descriptor {
+	char: 8;
+	char bDescriptorType;
+	char bMaxBurst;
+	int: 0;
+	short wBytesPerInterval;
+} __attribute__((packed));
+
+struct usb_host_endpoint {
+	long: 64;
+	char: 8;
+	struct usb_ss_ep_comp_descriptor ss_ep_comp;
+	long: 0;
+};
+
 
 int f(struct {
 	struct packed_trailing_space _1;
@@ -69,6 +126,8 @@ int f(struct {
 	union union_is_never_packed _6;
 	union union_does_not_need_packing _7;
 	union jump_code_union _8;
+	struct outer_implicitly_packed_struct _9;
+	struct usb_host_endpoint _10;
 } *_)
 {
 	return 0;
diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c
index 6f963d34c45b..79276fbe454a 100644
--- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c
@@ -128,6 +128,98 @@ struct padding_weird_2 {
 	char: 8;
 };
 
+/* ----- START-EXPECTED-OUTPUT ----- */
+struct exact_1byte {
+	char x;
+};
+
+struct padded_1byte {
+	char: 8;
+};
+
+struct exact_2bytes {
+	short x;
+};
+
+struct padded_2bytes {
+	short: 16;
+};
+
+struct exact_4bytes {
+	int x;
+};
+
+struct padded_4bytes {
+	int: 32;
+};
+
+struct exact_8bytes {
+	long x;
+};
+
+struct padded_8bytes {
+	long: 64;
+};
+
+struct ff_periodic_effect {
+	int: 32;
+	short magnitude;
+	long: 0;
+	short phase;
+	long: 0;
+	int: 32;
+	int custom_len;
+	short *custom_data;
+};
+
+struct ib_wc {
+	long: 64;
+	long: 64;
+	int: 32;
+	int byte_len;
+	void *qp;
+	union {} ex;
+	long: 64;
+	int slid;
+	int wc_flags;
+	long: 64;
+	char smac[6];
+	long: 0;
+	char network_hdr_type;
+};
+
+struct acpi_object_method {
+	long: 64;
+	char: 8;
+	char type;
+	short reference_count;
+	char flags;
+	short: 0;
+	char: 8;
+	char sync_level;
+	long: 64;
+	void *node;
+	void *aml_start;
+	union {} dispatch;
+	long: 64;
+	int aml_length;
+};
+
+struct nested_unpacked {
+	int x;
+};
+
+struct nested_packed {
+	struct nested_unpacked a;
+	char c;
+} __attribute__((packed));
+
+struct outer_mixed_but_unpacked {
+	struct nested_packed b1;
+	short a1;
+	struct nested_packed b2;
+};
+
 /* ------ END-EXPECTED-OUTPUT ------ */
 
 int f(struct {
@@ -139,6 +231,18 @@ int f(struct {
 	struct padding_wo_named_members _6;
 	struct padding_weird_1 _7;
 	struct padding_weird_2 _8;
+	struct exact_1byte _100;
+	struct padded_1byte _101;
+	struct exact_2bytes _102;
+	struct padded_2bytes _103;
+	struct exact_4bytes _104;
+	struct padded_4bytes _105;
+	struct exact_8bytes _106;
+	struct padded_8bytes _107;
+	struct ff_periodic_effect _200;
+	struct ib_wc _201;
+	struct acpi_object_method _202;
+	struct outer_mixed_but_unpacked _203;
 } *_)
 {
 	return 0;
-- 
cgit 


From 4fb877aaa179dcdb1676d55216482febaada457e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 15 Dec 2022 10:36:05 -0800
Subject: libbpf: Fix btf_dump's packed struct determination

Fix bug in btf_dump's logic of determining if a given struct type is
packed or not. The notion of "natural alignment" is not needed and is
even harmful in this case, so drop it altogether. The biggest difference
in btf_is_struct_packed() compared to its original implementation is
that we don't really use btf__align_of() to determine overall alignment
of a struct type (because it could be 1 for both packed and non-packed
struct, depending on specifci field definitions), and just use field's
actual alignment to calculate whether any field is requiring packing or
struct's size overall necessitates packing.

Add two simple test cases that demonstrate the difference this change
would make.

Fixes: ea2ce1ba99aa ("libbpf: Fix BTF-to-C converter's padding logic")
Reported-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20221215183605.4149488-1-andrii@kernel.org
---
 tools/lib/bpf/btf_dump.c                           | 33 ++++------------------
 .../bpf/progs/btf_dump_test_case_packing.c         | 19 +++++++++++++
 2 files changed, 25 insertions(+), 27 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index d6fd93a57f11..580985ee5545 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -830,47 +830,26 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id)
 	}
 }
 
-static int btf_natural_align_of(const struct btf *btf, __u32 id)
-{
-	const struct btf_type *t = btf__type_by_id(btf, id);
-	int i, align, vlen;
-	const struct btf_member *m;
-
-	if (!btf_is_composite(t))
-		return btf__align_of(btf, id);
-
-	align = 1;
-	m = btf_members(t);
-	vlen = btf_vlen(t);
-	for (i = 0; i < vlen; i++, m++) {
-		align = max(align, btf__align_of(btf, m->type));
-	}
-
-	return align;
-}
-
 static bool btf_is_struct_packed(const struct btf *btf, __u32 id,
 				 const struct btf_type *t)
 {
 	const struct btf_member *m;
-	int align, i, bit_sz;
+	int max_align = 1, align, i, bit_sz;
 	__u16 vlen;
 
-	align = btf_natural_align_of(btf, id);
-	/* size of a non-packed struct has to be a multiple of its alignment */
-	if (align && (t->size % align) != 0)
-		return true;
-
 	m = btf_members(t);
 	vlen = btf_vlen(t);
 	/* all non-bitfield fields have to be naturally aligned */
 	for (i = 0; i < vlen; i++, m++) {
-		align = btf_natural_align_of(btf, m->type);
+		align = btf__align_of(btf, m->type);
 		bit_sz = btf_member_bitfield_size(t, i);
 		if (align && bit_sz == 0 && m->offset % (8 * align) != 0)
 			return true;
+		max_align = max(align, max_align);
 	}
-
+	/* size of a non-packed struct has to be a multiple of its alignment */
+	if (t->size % max_align != 0)
+		return true;
 	/*
 	 * if original struct was marked as packed, but its layout is
 	 * naturally aligned, we'll detect that it's not packed
diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c
index 5c6c62f7ed32..7998f27df7dd 100644
--- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c
@@ -116,6 +116,23 @@ struct usb_host_endpoint {
 	long: 0;
 };
 
+/* ----- START-EXPECTED-OUTPUT ----- */
+struct nested_packed_struct {
+	int a;
+	char b;
+} __attribute__((packed));
+
+struct outer_nonpacked_struct {
+	short a;
+	struct nested_packed_struct b;
+};
+
+struct outer_packed_struct {
+	short a;
+	struct nested_packed_struct b;
+} __attribute__((packed));
+
+/* ------ END-EXPECTED-OUTPUT ------ */
 
 int f(struct {
 	struct packed_trailing_space _1;
@@ -128,6 +145,8 @@ int f(struct {
 	union jump_code_union _8;
 	struct outer_implicitly_packed_struct _9;
 	struct usb_host_endpoint _10;
+	struct outer_nonpacked_struct _11;
+	struct outer_packed_struct _12;
 } *_)
 {
 	return 0;
-- 
cgit 


From 0e43662e61f2569500ab83b8188c065603530785 Mon Sep 17 00:00:00 2001
From: Shen Jiamin <shen_jiamin@comp.nus.edu.sg>
Date: Thu, 15 Dec 2022 12:47:03 +0800
Subject: tools/resolve_btfids: Use pkg-config to locate libelf

When libelf was not installed in the standard location, it cannot be
located by the current building config.

Use pkg-config to help locate libelf in such cases.

Signed-off-by: Shen Jiamin <shen_jiamin@comp.nus.edu.sg>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20221215044703.400139-1-shen_jiamin@comp.nus.edu.sg
---
 tools/bpf/resolve_btfids/Makefile | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile
index 19a3112e271a..f7375a119f54 100644
--- a/tools/bpf/resolve_btfids/Makefile
+++ b/tools/bpf/resolve_btfids/Makefile
@@ -56,13 +56,17 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OU
 		    DESTDIR=$(LIBBPF_DESTDIR) prefix= EXTRA_CFLAGS="$(CFLAGS)" \
 		    $(abspath $@) install_headers
 
+LIBELF_FLAGS := $(shell $(HOSTPKG_CONFIG) libelf --cflags 2>/dev/null)
+LIBELF_LIBS  := $(shell $(HOSTPKG_CONFIG) libelf --libs 2>/dev/null || echo -lelf)
+
 CFLAGS += -g \
           -I$(srctree)/tools/include \
           -I$(srctree)/tools/include/uapi \
           -I$(LIBBPF_INCLUDE) \
-          -I$(SUBCMD_SRC)
+          -I$(SUBCMD_SRC) \
+          $(LIBELF_FLAGS)
 
-LIBS = -lelf -lz
+LIBS = $(LIBELF_LIBS) -lz
 
 export srctree OUTPUT CFLAGS Q
 include $(srctree)/tools/build/Makefile.include
-- 
cgit 


From e26aa600ba6a62fe84659f1df497a381bab6d07e Mon Sep 17 00:00:00 2001
From: Christian Ehrig <cehrig@cloudflare.com>
Date: Sun, 18 Dec 2022 06:17:31 +0100
Subject: bpf: Add flag BPF_F_NO_TUNNEL_KEY to bpf_skb_set_tunnel_key()

This patch allows to remove TUNNEL_KEY from the tunnel flags bitmap
when using bpf_skb_set_tunnel_key by providing a BPF_F_NO_TUNNEL_KEY
flag. On egress, the resulting tunnel header will not contain a tunnel
key if the protocol and implementation supports it.

At the moment bpf_tunnel_key wants a user to specify a numeric tunnel
key. This will wrap the inner packet into a tunnel header with the key
bit and value set accordingly. This is problematic when using a tunnel
protocol that supports optional tunnel keys and a receiving tunnel
device that is not expecting packets with the key bit set. The receiver
won't decapsulate and drop the packet.

RFC 2890 and RFC 2784 GRE tunnels are examples where this flag is
useful. It allows for generating packets, that can be decapsulated by
a GRE tunnel device not operating in collect metadata mode or not
expecting the key bit set.

Signed-off-by: Christian Ehrig <cehrig@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/20221218051734.31411-1-cehrig@cloudflare.com
---
 include/uapi/linux/bpf.h       | 4 ++++
 net/core/filter.c              | 5 ++++-
 tools/include/uapi/linux/bpf.h | 4 ++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 464ca3f01fe7..bc1a3d232ae4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2001,6 +2001,9 @@ union bpf_attr {
  * 			sending the packet. This flag was added for GRE
  * 			encapsulation, but might be used with other protocols
  * 			as well in the future.
+ * 		**BPF_F_NO_TUNNEL_KEY**
+ * 			Add a flag to tunnel metadata indicating that no tunnel
+ * 			key should be set in the resulting tunnel header.
  *
  * 		Here is a typical usage on the transmit path:
  *
@@ -5764,6 +5767,7 @@ enum {
 	BPF_F_ZERO_CSUM_TX		= (1ULL << 1),
 	BPF_F_DONT_FRAGMENT		= (1ULL << 2),
 	BPF_F_SEQ_NUMBER		= (1ULL << 3),
+	BPF_F_NO_TUNNEL_KEY		= (1ULL << 4),
 };
 
 /* BPF_FUNC_skb_get_tunnel_key flags. */
diff --git a/net/core/filter.c b/net/core/filter.c
index 929358677183..c746e4d77214 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4615,7 +4615,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 	struct ip_tunnel_info *info;
 
 	if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
-			       BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER)))
+			       BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER |
+			       BPF_F_NO_TUNNEL_KEY)))
 		return -EINVAL;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
@@ -4653,6 +4654,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 		info->key.tun_flags &= ~TUNNEL_CSUM;
 	if (flags & BPF_F_SEQ_NUMBER)
 		info->key.tun_flags |= TUNNEL_SEQ;
+	if (flags & BPF_F_NO_TUNNEL_KEY)
+		info->key.tun_flags &= ~TUNNEL_KEY;
 
 	info->key.tun_id = cpu_to_be64(from->tunnel_id);
 	info->key.tos = from->tunnel_tos;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 464ca3f01fe7..bc1a3d232ae4 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2001,6 +2001,9 @@ union bpf_attr {
  * 			sending the packet. This flag was added for GRE
  * 			encapsulation, but might be used with other protocols
  * 			as well in the future.
+ * 		**BPF_F_NO_TUNNEL_KEY**
+ * 			Add a flag to tunnel metadata indicating that no tunnel
+ * 			key should be set in the resulting tunnel header.
  *
  * 		Here is a typical usage on the transmit path:
  *
@@ -5764,6 +5767,7 @@ enum {
 	BPF_F_ZERO_CSUM_TX		= (1ULL << 1),
 	BPF_F_DONT_FRAGMENT		= (1ULL << 2),
 	BPF_F_SEQ_NUMBER		= (1ULL << 3),
+	BPF_F_NO_TUNNEL_KEY		= (1ULL << 4),
 };
 
 /* BPF_FUNC_skb_get_tunnel_key flags. */
-- 
cgit 


From ac6e45e05857464a1e347c50da9917141f1fbb80 Mon Sep 17 00:00:00 2001
From: Christian Ehrig <cehrig@cloudflare.com>
Date: Sun, 18 Dec 2022 06:17:32 +0100
Subject: selftests/bpf: Add BPF_F_NO_TUNNEL_KEY test

This patch adds a selftest simulating a GRE sender and receiver using
tunnel headers without tunnel keys. It validates if packets encapsulated
using BPF_F_NO_TUNNEL_KEY are decapsulated by a GRE receiver not
configured with tunnel keys.

Signed-off-by: Christian Ehrig <cehrig@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/20221218051734.31411-2-cehrig@cloudflare.com
---
 .../testing/selftests/bpf/progs/test_tunnel_kern.c | 21 ++++++++++++
 tools/testing/selftests/bpf/test_tunnel.sh         | 40 ++++++++++++++++++++--
 2 files changed, 58 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index 98af55f0bcd3..508da4a23c4f 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -81,6 +81,27 @@ int gre_set_tunnel(struct __sk_buff *skb)
 	return TC_ACT_OK;
 }
 
+SEC("tc")
+int gre_set_tunnel_no_key(struct __sk_buff *skb)
+{
+	int ret;
+	struct bpf_tunnel_key key;
+
+	__builtin_memset(&key, 0x0, sizeof(key));
+	key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
+	key.tunnel_ttl = 64;
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_ZERO_CSUM_TX | BPF_F_SEQ_NUMBER |
+				     BPF_F_NO_TUNNEL_KEY);
+	if (ret < 0) {
+		log_err(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
 SEC("tc")
 int gre_get_tunnel(struct __sk_buff *skb)
 {
diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh
index 2eaedc1d9ed3..06857b689c11 100755
--- a/tools/testing/selftests/bpf/test_tunnel.sh
+++ b/tools/testing/selftests/bpf/test_tunnel.sh
@@ -66,15 +66,20 @@ config_device()
 
 add_gre_tunnel()
 {
+	tun_key=
+	if [ -n "$1" ]; then
+		tun_key="key $1"
+	fi
+
 	# at_ns0 namespace
 	ip netns exec at_ns0 \
-        ip link add dev $DEV_NS type $TYPE seq key 2 \
+        ip link add dev $DEV_NS type $TYPE seq $tun_key \
 		local 172.16.1.100 remote 172.16.1.200
 	ip netns exec at_ns0 ip link set dev $DEV_NS up
 	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
 
 	# root namespace
-	ip link add dev $DEV type $TYPE key 2 external
+	ip link add dev $DEV type $TYPE $tun_key external
 	ip link set dev $DEV up
 	ip addr add dev $DEV 10.1.1.200/24
 }
@@ -238,7 +243,7 @@ test_gre()
 
 	check $TYPE
 	config_device
-	add_gre_tunnel
+	add_gre_tunnel 2
 	attach_bpf $DEV gre_set_tunnel gre_get_tunnel
 	ping $PING_ARG 10.1.1.100
 	check_err $?
@@ -253,6 +258,30 @@ test_gre()
         echo -e ${GREEN}"PASS: $TYPE"${NC}
 }
 
+test_gre_no_tunnel_key()
+{
+	TYPE=gre
+	DEV_NS=gre00
+	DEV=gre11
+	ret=0
+
+	check $TYPE
+	config_device
+	add_gre_tunnel
+	attach_bpf $DEV gre_set_tunnel_no_key gre_get_tunnel
+	ping $PING_ARG 10.1.1.100
+	check_err $?
+	ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+	check_err $?
+	cleanup
+
+        if [ $ret -ne 0 ]; then
+                echo -e ${RED}"FAIL: $TYPE"${NC}
+                return 1
+        fi
+        echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
 test_ip6gre()
 {
 	TYPE=ip6gre
@@ -589,6 +618,7 @@ cleanup()
 	ip link del ipip6tnl11 2> /dev/null
 	ip link del ip6ip6tnl11 2> /dev/null
 	ip link del gretap11 2> /dev/null
+	ip link del gre11 2> /dev/null
 	ip link del ip6gre11 2> /dev/null
 	ip link del ip6gretap11 2> /dev/null
 	ip link del geneve11 2> /dev/null
@@ -641,6 +671,10 @@ bpf_tunnel_test()
 	test_gre
 	errors=$(( $errors + $? ))
 
+	echo "Testing GRE tunnel (without tunnel keys)..."
+	test_gre_no_tunnel_key
+	errors=$(( $errors + $? ))
+
 	echo "Testing IP6GRE tunnel..."
 	test_ip6gre
 	errors=$(( $errors + $? ))
-- 
cgit 


From 1520e8466d683b6c5e1aa53aa65165ebd5da46cf Mon Sep 17 00:00:00 2001
From: Khem Raj <raj.khem@gmail.com>
Date: Mon, 19 Dec 2022 11:15:26 -0800
Subject: libbpf: Fix build warning on ref_ctr_off for 32-bit architectures

Clang warns on 32-bit ARM on this comparision:

libbpf.c:10497:18: error: result of comparison of constant 4294967296 with expression of type 'size_t' (aka 'unsigned int') is always false [-Werror,-Wtautological-constant-out-of-range-compare]
        if (ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS))
            ~~~~~~~~~~~ ^  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Typecast ref_ctr_off to __u64 in the check conditional, it is false on
32bit anyways.

Signed-off-by: Khem Raj <raj.khem@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20221219191526.296264-1-raj.khem@gmail.com
---
 tools/lib/bpf/libbpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 2a82f49ce16f..a5c67a3c93c5 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -9903,7 +9903,7 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name,
 	char errmsg[STRERR_BUFSIZE];
 	int type, pfd;
 
-	if (ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS))
+	if ((__u64)ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS))
 		return -EINVAL;
 
 	memset(&attr, 0, attr_sz);
-- 
cgit 


From e6b4e1d759d3bfb7cb84c87cc8f1858da7db8dea Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Sun, 18 Dec 2022 06:35:08 +0800
Subject: libbpf: Show error info about missing ".BTF" section

Show the real problem instead of just saying "No such file or directory".

Now will print below info:
libbpf: failed to find '.BTF' ELF section in /home/changbin/work/linux/vmlinux
Error: failed to load BTF from /home/changbin/work/linux/vmlinux: No such file or directory

Signed-off-by: Changbin Du <changbin.du@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20221217223509.88254-2-changbin.du@gmail.com
---
 tools/lib/bpf/btf.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 8cbcef959456..b03250007018 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1003,6 +1003,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf,
 	err = 0;
 
 	if (!btf_data) {
+		pr_warn("failed to find '%s' ELF section in %s\n", BTF_ELF_SEC, path);
 		err = -ENOENT;
 		goto done;
 	}
-- 
cgit 


From e7f0d5cdd023d8fa53d9ca541b9a55f0eb45618c Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Sun, 18 Dec 2022 06:35:09 +0800
Subject: bpf: makefiles: Do not generate empty vmlinux.h

Remove the empty vmlinux.h if bpftool failed to dump btf info.
The empty vmlinux.h can hide real error when reading output
of make.

This is done by adding .DELETE_ON_ERROR special target in related
makefiles.

Signed-off-by: Changbin Du <changbin.du@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20221217223509.88254-3-changbin.du@gmail.com
---
 tools/bpf/bpftool/Makefile           | 3 +++
 tools/testing/selftests/bpf/Makefile | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 787b857d3fb5..313fd1b09189 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -289,3 +289,6 @@ FORCE:
 .PHONY: all FORCE bootstrap clean install-bin install uninstall
 .PHONY: doc doc-clean doc-install doc-uninstall
 .DEFAULT_GOAL := all
+
+# Delete partially updated (corrupted) files on error
+.DELETE_ON_ERROR:
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index c22c43bbee19..205e8c3c346a 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -626,3 +626,6 @@ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)	\
 			       liburandom_read.so)
 
 .PHONY: docs docs-clean
+
+# Delete partially updated (corrupted) files on error
+.DELETE_ON_ERROR:
-- 
cgit 


From 4ec38eda85b9d07942a1cc7a29bba8aa7c810780 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 21 Dec 2022 10:00:49 -0800
Subject: libbpf: start v1.2 development cycle

Bump current version for new development cycle to v1.2.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20221221180049.853365-1-andrii@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/lib/bpf/libbpf.map       | 3 +++
 tools/lib/bpf/libbpf_version.h | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 71bf5691a689..11c36a3c1a9f 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -382,3 +382,6 @@ LIBBPF_1.1.0 {
 		user_ring_buffer__reserve_blocking;
 		user_ring_buffer__submit;
 } LIBBPF_1.0.0;
+
+LIBBPF_1.2.0 {
+} LIBBPF_1.1.0;
diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h
index e944f5bce728..1fd2eeac5cfc 100644
--- a/tools/lib/bpf/libbpf_version.h
+++ b/tools/lib/bpf/libbpf_version.h
@@ -4,6 +4,6 @@
 #define __LIBBPF_VERSION_H
 
 #define LIBBPF_MAJOR_VERSION 1
-#define LIBBPF_MINOR_VERSION 1
+#define LIBBPF_MINOR_VERSION 2
 
 #endif /* __LIBBPF_VERSION_H */
-- 
cgit 


From 59fe41b5255f7b8a19a3347ec4b03fd830d6f4aa Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Fri, 16 Dec 2022 13:43:19 -0800
Subject: selftests/bpf: Add verifier test exercising jit PROBE_MEM logic

This patch adds a test exercising logic that was fixed / improved in
the previous patch in the series, as well as general sanity checking for
jit's PROBE_MEM logic which should've been unaffected by the previous
patch.

The added verifier test does the following:

  * Acquire a referenced kptr to struct prog_test_ref_kfunc using
    existing net/bpf/test_run.c kfunc
    * Helper returns ptr to a specific prog_test_ref_kfunc whose first
      two fields - both ints - have been prepopulated w/ vals 42 and
      108, respectively
  * kptr_xchg the acquired ptr into an arraymap
  * Do a direct map_value load of the just-added ptr
    * Goal of all this setup is to get an unreferenced kptr pointing to
      struct with ints of known value, which is the result of this step
  * Using unreferenced kptr obtained in previous step, do loads of
    prog_test_ref_kfunc.a (offset 0) and .b (offset 4)
  * Then incr the kptr by 8 and load prog_test_ref_kfunc.a again (this
    time at offset -8)
  * Add all the loaded ints together and return

Before the PROBE_MEM fixes in previous patch, the loads at offset 0 and
4 would succeed, while the load at offset -8 would incorrectly fail
runtime check emitted by the JIT and 0 out dst reg as a result. This
confirmed by retval of 150 for this test before previous patch - since
second .a read is 0'd out - and a retval of 192 with the fixed logic.

The test exercises the two optimizations to fixed logic added in last
patch as well:

  * First load, with insn "r8 = *(u32 *)(r9 + 0)" exercises "insn->off
    is 0, no need to add / sub from src_reg" optimization
  * Third load, with insn "r9 = *(u32 *)(r9 - 8)" exercises "src_reg ==
    dst_reg, no need to restore src_reg after load" optimization

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20221216214319.3408356-2-davemarchevsky@fb.com
---
 .../selftests/bpf/prog_tests/jit_probe_mem.c       | 28 ++++++++++
 tools/testing/selftests/bpf/progs/jit_probe_mem.c  | 61 ++++++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/jit_probe_mem.c
 create mode 100644 tools/testing/selftests/bpf/progs/jit_probe_mem.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/jit_probe_mem.c b/tools/testing/selftests/bpf/prog_tests/jit_probe_mem.c
new file mode 100644
index 000000000000..5639428607e6
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/jit_probe_mem.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+#include <network_helpers.h>
+
+#include "jit_probe_mem.skel.h"
+
+void test_jit_probe_mem(void)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts,
+		.data_in = &pkt_v4,
+		.data_size_in = sizeof(pkt_v4),
+		.repeat = 1,
+	);
+	struct jit_probe_mem *skel;
+	int ret;
+
+	skel = jit_probe_mem__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "jit_probe_mem__open_and_load"))
+		return;
+
+	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_jit_probe_mem), &opts);
+	ASSERT_OK(ret, "jit_probe_mem ret");
+	ASSERT_OK(opts.retval, "jit_probe_mem opts.retval");
+	ASSERT_EQ(skel->data->total_sum, 192, "jit_probe_mem total_sum");
+
+	jit_probe_mem__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/jit_probe_mem.c b/tools/testing/selftests/bpf/progs/jit_probe_mem.c
new file mode 100644
index 000000000000..2d2e61470794
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/jit_probe_mem.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+
+static struct prog_test_ref_kfunc __kptr_ref *v;
+long total_sum = -1;
+
+extern struct prog_test_ref_kfunc *bpf_kfunc_call_test_acquire(unsigned long *sp) __ksym;
+extern void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym;
+
+SEC("tc")
+int test_jit_probe_mem(struct __sk_buff *ctx)
+{
+	struct prog_test_ref_kfunc *p;
+	unsigned long zero = 0, sum;
+
+	p = bpf_kfunc_call_test_acquire(&zero);
+	if (!p)
+		return 1;
+
+	p = bpf_kptr_xchg(&v, p);
+	if (p)
+		goto release_out;
+
+	/* Direct map value access of kptr, should be PTR_UNTRUSTED */
+	p = v;
+	if (!p)
+		return 1;
+
+	asm volatile (
+		"r9 = %[p];"
+		"%[sum] = 0;"
+
+		/* r8 = p->a */
+		"r8 = *(u32 *)(r9 + 0);"
+		"%[sum] += r8;"
+
+		/* r8 = p->b */
+		"r8 = *(u32 *)(r9 + 4);"
+		"%[sum] += r8;"
+
+		"r9 += 8;"
+		/* r9 = p->a */
+		"r9 = *(u32 *)(r9 - 8);"
+		"%[sum] += r9;"
+
+		: [sum] "=r"(sum)
+		: [p] "r"(p)
+		: "r8", "r9"
+	);
+
+	total_sum = sum;
+	return 0;
+release_out:
+	bpf_kfunc_call_test_release(p);
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit 


From 5fbf8c24b66d8d63fd6a452fc60e7e11d98ac5f6 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 22 Dec 2022 15:30:58 +0100
Subject: selftests/bpf: Add jit probe_mem corner case tests to s390x denylist

BPF CI fails for s390x with the following result:

  [...]
  All error logs:

  libbpf: prog 'test_jit_probe_mem': BPF program load failed: ERROR: strerror_r(524)=22
  libbpf: prog 'test_jit_probe_mem':
    -- BEGIN PROG LOAD LOG --
    JIT does not support calling kernel function
    processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
    -- END PROG LOAD LOG --
  libbpf: prog 'test_jit_probe_mem': failed to load: -524
  libbpf: failed to load object 'jit_probe_mem'
  libbpf: failed to load BPF skeleton 'jit_probe_mem': -524
  test_jit_probe_mem:FAIL:jit_probe_mem__open_and_load unexpected error: -524
  #89      jit_probe_mem:FAIL
  [...]

Add the test to the deny list.

Fixes: 59fe41b5255f ("selftests/bpf: Add verifier test exercising jit PROBE_MEM logic")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/DENYLIST.s390x | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index 585fcf73c731..3efe091255bf 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -26,6 +26,7 @@ get_func_args_test	                 # trampoline
 get_func_ip_test                         # get_func_ip_test__attach unexpected error: -524                             (trampoline)
 get_stack_raw_tp                         # user_stack corrupted user stack                                             (no backchain userspace)
 htab_update                              # failed to attach: ERROR: strerror_r(-524)=22                                (trampoline)
+jit_probe_mem                            # jit_probe_mem__open_and_load unexpected error: -524                         (kfunc)
 kfree_skb                                # attach fentry unexpected error: -524                                        (trampoline)
 kfunc_call                               # 'bpf_prog_active': not found in kernel BTF                                  (?)
 kfunc_dynptr_param                       # JIT does not support calling kernel function                                (kfunc)
-- 
cgit 


From 4842dadfc66f627083ec46c4e9a426e805c765f3 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:03 -0500
Subject: selftests/rseq: Fix: Fail thread registration when CONFIG_RSEQ=n

When linking the selftests against a libc which does not handle rseq
registration (before 2.35),  rseq thread registration silently succeed
even with CONFIG_RSEQ=n because it erroneously thinks that libc is
handling rseq registration.

This is caused by setting the rseq ownership flag only after the
rseq_available() check. It should rather be set before the
rseq_available() check.

Set the rseq_size to 0 (error value) immediately after the
rseq_available() check fails rather than in the thread registration
functions.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-2-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/rseq.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
index 4177f9507bbe..376a73f1ac41 100644
--- a/tools/testing/selftests/rseq/rseq.c
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -119,9 +119,11 @@ void rseq_init(void)
 		rseq_flags = *libc_rseq_flags_p;
 		return;
 	}
-	if (!rseq_available())
-		return;
 	rseq_ownership = 1;
+	if (!rseq_available()) {
+		rseq_size = 0;
+		return;
+	}
 	rseq_offset = (void *)&__rseq_abi - rseq_thread_pointer();
 	rseq_size = sizeof(struct rseq_abi);
 	rseq_flags = 0;
-- 
cgit 


From 03f5c0272d1b59343144e199becc911dae52c37e Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:07 -0500
Subject: selftests/rseq: Use ELF auxiliary vector for extensible rseq

Use the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE to detect the RSEQ
features supported by the kernel.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-6-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/rseq-abi.h |  5 +++
 tools/testing/selftests/rseq/rseq.c     | 67 +++++++++++++++++++++++++++++----
 tools/testing/selftests/rseq/rseq.h     | 18 +++++++--
 3 files changed, 78 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/rseq-abi.h b/tools/testing/selftests/rseq/rseq-abi.h
index a8c44d9af71f..00ac846d85b0 100644
--- a/tools/testing/selftests/rseq/rseq-abi.h
+++ b/tools/testing/selftests/rseq/rseq-abi.h
@@ -146,6 +146,11 @@ struct rseq_abi {
 	 *     this thread.
 	 */
 	__u32 flags;
+
+	/*
+	 * Flexible array member at end of structure, after last feature field.
+	 */
+	char end[];
 } __attribute__((aligned(4 * sizeof(__u64))));
 
 #endif /* _RSEQ_ABI_H */
diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
index 376a73f1ac41..1e8e3265bdbf 100644
--- a/tools/testing/selftests/rseq/rseq.c
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -28,6 +28,8 @@
 #include <limits.h>
 #include <dlfcn.h>
 #include <stddef.h>
+#include <sys/auxv.h>
+#include <linux/auxvec.h>
 
 #include "../kselftest.h"
 #include "rseq.h"
@@ -36,20 +38,38 @@ static const ptrdiff_t *libc_rseq_offset_p;
 static const unsigned int *libc_rseq_size_p;
 static const unsigned int *libc_rseq_flags_p;
 
-/* Offset from the thread pointer to the rseq area.  */
+/* Offset from the thread pointer to the rseq area. */
 ptrdiff_t rseq_offset;
 
-/* Size of the registered rseq area.  0 if the registration was
-   unsuccessful.  */
+/*
+ * Size of the registered rseq area. 0 if the registration was
+ * unsuccessful.
+ */
 unsigned int rseq_size = -1U;
 
 /* Flags used during rseq registration.  */
 unsigned int rseq_flags;
 
+/*
+ * rseq feature size supported by the kernel. 0 if the registration was
+ * unsuccessful.
+ */
+unsigned int rseq_feature_size = -1U;
+
 static int rseq_ownership;
+static int rseq_reg_success;	/* At least one rseq registration has succeded. */
+
+/* Allocate a large area for the TLS. */
+#define RSEQ_THREAD_AREA_ALLOC_SIZE	1024
+
+/* Original struct rseq feature size is 20 bytes. */
+#define ORIG_RSEQ_FEATURE_SIZE		20
+
+/* Original struct rseq allocation size is 32 bytes. */
+#define ORIG_RSEQ_ALLOC_SIZE		32
 
 static
-__thread struct rseq_abi __rseq_abi __attribute__((tls_model("initial-exec"))) = {
+__thread struct rseq_abi __rseq_abi __attribute__((tls_model("initial-exec"), aligned(RSEQ_THREAD_AREA_ALLOC_SIZE))) = {
 	.cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED,
 };
 
@@ -84,10 +104,16 @@ int rseq_register_current_thread(void)
 		/* Treat libc's ownership as a successful registration. */
 		return 0;
 	}
-	rc = sys_rseq(&__rseq_abi, sizeof(struct rseq_abi), 0, RSEQ_SIG);
-	if (rc)
+	rc = sys_rseq(&__rseq_abi, rseq_size, 0, RSEQ_SIG);
+	if (rc) {
+		if (RSEQ_READ_ONCE(rseq_reg_success)) {
+			/* Incoherent success/failure within process. */
+			abort();
+		}
 		return -1;
+	}
 	assert(rseq_current_cpu_raw() >= 0);
+	RSEQ_WRITE_ONCE(rseq_reg_success, 1);
 	return 0;
 }
 
@@ -99,12 +125,28 @@ int rseq_unregister_current_thread(void)
 		/* Treat libc's ownership as a successful unregistration. */
 		return 0;
 	}
-	rc = sys_rseq(&__rseq_abi, sizeof(struct rseq_abi), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG);
+	rc = sys_rseq(&__rseq_abi, rseq_size, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG);
 	if (rc)
 		return -1;
 	return 0;
 }
 
+static
+unsigned int get_rseq_feature_size(void)
+{
+	unsigned long auxv_rseq_feature_size, auxv_rseq_align;
+
+	auxv_rseq_align = getauxval(AT_RSEQ_ALIGN);
+	assert(!auxv_rseq_align || auxv_rseq_align <= RSEQ_THREAD_AREA_ALLOC_SIZE);
+
+	auxv_rseq_feature_size = getauxval(AT_RSEQ_FEATURE_SIZE);
+	assert(!auxv_rseq_feature_size || auxv_rseq_feature_size <= RSEQ_THREAD_AREA_ALLOC_SIZE);
+	if (auxv_rseq_feature_size)
+		return auxv_rseq_feature_size;
+	else
+		return ORIG_RSEQ_FEATURE_SIZE;
+}
+
 static __attribute__((constructor))
 void rseq_init(void)
 {
@@ -117,16 +159,24 @@ void rseq_init(void)
 		rseq_offset = *libc_rseq_offset_p;
 		rseq_size = *libc_rseq_size_p;
 		rseq_flags = *libc_rseq_flags_p;
+		rseq_feature_size = get_rseq_feature_size();
+		if (rseq_feature_size > rseq_size)
+			rseq_feature_size = rseq_size;
 		return;
 	}
 	rseq_ownership = 1;
 	if (!rseq_available()) {
 		rseq_size = 0;
+		rseq_feature_size = 0;
 		return;
 	}
 	rseq_offset = (void *)&__rseq_abi - rseq_thread_pointer();
-	rseq_size = sizeof(struct rseq_abi);
 	rseq_flags = 0;
+	rseq_feature_size = get_rseq_feature_size();
+	if (rseq_feature_size == ORIG_RSEQ_FEATURE_SIZE)
+		rseq_size = ORIG_RSEQ_ALLOC_SIZE;
+	else
+		rseq_size = RSEQ_THREAD_AREA_ALLOC_SIZE;
 }
 
 static __attribute__((destructor))
@@ -136,6 +186,7 @@ void rseq_exit(void)
 		return;
 	rseq_offset = 0;
 	rseq_size = -1U;
+	rseq_feature_size = -1U;
 	rseq_ownership = 0;
 }
 
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
index 6f7513384bf5..95adc1e1b0db 100644
--- a/tools/testing/selftests/rseq/rseq.h
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -47,14 +47,24 @@
 
 #include "rseq-thread-pointer.h"
 
-/* Offset from the thread pointer to the rseq area.  */
+/* Offset from the thread pointer to the rseq area. */
 extern ptrdiff_t rseq_offset;
-/* Size of the registered rseq area.  0 if the registration was
-   unsuccessful.  */
+
+/*
+ * Size of the registered rseq area. 0 if the registration was
+ * unsuccessful.
+ */
 extern unsigned int rseq_size;
-/* Flags used during rseq registration.  */
+
+/* Flags used during rseq registration. */
 extern unsigned int rseq_flags;
 
+/*
+ * rseq feature size supported by the kernel. 0 if the registration was
+ * unsuccessful.
+ */
+extern unsigned int rseq_feature_size;
+
 static inline struct rseq_abi *rseq_get_abi(void)
 {
 	return (struct rseq_abi *) ((uintptr_t) rseq_thread_pointer() + rseq_offset);
-- 
cgit 


From 99babd04b25054717d21840298b0b46046b42cd9 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:08 -0500
Subject: selftests/rseq: Implement rseq numa node id field selftest

Test the NUMA node id extension rseq field. Compare it against the value
returned by the getcpu(2) system call while pinned on a specific core.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-7-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/basic_test.c |  4 ++++
 tools/testing/selftests/rseq/rseq-abi.h   |  8 ++++++++
 tools/testing/selftests/rseq/rseq.c       | 18 ++++++++++++++++++
 tools/testing/selftests/rseq/rseq.h       | 28 ++++++++++++++++++++++++++++
 4 files changed, 58 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/basic_test.c b/tools/testing/selftests/rseq/basic_test.c
index d8efbfb89193..295eea16466f 100644
--- a/tools/testing/selftests/rseq/basic_test.c
+++ b/tools/testing/selftests/rseq/basic_test.c
@@ -22,6 +22,8 @@ void test_cpu_pointer(void)
 	CPU_ZERO(&test_affinity);
 	for (i = 0; i < CPU_SETSIZE; i++) {
 		if (CPU_ISSET(i, &affinity)) {
+			int node;
+
 			CPU_SET(i, &test_affinity);
 			sched_setaffinity(0, sizeof(test_affinity),
 					&test_affinity);
@@ -29,6 +31,8 @@ void test_cpu_pointer(void)
 			assert(rseq_current_cpu() == i);
 			assert(rseq_current_cpu_raw() == i);
 			assert(rseq_cpu_start() == i);
+			node = rseq_fallback_current_node();
+			assert(rseq_current_node_id() == node);
 			CPU_CLR(i, &test_affinity);
 		}
 	}
diff --git a/tools/testing/selftests/rseq/rseq-abi.h b/tools/testing/selftests/rseq/rseq-abi.h
index 00ac846d85b0..a1faa9162d52 100644
--- a/tools/testing/selftests/rseq/rseq-abi.h
+++ b/tools/testing/selftests/rseq/rseq-abi.h
@@ -147,6 +147,14 @@ struct rseq_abi {
 	 */
 	__u32 flags;
 
+	/*
+	 * Restartable sequences node_id field. Updated by the kernel. Read by
+	 * user-space with single-copy atomicity semantics. This field should
+	 * only be read by the thread which registered this data structure.
+	 * Aligned on 32-bit. Contains the current NUMA node ID.
+	 */
+	__u32 node_id;
+
 	/*
 	 * Flexible array member at end of structure, after last feature field.
 	 */
diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
index 1e8e3265bdbf..4e4aa006004c 100644
--- a/tools/testing/selftests/rseq/rseq.c
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -79,6 +79,11 @@ static int sys_rseq(struct rseq_abi *rseq_abi, uint32_t rseq_len,
 	return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig);
 }
 
+static int sys_getcpu(unsigned *cpu, unsigned *node)
+{
+	return syscall(__NR_getcpu, cpu, node, NULL);
+}
+
 int rseq_available(void)
 {
 	int rc;
@@ -201,3 +206,16 @@ int32_t rseq_fallback_current_cpu(void)
 	}
 	return cpu;
 }
+
+int32_t rseq_fallback_current_node(void)
+{
+	uint32_t cpu_id, node_id;
+	int ret;
+
+	ret = sys_getcpu(&cpu_id, &node_id);
+	if (ret) {
+		perror("sys_getcpu()");
+		return ret;
+	}
+	return (int32_t) node_id;
+}
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
index 95adc1e1b0db..fd17d0e54a1b 100644
--- a/tools/testing/selftests/rseq/rseq.h
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -20,6 +20,15 @@
 #include "rseq-abi.h"
 #include "compiler.h"
 
+#ifndef rseq_sizeof_field
+#define rseq_sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
+#endif
+
+#ifndef rseq_offsetofend
+#define rseq_offsetofend(TYPE, MEMBER) \
+	(offsetof(TYPE, MEMBER)	+ rseq_sizeof_field(TYPE, MEMBER))
+#endif
+
 /*
  * Empty code injection macros, override when testing.
  * It is important to consider that the ASM injection macros need to be
@@ -128,6 +137,11 @@ int rseq_unregister_current_thread(void);
  */
 int32_t rseq_fallback_current_cpu(void);
 
+/*
+ * Restartable sequence fallback for reading the current node number.
+ */
+int32_t rseq_fallback_current_node(void);
+
 /*
  * Values returned can be either the current CPU number, -1 (rseq is
  * uninitialized), or -2 (rseq initialization has failed).
@@ -163,6 +177,20 @@ static inline uint32_t rseq_current_cpu(void)
 	return cpu;
 }
 
+static inline bool rseq_node_id_available(void)
+{
+	return (int) rseq_feature_size >= rseq_offsetofend(struct rseq_abi, node_id);
+}
+
+/*
+ * Current NUMA node number.
+ */
+static inline uint32_t rseq_current_node_id(void)
+{
+	assert(rseq_node_id_available());
+	return RSEQ_ACCESS_ONCE(rseq_get_abi()->node_id);
+}
+
 static inline void rseq_clear_rseq_cs(void)
 {
 	RSEQ_WRITE_ONCE(rseq_get_abi()->rseq_cs.arch.ptr, 0);
-- 
cgit 


From 72cb1d7f2faca4fba81ab8417367d63852cd490c Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:11 -0500
Subject: selftests/rseq: Remove RSEQ_SKIP_FASTPATH code

This code is not currently build by the test Makefile, adds complexity,
and is not overall useful considering that the abort handling loops to
retry the fast-path.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-10-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/param_test.c |  4 --
 tools/testing/selftests/rseq/rseq-arm.h   |  6 ---
 tools/testing/selftests/rseq/rseq-arm64.h |  6 ---
 tools/testing/selftests/rseq/rseq-mips.h  |  6 ---
 tools/testing/selftests/rseq/rseq-ppc.h   |  6 ---
 tools/testing/selftests/rseq/rseq-riscv.h |  6 ---
 tools/testing/selftests/rseq/rseq-s390.h  |  5 ---
 tools/testing/selftests/rseq/rseq-skip.h  | 65 -------------------------------
 tools/testing/selftests/rseq/rseq-x86.h   | 12 ------
 9 files changed, 116 deletions(-)
 delete mode 100644 tools/testing/selftests/rseq/rseq-skip.h

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
index ef29bc16f358..9869369a8607 100644
--- a/tools/testing/selftests/rseq/param_test.c
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -38,11 +38,7 @@ static int opt_yield, opt_signal, opt_sleep,
 		opt_disable_rseq, opt_threads = 200,
 		opt_disable_mod = 0, opt_test = 's', opt_mb = 0;
 
-#ifndef RSEQ_SKIP_FASTPATH
 static long long opt_reps = 5000;
-#else
-static long long opt_reps = 100;
-#endif
 
 static __thread __attribute__((tls_model("initial-exec")))
 unsigned int signals_delivered;
diff --git a/tools/testing/selftests/rseq/rseq-arm.h b/tools/testing/selftests/rseq/rseq-arm.h
index 893a11eca9d5..7445107f842b 100644
--- a/tools/testing/selftests/rseq/rseq-arm.h
+++ b/tools/testing/selftests/rseq/rseq-arm.h
@@ -79,10 +79,6 @@ do {									\
 	RSEQ_WRITE_ONCE(*p, v);						\
 } while (0)
 
-#ifdef RSEQ_SKIP_FASTPATH
-#include "rseq-skip.h"
-#else /* !RSEQ_SKIP_FASTPATH */
-
 #define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,	\
 				post_commit_offset, abort_ip)		\
 		".pushsection __rseq_cs, \"aw\"\n\t"			\
@@ -823,5 +819,3 @@ error2:
 	rseq_bug("expected value comparison failed");
 #endif
 }
-
-#endif /* !RSEQ_SKIP_FASTPATH */
diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
index cbe190a4d005..49c387fcd868 100644
--- a/tools/testing/selftests/rseq/rseq-arm64.h
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -85,10 +85,6 @@ do {										\
 	}									\
 } while (0)
 
-#ifdef RSEQ_SKIP_FASTPATH
-#include "rseq-skip.h"
-#else /* !RSEQ_SKIP_FASTPATH */
-
 #define RSEQ_ASM_TMP_REG32	"w15"
 #define RSEQ_ASM_TMP_REG	"x15"
 #define RSEQ_ASM_TMP_REG_2	"x14"
@@ -691,5 +687,3 @@ error2:
 	rseq_bug("expected value comparison failed");
 #endif
 }
-
-#endif /* !RSEQ_SKIP_FASTPATH */
diff --git a/tools/testing/selftests/rseq/rseq-mips.h b/tools/testing/selftests/rseq/rseq-mips.h
index 878739fae2fd..dd199952d649 100644
--- a/tools/testing/selftests/rseq/rseq-mips.h
+++ b/tools/testing/selftests/rseq/rseq-mips.h
@@ -60,10 +60,6 @@ do {									\
 	RSEQ_WRITE_ONCE(*p, v);						\
 } while (0)
 
-#ifdef RSEQ_SKIP_FASTPATH
-#include "rseq-skip.h"
-#else /* !RSEQ_SKIP_FASTPATH */
-
 #if _MIPS_SZLONG == 64
 # define LONG			".dword"
 # define LONG_LA		"dla"
@@ -773,5 +769,3 @@ error2:
 	rseq_bug("expected value comparison failed");
 #endif
 }
-
-#endif /* !RSEQ_SKIP_FASTPATH */
diff --git a/tools/testing/selftests/rseq/rseq-ppc.h b/tools/testing/selftests/rseq/rseq-ppc.h
index bab8e0b9fb11..f82d95c1bb3f 100644
--- a/tools/testing/selftests/rseq/rseq-ppc.h
+++ b/tools/testing/selftests/rseq/rseq-ppc.h
@@ -36,10 +36,6 @@ do {									\
 	RSEQ_WRITE_ONCE(*p, v);						\
 } while (0)
 
-#ifdef RSEQ_SKIP_FASTPATH
-#include "rseq-skip.h"
-#else /* !RSEQ_SKIP_FASTPATH */
-
 /*
  * The __rseq_cs_ptr_array and __rseq_cs sections can be used by debuggers to
  * better handle single-stepping through the restartable critical sections.
@@ -787,5 +783,3 @@ error2:
 	rseq_bug("expected value comparison failed");
 #endif
 }
-
-#endif /* !RSEQ_SKIP_FASTPATH */
diff --git a/tools/testing/selftests/rseq/rseq-riscv.h b/tools/testing/selftests/rseq/rseq-riscv.h
index 3a391c9bf468..b16d943a63e1 100644
--- a/tools/testing/selftests/rseq/rseq-riscv.h
+++ b/tools/testing/selftests/rseq/rseq-riscv.h
@@ -49,10 +49,6 @@ do {									\
 	RSEQ_WRITE_ONCE(*(p), v);						\
 } while (0)
 
-#ifdef RSEQ_SKIP_FASTPATH
-#include "rseq-skip.h"
-#else /* !RSEQ_SKIP_FASTPATH */
-
 #define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,	\
 				post_commit_offset, abort_ip)		\
 	".pushsection	__rseq_cs, \"aw\"\n"				\
@@ -673,5 +669,3 @@ error1:
 	rseq_bug("cpu_id comparison failed");
 #endif
 }
-
-#endif /* !RSEQ_SKIP_FASTPATH */
diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h
index 4e6dc5f0cb42..4d3286453bbf 100644
--- a/tools/testing/selftests/rseq/rseq-s390.h
+++ b/tools/testing/selftests/rseq/rseq-s390.h
@@ -28,10 +28,6 @@ do {									\
 	RSEQ_WRITE_ONCE(*p, v);						\
 } while (0)
 
-#ifdef RSEQ_SKIP_FASTPATH
-#include "rseq-skip.h"
-#else /* !RSEQ_SKIP_FASTPATH */
-
 #ifdef __s390x__
 
 #define LONG_L			"lg"
@@ -607,4 +603,3 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
 	return rseq_cmpeqv_trymemcpy_storev(v, expect, dst, src, len,
 					    newv, cpu);
 }
-#endif /* !RSEQ_SKIP_FASTPATH */
diff --git a/tools/testing/selftests/rseq/rseq-skip.h b/tools/testing/selftests/rseq/rseq-skip.h
deleted file mode 100644
index 7b53dac1fcdd..000000000000
--- a/tools/testing/selftests/rseq/rseq-skip.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
-/*
- * rseq-skip.h
- *
- * (C) Copyright 2017-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
- */
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
-{
-	return -1;
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
-			       long voffp, intptr_t *load, int cpu)
-{
-	return -1;
-}
-
-static inline __attribute__((always_inline))
-int rseq_addv(intptr_t *v, intptr_t count, int cpu)
-{
-	return -1;
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
-				 intptr_t *v2, intptr_t newv2,
-				 intptr_t newv, int cpu)
-{
-	return -1;
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
-					 intptr_t *v2, intptr_t newv2,
-					 intptr_t newv, int cpu)
-{
-	return -1;
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
-			      intptr_t *v2, intptr_t expect2,
-			      intptr_t newv, int cpu)
-{
-	return -1;
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
-				 void *dst, void *src, size_t len,
-				 intptr_t newv, int cpu)
-{
-	return -1;
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
-					 void *dst, void *src, size_t len,
-					 intptr_t newv, int cpu)
-{
-	return -1;
-}
diff --git a/tools/testing/selftests/rseq/rseq-x86.h b/tools/testing/selftests/rseq/rseq-x86.h
index bd01dc41ca13..e148dfb2f68a 100644
--- a/tools/testing/selftests/rseq/rseq-x86.h
+++ b/tools/testing/selftests/rseq/rseq-x86.h
@@ -50,10 +50,6 @@ do {									\
 	RSEQ_WRITE_ONCE(*p, v);						\
 } while (0)
 
-#ifdef RSEQ_SKIP_FASTPATH
-#include "rseq-skip.h"
-#else /* !RSEQ_SKIP_FASTPATH */
-
 #define __RSEQ_ASM_DEFINE_TABLE(label, version, flags,			\
 				start_ip, post_commit_offset, abort_ip)	\
 		".pushsection __rseq_cs, \"aw\"\n\t"			\
@@ -629,8 +625,6 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
 					    newv, cpu);
 }
 
-#endif /* !RSEQ_SKIP_FASTPATH */
-
 #elif defined(__i386__)
 
 #define RSEQ_ASM_TP_SEGMENT	%%gs
@@ -657,10 +651,6 @@ do {									\
 	RSEQ_WRITE_ONCE(*p, v);						\
 } while (0)
 
-#ifdef RSEQ_SKIP_FASTPATH
-#include "rseq-skip.h"
-#else /* !RSEQ_SKIP_FASTPATH */
-
 /*
  * Use eax as scratch register and take memory operands as input to
  * lessen register pressure. Especially needed when compiling in O0.
@@ -1360,6 +1350,4 @@ error2:
 #endif
 }
 
-#endif /* !RSEQ_SKIP_FASTPATH */
-
 #endif
-- 
cgit 


From 18c2355838e76788f61849f4d83513b103d68b95 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:12 -0500
Subject: selftests/rseq: Implement rseq mm_cid field support

Add support for the mm_cid field (per-memory-map concurrency ID) of
struct rseq to rseq selftests.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-11-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/rseq-abi.h |  9 +++++++++
 tools/testing/selftests/rseq/rseq.h     | 10 ++++++++++
 2 files changed, 19 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/rseq-abi.h b/tools/testing/selftests/rseq/rseq-abi.h
index a1faa9162d52..fb4ec8a75dd4 100644
--- a/tools/testing/selftests/rseq/rseq-abi.h
+++ b/tools/testing/selftests/rseq/rseq-abi.h
@@ -155,6 +155,15 @@ struct rseq_abi {
 	 */
 	__u32 node_id;
 
+	/*
+	 * Restartable sequences mm_cid field. Updated by the kernel. Read by
+	 * user-space with single-copy atomicity semantics. This field should
+	 * only be read by the thread which registered this data structure.
+	 * Aligned on 32-bit. Contains the current thread's concurrency ID
+	 * (allocated uniquely within a memory map).
+	 */
+	__u32 mm_cid;
+
 	/*
 	 * Flexible array member at end of structure, after last feature field.
 	 */
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
index fd17d0e54a1b..10ebf13644fa 100644
--- a/tools/testing/selftests/rseq/rseq.h
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -191,6 +191,16 @@ static inline uint32_t rseq_current_node_id(void)
 	return RSEQ_ACCESS_ONCE(rseq_get_abi()->node_id);
 }
 
+static inline bool rseq_mm_cid_available(void)
+{
+	return (int) rseq_feature_size >= rseq_offsetofend(struct rseq_abi, mm_cid);
+}
+
+static inline uint32_t rseq_current_mm_cid(void)
+{
+	return RSEQ_ACCESS_ONCE(rseq_get_abi()->mm_cid);
+}
+
 static inline void rseq_clear_rseq_cs(void)
 {
 	RSEQ_WRITE_ONCE(rseq_get_abi()->rseq_cs.arch.ptr, 0);
-- 
cgit 


From ae31573843028ad17b1a807081c542d17fa9a83a Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:13 -0500
Subject: selftests/rseq: x86: Template memory ordering and percpu access mode

Introduce a rseq-x86-bits.h template header which is internally included
to generate the static inline functions covering:

- relaxed and release memory ordering,
- per-cpu-id and per-mm-cid per-cpu data access.

This introduces changes to the rseq.h selftests API which require to
update the rseq selftest programs. Similar API/templating changes need
to be done for other architectures.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-12-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/compiler.h           |    6 +
 tools/testing/selftests/rseq/rseq-bits-reset.h    |   11 +
 tools/testing/selftests/rseq/rseq-bits-template.h |   41 +
 tools/testing/selftests/rseq/rseq-x86-bits.h      |  993 +++++++++++++++++
 tools/testing/selftests/rseq/rseq-x86.h           | 1181 +--------------------
 tools/testing/selftests/rseq/rseq.h               |  159 +++
 6 files changed, 1241 insertions(+), 1150 deletions(-)
 create mode 100644 tools/testing/selftests/rseq/rseq-bits-reset.h
 create mode 100644 tools/testing/selftests/rseq/rseq-bits-template.h
 create mode 100644 tools/testing/selftests/rseq/rseq-x86-bits.h

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/compiler.h b/tools/testing/selftests/rseq/compiler.h
index 876eb6a7f75b..f47092bddeba 100644
--- a/tools/testing/selftests/rseq/compiler.h
+++ b/tools/testing/selftests/rseq/compiler.h
@@ -27,4 +27,10 @@
  */
 #define rseq_after_asm_goto()	asm volatile ("" : : : "memory")
 
+/* Combine two tokens. */
+#define RSEQ__COMBINE_TOKENS(_tokena, _tokenb)	\
+	_tokena##_tokenb
+#define RSEQ_COMBINE_TOKENS(_tokena, _tokenb)	\
+	RSEQ__COMBINE_TOKENS(_tokena, _tokenb)
+
 #endif  /* RSEQ_COMPILER_H_ */
diff --git a/tools/testing/selftests/rseq/rseq-bits-reset.h b/tools/testing/selftests/rseq/rseq-bits-reset.h
new file mode 100644
index 000000000000..e8655089f9cb
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-bits-reset.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq-bits-reset.h
+ *
+ * (C) Copyright 2016-2022 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#undef RSEQ_TEMPLATE_IDENTIFIER
+#undef RSEQ_TEMPLATE_CPU_ID_FIELD
+#undef RSEQ_TEMPLATE_CPU_ID_OFFSET
+#undef RSEQ_TEMPLATE_SUFFIX
diff --git a/tools/testing/selftests/rseq/rseq-bits-template.h b/tools/testing/selftests/rseq/rseq-bits-template.h
new file mode 100644
index 000000000000..65698d6a6cf9
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-bits-template.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq-bits-template.h
+ *
+ * (C) Copyright 2016-2022 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#ifdef RSEQ_TEMPLATE_CPU_ID
+# define RSEQ_TEMPLATE_CPU_ID_OFFSET	RSEQ_CPU_ID_OFFSET
+# define RSEQ_TEMPLATE_CPU_ID_FIELD	cpu_id
+# ifdef RSEQ_TEMPLATE_MO_RELEASE
+#  define RSEQ_TEMPLATE_SUFFIX		_release_cpu_id
+# elif defined (RSEQ_TEMPLATE_MO_RELAXED)
+#  define RSEQ_TEMPLATE_SUFFIX		_relaxed_cpu_id
+# else
+#  error "Never use <rseq-bits-template.h> directly; include <rseq.h> instead."
+# endif
+#elif defined(RSEQ_TEMPLATE_MM_CID)
+# define RSEQ_TEMPLATE_CPU_ID_OFFSET	RSEQ_MM_CID_OFFSET
+# define RSEQ_TEMPLATE_CPU_ID_FIELD	mm_cid
+# ifdef RSEQ_TEMPLATE_MO_RELEASE
+#  define RSEQ_TEMPLATE_SUFFIX		_release_mm_cid
+# elif defined (RSEQ_TEMPLATE_MO_RELAXED)
+#  define RSEQ_TEMPLATE_SUFFIX		_relaxed_mm_cid
+# else
+#  error "Never use <rseq-bits-template.h> directly; include <rseq.h> instead."
+# endif
+#elif defined (RSEQ_TEMPLATE_CPU_ID_NONE)
+# ifdef RSEQ_TEMPLATE_MO_RELEASE
+#  define RSEQ_TEMPLATE_SUFFIX		_release
+# elif defined (RSEQ_TEMPLATE_MO_RELAXED)
+#  define RSEQ_TEMPLATE_SUFFIX		_relaxed
+# else
+#  error "Never use <rseq-bits-template.h> directly; include <rseq.h> instead."
+# endif
+#else
+# error "Never use <rseq-bits-template.h> directly; include <rseq.h> instead."
+#endif
+
+#define RSEQ_TEMPLATE_IDENTIFIER(x)	RSEQ_COMBINE_TOKENS(x, RSEQ_TEMPLATE_SUFFIX)
+
diff --git a/tools/testing/selftests/rseq/rseq-x86-bits.h b/tools/testing/selftests/rseq/rseq-x86-bits.h
new file mode 100644
index 000000000000..8a9431eec467
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-x86-bits.h
@@ -0,0 +1,993 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq-x86-bits.h
+ *
+ * (C) Copyright 2016-2022 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#include "rseq-bits-template.h"
+
+#ifdef __x86_64__
+
+#if defined(RSEQ_TEMPLATE_MO_RELAXED) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_storev)(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+		RSEQ_INJECT_ASM(3)
+		"cmpq %[v], %[expect]\n\t"
+		"jnz %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+		"cmpq %[v], %[expect]\n\t"
+		"jnz %l[error2]\n\t"
+#endif
+		/* final store */
+		"movq %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [rseq_offset]		"r" (rseq_offset),
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		: "memory", "cc", "rax"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+/*
+ * Compare @v against @expectnot. When it does _not_ match, load @v
+ * into @load, and store the content of *@v + voffp into @v.
+ */
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpnev_storeoffp_load)(intptr_t *v, intptr_t expectnot,
+			       long voffp, intptr_t *load, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+		RSEQ_INJECT_ASM(3)
+		"movq %[v], %%rbx\n\t"
+		"cmpq %%rbx, %[expectnot]\n\t"
+		"je %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+		"movq %[v], %%rbx\n\t"
+		"cmpq %%rbx, %[expectnot]\n\t"
+		"je %l[error2]\n\t"
+#endif
+		"movq %%rbx, %[load]\n\t"
+		"addq %[voffp], %%rbx\n\t"
+		"movq (%%rbx), %%rbx\n\t"
+		/* final store */
+		"movq %%rbx, %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [rseq_offset]		"r" (rseq_offset),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expectnot]		"r" (expectnot),
+		  [voffp]		"er" (voffp),
+		  [load]		"m" (*load)
+		: "memory", "cc", "rax", "rbx"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_addv)(intptr_t *v, intptr_t count, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+		RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+#endif
+		/* final store */
+		"addq %[count], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(4)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [rseq_offset]		"r" (rseq_offset),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [count]		"er" (count)
+		: "memory", "cc", "rax"
+		  RSEQ_INJECT_CLOBBER
+		: abort
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+#define RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+
+/*
+ *   pval = *(ptr+off)
+ *  *pval += inc;
+ */
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_offset_deref_addv)(intptr_t *ptr, long off, intptr_t inc, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+		RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+#endif
+		/* get p+v */
+		"movq %[ptr], %%rbx\n\t"
+		"addq %[off], %%rbx\n\t"
+		/* get pv */
+		"movq (%%rbx), %%rcx\n\t"
+		/* *pv += inc */
+		"addq %[inc], (%%rcx)\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(4)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [rseq_offset]		"r" (rseq_offset),
+		  /* final store input */
+		  [ptr]			"m" (*ptr),
+		  [off]			"er" (off),
+		  [inc]			"er" (inc)
+		: "memory", "cc", "rax", "rbx", "rcx"
+		  RSEQ_INJECT_CLOBBER
+		: abort
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_cmpeqv_storev)(intptr_t *v, intptr_t expect,
+			      intptr_t *v2, intptr_t expect2,
+			      intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+		RSEQ_INJECT_ASM(3)
+		"cmpq %[v], %[expect]\n\t"
+		"jnz %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+		"cmpq %[v2], %[expect2]\n\t"
+		"jnz %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+		"cmpq %[v], %[expect]\n\t"
+		"jnz %l[error2]\n\t"
+		"cmpq %[v2], %[expect2]\n\t"
+		"jnz %l[error3]\n\t"
+#endif
+		/* final store */
+		"movq %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [rseq_offset]		"r" (rseq_offset),
+		  /* cmp2 input */
+		  [v2]			"m" (*v2),
+		  [expect2]		"r" (expect2),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		: "memory", "cc", "rax"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2, error3
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("1st expected value comparison failed");
+error3:
+	rseq_after_asm_goto();
+	rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+#endif /* #if defined(RSEQ_TEMPLATE_MO_RELAXED) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trystorev_storev)(intptr_t *v, intptr_t expect,
+				 intptr_t *v2, intptr_t newv2,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+		RSEQ_INJECT_ASM(3)
+		"cmpq %[v], %[expect]\n\t"
+		"jnz %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+		"cmpq %[v], %[expect]\n\t"
+		"jnz %l[error2]\n\t"
+#endif
+		/* try store */
+		"movq %[newv2], %[v2]\n\t"
+		RSEQ_INJECT_ASM(5)
+		/* final store */
+		"movq %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [rseq_offset]		"r" (rseq_offset),
+		  /* try store input */
+		  [v2]			"m" (*v2),
+		  [newv2]		"r" (newv2),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		: "memory", "cc", "rax"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trymemcpy_storev)(intptr_t *v, intptr_t expect,
+				 void *dst, void *src, size_t len,
+				 intptr_t newv, int cpu)
+{
+	uint64_t rseq_scratch[3];
+
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		"movq %[src], %[rseq_scratch0]\n\t"
+		"movq %[dst], %[rseq_scratch1]\n\t"
+		"movq %[len], %[rseq_scratch2]\n\t"
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+		RSEQ_INJECT_ASM(3)
+		"cmpq %[v], %[expect]\n\t"
+		"jnz 5f\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 6f)
+		"cmpq %[v], %[expect]\n\t"
+		"jnz 7f\n\t"
+#endif
+		/* try memcpy */
+		"test %[len], %[len]\n\t" \
+		"jz 333f\n\t" \
+		"222:\n\t" \
+		"movb (%[src]), %%al\n\t" \
+		"movb %%al, (%[dst])\n\t" \
+		"inc %[src]\n\t" \
+		"inc %[dst]\n\t" \
+		"dec %[len]\n\t" \
+		"jnz 222b\n\t" \
+		"333:\n\t" \
+		RSEQ_INJECT_ASM(5)
+		/* final store */
+		"movq %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		/* teardown */
+		"movq %[rseq_scratch2], %[len]\n\t"
+		"movq %[rseq_scratch1], %[dst]\n\t"
+		"movq %[rseq_scratch0], %[src]\n\t"
+		RSEQ_ASM_DEFINE_ABORT(4,
+			"movq %[rseq_scratch2], %[len]\n\t"
+			"movq %[rseq_scratch1], %[dst]\n\t"
+			"movq %[rseq_scratch0], %[src]\n\t",
+			abort)
+		RSEQ_ASM_DEFINE_CMPFAIL(5,
+			"movq %[rseq_scratch2], %[len]\n\t"
+			"movq %[rseq_scratch1], %[dst]\n\t"
+			"movq %[rseq_scratch0], %[src]\n\t",
+			cmpfail)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_CMPFAIL(6,
+			"movq %[rseq_scratch2], %[len]\n\t"
+			"movq %[rseq_scratch1], %[dst]\n\t"
+			"movq %[rseq_scratch0], %[src]\n\t",
+			error1)
+		RSEQ_ASM_DEFINE_CMPFAIL(7,
+			"movq %[rseq_scratch2], %[len]\n\t"
+			"movq %[rseq_scratch1], %[dst]\n\t"
+			"movq %[rseq_scratch0], %[src]\n\t",
+			error2)
+#endif
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [rseq_offset]		"r" (rseq_offset),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv),
+		  /* try memcpy input */
+		  [dst]			"r" (dst),
+		  [src]			"r" (src),
+		  [len]			"r" (len),
+		  [rseq_scratch0]	"m" (rseq_scratch[0]),
+		  [rseq_scratch1]	"m" (rseq_scratch[1]),
+		  [rseq_scratch2]	"m" (rseq_scratch[2])
+		: "memory", "cc", "rax"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* #if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#elif defined(__i386__)
+
+#if defined(RSEQ_TEMPLATE_MO_RELAXED) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_storev)(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+		RSEQ_INJECT_ASM(3)
+		"cmpl %[v], %[expect]\n\t"
+		"jnz %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+		"cmpl %[v], %[expect]\n\t"
+		"jnz %l[error2]\n\t"
+#endif
+		/* final store */
+		"movl %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [rseq_offset]		"r" (rseq_offset),
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		: "memory", "cc", "eax"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+/*
+ * Compare @v against @expectnot. When it does _not_ match, load @v
+ * into @load, and store the content of *@v + voffp into @v.
+ */
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpnev_storeoffp_load)(intptr_t *v, intptr_t expectnot,
+			       long voffp, intptr_t *load, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+		RSEQ_INJECT_ASM(3)
+		"movl %[v], %%ebx\n\t"
+		"cmpl %%ebx, %[expectnot]\n\t"
+		"je %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+		"movl %[v], %%ebx\n\t"
+		"cmpl %%ebx, %[expectnot]\n\t"
+		"je %l[error2]\n\t"
+#endif
+		"movl %%ebx, %[load]\n\t"
+		"addl %[voffp], %%ebx\n\t"
+		"movl (%%ebx), %%ebx\n\t"
+		/* final store */
+		"movl %%ebx, %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [rseq_offset]		"r" (rseq_offset),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expectnot]		"r" (expectnot),
+		  [voffp]		"ir" (voffp),
+		  [load]		"m" (*load)
+		: "memory", "cc", "eax", "ebx"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_addv)(intptr_t *v, intptr_t count, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+		RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+#endif
+		/* final store */
+		"addl %[count], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(4)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [rseq_offset]		"r" (rseq_offset),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [count]		"ir" (count)
+		: "memory", "cc", "eax"
+		  RSEQ_INJECT_CLOBBER
+		: abort
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_cmpeqv_storev)(intptr_t *v, intptr_t expect,
+			      intptr_t *v2, intptr_t expect2,
+			      intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+		RSEQ_INJECT_ASM(3)
+		"cmpl %[v], %[expect]\n\t"
+		"jnz %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+		"cmpl %[expect2], %[v2]\n\t"
+		"jnz %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+		"cmpl %[v], %[expect]\n\t"
+		"jnz %l[error2]\n\t"
+		"cmpl %[expect2], %[v2]\n\t"
+		"jnz %l[error3]\n\t"
+#endif
+		"movl %[newv], %%eax\n\t"
+		/* final store */
+		"movl %%eax, %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [rseq_offset]		"r" (rseq_offset),
+		  /* cmp2 input */
+		  [v2]			"m" (*v2),
+		  [expect2]		"r" (expect2),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"m" (newv)
+		: "memory", "cc", "eax"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2, error3
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("1st expected value comparison failed");
+error3:
+	rseq_after_asm_goto();
+	rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+#endif /* #if defined(RSEQ_TEMPLATE_MO_RELAXED) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trystorev_storev)(intptr_t *v, intptr_t expect,
+					 intptr_t *v2, intptr_t newv2,
+					 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+		RSEQ_INJECT_ASM(3)
+		"movl %[expect], %%eax\n\t"
+		"cmpl %[v], %%eax\n\t"
+		"jnz %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+		"movl %[expect], %%eax\n\t"
+		"cmpl %[v], %%eax\n\t"
+		"jnz %l[error2]\n\t"
+#endif
+		/* try store */
+		"movl %[newv2], %[v2]\n\t"
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_TEMPLATE_MO_RELEASE
+		"lock; addl $0,-128(%%esp)\n\t"
+#endif
+		/* final store */
+		"movl %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [rseq_offset]		"r" (rseq_offset),
+		  /* try store input */
+		  [v2]			"m" (*v2),
+		  [newv2]		"r" (newv2),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"m" (expect),
+		  [newv]		"r" (newv)
+		: "memory", "cc", "eax"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+
+}
+
+/* TODO: implement a faster memcpy. */
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trymemcpy_storev)(intptr_t *v, intptr_t expect,
+					 void *dst, void *src, size_t len,
+					 intptr_t newv, int cpu)
+{
+	uint32_t rseq_scratch[3];
+
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		"movl %[src], %[rseq_scratch0]\n\t"
+		"movl %[dst], %[rseq_scratch1]\n\t"
+		"movl %[len], %[rseq_scratch2]\n\t"
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+		RSEQ_INJECT_ASM(3)
+		"movl %[expect], %%eax\n\t"
+		"cmpl %%eax, %[v]\n\t"
+		"jnz 5f\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_TEMPLATE_CPU_ID_OFFSET(%[rseq_offset]), 6f)
+		"movl %[expect], %%eax\n\t"
+		"cmpl %%eax, %[v]\n\t"
+		"jnz 7f\n\t"
+#endif
+		/* try memcpy */
+		"test %[len], %[len]\n\t" \
+		"jz 333f\n\t" \
+		"222:\n\t" \
+		"movb (%[src]), %%al\n\t" \
+		"movb %%al, (%[dst])\n\t" \
+		"inc %[src]\n\t" \
+		"inc %[dst]\n\t" \
+		"dec %[len]\n\t" \
+		"jnz 222b\n\t" \
+		"333:\n\t" \
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_TEMPLATE_MO_RELEASE
+		"lock; addl $0,-128(%%esp)\n\t"
+#endif
+		"movl %[newv], %%eax\n\t"
+		/* final store */
+		"movl %%eax, %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		/* teardown */
+		"movl %[rseq_scratch2], %[len]\n\t"
+		"movl %[rseq_scratch1], %[dst]\n\t"
+		"movl %[rseq_scratch0], %[src]\n\t"
+		RSEQ_ASM_DEFINE_ABORT(4,
+			"movl %[rseq_scratch2], %[len]\n\t"
+			"movl %[rseq_scratch1], %[dst]\n\t"
+			"movl %[rseq_scratch0], %[src]\n\t",
+			abort)
+		RSEQ_ASM_DEFINE_CMPFAIL(5,
+			"movl %[rseq_scratch2], %[len]\n\t"
+			"movl %[rseq_scratch1], %[dst]\n\t"
+			"movl %[rseq_scratch0], %[src]\n\t",
+			cmpfail)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_CMPFAIL(6,
+			"movl %[rseq_scratch2], %[len]\n\t"
+			"movl %[rseq_scratch1], %[dst]\n\t"
+			"movl %[rseq_scratch0], %[src]\n\t",
+			error1)
+		RSEQ_ASM_DEFINE_CMPFAIL(7,
+			"movl %[rseq_scratch2], %[len]\n\t"
+			"movl %[rseq_scratch1], %[dst]\n\t"
+			"movl %[rseq_scratch0], %[src]\n\t",
+			error2)
+#endif
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [rseq_offset]		"r" (rseq_offset),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"m" (expect),
+		  [newv]		"m" (newv),
+		  /* try memcpy input */
+		  [dst]			"r" (dst),
+		  [src]			"r" (src),
+		  [len]			"r" (len),
+		  [rseq_scratch0]	"m" (rseq_scratch[0]),
+		  [rseq_scratch1]	"m" (rseq_scratch[1]),
+		  [rseq_scratch2]	"m" (rseq_scratch[2])
+		: "memory", "cc", "eax"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* #if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#endif
+
+#include "rseq-bits-reset.h"
diff --git a/tools/testing/selftests/rseq/rseq-x86.h b/tools/testing/selftests/rseq/rseq-x86.h
index e148dfb2f68a..fb65ef54b0fb 100644
--- a/tools/testing/selftests/rseq/rseq-x86.h
+++ b/tools/testing/selftests/rseq/rseq-x86.h
@@ -2,9 +2,13 @@
 /*
  * rseq-x86.h
  *
- * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * (C) Copyright 2016-2022 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  */
 
+#ifndef RSEQ_H
+#error "Never use <rseq-x86.h> directly; include <rseq.h> instead."
+#endif
+
 #include <stdint.h>
 
 /*
@@ -22,9 +26,10 @@
  * address through a "r" input operand.
  */
 
-/* Offset of cpu_id and rseq_cs fields in struct rseq. */
+/* Offset of cpu_id, rseq_cs, and mm_cid fields in struct rseq. */
 #define RSEQ_CPU_ID_OFFSET	4
 #define RSEQ_CS_OFFSET		8
+#define RSEQ_MM_CID_OFFSET	24
 
 #ifdef __x86_64__
 
@@ -108,523 +113,6 @@ do {									\
 		"jmp %l[" __rseq_str(cmpfail_label) "]\n\t"		\
 		".popsection\n\t"
 
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-		"cmpq %[v], %[expect]\n\t"
-		"jnz %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
-		"cmpq %[v], %[expect]\n\t"
-		"jnz %l[error2]\n\t"
-#endif
-		/* final store */
-		"movq %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		: "memory", "cc", "rax"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-/*
- * Compare @v against @expectnot. When it does _not_ match, load @v
- * into @load, and store the content of *@v + voffp into @v.
- */
-static inline __attribute__((always_inline))
-int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
-			       long voffp, intptr_t *load, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-		"movq %[v], %%rbx\n\t"
-		"cmpq %%rbx, %[expectnot]\n\t"
-		"je %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
-		"movq %[v], %%rbx\n\t"
-		"cmpq %%rbx, %[expectnot]\n\t"
-		"je %l[error2]\n\t"
-#endif
-		"movq %%rbx, %[load]\n\t"
-		"addq %[voffp], %%rbx\n\t"
-		"movq (%%rbx), %%rbx\n\t"
-		/* final store */
-		"movq %%rbx, %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expectnot]		"r" (expectnot),
-		  [voffp]		"er" (voffp),
-		  [load]		"m" (*load)
-		: "memory", "cc", "rax", "rbx"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_addv(intptr_t *v, intptr_t count, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
-#endif
-		/* final store */
-		"addq %[count], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(4)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [count]		"er" (count)
-		: "memory", "cc", "rax"
-		  RSEQ_INJECT_CLOBBER
-		: abort
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-#endif
-}
-
-#define RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
-
-/*
- *   pval = *(ptr+off)
- *  *pval += inc;
- */
-static inline __attribute__((always_inline))
-int rseq_offset_deref_addv(intptr_t *ptr, long off, intptr_t inc, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
-#endif
-		/* get p+v */
-		"movq %[ptr], %%rbx\n\t"
-		"addq %[off], %%rbx\n\t"
-		/* get pv */
-		"movq (%%rbx), %%rcx\n\t"
-		/* *pv += inc */
-		"addq %[inc], (%%rcx)\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(4)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  /* final store input */
-		  [ptr]			"m" (*ptr),
-		  [off]			"er" (off),
-		  [inc]			"er" (inc)
-		: "memory", "cc", "rax", "rbx", "rcx"
-		  RSEQ_INJECT_CLOBBER
-		: abort
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1
-#endif
-	);
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
-				 intptr_t *v2, intptr_t newv2,
-				 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-		"cmpq %[v], %[expect]\n\t"
-		"jnz %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
-		"cmpq %[v], %[expect]\n\t"
-		"jnz %l[error2]\n\t"
-#endif
-		/* try store */
-		"movq %[newv2], %[v2]\n\t"
-		RSEQ_INJECT_ASM(5)
-		/* final store */
-		"movq %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  /* try store input */
-		  [v2]			"m" (*v2),
-		  [newv2]		"r" (newv2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		: "memory", "cc", "rax"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-/* x86-64 is TSO. */
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
-					 intptr_t *v2, intptr_t newv2,
-					 intptr_t newv, int cpu)
-{
-	return rseq_cmpeqv_trystorev_storev(v, expect, v2, newv2, newv, cpu);
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
-			      intptr_t *v2, intptr_t expect2,
-			      intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-		"cmpq %[v], %[expect]\n\t"
-		"jnz %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-		"cmpq %[v2], %[expect2]\n\t"
-		"jnz %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(5)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
-		"cmpq %[v], %[expect]\n\t"
-		"jnz %l[error2]\n\t"
-		"cmpq %[v2], %[expect2]\n\t"
-		"jnz %l[error3]\n\t"
-#endif
-		/* final store */
-		"movq %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  /* cmp2 input */
-		  [v2]			"m" (*v2),
-		  [expect2]		"r" (expect2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		: "memory", "cc", "rax"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2, error3
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("1st expected value comparison failed");
-error3:
-	rseq_after_asm_goto();
-	rseq_bug("2nd expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
-				 void *dst, void *src, size_t len,
-				 intptr_t newv, int cpu)
-{
-	uint64_t rseq_scratch[3];
-
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		"movq %[src], %[rseq_scratch0]\n\t"
-		"movq %[dst], %[rseq_scratch1]\n\t"
-		"movq %[len], %[rseq_scratch2]\n\t"
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-		"cmpq %[v], %[expect]\n\t"
-		"jnz 5f\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 6f)
-		"cmpq %[v], %[expect]\n\t"
-		"jnz 7f\n\t"
-#endif
-		/* try memcpy */
-		"test %[len], %[len]\n\t" \
-		"jz 333f\n\t" \
-		"222:\n\t" \
-		"movb (%[src]), %%al\n\t" \
-		"movb %%al, (%[dst])\n\t" \
-		"inc %[src]\n\t" \
-		"inc %[dst]\n\t" \
-		"dec %[len]\n\t" \
-		"jnz 222b\n\t" \
-		"333:\n\t" \
-		RSEQ_INJECT_ASM(5)
-		/* final store */
-		"movq %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		/* teardown */
-		"movq %[rseq_scratch2], %[len]\n\t"
-		"movq %[rseq_scratch1], %[dst]\n\t"
-		"movq %[rseq_scratch0], %[src]\n\t"
-		RSEQ_ASM_DEFINE_ABORT(4,
-			"movq %[rseq_scratch2], %[len]\n\t"
-			"movq %[rseq_scratch1], %[dst]\n\t"
-			"movq %[rseq_scratch0], %[src]\n\t",
-			abort)
-		RSEQ_ASM_DEFINE_CMPFAIL(5,
-			"movq %[rseq_scratch2], %[len]\n\t"
-			"movq %[rseq_scratch1], %[dst]\n\t"
-			"movq %[rseq_scratch0], %[src]\n\t",
-			cmpfail)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_CMPFAIL(6,
-			"movq %[rseq_scratch2], %[len]\n\t"
-			"movq %[rseq_scratch1], %[dst]\n\t"
-			"movq %[rseq_scratch0], %[src]\n\t",
-			error1)
-		RSEQ_ASM_DEFINE_CMPFAIL(7,
-			"movq %[rseq_scratch2], %[len]\n\t"
-			"movq %[rseq_scratch1], %[dst]\n\t"
-			"movq %[rseq_scratch0], %[src]\n\t",
-			error2)
-#endif
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv),
-		  /* try memcpy input */
-		  [dst]			"r" (dst),
-		  [src]			"r" (src),
-		  [len]			"r" (len),
-		  [rseq_scratch0]	"m" (rseq_scratch[0]),
-		  [rseq_scratch1]	"m" (rseq_scratch[1]),
-		  [rseq_scratch2]	"m" (rseq_scratch[2])
-		: "memory", "cc", "rax"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-/* x86-64 is TSO. */
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
-					 void *dst, void *src, size_t len,
-					 intptr_t newv, int cpu)
-{
-	return rseq_cmpeqv_trymemcpy_storev(v, expect, dst, src, len,
-					    newv, cpu);
-}
-
 #elif defined(__i386__)
 
 #define RSEQ_ASM_TP_SEGMENT	%%gs
@@ -711,643 +199,36 @@ do {									\
 		"jmp %l[" __rseq_str(cmpfail_label) "]\n\t"		\
 		".popsection\n\t"
 
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-		"cmpl %[v], %[expect]\n\t"
-		"jnz %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
-		"cmpl %[v], %[expect]\n\t"
-		"jnz %l[error2]\n\t"
-#endif
-		/* final store */
-		"movl %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		: "memory", "cc", "eax"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-/*
- * Compare @v against @expectnot. When it does _not_ match, load @v
- * into @load, and store the content of *@v + voffp into @v.
- */
-static inline __attribute__((always_inline))
-int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
-			       long voffp, intptr_t *load, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-		"movl %[v], %%ebx\n\t"
-		"cmpl %%ebx, %[expectnot]\n\t"
-		"je %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
-		"movl %[v], %%ebx\n\t"
-		"cmpl %%ebx, %[expectnot]\n\t"
-		"je %l[error2]\n\t"
-#endif
-		"movl %%ebx, %[load]\n\t"
-		"addl %[voffp], %%ebx\n\t"
-		"movl (%%ebx), %%ebx\n\t"
-		/* final store */
-		"movl %%ebx, %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expectnot]		"r" (expectnot),
-		  [voffp]		"ir" (voffp),
-		  [load]		"m" (*load)
-		: "memory", "cc", "eax", "ebx"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_addv(intptr_t *v, intptr_t count, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
-#endif
-		/* final store */
-		"addl %[count], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(4)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [count]		"ir" (count)
-		: "memory", "cc", "eax"
-		  RSEQ_INJECT_CLOBBER
-		: abort
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
-				 intptr_t *v2, intptr_t newv2,
-				 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-		"cmpl %[v], %[expect]\n\t"
-		"jnz %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
-		"cmpl %[v], %[expect]\n\t"
-		"jnz %l[error2]\n\t"
-#endif
-		/* try store */
-		"movl %[newv2], %%eax\n\t"
-		"movl %%eax, %[v2]\n\t"
-		RSEQ_INJECT_ASM(5)
-		/* final store */
-		"movl %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  /* try store input */
-		  [v2]			"m" (*v2),
-		  [newv2]		"m" (newv2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		: "memory", "cc", "eax"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
-					 intptr_t *v2, intptr_t newv2,
-					 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-		"movl %[expect], %%eax\n\t"
-		"cmpl %[v], %%eax\n\t"
-		"jnz %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
-		"movl %[expect], %%eax\n\t"
-		"cmpl %[v], %%eax\n\t"
-		"jnz %l[error2]\n\t"
-#endif
-		/* try store */
-		"movl %[newv2], %[v2]\n\t"
-		RSEQ_INJECT_ASM(5)
-		"lock; addl $0,-128(%%esp)\n\t"
-		/* final store */
-		"movl %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  /* try store input */
-		  [v2]			"m" (*v2),
-		  [newv2]		"r" (newv2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"m" (expect),
-		  [newv]		"r" (newv)
-		: "memory", "cc", "eax"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
 #endif
 
-}
+/* Per-cpu-id indexing. */
 
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
-			      intptr_t *v2, intptr_t expect2,
-			      intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
+#define RSEQ_TEMPLATE_CPU_ID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-x86-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-		"cmpl %[v], %[expect]\n\t"
-		"jnz %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-		"cmpl %[expect2], %[v2]\n\t"
-		"jnz %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(5)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
-		"cmpl %[v], %[expect]\n\t"
-		"jnz %l[error2]\n\t"
-		"cmpl %[expect2], %[v2]\n\t"
-		"jnz %l[error3]\n\t"
-#endif
-		"movl %[newv], %%eax\n\t"
-		/* final store */
-		"movl %%eax, %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  /* cmp2 input */
-		  [v2]			"m" (*v2),
-		  [expect2]		"r" (expect2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"m" (newv)
-		: "memory", "cc", "eax"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2, error3
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("1st expected value comparison failed");
-error3:
-	rseq_after_asm_goto();
-	rseq_bug("2nd expected value comparison failed");
-#endif
-}
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-x86-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_CPU_ID
 
-/* TODO: implement a faster memcpy. */
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
-				 void *dst, void *src, size_t len,
-				 intptr_t newv, int cpu)
-{
-	uint32_t rseq_scratch[3];
+/* Per-mm-cid indexing. */
 
-	RSEQ_INJECT_C(9)
+#define RSEQ_TEMPLATE_MM_CID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-x86-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		"movl %[src], %[rseq_scratch0]\n\t"
-		"movl %[dst], %[rseq_scratch1]\n\t"
-		"movl %[len], %[rseq_scratch2]\n\t"
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-		"movl %[expect], %%eax\n\t"
-		"cmpl %%eax, %[v]\n\t"
-		"jnz 5f\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 6f)
-		"movl %[expect], %%eax\n\t"
-		"cmpl %%eax, %[v]\n\t"
-		"jnz 7f\n\t"
-#endif
-		/* try memcpy */
-		"test %[len], %[len]\n\t" \
-		"jz 333f\n\t" \
-		"222:\n\t" \
-		"movb (%[src]), %%al\n\t" \
-		"movb %%al, (%[dst])\n\t" \
-		"inc %[src]\n\t" \
-		"inc %[dst]\n\t" \
-		"dec %[len]\n\t" \
-		"jnz 222b\n\t" \
-		"333:\n\t" \
-		RSEQ_INJECT_ASM(5)
-		"movl %[newv], %%eax\n\t"
-		/* final store */
-		"movl %%eax, %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		/* teardown */
-		"movl %[rseq_scratch2], %[len]\n\t"
-		"movl %[rseq_scratch1], %[dst]\n\t"
-		"movl %[rseq_scratch0], %[src]\n\t"
-		RSEQ_ASM_DEFINE_ABORT(4,
-			"movl %[rseq_scratch2], %[len]\n\t"
-			"movl %[rseq_scratch1], %[dst]\n\t"
-			"movl %[rseq_scratch0], %[src]\n\t",
-			abort)
-		RSEQ_ASM_DEFINE_CMPFAIL(5,
-			"movl %[rseq_scratch2], %[len]\n\t"
-			"movl %[rseq_scratch1], %[dst]\n\t"
-			"movl %[rseq_scratch0], %[src]\n\t",
-			cmpfail)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_CMPFAIL(6,
-			"movl %[rseq_scratch2], %[len]\n\t"
-			"movl %[rseq_scratch1], %[dst]\n\t"
-			"movl %[rseq_scratch0], %[src]\n\t",
-			error1)
-		RSEQ_ASM_DEFINE_CMPFAIL(7,
-			"movl %[rseq_scratch2], %[len]\n\t"
-			"movl %[rseq_scratch1], %[dst]\n\t"
-			"movl %[rseq_scratch0], %[src]\n\t",
-			error2)
-#endif
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"m" (expect),
-		  [newv]		"m" (newv),
-		  /* try memcpy input */
-		  [dst]			"r" (dst),
-		  [src]			"r" (src),
-		  [len]			"r" (len),
-		  [rseq_scratch0]	"m" (rseq_scratch[0]),
-		  [rseq_scratch1]	"m" (rseq_scratch[1]),
-		  [rseq_scratch2]	"m" (rseq_scratch[2])
-		: "memory", "cc", "eax"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-/* TODO: implement a faster memcpy. */
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
-					 void *dst, void *src, size_t len,
-					 intptr_t newv, int cpu)
-{
-	uint32_t rseq_scratch[3];
-
-	RSEQ_INJECT_C(9)
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-x86-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_MM_CID
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		"movl %[src], %[rseq_scratch0]\n\t"
-		"movl %[dst], %[rseq_scratch1]\n\t"
-		"movl %[len], %[rseq_scratch2]\n\t"
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
-		RSEQ_INJECT_ASM(3)
-		"movl %[expect], %%eax\n\t"
-		"cmpl %%eax, %[v]\n\t"
-		"jnz 5f\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 6f)
-		"movl %[expect], %%eax\n\t"
-		"cmpl %%eax, %[v]\n\t"
-		"jnz 7f\n\t"
-#endif
-		/* try memcpy */
-		"test %[len], %[len]\n\t" \
-		"jz 333f\n\t" \
-		"222:\n\t" \
-		"movb (%[src]), %%al\n\t" \
-		"movb %%al, (%[dst])\n\t" \
-		"inc %[src]\n\t" \
-		"inc %[dst]\n\t" \
-		"dec %[len]\n\t" \
-		"jnz 222b\n\t" \
-		"333:\n\t" \
-		RSEQ_INJECT_ASM(5)
-		"lock; addl $0,-128(%%esp)\n\t"
-		"movl %[newv], %%eax\n\t"
-		/* final store */
-		"movl %%eax, %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		/* teardown */
-		"movl %[rseq_scratch2], %[len]\n\t"
-		"movl %[rseq_scratch1], %[dst]\n\t"
-		"movl %[rseq_scratch0], %[src]\n\t"
-		RSEQ_ASM_DEFINE_ABORT(4,
-			"movl %[rseq_scratch2], %[len]\n\t"
-			"movl %[rseq_scratch1], %[dst]\n\t"
-			"movl %[rseq_scratch0], %[src]\n\t",
-			abort)
-		RSEQ_ASM_DEFINE_CMPFAIL(5,
-			"movl %[rseq_scratch2], %[len]\n\t"
-			"movl %[rseq_scratch1], %[dst]\n\t"
-			"movl %[rseq_scratch0], %[src]\n\t",
-			cmpfail)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_CMPFAIL(6,
-			"movl %[rseq_scratch2], %[len]\n\t"
-			"movl %[rseq_scratch1], %[dst]\n\t"
-			"movl %[rseq_scratch0], %[src]\n\t",
-			error1)
-		RSEQ_ASM_DEFINE_CMPFAIL(7,
-			"movl %[rseq_scratch2], %[len]\n\t"
-			"movl %[rseq_scratch1], %[dst]\n\t"
-			"movl %[rseq_scratch0], %[src]\n\t",
-			error2)
-#endif
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [rseq_offset]		"r" (rseq_offset),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"m" (expect),
-		  [newv]		"m" (newv),
-		  /* try memcpy input */
-		  [dst]			"r" (dst),
-		  [src]			"r" (src),
-		  [len]			"r" (len),
-		  [rseq_scratch0]	"m" (rseq_scratch[0]),
-		  [rseq_scratch1]	"m" (rseq_scratch[1]),
-		  [rseq_scratch2]	"m" (rseq_scratch[2])
-		: "memory", "cc", "eax"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
+/* APIs which are not based on cpu ids. */
 
-#endif
+#define RSEQ_TEMPLATE_CPU_ID_NONE
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-x86-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
+#undef RSEQ_TEMPLATE_CPU_ID_NONE
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
index 10ebf13644fa..d7364ea4d201 100644
--- a/tools/testing/selftests/rseq/rseq.h
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -74,6 +74,20 @@ extern unsigned int rseq_flags;
  */
 extern unsigned int rseq_feature_size;
 
+enum rseq_mo {
+	RSEQ_MO_RELAXED = 0,
+	RSEQ_MO_CONSUME = 1,	/* Unused */
+	RSEQ_MO_ACQUIRE = 2,	/* Unused */
+	RSEQ_MO_RELEASE = 3,
+	RSEQ_MO_ACQ_REL = 4,	/* Unused */
+	RSEQ_MO_SEQ_CST = 5,	/* Unused */
+};
+
+enum rseq_percpu_mode {
+	RSEQ_PERCPU_CPU_ID = 0,
+	RSEQ_PERCPU_MM_CID = 1,
+};
+
 static inline struct rseq_abi *rseq_get_abi(void)
 {
 	return (struct rseq_abi *) ((uintptr_t) rseq_thread_pointer() + rseq_offset);
@@ -222,4 +236,149 @@ static inline void rseq_prepare_unload(void)
 	rseq_clear_rseq_cs();
 }
 
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_storev(enum rseq_mo rseq_mo, enum rseq_percpu_mode percpu_mode,
+		       intptr_t *v, intptr_t expect,
+		       intptr_t newv, int cpu)
+{
+	if (rseq_mo != RSEQ_MO_RELAXED)
+		return -1;
+	switch (percpu_mode) {
+	case RSEQ_PERCPU_CPU_ID:
+		return rseq_cmpeqv_storev_relaxed_cpu_id(v, expect, newv, cpu);
+	case RSEQ_PERCPU_MM_CID:
+		return rseq_cmpeqv_storev_relaxed_mm_cid(v, expect, newv, cpu);
+	}
+	return -1;
+}
+
+/*
+ * Compare @v against @expectnot. When it does _not_ match, load @v
+ * into @load, and store the content of *@v + voffp into @v.
+ */
+static inline __attribute__((always_inline))
+int rseq_cmpnev_storeoffp_load(enum rseq_mo rseq_mo, enum rseq_percpu_mode percpu_mode,
+			       intptr_t *v, intptr_t expectnot, long voffp, intptr_t *load,
+			       int cpu)
+{
+	if (rseq_mo != RSEQ_MO_RELAXED)
+		return -1;
+	switch (percpu_mode) {
+	case RSEQ_PERCPU_CPU_ID:
+		return rseq_cmpnev_storeoffp_load_relaxed_cpu_id(v, expectnot, voffp, load, cpu);
+	case RSEQ_PERCPU_MM_CID:
+		return rseq_cmpnev_storeoffp_load_relaxed_mm_cid(v, expectnot, voffp, load, cpu);
+	}
+	return -1;
+}
+
+static inline __attribute__((always_inline))
+int rseq_addv(enum rseq_mo rseq_mo, enum rseq_percpu_mode percpu_mode,
+	      intptr_t *v, intptr_t count, int cpu)
+{
+	if (rseq_mo != RSEQ_MO_RELAXED)
+		return -1;
+	switch (percpu_mode) {
+	case RSEQ_PERCPU_CPU_ID:
+		return rseq_addv_relaxed_cpu_id(v, count, cpu);
+	case RSEQ_PERCPU_MM_CID:
+		return rseq_addv_relaxed_mm_cid(v, count, cpu);
+	}
+	return -1;
+}
+
+#ifdef RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+/*
+ *   pval = *(ptr+off)
+ *  *pval += inc;
+ */
+static inline __attribute__((always_inline))
+int rseq_offset_deref_addv(enum rseq_mo rseq_mo, enum rseq_percpu_mode percpu_mode,
+			   intptr_t *ptr, long off, intptr_t inc, int cpu)
+{
+	if (rseq_mo != RSEQ_MO_RELAXED)
+		return -1;
+	switch (percpu_mode) {
+	case RSEQ_PERCPU_CPU_ID:
+		return rseq_offset_deref_addv_relaxed_cpu_id(ptr, off, inc, cpu);
+	case RSEQ_PERCPU_MM_CID:
+		return rseq_offset_deref_addv_relaxed_mm_cid(ptr, off, inc, cpu);
+	}
+	return -1;
+}
+#endif
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev(enum rseq_mo rseq_mo, enum rseq_percpu_mode percpu_mode,
+				 intptr_t *v, intptr_t expect,
+				 intptr_t *v2, intptr_t newv2,
+				 intptr_t newv, int cpu)
+{
+	switch (rseq_mo) {
+	case RSEQ_MO_RELAXED:
+		switch (percpu_mode) {
+		case RSEQ_PERCPU_CPU_ID:
+			return rseq_cmpeqv_trystorev_storev_relaxed_cpu_id(v, expect, v2, newv2, newv, cpu);
+		case RSEQ_PERCPU_MM_CID:
+			return rseq_cmpeqv_trystorev_storev_relaxed_mm_cid(v, expect, v2, newv2, newv, cpu);
+		}
+		return -1;
+	case RSEQ_MO_RELEASE:
+		switch (percpu_mode) {
+		case RSEQ_PERCPU_CPU_ID:
+			return rseq_cmpeqv_trystorev_storev_release_cpu_id(v, expect, v2, newv2, newv, cpu);
+		case RSEQ_PERCPU_MM_CID:
+			return rseq_cmpeqv_trystorev_storev_release_mm_cid(v, expect, v2, newv2, newv, cpu);
+		}
+		return -1;
+	default:
+		return -1;
+	}
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_cmpeqv_storev(enum rseq_mo rseq_mo, enum rseq_percpu_mode percpu_mode,
+			      intptr_t *v, intptr_t expect,
+			      intptr_t *v2, intptr_t expect2,
+			      intptr_t newv, int cpu)
+{
+	if (rseq_mo != RSEQ_MO_RELAXED)
+		return -1;
+	switch (percpu_mode) {
+	case RSEQ_PERCPU_CPU_ID:
+		return rseq_cmpeqv_cmpeqv_storev_relaxed_cpu_id(v, expect, v2, expect2, newv, cpu);
+	case RSEQ_PERCPU_MM_CID:
+		return rseq_cmpeqv_cmpeqv_storev_relaxed_mm_cid(v, expect, v2, expect2, newv, cpu);
+	}
+	return -1;
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev(enum rseq_mo rseq_mo, enum rseq_percpu_mode percpu_mode,
+				 intptr_t *v, intptr_t expect,
+				 void *dst, void *src, size_t len,
+				 intptr_t newv, int cpu)
+{
+	switch (rseq_mo) {
+	case RSEQ_MO_RELAXED:
+		switch (percpu_mode) {
+		case RSEQ_PERCPU_CPU_ID:
+			return rseq_cmpeqv_trymemcpy_storev_relaxed_cpu_id(v, expect, dst, src, len, newv, cpu);
+		case RSEQ_PERCPU_MM_CID:
+			return rseq_cmpeqv_trymemcpy_storev_relaxed_mm_cid(v, expect, dst, src, len, newv, cpu);
+		}
+		return -1;
+	case RSEQ_MO_RELEASE:
+		switch (percpu_mode) {
+		case RSEQ_PERCPU_CPU_ID:
+			return rseq_cmpeqv_trymemcpy_storev_release_cpu_id(v, expect, dst, src, len, newv, cpu);
+		case RSEQ_PERCPU_MM_CID:
+			return rseq_cmpeqv_trymemcpy_storev_release_mm_cid(v, expect, dst, src, len, newv, cpu);
+		}
+		return -1;
+	default:
+		return -1;
+	}
+}
+
 #endif  /* RSEQ_H_ */
-- 
cgit 


From 5bf4aba38a7686fd13f9800a42c39d2adef04fb0 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:14 -0500
Subject: selftests/rseq: arm: Template memory ordering and percpu access mode

Introduce a rseq-arm-bits.h template header which is internally included
to generate the static inline functions covering:

- relaxed and release memory ordering,
- per-cpu-id and per-mm-cid per-cpu data access.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-13-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/rseq-arm-bits.h | 505 +++++++++++++++++++
 tools/testing/selftests/rseq/rseq-arm.h      | 695 +--------------------------
 2 files changed, 530 insertions(+), 670 deletions(-)
 create mode 100644 tools/testing/selftests/rseq/rseq-arm-bits.h

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/rseq-arm-bits.h b/tools/testing/selftests/rseq/rseq-arm-bits.h
new file mode 100644
index 000000000000..4f03cb395462
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-arm-bits.h
@@ -0,0 +1,505 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq-arm-bits.h
+ *
+ * (C) Copyright 2016-2022 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#include "rseq-bits-template.h"
+
+#if defined(RSEQ_TEMPLATE_MO_RELAXED) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_storev)(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		"ldr r0, %[v]\n\t"
+		"cmp %[expect], r0\n\t"
+		"bne %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		"ldr r0, %[v]\n\t"
+		"cmp %[expect], r0\n\t"
+		"bne %l[error2]\n\t"
+#endif
+		/* final store */
+		"str %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(5)
+		"b 5f\n\t"
+		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+		"5:\n\t"
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "r0", "memory", "cc"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpnev_storeoffp_load)(intptr_t *v, intptr_t expectnot,
+			       long voffp, intptr_t *load, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		"ldr r0, %[v]\n\t"
+		"cmp %[expectnot], r0\n\t"
+		"beq %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		"ldr r0, %[v]\n\t"
+		"cmp %[expectnot], r0\n\t"
+		"beq %l[error2]\n\t"
+#endif
+		"str r0, %[load]\n\t"
+		"add r0, %[voffp]\n\t"
+		"ldr r0, [r0]\n\t"
+		/* final store */
+		"str r0, %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(5)
+		"b 5f\n\t"
+		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+		"5:\n\t"
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expectnot]		"r" (expectnot),
+		  [voffp]		"Ir" (voffp),
+		  [load]		"m" (*load)
+		  RSEQ_INJECT_INPUT
+		: "r0", "memory", "cc"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_addv)(intptr_t *v, intptr_t count, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+#endif
+		"ldr r0, %[v]\n\t"
+		"add r0, %[count]\n\t"
+		/* final store */
+		"str r0, %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(4)
+		"b 5f\n\t"
+		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+		"5:\n\t"
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  [v]			"m" (*v),
+		  [count]		"Ir" (count)
+		  RSEQ_INJECT_INPUT
+		: "r0", "memory", "cc"
+		  RSEQ_INJECT_CLOBBER
+		: abort
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_cmpeqv_storev)(intptr_t *v, intptr_t expect,
+			      intptr_t *v2, intptr_t expect2,
+			      intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		"ldr r0, %[v]\n\t"
+		"cmp %[expect], r0\n\t"
+		"bne %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+		"ldr r0, %[v2]\n\t"
+		"cmp %[expect2], r0\n\t"
+		"bne %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		"ldr r0, %[v]\n\t"
+		"cmp %[expect], r0\n\t"
+		"bne %l[error2]\n\t"
+		"ldr r0, %[v2]\n\t"
+		"cmp %[expect2], r0\n\t"
+		"bne %l[error3]\n\t"
+#endif
+		/* final store */
+		"str %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		"b 5f\n\t"
+		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+		"5:\n\t"
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* cmp2 input */
+		  [v2]			"m" (*v2),
+		  [expect2]		"r" (expect2),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "r0", "memory", "cc"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2, error3
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("1st expected value comparison failed");
+error3:
+	rseq_after_asm_goto();
+	rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+#endif /* #if defined(RSEQ_TEMPLATE_MO_RELAXED) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trystorev_storev)(intptr_t *v, intptr_t expect,
+				 intptr_t *v2, intptr_t newv2,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		"ldr r0, %[v]\n\t"
+		"cmp %[expect], r0\n\t"
+		"bne %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		"ldr r0, %[v]\n\t"
+		"cmp %[expect], r0\n\t"
+		"bne %l[error2]\n\t"
+#endif
+		/* try store */
+		"str %[newv2], %[v2]\n\t"
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_TEMPLATE_MO_RELEASE
+		"dmb\n\t"	/* full mb provides store-release */
+#endif
+		/* final store */
+		"str %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		"b 5f\n\t"
+		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+		"5:\n\t"
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* try store input */
+		  [v2]			"m" (*v2),
+		  [newv2]		"r" (newv2),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "r0", "memory", "cc"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trymemcpy_storev)(intptr_t *v, intptr_t expect,
+				 void *dst, void *src, size_t len,
+				 intptr_t newv, int cpu)
+{
+	uint32_t rseq_scratch[3];
+
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		"str %[src], %[rseq_scratch0]\n\t"
+		"str %[dst], %[rseq_scratch1]\n\t"
+		"str %[len], %[rseq_scratch2]\n\t"
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		"ldr r0, %[v]\n\t"
+		"cmp %[expect], r0\n\t"
+		"bne 5f\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
+		"ldr r0, %[v]\n\t"
+		"cmp %[expect], r0\n\t"
+		"bne 7f\n\t"
+#endif
+		/* try memcpy */
+		"cmp %[len], #0\n\t" \
+		"beq 333f\n\t" \
+		"222:\n\t" \
+		"ldrb %%r0, [%[src]]\n\t" \
+		"strb %%r0, [%[dst]]\n\t" \
+		"adds %[src], #1\n\t" \
+		"adds %[dst], #1\n\t" \
+		"subs %[len], #1\n\t" \
+		"bne 222b\n\t" \
+		"333:\n\t" \
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_TEMPLATE_MO_RELEASE
+		"dmb\n\t"	/* full mb provides store-release */
+#endif
+		/* final store */
+		"str %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		/* teardown */
+		"ldr %[len], %[rseq_scratch2]\n\t"
+		"ldr %[dst], %[rseq_scratch1]\n\t"
+		"ldr %[src], %[rseq_scratch0]\n\t"
+		"b 8f\n\t"
+		RSEQ_ASM_DEFINE_ABORT(3, 4,
+				      /* teardown */
+				      "ldr %[len], %[rseq_scratch2]\n\t"
+				      "ldr %[dst], %[rseq_scratch1]\n\t"
+				      "ldr %[src], %[rseq_scratch0]\n\t",
+				      abort, 1b, 2b, 4f)
+		RSEQ_ASM_DEFINE_CMPFAIL(5,
+					/* teardown */
+					"ldr %[len], %[rseq_scratch2]\n\t"
+					"ldr %[dst], %[rseq_scratch1]\n\t"
+					"ldr %[src], %[rseq_scratch0]\n\t",
+					cmpfail)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_CMPFAIL(6,
+					/* teardown */
+					"ldr %[len], %[rseq_scratch2]\n\t"
+					"ldr %[dst], %[rseq_scratch1]\n\t"
+					"ldr %[src], %[rseq_scratch0]\n\t",
+					error1)
+		RSEQ_ASM_DEFINE_CMPFAIL(7,
+					/* teardown */
+					"ldr %[len], %[rseq_scratch2]\n\t"
+					"ldr %[dst], %[rseq_scratch1]\n\t"
+					"ldr %[src], %[rseq_scratch0]\n\t",
+					error2)
+#endif
+		"8:\n\t"
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv),
+		  /* try memcpy input */
+		  [dst]			"r" (dst),
+		  [src]			"r" (src),
+		  [len]			"r" (len),
+		  [rseq_scratch0]	"m" (rseq_scratch[0]),
+		  [rseq_scratch1]	"m" (rseq_scratch[1]),
+		  [rseq_scratch2]	"m" (rseq_scratch[2])
+		  RSEQ_INJECT_INPUT
+		: "r0", "memory", "cc"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* #if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#include "rseq-bits-reset.h"
diff --git a/tools/testing/selftests/rseq/rseq-arm.h b/tools/testing/selftests/rseq/rseq-arm.h
index 7445107f842b..8414fc3eac15 100644
--- a/tools/testing/selftests/rseq/rseq-arm.h
+++ b/tools/testing/selftests/rseq/rseq-arm.h
@@ -2,7 +2,7 @@
 /*
  * rseq-arm.h
  *
- * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * (C) Copyright 2016-2022 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  */
 
 /*
@@ -143,679 +143,34 @@ do {									\
 		teardown						\
 		"b %l[" __rseq_str(cmpfail_label) "]\n\t"
 
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
+/* Per-cpu-id indexing. */
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		"ldr r0, %[v]\n\t"
-		"cmp %[expect], r0\n\t"
-		"bne %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		"ldr r0, %[v]\n\t"
-		"cmp %[expect], r0\n\t"
-		"bne %l[error2]\n\t"
-#endif
-		/* final store */
-		"str %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(5)
-		"b 5f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
-		"5:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "r0", "memory", "cc"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
-			       long voffp, intptr_t *load, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		"ldr r0, %[v]\n\t"
-		"cmp %[expectnot], r0\n\t"
-		"beq %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		"ldr r0, %[v]\n\t"
-		"cmp %[expectnot], r0\n\t"
-		"beq %l[error2]\n\t"
-#endif
-		"str r0, %[load]\n\t"
-		"add r0, %[voffp]\n\t"
-		"ldr r0, [r0]\n\t"
-		/* final store */
-		"str r0, %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(5)
-		"b 5f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
-		"5:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expectnot]		"r" (expectnot),
-		  [voffp]		"Ir" (voffp),
-		  [load]		"m" (*load)
-		  RSEQ_INJECT_INPUT
-		: "r0", "memory", "cc"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_addv(intptr_t *v, intptr_t count, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-#endif
-		"ldr r0, %[v]\n\t"
-		"add r0, %[count]\n\t"
-		/* final store */
-		"str r0, %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(4)
-		"b 5f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
-		"5:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  [v]			"m" (*v),
-		  [count]		"Ir" (count)
-		  RSEQ_INJECT_INPUT
-		: "r0", "memory", "cc"
-		  RSEQ_INJECT_CLOBBER
-		: abort
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
-				 intptr_t *v2, intptr_t newv2,
-				 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		"ldr r0, %[v]\n\t"
-		"cmp %[expect], r0\n\t"
-		"bne %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		"ldr r0, %[v]\n\t"
-		"cmp %[expect], r0\n\t"
-		"bne %l[error2]\n\t"
-#endif
-		/* try store */
-		"str %[newv2], %[v2]\n\t"
-		RSEQ_INJECT_ASM(5)
-		/* final store */
-		"str %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		"b 5f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
-		"5:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* try store input */
-		  [v2]			"m" (*v2),
-		  [newv2]		"r" (newv2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "r0", "memory", "cc"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
+#define RSEQ_TEMPLATE_CPU_ID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-arm-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
 
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
-					 intptr_t *v2, intptr_t newv2,
-					 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-arm-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_CPU_ID
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		"ldr r0, %[v]\n\t"
-		"cmp %[expect], r0\n\t"
-		"bne %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		"ldr r0, %[v]\n\t"
-		"cmp %[expect], r0\n\t"
-		"bne %l[error2]\n\t"
-#endif
-		/* try store */
-		"str %[newv2], %[v2]\n\t"
-		RSEQ_INJECT_ASM(5)
-		"dmb\n\t"	/* full mb provides store-release */
-		/* final store */
-		"str %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		"b 5f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
-		"5:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* try store input */
-		  [v2]			"m" (*v2),
-		  [newv2]		"r" (newv2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "r0", "memory", "cc"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
-			      intptr_t *v2, intptr_t expect2,
-			      intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		"ldr r0, %[v]\n\t"
-		"cmp %[expect], r0\n\t"
-		"bne %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-		"ldr r0, %[v2]\n\t"
-		"cmp %[expect2], r0\n\t"
-		"bne %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(5)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		"ldr r0, %[v]\n\t"
-		"cmp %[expect], r0\n\t"
-		"bne %l[error2]\n\t"
-		"ldr r0, %[v2]\n\t"
-		"cmp %[expect2], r0\n\t"
-		"bne %l[error3]\n\t"
-#endif
-		/* final store */
-		"str %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		"b 5f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
-		"5:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* cmp2 input */
-		  [v2]			"m" (*v2),
-		  [expect2]		"r" (expect2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "r0", "memory", "cc"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2, error3
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("1st expected value comparison failed");
-error3:
-	rseq_after_asm_goto();
-	rseq_bug("2nd expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
-				 void *dst, void *src, size_t len,
-				 intptr_t newv, int cpu)
-{
-	uint32_t rseq_scratch[3];
+/* Per-mm-cid indexing. */
 
-	RSEQ_INJECT_C(9)
+#define RSEQ_TEMPLATE_MM_CID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-arm-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		"str %[src], %[rseq_scratch0]\n\t"
-		"str %[dst], %[rseq_scratch1]\n\t"
-		"str %[len], %[rseq_scratch2]\n\t"
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		"ldr r0, %[v]\n\t"
-		"cmp %[expect], r0\n\t"
-		"bne 5f\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
-		"ldr r0, %[v]\n\t"
-		"cmp %[expect], r0\n\t"
-		"bne 7f\n\t"
-#endif
-		/* try memcpy */
-		"cmp %[len], #0\n\t" \
-		"beq 333f\n\t" \
-		"222:\n\t" \
-		"ldrb %%r0, [%[src]]\n\t" \
-		"strb %%r0, [%[dst]]\n\t" \
-		"adds %[src], #1\n\t" \
-		"adds %[dst], #1\n\t" \
-		"subs %[len], #1\n\t" \
-		"bne 222b\n\t" \
-		"333:\n\t" \
-		RSEQ_INJECT_ASM(5)
-		/* final store */
-		"str %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		/* teardown */
-		"ldr %[len], %[rseq_scratch2]\n\t"
-		"ldr %[dst], %[rseq_scratch1]\n\t"
-		"ldr %[src], %[rseq_scratch0]\n\t"
-		"b 8f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4,
-				      /* teardown */
-				      "ldr %[len], %[rseq_scratch2]\n\t"
-				      "ldr %[dst], %[rseq_scratch1]\n\t"
-				      "ldr %[src], %[rseq_scratch0]\n\t",
-				      abort, 1b, 2b, 4f)
-		RSEQ_ASM_DEFINE_CMPFAIL(5,
-					/* teardown */
-					"ldr %[len], %[rseq_scratch2]\n\t"
-					"ldr %[dst], %[rseq_scratch1]\n\t"
-					"ldr %[src], %[rseq_scratch0]\n\t",
-					cmpfail)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_CMPFAIL(6,
-					/* teardown */
-					"ldr %[len], %[rseq_scratch2]\n\t"
-					"ldr %[dst], %[rseq_scratch1]\n\t"
-					"ldr %[src], %[rseq_scratch0]\n\t",
-					error1)
-		RSEQ_ASM_DEFINE_CMPFAIL(7,
-					/* teardown */
-					"ldr %[len], %[rseq_scratch2]\n\t"
-					"ldr %[dst], %[rseq_scratch1]\n\t"
-					"ldr %[src], %[rseq_scratch0]\n\t",
-					error2)
-#endif
-		"8:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv),
-		  /* try memcpy input */
-		  [dst]			"r" (dst),
-		  [src]			"r" (src),
-		  [len]			"r" (len),
-		  [rseq_scratch0]	"m" (rseq_scratch[0]),
-		  [rseq_scratch1]	"m" (rseq_scratch[1]),
-		  [rseq_scratch2]	"m" (rseq_scratch[2])
-		  RSEQ_INJECT_INPUT
-		: "r0", "memory", "cc"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
-					 void *dst, void *src, size_t len,
-					 intptr_t newv, int cpu)
-{
-	uint32_t rseq_scratch[3];
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-arm-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_MM_CID
 
-	RSEQ_INJECT_C(9)
+/* APIs which are not based on cpu ids. */
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		"str %[src], %[rseq_scratch0]\n\t"
-		"str %[dst], %[rseq_scratch1]\n\t"
-		"str %[len], %[rseq_scratch2]\n\t"
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		"ldr r0, %[v]\n\t"
-		"cmp %[expect], r0\n\t"
-		"bne 5f\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
-		"ldr r0, %[v]\n\t"
-		"cmp %[expect], r0\n\t"
-		"bne 7f\n\t"
-#endif
-		/* try memcpy */
-		"cmp %[len], #0\n\t" \
-		"beq 333f\n\t" \
-		"222:\n\t" \
-		"ldrb %%r0, [%[src]]\n\t" \
-		"strb %%r0, [%[dst]]\n\t" \
-		"adds %[src], #1\n\t" \
-		"adds %[dst], #1\n\t" \
-		"subs %[len], #1\n\t" \
-		"bne 222b\n\t" \
-		"333:\n\t" \
-		RSEQ_INJECT_ASM(5)
-		"dmb\n\t"	/* full mb provides store-release */
-		/* final store */
-		"str %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		/* teardown */
-		"ldr %[len], %[rseq_scratch2]\n\t"
-		"ldr %[dst], %[rseq_scratch1]\n\t"
-		"ldr %[src], %[rseq_scratch0]\n\t"
-		"b 8f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4,
-				      /* teardown */
-				      "ldr %[len], %[rseq_scratch2]\n\t"
-				      "ldr %[dst], %[rseq_scratch1]\n\t"
-				      "ldr %[src], %[rseq_scratch0]\n\t",
-				      abort, 1b, 2b, 4f)
-		RSEQ_ASM_DEFINE_CMPFAIL(5,
-					/* teardown */
-					"ldr %[len], %[rseq_scratch2]\n\t"
-					"ldr %[dst], %[rseq_scratch1]\n\t"
-					"ldr %[src], %[rseq_scratch0]\n\t",
-					cmpfail)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_CMPFAIL(6,
-					/* teardown */
-					"ldr %[len], %[rseq_scratch2]\n\t"
-					"ldr %[dst], %[rseq_scratch1]\n\t"
-					"ldr %[src], %[rseq_scratch0]\n\t",
-					error1)
-		RSEQ_ASM_DEFINE_CMPFAIL(7,
-					/* teardown */
-					"ldr %[len], %[rseq_scratch2]\n\t"
-					"ldr %[dst], %[rseq_scratch1]\n\t"
-					"ldr %[src], %[rseq_scratch0]\n\t",
-					error2)
-#endif
-		"8:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv),
-		  /* try memcpy input */
-		  [dst]			"r" (dst),
-		  [src]			"r" (src),
-		  [len]			"r" (len),
-		  [rseq_scratch0]	"m" (rseq_scratch[0]),
-		  [rseq_scratch1]	"m" (rseq_scratch[1]),
-		  [rseq_scratch2]	"m" (rseq_scratch[2])
-		  RSEQ_INJECT_INPUT
-		: "r0", "memory", "cc"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
+#define RSEQ_TEMPLATE_CPU_ID_NONE
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-arm-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
+#undef RSEQ_TEMPLATE_CPU_ID_NONE
-- 
cgit 


From 8d4eeb8bb43262509c3e929e4d086ccaaec627a4 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:15 -0500
Subject: selftests/rseq: arm64: Template memory ordering and percpu access
 mode

Introduce a rseq-arm64-bits.h template header which is internally
included to generate the static inline functions covering:

- relaxed and release memory ordering,
- per-cpu-id and per-mm-cid per-cpu data access.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-14-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/rseq-arm64-bits.h | 392 +++++++++++++++++++
 tools/testing/selftests/rseq/rseq-arm64.h      | 516 ++-----------------------
 2 files changed, 422 insertions(+), 486 deletions(-)
 create mode 100644 tools/testing/selftests/rseq/rseq-arm64-bits.h

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/rseq-arm64-bits.h b/tools/testing/selftests/rseq/rseq-arm64-bits.h
new file mode 100644
index 000000000000..cc7226b1efe1
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-arm64-bits.h
@@ -0,0 +1,392 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq-arm64-bits.h
+ *
+ * (C) Copyright 2016-2022 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * (C) Copyright 2018 - Will Deacon <will.deacon@arm.com>
+ */
+
+#include "rseq-bits-template.h"
+
+#if defined(RSEQ_TEMPLATE_MO_RELAXED) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_storev)(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  [v]			"Qo" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpnev_storeoffp_load)(intptr_t *v, intptr_t expectnot,
+			       long voffp, intptr_t *load, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[error2])
+#endif
+		RSEQ_ASM_OP_R_LOAD(v)
+		RSEQ_ASM_OP_R_STORE(load)
+		RSEQ_ASM_OP_R_LOAD_OFF(voffp)
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  [v]			"Qo" (*v),
+		  [expectnot]		"r" (expectnot),
+		  [load]		"Qo" (*load),
+		  [voffp]		"r" (voffp)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_addv)(intptr_t *v, intptr_t count, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+#endif
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+#endif
+		RSEQ_ASM_OP_R_LOAD(v)
+		RSEQ_ASM_OP_R_ADD(count)
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+		RSEQ_INJECT_ASM(4)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  [v]			"Qo" (*v),
+		  [count]		"r" (count)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_cmpeqv_storev)(intptr_t *v, intptr_t expect,
+			      intptr_t *v2, intptr_t expect2,
+			      intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error3])
+#endif
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[cmpfail])
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])
+#endif
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  [v]			"Qo" (*v),
+		  [expect]		"r" (expect),
+		  [v2]			"Qo" (*v2),
+		  [expect2]		"r" (expect2),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2, error3
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+error3:
+	rseq_after_asm_goto();
+	rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+#endif /* #if defined(RSEQ_TEMPLATE_MO_RELAXED) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trystorev_storev)(intptr_t *v, intptr_t expect,
+				 intptr_t *v2, intptr_t newv2,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_STORE(newv2, v2)
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_TEMPLATE_MO_RELEASE
+		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+#else
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+#endif
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  [expect]		"r" (expect),
+		  [v]			"Qo" (*v),
+		  [newv]		"r" (newv),
+		  [v2]			"Qo" (*v2),
+		  [newv2]		"r" (newv2)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trymemcpy_storev)(intptr_t *v, intptr_t expect,
+				 void *dst, void *src, size_t len,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
+		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_TEMPLATE_MO_RELEASE
+		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+#else
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+#endif
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"Qo" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  [expect]		"r" (expect),
+		  [v]			"Qo" (*v),
+		  [newv]		"r" (newv),
+		  [dst]			"r" (dst),
+		  [src]			"r" (src),
+		  [len]			"r" (len)
+		  RSEQ_INJECT_INPUT
+		: "memory", RSEQ_ASM_TMP_REG, RSEQ_ASM_TMP_REG_2
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* #if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#include "rseq-bits-reset.h"
diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
index 49c387fcd868..85b90977e7e6 100644
--- a/tools/testing/selftests/rseq/rseq-arm64.h
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -2,7 +2,7 @@
 /*
  * rseq-arm64.h
  *
- * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * (C) Copyright 2016-2022 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  * (C) Copyright 2018 - Will Deacon <will.deacon@arm.com>
  */
 
@@ -200,490 +200,34 @@ do {										\
 	"	cbnz	" RSEQ_ASM_TMP_REG_2 ", 222b\n"				\
 	"333:\n"
 
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
-#endif
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
-#endif
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
-		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"Qo" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  [v]			"Qo" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "memory", RSEQ_ASM_TMP_REG
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
-			       long voffp, intptr_t *load, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
-#endif
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[error2])
-#endif
-		RSEQ_ASM_OP_R_LOAD(v)
-		RSEQ_ASM_OP_R_STORE(load)
-		RSEQ_ASM_OP_R_LOAD_OFF(voffp)
-		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
-		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"Qo" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  [v]			"Qo" (*v),
-		  [expectnot]		"r" (expectnot),
-		  [load]		"Qo" (*load),
-		  [voffp]		"r" (voffp)
-		  RSEQ_INJECT_INPUT
-		: "memory", RSEQ_ASM_TMP_REG
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
+/* Per-cpu-id indexing. */
 
-static inline __attribute__((always_inline))
-int rseq_addv(intptr_t *v, intptr_t count, int cpu)
-{
-	RSEQ_INJECT_C(9)
+#define RSEQ_TEMPLATE_CPU_ID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-arm64-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
-#endif
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-#endif
-		RSEQ_ASM_OP_R_LOAD(v)
-		RSEQ_ASM_OP_R_ADD(count)
-		RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
-		RSEQ_INJECT_ASM(4)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"Qo" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  [v]			"Qo" (*v),
-		  [count]		"r" (count)
-		  RSEQ_INJECT_INPUT
-		: "memory", RSEQ_ASM_TMP_REG
-		: abort
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
-				 intptr_t *v2, intptr_t newv2,
-				 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
-#endif
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
-#endif
-		RSEQ_ASM_OP_STORE(newv2, v2)
-		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"Qo" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  [expect]		"r" (expect),
-		  [v]			"Qo" (*v),
-		  [newv]		"r" (newv),
-		  [v2]			"Qo" (*v2),
-		  [newv2]		"r" (newv2)
-		  RSEQ_INJECT_INPUT
-		: "memory", RSEQ_ASM_TMP_REG
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
-					 intptr_t *v2, intptr_t newv2,
-					 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
-#endif
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
-#endif
-		RSEQ_ASM_OP_STORE(newv2, v2)
-		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"Qo" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  [expect]		"r" (expect),
-		  [v]			"Qo" (*v),
-		  [newv]		"r" (newv),
-		  [v2]			"Qo" (*v2),
-		  [newv2]		"r" (newv2)
-		  RSEQ_INJECT_INPUT
-		: "memory", RSEQ_ASM_TMP_REG
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
-			      intptr_t *v2, intptr_t expect2,
-			      intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error3])
-#endif
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
-		RSEQ_INJECT_ASM(4)
-		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[cmpfail])
-		RSEQ_INJECT_ASM(5)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
-		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])
-#endif
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"Qo" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  [v]			"Qo" (*v),
-		  [expect]		"r" (expect),
-		  [v2]			"Qo" (*v2),
-		  [expect2]		"r" (expect2),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "memory", RSEQ_ASM_TMP_REG
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2, error3
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-error3:
-	rseq_after_asm_goto();
-	rseq_bug("2nd expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
-				 void *dst, void *src, size_t len,
-				 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
-#endif
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
-#endif
-		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
-		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"Qo" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  [expect]		"r" (expect),
-		  [v]			"Qo" (*v),
-		  [newv]		"r" (newv),
-		  [dst]			"r" (dst),
-		  [src]			"r" (src),
-		  [len]			"r" (len)
-		  RSEQ_INJECT_INPUT
-		: "memory", RSEQ_ASM_TMP_REG, RSEQ_ASM_TMP_REG_2
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
-					 void *dst, void *src, size_t len,
-					 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
-#endif
-		RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
-#endif
-		RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
-		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"Qo" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  [expect]		"r" (expect),
-		  [v]			"Qo" (*v),
-		  [newv]		"r" (newv),
-		  [dst]			"r" (dst),
-		  [src]			"r" (src),
-		  [len]			"r" (len)
-		  RSEQ_INJECT_INPUT
-		: "memory", RSEQ_ASM_TMP_REG, RSEQ_ASM_TMP_REG_2
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-arm64-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_CPU_ID
+
+/* Per-mm-cid indexing. */
+
+#define RSEQ_TEMPLATE_MM_CID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-arm64-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
+
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-arm64-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_MM_CID
+
+/* APIs which are not based on cpu ids. */
+
+#define RSEQ_TEMPLATE_CPU_ID_NONE
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-arm64-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
+#undef RSEQ_TEMPLATE_CPU_ID_NONE
-- 
cgit 


From 431b63285abc46d791874eadee8da12c835ce71d Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:16 -0500
Subject: selftests/rseq: mips: Template memory ordering and percpu access mode

Introduce a rseq-mips-bits.h template header which is internally
included to generate the static inline functions covering:

- relaxed and release memory ordering,
- per-cpu-id and per-mm-cid per-cpu data access.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-15-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/rseq-mips-bits.h | 462 +++++++++++++++++++
 tools/testing/selftests/rseq/rseq-mips.h      | 640 +-------------------------
 2 files changed, 487 insertions(+), 615 deletions(-)
 create mode 100644 tools/testing/selftests/rseq/rseq-mips-bits.h

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/rseq-mips-bits.h b/tools/testing/selftests/rseq/rseq-mips-bits.h
new file mode 100644
index 000000000000..6c48af4d0944
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-mips-bits.h
@@ -0,0 +1,462 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * Author: Paul Burton <paul.burton@mips.com>
+ * (C) Copyright 2018 MIPS Tech LLC
+ * (C) Copyright 2016-2022 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#include "rseq-bits-template.h"
+
+#if defined(RSEQ_TEMPLATE_MO_RELAXED) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_storev)(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		LONG_L " $4, %[v]\n\t"
+		"bne $4, %[expect], %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		LONG_L " $4, %[v]\n\t"
+		"bne $4, %[expect], %l[error2]\n\t"
+#endif
+		/* final store */
+		LONG_S " %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(5)
+		"b 5f\n\t"
+		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+		"5:\n\t"
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "$4", "memory"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpnev_storeoffp_load)(intptr_t *v, intptr_t expectnot,
+			       long voffp, intptr_t *load, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		LONG_L " $4, %[v]\n\t"
+		"beq $4, %[expectnot], %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		LONG_L " $4, %[v]\n\t"
+		"beq $4, %[expectnot], %l[error2]\n\t"
+#endif
+		LONG_S " $4, %[load]\n\t"
+		LONG_ADDI " $4, %[voffp]\n\t"
+		LONG_L " $4, 0($4)\n\t"
+		/* final store */
+		LONG_S " $4, %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(5)
+		"b 5f\n\t"
+		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+		"5:\n\t"
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expectnot]		"r" (expectnot),
+		  [voffp]		"Ir" (voffp),
+		  [load]		"m" (*load)
+		  RSEQ_INJECT_INPUT
+		: "$4", "memory"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_addv)(intptr_t *v, intptr_t count, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+#endif
+		LONG_L " $4, %[v]\n\t"
+		LONG_ADDI " $4, %[count]\n\t"
+		/* final store */
+		LONG_S " $4, %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(4)
+		"b 5f\n\t"
+		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+		"5:\n\t"
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  [v]			"m" (*v),
+		  [count]		"Ir" (count)
+		  RSEQ_INJECT_INPUT
+		: "$4", "memory"
+		  RSEQ_INJECT_CLOBBER
+		: abort
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_cmpeqv_storev)(intptr_t *v, intptr_t expect,
+			      intptr_t *v2, intptr_t expect2,
+			      intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		LONG_L " $4, %[v]\n\t"
+		"bne $4, %[expect], %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+		LONG_L " $4, %[v2]\n\t"
+		"bne $4, %[expect2], %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		LONG_L " $4, %[v]\n\t"
+		"bne $4, %[expect], %l[error2]\n\t"
+		LONG_L " $4, %[v2]\n\t"
+		"bne $4, %[expect2], %l[error3]\n\t"
+#endif
+		/* final store */
+		LONG_S " %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		"b 5f\n\t"
+		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+		"5:\n\t"
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* cmp2 input */
+		  [v2]			"m" (*v2),
+		  [expect2]		"r" (expect2),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "$4", "memory"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2, error3
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("1st expected value comparison failed");
+error3:
+	rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+#endif /* #if defined(RSEQ_TEMPLATE_MO_RELAXED) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trystorev_storev)(intptr_t *v, intptr_t expect,
+				 intptr_t *v2, intptr_t newv2,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		LONG_L " $4, %[v]\n\t"
+		"bne $4, %[expect], %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		LONG_L " $4, %[v]\n\t"
+		"bne $4, %[expect], %l[error2]\n\t"
+#endif
+		/* try store */
+		LONG_S " %[newv2], %[v2]\n\t"
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_TEMPLATE_MO_RELEASE
+		"sync\n\t"	/* full sync provides store-release */
+#endif
+		/* final store */
+		LONG_S " %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		"b 5f\n\t"
+		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+		"5:\n\t"
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* try store input */
+		  [v2]			"m" (*v2),
+		  [newv2]		"r" (newv2),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "$4", "memory"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trymemcpy_storev)(intptr_t *v, intptr_t expect,
+				 void *dst, void *src, size_t len,
+				 intptr_t newv, int cpu)
+{
+	uintptr_t rseq_scratch[3];
+
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		LONG_S " %[src], %[rseq_scratch0]\n\t"
+		LONG_S "  %[dst], %[rseq_scratch1]\n\t"
+		LONG_S " %[len], %[rseq_scratch2]\n\t"
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		LONG_L " $4, %[v]\n\t"
+		"bne $4, %[expect], 5f\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
+		LONG_L " $4, %[v]\n\t"
+		"bne $4, %[expect], 7f\n\t"
+#endif
+		/* try memcpy */
+		"beqz %[len], 333f\n\t" \
+		"222:\n\t" \
+		"lb   $4, 0(%[src])\n\t" \
+		"sb   $4, 0(%[dst])\n\t" \
+		LONG_ADDI " %[src], 1\n\t" \
+		LONG_ADDI " %[dst], 1\n\t" \
+		LONG_ADDI " %[len], -1\n\t" \
+		"bnez %[len], 222b\n\t" \
+		"333:\n\t" \
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_TEMPLATE_MO_RELEASE
+		"sync\n\t"	/* full sync provides store-release */
+#endif
+		/* final store */
+		LONG_S " %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		/* teardown */
+		LONG_L " %[len], %[rseq_scratch2]\n\t"
+		LONG_L " %[dst], %[rseq_scratch1]\n\t"
+		LONG_L " %[src], %[rseq_scratch0]\n\t"
+		"b 8f\n\t"
+		RSEQ_ASM_DEFINE_ABORT(3, 4,
+				      /* teardown */
+				      LONG_L " %[len], %[rseq_scratch2]\n\t"
+				      LONG_L " %[dst], %[rseq_scratch1]\n\t"
+				      LONG_L " %[src], %[rseq_scratch0]\n\t",
+				      abort, 1b, 2b, 4f)
+		RSEQ_ASM_DEFINE_CMPFAIL(5,
+					/* teardown */
+					LONG_L " %[len], %[rseq_scratch2]\n\t"
+					LONG_L " %[dst], %[rseq_scratch1]\n\t"
+					LONG_L " %[src], %[rseq_scratch0]\n\t",
+					cmpfail)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_CMPFAIL(6,
+					/* teardown */
+					LONG_L " %[len], %[rseq_scratch2]\n\t"
+					LONG_L " %[dst], %[rseq_scratch1]\n\t"
+					LONG_L " %[src], %[rseq_scratch0]\n\t",
+					error1)
+		RSEQ_ASM_DEFINE_CMPFAIL(7,
+					/* teardown */
+					LONG_L " %[len], %[rseq_scratch2]\n\t"
+					LONG_L " %[dst], %[rseq_scratch1]\n\t"
+					LONG_L " %[src], %[rseq_scratch0]\n\t",
+					error2)
+#endif
+		"8:\n\t"
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv),
+		  /* try memcpy input */
+		  [dst]			"r" (dst),
+		  [src]			"r" (src),
+		  [len]			"r" (len),
+		  [rseq_scratch0]	"m" (rseq_scratch[0]),
+		  [rseq_scratch1]	"m" (rseq_scratch[1]),
+		  [rseq_scratch2]	"m" (rseq_scratch[2])
+		  RSEQ_INJECT_INPUT
+		: "$4", "memory"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* #if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#include "rseq-bits-reset.h"
diff --git a/tools/testing/selftests/rseq/rseq-mips.h b/tools/testing/selftests/rseq/rseq-mips.h
index dd199952d649..50b950cf9585 100644
--- a/tools/testing/selftests/rseq/rseq-mips.h
+++ b/tools/testing/selftests/rseq/rseq-mips.h
@@ -2,9 +2,7 @@
 /*
  * Author: Paul Burton <paul.burton@mips.com>
  * (C) Copyright 2018 MIPS Tech LLC
- *
- * Based on rseq-arm.h:
- * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * (C) Copyright 2016-2022 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  */
 
 /*
@@ -150,622 +148,34 @@ do {									\
 		teardown \
 		"b %l[" __rseq_str(cmpfail_label) "]\n\t"
 
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
+/* Per-cpu-id indexing. */
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		LONG_L " $4, %[v]\n\t"
-		"bne $4, %[expect], %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		LONG_L " $4, %[v]\n\t"
-		"bne $4, %[expect], %l[error2]\n\t"
-#endif
-		/* final store */
-		LONG_S " %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(5)
-		"b 5f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
-		"5:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "$4", "memory"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_bug("expected value comparison failed");
-#endif
-}
+#define RSEQ_TEMPLATE_CPU_ID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-mips-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
 
-static inline __attribute__((always_inline))
-int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
-			       long voffp, intptr_t *load, int cpu)
-{
-	RSEQ_INJECT_C(9)
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-mips-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_CPU_ID
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		LONG_L " $4, %[v]\n\t"
-		"beq $4, %[expectnot], %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		LONG_L " $4, %[v]\n\t"
-		"beq $4, %[expectnot], %l[error2]\n\t"
-#endif
-		LONG_S " $4, %[load]\n\t"
-		LONG_ADDI " $4, %[voffp]\n\t"
-		LONG_L " $4, 0($4)\n\t"
-		/* final store */
-		LONG_S " $4, %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(5)
-		"b 5f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
-		"5:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expectnot]		"r" (expectnot),
-		  [voffp]		"Ir" (voffp),
-		  [load]		"m" (*load)
-		  RSEQ_INJECT_INPUT
-		: "$4", "memory"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_bug("expected value comparison failed");
-#endif
-}
+/* Per-mm-cid indexing. */
 
-static inline __attribute__((always_inline))
-int rseq_addv(intptr_t *v, intptr_t count, int cpu)
-{
-	RSEQ_INJECT_C(9)
+#define RSEQ_TEMPLATE_MM_CID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-mips-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-#endif
-		LONG_L " $4, %[v]\n\t"
-		LONG_ADDI " $4, %[count]\n\t"
-		/* final store */
-		LONG_S " $4, %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(4)
-		"b 5f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
-		"5:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  [v]			"m" (*v),
-		  [count]		"Ir" (count)
-		  RSEQ_INJECT_INPUT
-		: "$4", "memory"
-		  RSEQ_INJECT_CLOBBER
-		: abort
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1
-#endif
-	);
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
-				 intptr_t *v2, intptr_t newv2,
-				 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-mips-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_MM_CID
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		LONG_L " $4, %[v]\n\t"
-		"bne $4, %[expect], %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		LONG_L " $4, %[v]\n\t"
-		"bne $4, %[expect], %l[error2]\n\t"
-#endif
-		/* try store */
-		LONG_S " %[newv2], %[v2]\n\t"
-		RSEQ_INJECT_ASM(5)
-		/* final store */
-		LONG_S " %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		"b 5f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
-		"5:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* try store input */
-		  [v2]			"m" (*v2),
-		  [newv2]		"r" (newv2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "$4", "memory"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_bug("expected value comparison failed");
-#endif
-}
+/* APIs which are not based on cpu ids. */
 
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
-					 intptr_t *v2, intptr_t newv2,
-					 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		LONG_L " $4, %[v]\n\t"
-		"bne $4, %[expect], %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		LONG_L " $4, %[v]\n\t"
-		"bne $4, %[expect], %l[error2]\n\t"
-#endif
-		/* try store */
-		LONG_S " %[newv2], %[v2]\n\t"
-		RSEQ_INJECT_ASM(5)
-		"sync\n\t"	/* full sync provides store-release */
-		/* final store */
-		LONG_S " %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		"b 5f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
-		"5:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* try store input */
-		  [v2]			"m" (*v2),
-		  [newv2]		"r" (newv2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "$4", "memory"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
-			      intptr_t *v2, intptr_t expect2,
-			      intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		LONG_L " $4, %[v]\n\t"
-		"bne $4, %[expect], %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-		LONG_L " $4, %[v2]\n\t"
-		"bne $4, %[expect2], %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(5)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		LONG_L " $4, %[v]\n\t"
-		"bne $4, %[expect], %l[error2]\n\t"
-		LONG_L " $4, %[v2]\n\t"
-		"bne $4, %[expect2], %l[error3]\n\t"
-#endif
-		/* final store */
-		LONG_S " %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		"b 5f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
-		"5:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* cmp2 input */
-		  [v2]			"m" (*v2),
-		  [expect2]		"r" (expect2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "$4", "memory"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2, error3
-#endif
-	);
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_bug("1st expected value comparison failed");
-error3:
-	rseq_bug("2nd expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
-				 void *dst, void *src, size_t len,
-				 intptr_t newv, int cpu)
-{
-	uintptr_t rseq_scratch[3];
-
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		LONG_S " %[src], %[rseq_scratch0]\n\t"
-		LONG_S "  %[dst], %[rseq_scratch1]\n\t"
-		LONG_S " %[len], %[rseq_scratch2]\n\t"
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		LONG_L " $4, %[v]\n\t"
-		"bne $4, %[expect], 5f\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
-		LONG_L " $4, %[v]\n\t"
-		"bne $4, %[expect], 7f\n\t"
-#endif
-		/* try memcpy */
-		"beqz %[len], 333f\n\t" \
-		"222:\n\t" \
-		"lb   $4, 0(%[src])\n\t" \
-		"sb   $4, 0(%[dst])\n\t" \
-		LONG_ADDI " %[src], 1\n\t" \
-		LONG_ADDI " %[dst], 1\n\t" \
-		LONG_ADDI " %[len], -1\n\t" \
-		"bnez %[len], 222b\n\t" \
-		"333:\n\t" \
-		RSEQ_INJECT_ASM(5)
-		/* final store */
-		LONG_S " %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		/* teardown */
-		LONG_L " %[len], %[rseq_scratch2]\n\t"
-		LONG_L " %[dst], %[rseq_scratch1]\n\t"
-		LONG_L " %[src], %[rseq_scratch0]\n\t"
-		"b 8f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4,
-				      /* teardown */
-				      LONG_L " %[len], %[rseq_scratch2]\n\t"
-				      LONG_L " %[dst], %[rseq_scratch1]\n\t"
-				      LONG_L " %[src], %[rseq_scratch0]\n\t",
-				      abort, 1b, 2b, 4f)
-		RSEQ_ASM_DEFINE_CMPFAIL(5,
-					/* teardown */
-					LONG_L " %[len], %[rseq_scratch2]\n\t"
-					LONG_L " %[dst], %[rseq_scratch1]\n\t"
-					LONG_L " %[src], %[rseq_scratch0]\n\t",
-					cmpfail)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_CMPFAIL(6,
-					/* teardown */
-					LONG_L " %[len], %[rseq_scratch2]\n\t"
-					LONG_L " %[dst], %[rseq_scratch1]\n\t"
-					LONG_L " %[src], %[rseq_scratch0]\n\t",
-					error1)
-		RSEQ_ASM_DEFINE_CMPFAIL(7,
-					/* teardown */
-					LONG_L " %[len], %[rseq_scratch2]\n\t"
-					LONG_L " %[dst], %[rseq_scratch1]\n\t"
-					LONG_L " %[src], %[rseq_scratch0]\n\t",
-					error2)
-#endif
-		"8:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv),
-		  /* try memcpy input */
-		  [dst]			"r" (dst),
-		  [src]			"r" (src),
-		  [len]			"r" (len),
-		  [rseq_scratch0]	"m" (rseq_scratch[0]),
-		  [rseq_scratch1]	"m" (rseq_scratch[1]),
-		  [rseq_scratch2]	"m" (rseq_scratch[2])
-		  RSEQ_INJECT_INPUT
-		: "$4", "memory"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
-					 void *dst, void *src, size_t len,
-					 intptr_t newv, int cpu)
-{
-	uintptr_t rseq_scratch[3];
-
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		LONG_S " %[src], %[rseq_scratch0]\n\t"
-		LONG_S " %[dst], %[rseq_scratch1]\n\t"
-		LONG_S " %[len], %[rseq_scratch2]\n\t"
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		LONG_L " $4, %[v]\n\t"
-		"bne $4, %[expect], 5f\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
-		LONG_L " $4, %[v]\n\t"
-		"bne $4, %[expect], 7f\n\t"
-#endif
-		/* try memcpy */
-		"beqz %[len], 333f\n\t" \
-		"222:\n\t" \
-		"lb   $4, 0(%[src])\n\t" \
-		"sb   $4, 0(%[dst])\n\t" \
-		LONG_ADDI " %[src], 1\n\t" \
-		LONG_ADDI " %[dst], 1\n\t" \
-		LONG_ADDI " %[len], -1\n\t" \
-		"bnez %[len], 222b\n\t" \
-		"333:\n\t" \
-		RSEQ_INJECT_ASM(5)
-		"sync\n\t"	/* full sync provides store-release */
-		/* final store */
-		LONG_S " %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		/* teardown */
-		LONG_L " %[len], %[rseq_scratch2]\n\t"
-		LONG_L " %[dst], %[rseq_scratch1]\n\t"
-		LONG_L " %[src], %[rseq_scratch0]\n\t"
-		"b 8f\n\t"
-		RSEQ_ASM_DEFINE_ABORT(3, 4,
-				      /* teardown */
-				      LONG_L " %[len], %[rseq_scratch2]\n\t"
-				      LONG_L " %[dst], %[rseq_scratch1]\n\t"
-				      LONG_L " %[src], %[rseq_scratch0]\n\t",
-				      abort, 1b, 2b, 4f)
-		RSEQ_ASM_DEFINE_CMPFAIL(5,
-					/* teardown */
-					LONG_L " %[len], %[rseq_scratch2]\n\t"
-					LONG_L " %[dst], %[rseq_scratch1]\n\t"
-					LONG_L " %[src], %[rseq_scratch0]\n\t",
-					cmpfail)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_CMPFAIL(6,
-					/* teardown */
-					LONG_L " %[len], %[rseq_scratch2]\n\t"
-					LONG_L " %[dst], %[rseq_scratch1]\n\t"
-					LONG_L " %[src], %[rseq_scratch0]\n\t",
-					error1)
-		RSEQ_ASM_DEFINE_CMPFAIL(7,
-					/* teardown */
-					LONG_L " %[len], %[rseq_scratch2]\n\t"
-					LONG_L " %[dst], %[rseq_scratch1]\n\t"
-					LONG_L " %[src], %[rseq_scratch0]\n\t",
-					error2)
-#endif
-		"8:\n\t"
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv),
-		  /* try memcpy input */
-		  [dst]			"r" (dst),
-		  [src]			"r" (src),
-		  [len]			"r" (len),
-		  [rseq_scratch0]	"m" (rseq_scratch[0]),
-		  [rseq_scratch1]	"m" (rseq_scratch[1]),
-		  [rseq_scratch2]	"m" (rseq_scratch[2])
-		  RSEQ_INJECT_INPUT
-		: "$4", "memory"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_bug("expected value comparison failed");
-#endif
-}
+#define RSEQ_TEMPLATE_CPU_ID_NONE
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-mips-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
+#undef RSEQ_TEMPLATE_CPU_ID_NONE
-- 
cgit 


From e61bd94c2bbd6b258fb408092172e37d39b4bf5c Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:17 -0500
Subject: selftests/rseq: ppc: Template memory ordering and percpu access mode

Introduce a rseq-ppc-bits.h template header which is internally included
to generate the static inline functions covering:

- relaxed and release memory ordering,
- per-cpu-id and per-mm-cid per-cpu data access.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-16-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/rseq-ppc-bits.h | 454 ++++++++++++++++++++
 tools/testing/selftests/rseq/rseq-ppc.h      | 611 ++-------------------------
 2 files changed, 486 insertions(+), 579 deletions(-)
 create mode 100644 tools/testing/selftests/rseq/rseq-ppc-bits.h

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/rseq-ppc-bits.h b/tools/testing/selftests/rseq/rseq-ppc-bits.h
new file mode 100644
index 000000000000..98e69eae1e62
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-ppc-bits.h
@@ -0,0 +1,454 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq-ppc-bits.h
+ *
+ * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * (C) Copyright 2016-2018 - Boqun Feng <boqun.feng@gmail.com>
+ */
+
+#include "rseq-bits-template.h"
+
+#if defined(RSEQ_TEMPLATE_MO_RELAXED) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_storev)(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+		/* cmp cpuid */
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		/* cmp @v equal to @expect */
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		/* cmp cpuid */
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		/* cmp @v equal to @expect */
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		/* final store */
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "memory", "cc", "r17"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpnev_storeoffp_load)(intptr_t *v, intptr_t expectnot,
+			       long voffp, intptr_t *load, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+		/* cmp cpuid */
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		/* cmp @v not equal to @expectnot */
+		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		/* cmp cpuid */
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		/* cmp @v not equal to @expectnot */
+		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[error2])
+#endif
+		/* load the value of @v */
+		RSEQ_ASM_OP_R_LOAD(v)
+		/* store it in @load */
+		RSEQ_ASM_OP_R_STORE(load)
+		/* dereference voffp(v) */
+		RSEQ_ASM_OP_R_LOADX(voffp)
+		/* final store the value at voffp(v) */
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expectnot]		"r" (expectnot),
+		  [voffp]		"b" (voffp),
+		  [load]		"m" (*load)
+		  RSEQ_INJECT_INPUT
+		: "memory", "cc", "r17"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_addv)(intptr_t *v, intptr_t count, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+		/* cmp cpuid */
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+		/* cmp cpuid */
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+#endif
+		/* load the value of @v */
+		RSEQ_ASM_OP_R_LOAD(v)
+		/* add @count to it */
+		RSEQ_ASM_OP_R_ADD(count)
+		/* final store */
+		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
+		RSEQ_INJECT_ASM(4)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [count]		"r" (count)
+		  RSEQ_INJECT_INPUT
+		: "memory", "cc", "r17"
+		  RSEQ_INJECT_CLOBBER
+		: abort
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_cmpeqv_storev)(intptr_t *v, intptr_t expect,
+			      intptr_t *v2, intptr_t expect2,
+			      intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+		/* cmp cpuid */
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		/* cmp @v equal to @expect */
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+		/* cmp @v2 equal to @expct2 */
+		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[cmpfail])
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+		/* cmp cpuid */
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		/* cmp @v equal to @expect */
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+		/* cmp @v2 equal to @expct2 */
+		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])
+#endif
+		/* final store */
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* cmp2 input */
+		  [v2]			"m" (*v2),
+		  [expect2]		"r" (expect2),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "memory", "cc", "r17"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2, error3
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("1st expected value comparison failed");
+error3:
+	rseq_after_asm_goto();
+	rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+#endif /* #if defined(RSEQ_TEMPLATE_MO_RELAXED) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trystorev_storev)(intptr_t *v, intptr_t expect,
+				 intptr_t *v2, intptr_t newv2,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+		/* cmp cpuid */
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		/* cmp @v equal to @expect */
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		/* cmp cpuid */
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		/* cmp @v equal to @expect */
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		/* try store */
+		RSEQ_ASM_OP_STORE(newv2, v2)
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_TEMPLATE_MO_RELEASE
+		/* for 'release' */
+		"lwsync\n\t"
+#endif
+		/* final store */
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* try store input */
+		  [v2]			"m" (*v2),
+		  [newv2]		"r" (newv2),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "memory", "cc", "r17"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trymemcpy_storev)(intptr_t *v, intptr_t expect,
+				 void *dst, void *src, size_t len,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* setup for mempcy */
+		"mr %%r19, %[len]\n\t"
+		"mr %%r20, %[src]\n\t"
+		"mr %%r21, %[dst]\n\t"
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+		/* cmp cpuid */
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		/* cmp @v equal to @expect */
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		/* cmp cpuid */
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		/* cmp @v equal to @expect */
+		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+		/* try memcpy */
+		RSEQ_ASM_OP_R_MEMCPY()
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_TEMPLATE_MO_RELEASE
+		/* for 'release' */
+		"lwsync\n\t"
+#endif
+		/* final store */
+		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
+		RSEQ_INJECT_ASM(6)
+		/* teardown */
+		RSEQ_ASM_DEFINE_ABORT(4, abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv),
+		  /* try memcpy input */
+		  [dst]			"r" (dst),
+		  [src]			"r" (src),
+		  [len]			"r" (len)
+		  RSEQ_INJECT_INPUT
+		: "memory", "cc", "r17", "r18", "r19", "r20", "r21"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* #if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#include "rseq-bits-reset.h"
diff --git a/tools/testing/selftests/rseq/rseq-ppc.h b/tools/testing/selftests/rseq/rseq-ppc.h
index f82d95c1bb3f..dc9190facee9 100644
--- a/tools/testing/selftests/rseq/rseq-ppc.h
+++ b/tools/testing/selftests/rseq/rseq-ppc.h
@@ -2,7 +2,7 @@
 /*
  * rseq-ppc.h
  *
- * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * (C) Copyright 2016-2022 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  * (C) Copyright 2016-2018 - Boqun Feng <boqun.feng@gmail.com>
  */
 
@@ -205,581 +205,34 @@ do {									\
 		RSEQ_STORE_LONG(var) "%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n\t" \
 		__rseq_str(post_commit_label) ":\n\t"
 
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		/* cmp @v equal to @expect */
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		/* cmp @v equal to @expect */
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
-#endif
-		/* final store */
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
-		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "memory", "cc", "r17"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
-			       long voffp, intptr_t *load, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		/* cmp @v not equal to @expectnot */
-		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		/* cmp @v not equal to @expectnot */
-		RSEQ_ASM_OP_CMPNE(v, expectnot, %l[error2])
-#endif
-		/* load the value of @v */
-		RSEQ_ASM_OP_R_LOAD(v)
-		/* store it in @load */
-		RSEQ_ASM_OP_R_STORE(load)
-		/* dereference voffp(v) */
-		RSEQ_ASM_OP_R_LOADX(voffp)
-		/* final store the value at voffp(v) */
-		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
-		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expectnot]		"r" (expectnot),
-		  [voffp]		"b" (voffp),
-		  [load]		"m" (*load)
-		  RSEQ_INJECT_INPUT
-		: "memory", "cc", "r17"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_addv(intptr_t *v, intptr_t count, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-#ifdef RSEQ_COMPARE_TWICE
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-#endif
-		/* load the value of @v */
-		RSEQ_ASM_OP_R_LOAD(v)
-		/* add @count to it */
-		RSEQ_ASM_OP_R_ADD(count)
-		/* final store */
-		RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
-		RSEQ_INJECT_ASM(4)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [count]		"r" (count)
-		  RSEQ_INJECT_INPUT
-		: "memory", "cc", "r17"
-		  RSEQ_INJECT_CLOBBER
-		: abort
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
-				 intptr_t *v2, intptr_t newv2,
-				 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		/* cmp @v equal to @expect */
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		/* cmp @v equal to @expect */
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
-#endif
-		/* try store */
-		RSEQ_ASM_OP_STORE(newv2, v2)
-		RSEQ_INJECT_ASM(5)
-		/* final store */
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* try store input */
-		  [v2]			"m" (*v2),
-		  [newv2]		"r" (newv2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "memory", "cc", "r17"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
-					 intptr_t *v2, intptr_t newv2,
-					 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		/* cmp @v equal to @expect */
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		/* cmp @v equal to @expect */
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
-#endif
-		/* try store */
-		RSEQ_ASM_OP_STORE(newv2, v2)
-		RSEQ_INJECT_ASM(5)
-		/* for 'release' */
-		"lwsync\n\t"
-		/* final store */
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* try store input */
-		  [v2]			"m" (*v2),
-		  [newv2]		"r" (newv2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "memory", "cc", "r17"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
-			      intptr_t *v2, intptr_t expect2,
-			      intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		/* cmp @v equal to @expect */
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
-		RSEQ_INJECT_ASM(4)
-		/* cmp @v2 equal to @expct2 */
-		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[cmpfail])
-		RSEQ_INJECT_ASM(5)
-#ifdef RSEQ_COMPARE_TWICE
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		/* cmp @v equal to @expect */
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
-		/* cmp @v2 equal to @expct2 */
-		RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])
-#endif
-		/* final store */
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* cmp2 input */
-		  [v2]			"m" (*v2),
-		  [expect2]		"r" (expect2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "memory", "cc", "r17"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2, error3
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("1st expected value comparison failed");
-error3:
-	rseq_after_asm_goto();
-	rseq_bug("2nd expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
-				 void *dst, void *src, size_t len,
-				 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* setup for mempcy */
-		"mr %%r19, %[len]\n\t"
-		"mr %%r20, %[src]\n\t"
-		"mr %%r21, %[dst]\n\t"
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		/* cmp @v equal to @expect */
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		/* cmp @v equal to @expect */
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
-#endif
-		/* try memcpy */
-		RSEQ_ASM_OP_R_MEMCPY()
-		RSEQ_INJECT_ASM(5)
-		/* final store */
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
-		RSEQ_INJECT_ASM(6)
-		/* teardown */
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv),
-		  /* try memcpy input */
-		  [dst]			"r" (dst),
-		  [src]			"r" (src),
-		  [len]			"r" (len)
-		  RSEQ_INJECT_INPUT
-		: "memory", "cc", "r17", "r18", "r19", "r20", "r21"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
-					 void *dst, void *src, size_t len,
-					 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* setup for mempcy */
-		"mr %%r19, %[len]\n\t"
-		"mr %%r20, %[src]\n\t"
-		"mr %%r21, %[dst]\n\t"
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		/* cmp @v equal to @expect */
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		/* cmp cpuid */
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		/* cmp @v equal to @expect */
-		RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
-#endif
-		/* try memcpy */
-		RSEQ_ASM_OP_R_MEMCPY()
-		RSEQ_INJECT_ASM(5)
-		/* for 'release' */
-		"lwsync\n\t"
-		/* final store */
-		RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
-		RSEQ_INJECT_ASM(6)
-		/* teardown */
-		RSEQ_ASM_DEFINE_ABORT(4, abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv),
-		  /* try memcpy input */
-		  [dst]			"r" (dst),
-		  [src]			"r" (src),
-		  [len]			"r" (len)
-		  RSEQ_INJECT_INPUT
-		: "memory", "cc", "r17", "r18", "r19", "r20", "r21"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
+/* Per-cpu-id indexing. */
+
+#define RSEQ_TEMPLATE_CPU_ID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-ppc-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
+
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-ppc-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_CPU_ID
+
+/* Per-mm-cid indexing. */
+
+#define RSEQ_TEMPLATE_MM_CID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-ppc-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
+
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-ppc-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_MM_CID
+
+/* APIs which are not based on cpu ids. */
+
+#define RSEQ_TEMPLATE_CPU_ID_NONE
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-ppc-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
+#undef RSEQ_TEMPLATE_CPU_ID_NONE
-- 
cgit 


From a94af3c58462d23c45879e1bbe86ed5702d5bd86 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:18 -0500
Subject: selftests/rseq: s390: Template memory ordering and percpu access mode

Introduce a rseq-s390-bits.h template header which is internally included
to generate the static inline functions covering:

- relaxed and release memory ordering,
- per-cpu-id and per-mm-cid per-cpu data access.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-17-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/rseq-s390-bits.h | 474 +++++++++++++++++++++++++
 tools/testing/selftests/rseq/rseq-s390.h      | 490 ++------------------------
 2 files changed, 498 insertions(+), 466 deletions(-)
 create mode 100644 tools/testing/selftests/rseq/rseq-s390-bits.h

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/rseq-s390-bits.h b/tools/testing/selftests/rseq/rseq-s390-bits.h
new file mode 100644
index 000000000000..0cf17d9f170a
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-s390-bits.h
@@ -0,0 +1,474 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+
+#include "rseq-bits-template.h"
+
+#if defined(RSEQ_TEMPLATE_MO_RELAXED) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_storev)(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		LONG_CMP " %[expect], %[v]\n\t"
+		"jnz %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		LONG_CMP " %[expect], %[v]\n\t"
+		"jnz %l[error2]\n\t"
+#endif
+		/* final store */
+		LONG_S " %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "memory", "cc", "r0"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+/*
+ * Compare @v against @expectnot. When it does _not_ match, load @v
+ * into @load, and store the content of *@v + voffp into @v.
+ */
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpnev_storeoffp_load)(intptr_t *v, intptr_t expectnot,
+			       long voffp, intptr_t *load, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		LONG_L " %%r1, %[v]\n\t"
+		LONG_CMP_R " %%r1, %[expectnot]\n\t"
+		"je %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		LONG_L " %%r1, %[v]\n\t"
+		LONG_CMP_R " %%r1, %[expectnot]\n\t"
+		"je %l[error2]\n\t"
+#endif
+		LONG_S " %%r1, %[load]\n\t"
+		LONG_ADD_R " %%r1, %[voffp]\n\t"
+		LONG_L " %%r1, 0(%%r1)\n\t"
+		/* final store */
+		LONG_S " %%r1, %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(5)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expectnot]		"r" (expectnot),
+		  [voffp]		"r" (voffp),
+		  [load]		"m" (*load)
+		  RSEQ_INJECT_INPUT
+		: "memory", "cc", "r0", "r1"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_addv)(intptr_t *v, intptr_t count, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+#endif
+		LONG_L " %%r0, %[v]\n\t"
+		LONG_ADD_R " %%r0, %[count]\n\t"
+		/* final store */
+		LONG_S " %%r0, %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(4)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [count]		"r" (count)
+		  RSEQ_INJECT_INPUT
+		: "memory", "cc", "r0"
+		  RSEQ_INJECT_CLOBBER
+		: abort
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_cmpeqv_storev)(intptr_t *v, intptr_t expect,
+			      intptr_t *v2, intptr_t expect2,
+			      intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		LONG_CMP " %[expect], %[v]\n\t"
+		"jnz %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+		LONG_CMP " %[expect2], %[v2]\n\t"
+		"jnz %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		LONG_CMP " %[expect], %[v]\n\t"
+		"jnz %l[error2]\n\t"
+		LONG_CMP " %[expect2], %[v2]\n\t"
+		"jnz %l[error3]\n\t"
+#endif
+		/* final store */
+		LONG_S " %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* cmp2 input */
+		  [v2]			"m" (*v2),
+		  [expect2]		"r" (expect2),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "memory", "cc", "r0"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2, error3
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("1st expected value comparison failed");
+error3:
+	rseq_after_asm_goto();
+	rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+#endif /* #if defined(RSEQ_TEMPLATE_MO_RELAXED) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+/* s390 is TSO. */
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trystorev_storev)(intptr_t *v, intptr_t expect,
+				 intptr_t *v2, intptr_t newv2,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		LONG_CMP " %[expect], %[v]\n\t"
+		"jnz %l[cmpfail]\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+		LONG_CMP " %[expect], %[v]\n\t"
+		"jnz %l[error2]\n\t"
+#endif
+		/* try store */
+		LONG_S " %[newv2], %[v2]\n\t"
+		RSEQ_INJECT_ASM(5)
+		/* final store */
+		LONG_S " %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* try store input */
+		  [v2]			"m" (*v2),
+		  [newv2]		"r" (newv2),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv)
+		  RSEQ_INJECT_INPUT
+		: "memory", "cc", "r0"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+/* s390 is TSO. */
+static inline __attribute__((always_inline))
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trymemcpy_storev)(intptr_t *v, intptr_t expect,
+				 void *dst, void *src, size_t len,
+				 intptr_t newv, int cpu)
+{
+	uint64_t rseq_scratch[3];
+
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto (
+		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+		LONG_S " %[src], %[rseq_scratch0]\n\t"
+		LONG_S " %[dst], %[rseq_scratch1]\n\t"
+		LONG_S " %[len], %[rseq_scratch2]\n\t"
+		/* Start rseq by storing table entry pointer into rseq_cs. */
+		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+		RSEQ_INJECT_ASM(3)
+		LONG_CMP " %[expect], %[v]\n\t"
+		"jnz 5f\n\t"
+		RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
+		LONG_CMP " %[expect], %[v]\n\t"
+		"jnz 7f\n\t"
+#endif
+		/* try memcpy */
+		LONG_LT_R " %[len], %[len]\n\t"
+		"jz 333f\n\t"
+		"222:\n\t"
+		"ic %%r0,0(%[src])\n\t"
+		"stc %%r0,0(%[dst])\n\t"
+		LONG_ADDI " %[src], 1\n\t"
+		LONG_ADDI " %[dst], 1\n\t"
+		LONG_ADDI " %[len], -1\n\t"
+		"jnz 222b\n\t"
+		"333:\n\t"
+		RSEQ_INJECT_ASM(5)
+		/* final store */
+		LONG_S " %[newv], %[v]\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(6)
+		/* teardown */
+		LONG_L " %[len], %[rseq_scratch2]\n\t"
+		LONG_L " %[dst], %[rseq_scratch1]\n\t"
+		LONG_L " %[src], %[rseq_scratch0]\n\t"
+		RSEQ_ASM_DEFINE_ABORT(4,
+			LONG_L " %[len], %[rseq_scratch2]\n\t"
+			LONG_L " %[dst], %[rseq_scratch1]\n\t"
+			LONG_L " %[src], %[rseq_scratch0]\n\t",
+			abort)
+		RSEQ_ASM_DEFINE_CMPFAIL(5,
+			LONG_L " %[len], %[rseq_scratch2]\n\t"
+			LONG_L " %[dst], %[rseq_scratch1]\n\t"
+			LONG_L " %[src], %[rseq_scratch0]\n\t",
+			cmpfail)
+#ifdef RSEQ_COMPARE_TWICE
+		RSEQ_ASM_DEFINE_CMPFAIL(6,
+			LONG_L " %[len], %[rseq_scratch2]\n\t"
+			LONG_L " %[dst], %[rseq_scratch1]\n\t"
+			LONG_L " %[src], %[rseq_scratch0]\n\t",
+			error1)
+		RSEQ_ASM_DEFINE_CMPFAIL(7,
+			LONG_L " %[len], %[rseq_scratch2]\n\t"
+			LONG_L " %[dst], %[rseq_scratch1]\n\t"
+			LONG_L " %[src], %[rseq_scratch0]\n\t",
+			error2)
+#endif
+		: /* gcc asm goto does not allow outputs */
+		: [cpu_id]		"r" (cpu),
+		  [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+		  /* final store input */
+		  [v]			"m" (*v),
+		  [expect]		"r" (expect),
+		  [newv]		"r" (newv),
+		  /* try memcpy input */
+		  [dst]			"r" (dst),
+		  [src]			"r" (src),
+		  [len]			"r" (len),
+		  [rseq_scratch0]	"m" (rseq_scratch[0]),
+		  [rseq_scratch1]	"m" (rseq_scratch[1]),
+		  [rseq_scratch2]	"m" (rseq_scratch[2])
+		  RSEQ_INJECT_INPUT
+		: "memory", "cc", "r0"
+		  RSEQ_INJECT_CLOBBER
+		: abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+		  , error1, error2
+#endif
+	);
+	rseq_after_asm_goto();
+	return 0;
+abort:
+	rseq_after_asm_goto();
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	rseq_after_asm_goto();
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_after_asm_goto();
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_after_asm_goto();
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* #if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#include "rseq-bits-reset.h"
diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h
index 4d3286453bbf..46c92598acc7 100644
--- a/tools/testing/selftests/rseq/rseq-s390.h
+++ b/tools/testing/selftests/rseq/rseq-s390.h
@@ -130,476 +130,34 @@ do {									\
 		"jg %l[" __rseq_str(cmpfail_label) "]\n\t"		\
 		".popsection\n\t"
 
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
+/* Per-cpu-id indexing. */
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		LONG_CMP " %[expect], %[v]\n\t"
-		"jnz %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		LONG_CMP " %[expect], %[v]\n\t"
-		"jnz %l[error2]\n\t"
-#endif
-		/* final store */
-		LONG_S " %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "memory", "cc", "r0"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-/*
- * Compare @v against @expectnot. When it does _not_ match, load @v
- * into @load, and store the content of *@v + voffp into @v.
- */
-static inline __attribute__((always_inline))
-int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
-			       long voffp, intptr_t *load, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		LONG_L " %%r1, %[v]\n\t"
-		LONG_CMP_R " %%r1, %[expectnot]\n\t"
-		"je %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		LONG_L " %%r1, %[v]\n\t"
-		LONG_CMP_R " %%r1, %[expectnot]\n\t"
-		"je %l[error2]\n\t"
-#endif
-		LONG_S " %%r1, %[load]\n\t"
-		LONG_ADD_R " %%r1, %[voffp]\n\t"
-		LONG_L " %%r1, 0(%%r1)\n\t"
-		/* final store */
-		LONG_S " %%r1, %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(5)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expectnot]		"r" (expectnot),
-		  [voffp]		"r" (voffp),
-		  [load]		"m" (*load)
-		  RSEQ_INJECT_INPUT
-		: "memory", "cc", "r0", "r1"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_addv(intptr_t *v, intptr_t count, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-#endif
-		LONG_L " %%r0, %[v]\n\t"
-		LONG_ADD_R " %%r0, %[count]\n\t"
-		/* final store */
-		LONG_S " %%r0, %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(4)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [count]		"r" (count)
-		  RSEQ_INJECT_INPUT
-		: "memory", "cc", "r0"
-		  RSEQ_INJECT_CLOBBER
-		: abort
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-#endif
-}
+#define RSEQ_TEMPLATE_CPU_ID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-s390-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
 
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
-				 intptr_t *v2, intptr_t newv2,
-				 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-s390-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_CPU_ID
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		LONG_CMP " %[expect], %[v]\n\t"
-		"jnz %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		LONG_CMP " %[expect], %[v]\n\t"
-		"jnz %l[error2]\n\t"
-#endif
-		/* try store */
-		LONG_S " %[newv2], %[v2]\n\t"
-		RSEQ_INJECT_ASM(5)
-		/* final store */
-		LONG_S " %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* try store input */
-		  [v2]			"m" (*v2),
-		  [newv2]		"r" (newv2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "memory", "cc", "r0"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
+/* Per-mm-cid indexing. */
 
-/* s390 is TSO. */
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
-					 intptr_t *v2, intptr_t newv2,
-					 intptr_t newv, int cpu)
-{
-	return rseq_cmpeqv_trystorev_storev(v, expect, v2, newv2, newv, cpu);
-}
+#define RSEQ_TEMPLATE_MM_CID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-s390-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
 
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
-			      intptr_t *v2, intptr_t expect2,
-			      intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-s390-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_MM_CID
 
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
-#endif
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		LONG_CMP " %[expect], %[v]\n\t"
-		"jnz %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(4)
-		LONG_CMP " %[expect2], %[v2]\n\t"
-		"jnz %l[cmpfail]\n\t"
-		RSEQ_INJECT_ASM(5)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
-		LONG_CMP " %[expect], %[v]\n\t"
-		"jnz %l[error2]\n\t"
-		LONG_CMP " %[expect2], %[v2]\n\t"
-		"jnz %l[error3]\n\t"
-#endif
-		/* final store */
-		LONG_S " %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		RSEQ_ASM_DEFINE_ABORT(4, "", abort)
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* cmp2 input */
-		  [v2]			"m" (*v2),
-		  [expect2]		"r" (expect2),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv)
-		  RSEQ_INJECT_INPUT
-		: "memory", "cc", "r0"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2, error3
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("1st expected value comparison failed");
-error3:
-	rseq_after_asm_goto();
-	rseq_bug("2nd expected value comparison failed");
-#endif
-}
-
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
-				 void *dst, void *src, size_t len,
-				 intptr_t newv, int cpu)
-{
-	uint64_t rseq_scratch[3];
-
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto (
-		RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
-		RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
-#endif
-		LONG_S " %[src], %[rseq_scratch0]\n\t"
-		LONG_S " %[dst], %[rseq_scratch1]\n\t"
-		LONG_S " %[len], %[rseq_scratch2]\n\t"
-		/* Start rseq by storing table entry pointer into rseq_cs. */
-		RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-		RSEQ_INJECT_ASM(3)
-		LONG_CMP " %[expect], %[v]\n\t"
-		"jnz 5f\n\t"
-		RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
-		LONG_CMP " %[expect], %[v]\n\t"
-		"jnz 7f\n\t"
-#endif
-		/* try memcpy */
-		LONG_LT_R " %[len], %[len]\n\t"
-		"jz 333f\n\t"
-		"222:\n\t"
-		"ic %%r0,0(%[src])\n\t"
-		"stc %%r0,0(%[dst])\n\t"
-		LONG_ADDI " %[src], 1\n\t"
-		LONG_ADDI " %[dst], 1\n\t"
-		LONG_ADDI " %[len], -1\n\t"
-		"jnz 222b\n\t"
-		"333:\n\t"
-		RSEQ_INJECT_ASM(5)
-		/* final store */
-		LONG_S " %[newv], %[v]\n\t"
-		"2:\n\t"
-		RSEQ_INJECT_ASM(6)
-		/* teardown */
-		LONG_L " %[len], %[rseq_scratch2]\n\t"
-		LONG_L " %[dst], %[rseq_scratch1]\n\t"
-		LONG_L " %[src], %[rseq_scratch0]\n\t"
-		RSEQ_ASM_DEFINE_ABORT(4,
-			LONG_L " %[len], %[rseq_scratch2]\n\t"
-			LONG_L " %[dst], %[rseq_scratch1]\n\t"
-			LONG_L " %[src], %[rseq_scratch0]\n\t",
-			abort)
-		RSEQ_ASM_DEFINE_CMPFAIL(5,
-			LONG_L " %[len], %[rseq_scratch2]\n\t"
-			LONG_L " %[dst], %[rseq_scratch1]\n\t"
-			LONG_L " %[src], %[rseq_scratch0]\n\t",
-			cmpfail)
-#ifdef RSEQ_COMPARE_TWICE
-		RSEQ_ASM_DEFINE_CMPFAIL(6,
-			LONG_L " %[len], %[rseq_scratch2]\n\t"
-			LONG_L " %[dst], %[rseq_scratch1]\n\t"
-			LONG_L " %[src], %[rseq_scratch0]\n\t",
-			error1)
-		RSEQ_ASM_DEFINE_CMPFAIL(7,
-			LONG_L " %[len], %[rseq_scratch2]\n\t"
-			LONG_L " %[dst], %[rseq_scratch1]\n\t"
-			LONG_L " %[src], %[rseq_scratch0]\n\t",
-			error2)
-#endif
-		: /* gcc asm goto does not allow outputs */
-		: [cpu_id]		"r" (cpu),
-		  [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-		  [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-		  /* final store input */
-		  [v]			"m" (*v),
-		  [expect]		"r" (expect),
-		  [newv]		"r" (newv),
-		  /* try memcpy input */
-		  [dst]			"r" (dst),
-		  [src]			"r" (src),
-		  [len]			"r" (len),
-		  [rseq_scratch0]	"m" (rseq_scratch[0]),
-		  [rseq_scratch1]	"m" (rseq_scratch[1]),
-		  [rseq_scratch2]	"m" (rseq_scratch[2])
-		  RSEQ_INJECT_INPUT
-		: "memory", "cc", "r0"
-		  RSEQ_INJECT_CLOBBER
-		: abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-		  , error1, error2
-#endif
-	);
-	rseq_after_asm_goto();
-	return 0;
-abort:
-	rseq_after_asm_goto();
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	rseq_after_asm_goto();
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_after_asm_goto();
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_after_asm_goto();
-	rseq_bug("expected value comparison failed");
-#endif
-}
+/* APIs which are not based on cpu ids. */
 
-/* s390 is TSO. */
-static inline __attribute__((always_inline))
-int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
-					 void *dst, void *src, size_t len,
-					 intptr_t newv, int cpu)
-{
-	return rseq_cmpeqv_trymemcpy_storev(v, expect, dst, src, len,
-					    newv, cpu);
-}
+#define RSEQ_TEMPLATE_CPU_ID_NONE
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-s390-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
+#undef RSEQ_TEMPLATE_CPU_ID_NONE
-- 
cgit 


From 171586a6ab66fb6be064e399ac2024ab459dfcf9 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:19 -0500
Subject: selftests/rseq: riscv: Template memory ordering and percpu access
 mode

Introduce a rseq-riscv-bits.h template header which is internally included
to generate the static inline functions covering:

- relaxed and release memory ordering,
- per-cpu-id and per-mm-cid per-cpu data access.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-18-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/rseq-riscv-bits.h | 410 +++++++++++++++++++
 tools/testing/selftests/rseq/rseq-riscv.h      | 527 ++-----------------------
 2 files changed, 437 insertions(+), 500 deletions(-)
 create mode 100644 tools/testing/selftests/rseq/rseq-riscv-bits.h

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/rseq-riscv-bits.h b/tools/testing/selftests/rseq/rseq-riscv-bits.h
new file mode 100644
index 000000000000..de31a0143139
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-riscv-bits.h
@@ -0,0 +1,410 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+
+#include "rseq-bits-template.h"
+
+#if defined(RSEQ_TEMPLATE_MO_RELAXED) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __always_inline
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_storev)(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
+#endif
+				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+				  RSEQ_INJECT_ASM(3)
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[cmpfail]")
+				  RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[error2]")
+#endif
+				  RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+				  RSEQ_INJECT_ASM(5)
+				  RSEQ_ASM_DEFINE_ABORT(4, abort)
+				  : /* gcc asm goto does not allow outputs */
+				  : [cpu_id]		"r" (cpu),
+				    [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+				    [v]			"m" (*v),
+				    [expect]		"r" (expect),
+				    [newv]		"r" (newv)
+				    RSEQ_INJECT_INPUT
+				  : "memory", RSEQ_ASM_TMP_REG_1
+				    RSEQ_INJECT_CLOBBER
+				  : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+				    , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __always_inline
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpnev_storeoffp_load)(intptr_t *v, intptr_t expectnot,
+			       off_t voffp, intptr_t *load, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
+#endif
+				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+				  RSEQ_INJECT_ASM(3)
+				  RSEQ_ASM_OP_CMPNE(v, expectnot, "%l[cmpfail]")
+				  RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
+				  RSEQ_ASM_OP_CMPNE(v, expectnot, "%l[error2]")
+#endif
+				  RSEQ_ASM_OP_R_LOAD(v)
+				  RSEQ_ASM_OP_R_STORE(load)
+				  RSEQ_ASM_OP_R_LOAD_OFF(voffp)
+				  RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+				  RSEQ_INJECT_ASM(5)
+				  RSEQ_ASM_DEFINE_ABORT(4, abort)
+				  : /* gcc asm goto does not allow outputs */
+				  : [cpu_id]		"r" (cpu),
+				    [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+				    [v]			"m" (*v),
+				    [expectnot]		"r" (expectnot),
+				    [load]		"m" (*load),
+				    [voffp]		"r" (voffp)
+				    RSEQ_INJECT_INPUT
+				  : "memory", RSEQ_ASM_TMP_REG_1
+				    RSEQ_INJECT_CLOBBER
+				  : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+				    , error1, error2
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __always_inline
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_addv)(intptr_t *v, intptr_t count, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
+#endif
+				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+				  RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
+#endif
+				  RSEQ_ASM_OP_R_LOAD(v)
+				  RSEQ_ASM_OP_R_ADD(count)
+				  RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+				  RSEQ_INJECT_ASM(4)
+				  RSEQ_ASM_DEFINE_ABORT(4, abort)
+				  : /* gcc asm goto does not allow outputs */
+				  : [cpu_id]		"r" (cpu),
+				    [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+				    [v]			"m" (*v),
+				    [count]		"r" (count)
+				    RSEQ_INJECT_INPUT
+				  : "memory", RSEQ_ASM_TMP_REG_1
+				    RSEQ_INJECT_CLOBBER
+				  : abort
+#ifdef RSEQ_COMPARE_TWICE
+				    , error1
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __always_inline
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_cmpeqv_storev)(intptr_t *v, intptr_t expect,
+			      intptr_t *v2, intptr_t expect2,
+			      intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error3]")
+#endif
+				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+				  RSEQ_INJECT_ASM(3)
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[cmpfail]")
+				  RSEQ_INJECT_ASM(4)
+				  RSEQ_ASM_OP_CMPEQ(v2, expect2, "%l[cmpfail]")
+				  RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[error2]")
+				  RSEQ_ASM_OP_CMPEQ(v2, expect2, "%l[error3]")
+#endif
+				  RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+				  RSEQ_INJECT_ASM(6)
+				  RSEQ_ASM_DEFINE_ABORT(4, abort)
+				  : /* gcc asm goto does not allow outputs */
+				  : [cpu_id]		"r" (cpu),
+				    [current_cpu_id]	"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+				    [v]			"m" (*v),
+				    [expect]		"r" (expect),
+				    [v2]			"m" (*v2),
+				    [expect2]		"r" (expect2),
+				    [newv]		"r" (newv)
+				    RSEQ_INJECT_INPUT
+				  : "memory", RSEQ_ASM_TMP_REG_1
+				    RSEQ_INJECT_CLOBBER
+				  : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+				    , error1, error2, error3
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+error3:
+	rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+#define RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+
+/*
+ *   pval = *(ptr+off)
+ *  *pval += inc;
+ */
+static inline __always_inline
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_offset_deref_addv)(intptr_t *ptr, off_t off, intptr_t inc, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
+#endif
+				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+				  RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
+#endif
+				  RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, 3)
+				  RSEQ_INJECT_ASM(4)
+				  RSEQ_ASM_DEFINE_ABORT(4, abort)
+				  : /* gcc asm goto does not allow outputs */
+				  : [cpu_id]			"r" (cpu),
+				    [current_cpu_id]		"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+				    [rseq_cs]			"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+				    [ptr]			"r" (ptr),
+				    [off]			"er" (off),
+				    [inc]			"er" (inc)
+				    RSEQ_INJECT_INPUT
+				  : "memory", RSEQ_ASM_TMP_REG_1
+				    RSEQ_INJECT_CLOBBER
+				  : abort
+#ifdef RSEQ_COMPARE_TWICE
+				    , error1
+#endif
+	);
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+#endif /* #if defined(RSEQ_TEMPLATE_MO_RELAXED) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) && \
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID))
+
+static inline __always_inline
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trystorev_storev)(intptr_t *v, intptr_t expect,
+				 intptr_t *v2, intptr_t newv2,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+
+	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
+#endif
+				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+				  RSEQ_INJECT_ASM(3)
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[cmpfail]")
+				  RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[error2]")
+#endif
+				  RSEQ_ASM_OP_STORE(newv2, v2)
+				  RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_TEMPLATE_MO_RELEASE
+				  RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+#else
+				  RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+#endif
+				  RSEQ_INJECT_ASM(6)
+				  RSEQ_ASM_DEFINE_ABORT(4, abort)
+				  : /* gcc asm goto does not allow outputs */
+				  : [cpu_id]			"r" (cpu),
+				    [current_cpu_id]		"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+				    [rseq_cs]			"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+				    [expect]			"r" (expect),
+				    [v]				"m" (*v),
+				    [newv]			"r" (newv),
+				    [v2]			"m" (*v2),
+				    [newv2]			"r" (newv2)
+				    RSEQ_INJECT_INPUT
+				  : "memory", RSEQ_ASM_TMP_REG_1
+				    RSEQ_INJECT_CLOBBER
+				  : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+				    , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __always_inline
+int RSEQ_TEMPLATE_IDENTIFIER(rseq_cmpeqv_trymemcpy_storev)(intptr_t *v, intptr_t expect,
+				 void *dst, void *src, size_t len,
+				 intptr_t newv, int cpu)
+{
+	RSEQ_INJECT_C(9)
+	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
+				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
+#endif
+				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+				  RSEQ_INJECT_ASM(3)
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[cmpfail]")
+				  RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
+				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[error2]")
+#endif
+				  RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
+				  RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_TEMPLATE_MO_RELEASE
+				  RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+#else
+				  RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+#endif
+				  RSEQ_INJECT_ASM(6)
+				  RSEQ_ASM_DEFINE_ABORT(4, abort)
+				  : /* gcc asm goto does not allow outputs */
+				  : [cpu_id]			"r" (cpu),
+				    [current_cpu_id]		"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD),
+				    [rseq_cs]			"m" (rseq_get_abi()->rseq_cs.arch.ptr),
+				    [expect]			"r" (expect),
+				    [v]				"m" (*v),
+				    [newv]			"r" (newv),
+				    [dst]			"r" (dst),
+				    [src]			"r" (src),
+				    [len]			"r" (len)
+				    RSEQ_INJECT_INPUT
+				  : "memory", RSEQ_ASM_TMP_REG_1, RSEQ_ASM_TMP_REG_2,
+				    RSEQ_ASM_TMP_REG_3, RSEQ_ASM_TMP_REG_4
+				    RSEQ_INJECT_CLOBBER
+				  : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+				    , error1, error2
+#endif
+	);
+
+	return 0;
+abort:
+	RSEQ_INJECT_FAILED
+	return -1;
+cmpfail:
+	return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+	rseq_bug("cpu_id comparison failed");
+error2:
+	rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* #if (defined(RSEQ_TEMPLATE_MO_RELAXED) || defined(RSEQ_TEMPLATE_MO_RELEASE)) &&
+	(defined(RSEQ_TEMPLATE_CPU_ID) || defined(RSEQ_TEMPLATE_MM_CID)) */
+
+#include "rseq-bits-reset.h"
diff --git a/tools/testing/selftests/rseq/rseq-riscv.h b/tools/testing/selftests/rseq/rseq-riscv.h
index b16d943a63e1..17932a79e066 100644
--- a/tools/testing/selftests/rseq/rseq-riscv.h
+++ b/tools/testing/selftests/rseq/rseq-riscv.h
@@ -165,507 +165,34 @@ do {									\
 	RSEQ_ASM_OP_R_ADD(inc)						\
 	__rseq_str(post_commit_label) ":\n"
 
-static inline __always_inline
-int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
-#endif
-				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-				  RSEQ_INJECT_ASM(3)
-				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[cmpfail]")
-				  RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
-				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[error2]")
-#endif
-				  RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
-				  RSEQ_INJECT_ASM(5)
-				  RSEQ_ASM_DEFINE_ABORT(4, abort)
-				  : /* gcc asm goto does not allow outputs */
-				  : [cpu_id]		"r" (cpu),
-				    [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-				    [v]			"m" (*v),
-				    [expect]		"r" (expect),
-				    [newv]		"r" (newv)
-				    RSEQ_INJECT_INPUT
-				  : "memory", RSEQ_ASM_TMP_REG_1
-				    RSEQ_INJECT_CLOBBER
-				  : abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-				    , error1, error2
-#endif
-	);
-
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __always_inline
-int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
-			       off_t voffp, intptr_t *load, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
-#endif
-				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-				  RSEQ_INJECT_ASM(3)
-				  RSEQ_ASM_OP_CMPNE(v, expectnot, "%l[cmpfail]")
-				  RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
-				  RSEQ_ASM_OP_CMPNE(v, expectnot, "%l[error2]")
-#endif
-				  RSEQ_ASM_OP_R_LOAD(v)
-				  RSEQ_ASM_OP_R_STORE(load)
-				  RSEQ_ASM_OP_R_LOAD_OFF(voffp)
-				  RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
-				  RSEQ_INJECT_ASM(5)
-				  RSEQ_ASM_DEFINE_ABORT(4, abort)
-				  : /* gcc asm goto does not allow outputs */
-				  : [cpu_id]		"r" (cpu),
-				    [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-				    [v]			"m" (*v),
-				    [expectnot]		"r" (expectnot),
-				    [load]		"m" (*load),
-				    [voffp]		"r" (voffp)
-				    RSEQ_INJECT_INPUT
-				  : "memory", RSEQ_ASM_TMP_REG_1
-				    RSEQ_INJECT_CLOBBER
-				  : abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-				    , error1, error2
-#endif
-	);
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_bug("expected value comparison failed");
-#endif
-}
+/* Per-cpu-id indexing. */
 
-static inline __always_inline
-int rseq_addv(intptr_t *v, intptr_t count, int cpu)
-{
-	RSEQ_INJECT_C(9)
+#define RSEQ_TEMPLATE_CPU_ID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-riscv-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
 
-	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
-#endif
-				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-				  RSEQ_INJECT_ASM(3)
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
-#endif
-				  RSEQ_ASM_OP_R_LOAD(v)
-				  RSEQ_ASM_OP_R_ADD(count)
-				  RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
-				  RSEQ_INJECT_ASM(4)
-				  RSEQ_ASM_DEFINE_ABORT(4, abort)
-				  : /* gcc asm goto does not allow outputs */
-				  : [cpu_id]		"r" (cpu),
-				    [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-				    [v]			"m" (*v),
-				    [count]		"r" (count)
-				    RSEQ_INJECT_INPUT
-				  : "memory", RSEQ_ASM_TMP_REG_1
-				    RSEQ_INJECT_CLOBBER
-				  : abort
-#ifdef RSEQ_COMPARE_TWICE
-				    , error1
-#endif
-	);
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-#endif
-}
-
-static inline __always_inline
-int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
-				 intptr_t *v2, intptr_t newv2,
-				 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
-#endif
-				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-				  RSEQ_INJECT_ASM(3)
-				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[cmpfail]")
-				  RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
-				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[error2]")
-#endif
-				  RSEQ_ASM_OP_STORE(newv2, v2)
-				  RSEQ_INJECT_ASM(5)
-				  RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
-				  RSEQ_INJECT_ASM(6)
-				  RSEQ_ASM_DEFINE_ABORT(4, abort)
-				  : /* gcc asm goto does not allow outputs */
-				  : [cpu_id]		"r" (cpu),
-				    [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-				    [expect]		"r" (expect),
-				    [v]			"m" (*v),
-				    [newv]		"r" (newv),
-				    [v2]			"m" (*v2),
-				    [newv2]		"r" (newv2)
-				    RSEQ_INJECT_INPUT
-				  : "memory", RSEQ_ASM_TMP_REG_1
-				    RSEQ_INJECT_CLOBBER
-				  : abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-				    , error1, error2
-#endif
-	);
-
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __always_inline
-int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
-					 intptr_t *v2, intptr_t newv2,
-					 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
-#endif
-				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-				  RSEQ_INJECT_ASM(3)
-				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[cmpfail]")
-				  RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
-				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[error2]")
-#endif
-				  RSEQ_ASM_OP_STORE(newv2, v2)
-				  RSEQ_INJECT_ASM(5)
-				  RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
-				  RSEQ_INJECT_ASM(6)
-				  RSEQ_ASM_DEFINE_ABORT(4, abort)
-				  : /* gcc asm goto does not allow outputs */
-				  : [cpu_id]		"r" (cpu),
-				    [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-				    [expect]		"r" (expect),
-				    [v]			"m" (*v),
-				    [newv]		"r" (newv),
-				    [v2]			"m" (*v2),
-				    [newv2]		"r" (newv2)
-				    RSEQ_INJECT_INPUT
-				  : "memory", RSEQ_ASM_TMP_REG_1
-				    RSEQ_INJECT_CLOBBER
-				  : abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-				    , error1, error2
-#endif
-	);
-
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __always_inline
-int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
-			      intptr_t *v2, intptr_t expect2,
-			      intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error3]")
-#endif
-				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-				  RSEQ_INJECT_ASM(3)
-				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[cmpfail]")
-				  RSEQ_INJECT_ASM(4)
-				  RSEQ_ASM_OP_CMPEQ(v2, expect2, "%l[cmpfail]")
-				  RSEQ_INJECT_ASM(5)
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
-				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[error2]")
-				  RSEQ_ASM_OP_CMPEQ(v2, expect2, "%l[error3]")
-#endif
-				  RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
-				  RSEQ_INJECT_ASM(6)
-				  RSEQ_ASM_DEFINE_ABORT(4, abort)
-				  : /* gcc asm goto does not allow outputs */
-				  : [cpu_id]		"r" (cpu),
-				    [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-				    [v]			"m" (*v),
-				    [expect]		"r" (expect),
-				    [v2]			"m" (*v2),
-				    [expect2]		"r" (expect2),
-				    [newv]		"r" (newv)
-				    RSEQ_INJECT_INPUT
-				  : "memory", RSEQ_ASM_TMP_REG_1
-				    RSEQ_INJECT_CLOBBER
-				  : abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-				    , error1, error2, error3
-#endif
-	);
-
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_bug("expected value comparison failed");
-error3:
-	rseq_bug("2nd expected value comparison failed");
-#endif
-}
-
-static inline __always_inline
-int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
-				 void *dst, void *src, size_t len,
-				 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
-#endif
-				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-				  RSEQ_INJECT_ASM(3)
-				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[cmpfail]")
-				  RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
-				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[error2]")
-#endif
-				  RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
-				  RSEQ_INJECT_ASM(5)
-				  RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
-				  RSEQ_INJECT_ASM(6)
-				  RSEQ_ASM_DEFINE_ABORT(4, abort)
-				  : /* gcc asm goto does not allow outputs */
-				  : [cpu_id]		"r" (cpu),
-				    [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-				    [expect]		"r" (expect),
-				    [v]			"m" (*v),
-				    [newv]		"r" (newv),
-				    [dst]			"r" (dst),
-				    [src]			"r" (src),
-				    [len]			"r" (len)
-				    RSEQ_INJECT_INPUT
-				  : "memory", RSEQ_ASM_TMP_REG_1, RSEQ_ASM_TMP_REG_2,
-				    RSEQ_ASM_TMP_REG_3, RSEQ_ASM_TMP_REG_4
-				    RSEQ_INJECT_CLOBBER
-				  : abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-				    , error1, error2
-#endif
-	);
-
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_bug("expected value comparison failed");
-#endif
-}
-
-static inline __always_inline
-int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
-					 void *dst, void *src, size_t len,
-					 intptr_t newv, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[cmpfail]")
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error2]")
-#endif
-				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-				  RSEQ_INJECT_ASM(3)
-				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[cmpfail]")
-				  RSEQ_INJECT_ASM(4)
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
-				  RSEQ_ASM_OP_CMPEQ(v, expect, "%l[error2]")
-#endif
-				  RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
-				  RSEQ_INJECT_ASM(5)
-				  RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
-				  RSEQ_INJECT_ASM(6)
-				  RSEQ_ASM_DEFINE_ABORT(4, abort)
-				  : /* gcc asm goto does not allow outputs */
-				  : [cpu_id]		"r" (cpu),
-				    [current_cpu_id]	"m" (rseq_get_abi()->cpu_id),
-				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-				    [expect]		"r" (expect),
-				    [v]			"m" (*v),
-				    [newv]		"r" (newv),
-				    [dst]			"r" (dst),
-				    [src]			"r" (src),
-				    [len]			"r" (len)
-				    RSEQ_INJECT_INPUT
-				  : "memory", RSEQ_ASM_TMP_REG_1, RSEQ_ASM_TMP_REG_2,
-				    RSEQ_ASM_TMP_REG_3, RSEQ_ASM_TMP_REG_4
-				    RSEQ_INJECT_CLOBBER
-				  : abort, cmpfail
-#ifdef RSEQ_COMPARE_TWICE
-				    , error1, error2
-#endif
-	);
-
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-cmpfail:
-	return 1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-error2:
-	rseq_bug("expected value comparison failed");
-#endif
-}
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-riscv-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_CPU_ID
 
-#define RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+/* Per-mm-cid indexing. */
 
-/*
- *   pval = *(ptr+off)
- *  *pval += inc;
- */
-static inline __always_inline
-int rseq_offset_deref_addv(intptr_t *ptr, off_t off, intptr_t inc, int cpu)
-{
-	RSEQ_INJECT_C(9)
-
-	__asm__ __volatile__ goto(RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_DEFINE_EXIT_POINT(2f, "%l[error1]")
-#endif
-				  RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
-				  RSEQ_INJECT_ASM(3)
-#ifdef RSEQ_COMPARE_TWICE
-				  RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]")
-#endif
-				  RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, 3)
-				  RSEQ_INJECT_ASM(4)
-				  RSEQ_ASM_DEFINE_ABORT(4, abort)
-				  : /* gcc asm goto does not allow outputs */
-				  : [cpu_id]		"r" (cpu),
-				    [current_cpu_id]      "m" (rseq_get_abi()->cpu_id),
-				    [rseq_cs]		"m" (rseq_get_abi()->rseq_cs.arch.ptr),
-				    [ptr]			"r" (ptr),
-				    [off]			"er" (off),
-				    [inc]			"er" (inc)
-				    RSEQ_INJECT_INPUT
-				  : "memory", RSEQ_ASM_TMP_REG_1
-				    RSEQ_INJECT_CLOBBER
-				  : abort
-#ifdef RSEQ_COMPARE_TWICE
-				    , error1
-#endif
-	);
-	return 0;
-abort:
-	RSEQ_INJECT_FAILED
-	return -1;
-#ifdef RSEQ_COMPARE_TWICE
-error1:
-	rseq_bug("cpu_id comparison failed");
-#endif
-}
+#define RSEQ_TEMPLATE_MM_CID
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-riscv-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
+
+#define RSEQ_TEMPLATE_MO_RELEASE
+#include "rseq-riscv-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELEASE
+#undef RSEQ_TEMPLATE_MM_CID
+
+/* APIs which are not based on cpu ids. */
+
+#define RSEQ_TEMPLATE_CPU_ID_NONE
+#define RSEQ_TEMPLATE_MO_RELAXED
+#include "rseq-riscv-bits.h"
+#undef RSEQ_TEMPLATE_MO_RELAXED
+#undef RSEQ_TEMPLATE_CPU_ID_NONE
-- 
cgit 


From cead72062756294dff01314b57b68e3e925ef321 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:20 -0500
Subject: selftests/rseq: Implement basic percpu ops mm_cid test

Adapt to the rseq.h API changes introduced by commits
"selftests/rseq: <arch>: Template memory ordering and percpu access mode".

Build a new basic_percpu_ops_mm_cid_test to test the new "mm_cid" rseq
field.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-19-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/.gitignore            |  1 +
 tools/testing/selftests/rseq/Makefile              |  5 ++-
 .../testing/selftests/rseq/basic_percpu_ops_test.c | 46 ++++++++++++++++++----
 3 files changed, 44 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore
index 5910888ebfe1..7e99554748cb 100644
--- a/tools/testing/selftests/rseq/.gitignore
+++ b/tools/testing/selftests/rseq/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 basic_percpu_ops_test
+basic_percpu_ops_mm_cid_test
 basic_test
 basic_rseq_op_test
 param_test
diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile
index 215e1067f037..2e5e3eac2ca0 100644
--- a/tools/testing/selftests/rseq/Makefile
+++ b/tools/testing/selftests/rseq/Makefile
@@ -12,7 +12,7 @@ LDLIBS += -lpthread -ldl
 # still track changes to header files and depend on shared object.
 OVERRIDE_TARGETS = 1
 
-TEST_GEN_PROGS = basic_test basic_percpu_ops_test param_test \
+TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \
 		param_test_benchmark param_test_compare_twice
 
 TEST_GEN_PROGS_EXTENDED = librseq.so
@@ -29,6 +29,9 @@ $(OUTPUT)/librseq.so: rseq.c rseq.h rseq-*.h
 $(OUTPUT)/%: %.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h
 	$(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@
 
+$(OUTPUT)/basic_percpu_ops_mm_cid_test: basic_percpu_ops_test.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h
+	$(CC) $(CFLAGS) -DBUILDOPT_RSEQ_PERCPU_MM_CID_ID $< $(LDLIBS) -lrseq -o $@
+
 $(OUTPUT)/param_test_benchmark: param_test.c $(TEST_GEN_PROGS_EXTENDED) \
 					rseq.h rseq-*.h
 	$(CC) $(CFLAGS) -DBENCHMARK $< $(LDLIBS) -lrseq -o $@
diff --git a/tools/testing/selftests/rseq/basic_percpu_ops_test.c b/tools/testing/selftests/rseq/basic_percpu_ops_test.c
index 517756afc2a4..887542961968 100644
--- a/tools/testing/selftests/rseq/basic_percpu_ops_test.c
+++ b/tools/testing/selftests/rseq/basic_percpu_ops_test.c
@@ -12,6 +12,32 @@
 #include "../kselftest.h"
 #include "rseq.h"
 
+#ifdef BUILDOPT_RSEQ_PERCPU_MM_CID
+# define RSEQ_PERCPU	RSEQ_PERCPU_MM_CID
+static
+int get_current_cpu_id(void)
+{
+	return rseq_current_mm_cid();
+}
+static
+bool rseq_validate_cpu_id(void)
+{
+	return rseq_mm_cid_available();
+}
+#else
+# define RSEQ_PERCPU	RSEQ_PERCPU_CPU_ID
+static
+int get_current_cpu_id(void)
+{
+	return rseq_cpu_start();
+}
+static
+bool rseq_validate_cpu_id(void)
+{
+	return rseq_current_cpu_raw() >= 0;
+}
+#endif
+
 struct percpu_lock_entry {
 	intptr_t v;
 } __attribute__((aligned(128)));
@@ -51,9 +77,9 @@ int rseq_this_cpu_lock(struct percpu_lock *lock)
 	for (;;) {
 		int ret;
 
-		cpu = rseq_cpu_start();
-		ret = rseq_cmpeqv_storev(&lock->c[cpu].v,
-					 0, 1, cpu);
+		cpu = get_current_cpu_id();
+		ret = rseq_cmpeqv_storev(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+					 &lock->c[cpu].v, 0, 1, cpu);
 		if (rseq_likely(!ret))
 			break;
 		/* Retry if comparison fails or rseq aborts. */
@@ -141,13 +167,14 @@ void this_cpu_list_push(struct percpu_list *list,
 		intptr_t *targetptr, newval, expect;
 		int ret;
 
-		cpu = rseq_cpu_start();
+		cpu = get_current_cpu_id();
 		/* Load list->c[cpu].head with single-copy atomicity. */
 		expect = (intptr_t)RSEQ_READ_ONCE(list->c[cpu].head);
 		newval = (intptr_t)node;
 		targetptr = (intptr_t *)&list->c[cpu].head;
 		node->next = (struct percpu_list_node *)expect;
-		ret = rseq_cmpeqv_storev(targetptr, expect, newval, cpu);
+		ret = rseq_cmpeqv_storev(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+					 targetptr, expect, newval, cpu);
 		if (rseq_likely(!ret))
 			break;
 		/* Retry if comparison fails or rseq aborts. */
@@ -170,12 +197,13 @@ struct percpu_list_node *this_cpu_list_pop(struct percpu_list *list,
 		long offset;
 		int ret, cpu;
 
-		cpu = rseq_cpu_start();
+		cpu = get_current_cpu_id();
 		targetptr = (intptr_t *)&list->c[cpu].head;
 		expectnot = (intptr_t)NULL;
 		offset = offsetof(struct percpu_list_node, next);
 		load = (intptr_t *)&head;
-		ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
+		ret = rseq_cmpnev_storeoffp_load(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+						 targetptr, expectnot,
 						 offset, load, cpu);
 		if (rseq_likely(!ret)) {
 			if (_cpu)
@@ -295,6 +323,10 @@ int main(int argc, char **argv)
 			errno, strerror(errno));
 		goto error;
 	}
+	if (!rseq_validate_cpu_id()) {
+		fprintf(stderr, "Error: cpu id getter unavailable\n");
+		goto error;
+	}
 	printf("spinlock\n");
 	test_percpu_spinlock();
 	printf("percpu_list\n");
-- 
cgit 


From ee31fff091fa9c185d10844f9caf38784afe4745 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:21 -0500
Subject: selftests/rseq: Implement parametrized mm_cid test

Adapt to the rseq.h API changes introduced by commits
"selftests/rseq: <arch>: Template memory ordering and percpu access mode".

Build a new param_test_mm_cid, param_test_mm_cid_benchmark, and
param_test_mm_cid_compare_twice executables to test the new "mm_cid"
rseq field.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-20-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/.gitignore        |   3 +
 tools/testing/selftests/rseq/Makefile          |  15 ++-
 tools/testing/selftests/rseq/param_test.c      | 148 +++++++++++++++++--------
 tools/testing/selftests/rseq/run_param_test.sh |   5 +
 4 files changed, 122 insertions(+), 49 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore
index 7e99554748cb..16496de5f6ce 100644
--- a/tools/testing/selftests/rseq/.gitignore
+++ b/tools/testing/selftests/rseq/.gitignore
@@ -6,3 +6,6 @@ basic_rseq_op_test
 param_test
 param_test_benchmark
 param_test_compare_twice
+param_test_mm_cid
+param_test_mm_cid_benchmark
+param_test_mm_cid_compare_twice
diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile
index 2e5e3eac2ca0..82a52810a649 100644
--- a/tools/testing/selftests/rseq/Makefile
+++ b/tools/testing/selftests/rseq/Makefile
@@ -13,7 +13,8 @@ LDLIBS += -lpthread -ldl
 OVERRIDE_TARGETS = 1
 
 TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \
-		param_test_benchmark param_test_compare_twice
+		param_test_benchmark param_test_compare_twice param_test_mm_cid \
+		param_test_mm_cid_benchmark param_test_mm_cid_compare_twice
 
 TEST_GEN_PROGS_EXTENDED = librseq.so
 
@@ -39,3 +40,15 @@ $(OUTPUT)/param_test_benchmark: param_test.c $(TEST_GEN_PROGS_EXTENDED) \
 $(OUTPUT)/param_test_compare_twice: param_test.c $(TEST_GEN_PROGS_EXTENDED) \
 					rseq.h rseq-*.h
 	$(CC) $(CFLAGS) -DRSEQ_COMPARE_TWICE $< $(LDLIBS) -lrseq -o $@
+
+$(OUTPUT)/param_test_mm_cid: param_test.c $(TEST_GEN_PROGS_EXTENDED) \
+					rseq.h rseq-*.h
+	$(CC) $(CFLAGS) -DBUILDOPT_RSEQ_PERCPU_MM_CID $< $(LDLIBS) -lrseq -o $@
+
+$(OUTPUT)/param_test_mm_cid_benchmark: param_test.c $(TEST_GEN_PROGS_EXTENDED) \
+					rseq.h rseq-*.h
+	$(CC) $(CFLAGS) -DBUILDOPT_RSEQ_PERCPU_MM_CID -DBENCHMARK $< $(LDLIBS) -lrseq -o $@
+
+$(OUTPUT)/param_test_mm_cid_compare_twice: param_test.c $(TEST_GEN_PROGS_EXTENDED) \
+					rseq.h rseq-*.h
+	$(CC) $(CFLAGS) -DBUILDOPT_RSEQ_PERCPU_MM_CID -DRSEQ_COMPARE_TWICE $< $(LDLIBS) -lrseq -o $@
diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
index 9869369a8607..cadb9d884811 100644
--- a/tools/testing/selftests/rseq/param_test.c
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -16,6 +16,7 @@
 #include <signal.h>
 #include <errno.h>
 #include <stddef.h>
+#include <stdbool.h>
 
 static inline pid_t rseq_gettid(void)
 {
@@ -36,7 +37,7 @@ static int opt_modulo, verbose;
 
 static int opt_yield, opt_signal, opt_sleep,
 		opt_disable_rseq, opt_threads = 200,
-		opt_disable_mod = 0, opt_test = 's', opt_mb = 0;
+		opt_disable_mod = 0, opt_test = 's';
 
 static long long opt_reps = 5000;
 
@@ -264,6 +265,63 @@ unsigned int yield_mod_cnt, nr_abort;
 
 #include "rseq.h"
 
+static enum rseq_mo opt_mo = RSEQ_MO_RELAXED;
+
+#ifdef RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+#define TEST_MEMBARRIER
+
+static int sys_membarrier(int cmd, int flags, int cpu_id)
+{
+	return syscall(__NR_membarrier, cmd, flags, cpu_id);
+}
+#endif
+
+#ifdef BUILDOPT_RSEQ_PERCPU_MM_CID
+# define RSEQ_PERCPU	RSEQ_PERCPU_MM_CID
+static
+int get_current_cpu_id(void)
+{
+	return rseq_current_mm_cid();
+}
+static
+bool rseq_validate_cpu_id(void)
+{
+	return rseq_mm_cid_available();
+}
+# ifdef TEST_MEMBARRIER
+/*
+ * Membarrier does not currently support targeting a mm_cid, so
+ * issue the barrier on all cpus.
+ */
+static
+int rseq_membarrier_expedited(int cpu)
+{
+	return sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
+			      0, 0);
+}
+# endif /* TEST_MEMBARRIER */
+#else
+# define RSEQ_PERCPU	RSEQ_PERCPU_CPU_ID
+static
+int get_current_cpu_id(void)
+{
+	return rseq_cpu_start();
+}
+static
+bool rseq_validate_cpu_id(void)
+{
+	return rseq_current_cpu_raw() >= 0;
+}
+# ifdef TEST_MEMBARRIER
+static
+int rseq_membarrier_expedited(int cpu)
+{
+	return sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
+			      MEMBARRIER_CMD_FLAG_CPU, cpu);
+}
+# endif /* TEST_MEMBARRIER */
+#endif
+
 struct percpu_lock_entry {
 	intptr_t v;
 } __attribute__((aligned(128)));
@@ -351,8 +409,9 @@ static int rseq_this_cpu_lock(struct percpu_lock *lock)
 	for (;;) {
 		int ret;
 
-		cpu = rseq_cpu_start();
-		ret = rseq_cmpeqv_storev(&lock->c[cpu].v,
+		cpu = get_current_cpu_id();
+		ret = rseq_cmpeqv_storev(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+					 &lock->c[cpu].v,
 					 0, 1, cpu);
 		if (rseq_likely(!ret))
 			break;
@@ -469,8 +528,9 @@ void *test_percpu_inc_thread(void *arg)
 		do {
 			int cpu;
 
-			cpu = rseq_cpu_start();
-			ret = rseq_addv(&data->c[cpu].count, 1, cpu);
+			cpu = get_current_cpu_id();
+			ret = rseq_addv(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+					&data->c[cpu].count, 1, cpu);
 		} while (rseq_unlikely(ret));
 #ifndef BENCHMARK
 		if (i != 0 && !(i % (reps / 10)))
@@ -539,13 +599,14 @@ void this_cpu_list_push(struct percpu_list *list,
 		intptr_t *targetptr, newval, expect;
 		int ret;
 
-		cpu = rseq_cpu_start();
+		cpu = get_current_cpu_id();
 		/* Load list->c[cpu].head with single-copy atomicity. */
 		expect = (intptr_t)RSEQ_READ_ONCE(list->c[cpu].head);
 		newval = (intptr_t)node;
 		targetptr = (intptr_t *)&list->c[cpu].head;
 		node->next = (struct percpu_list_node *)expect;
-		ret = rseq_cmpeqv_storev(targetptr, expect, newval, cpu);
+		ret = rseq_cmpeqv_storev(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+					 targetptr, expect, newval, cpu);
 		if (rseq_likely(!ret))
 			break;
 		/* Retry if comparison fails or rseq aborts. */
@@ -571,13 +632,14 @@ struct percpu_list_node *this_cpu_list_pop(struct percpu_list *list,
 		long offset;
 		int ret;
 
-		cpu = rseq_cpu_start();
+		cpu = get_current_cpu_id();
 		targetptr = (intptr_t *)&list->c[cpu].head;
 		expectnot = (intptr_t)NULL;
 		offset = offsetof(struct percpu_list_node, next);
 		load = (intptr_t *)&head;
-		ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
-						   offset, load, cpu);
+		ret = rseq_cmpnev_storeoffp_load(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+						 targetptr, expectnot,
+						 offset, load, cpu);
 		if (rseq_likely(!ret)) {
 			node = head;
 			break;
@@ -715,7 +777,7 @@ bool this_cpu_buffer_push(struct percpu_buffer *buffer,
 		intptr_t offset;
 		int ret;
 
-		cpu = rseq_cpu_start();
+		cpu = get_current_cpu_id();
 		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
 		if (offset == buffer->c[cpu].buflen)
 			break;
@@ -723,14 +785,9 @@ bool this_cpu_buffer_push(struct percpu_buffer *buffer,
 		targetptr_spec = (intptr_t *)&buffer->c[cpu].array[offset];
 		newval_final = offset + 1;
 		targetptr_final = &buffer->c[cpu].offset;
-		if (opt_mb)
-			ret = rseq_cmpeqv_trystorev_storev_release(
-				targetptr_final, offset, targetptr_spec,
-				newval_spec, newval_final, cpu);
-		else
-			ret = rseq_cmpeqv_trystorev_storev(targetptr_final,
-				offset, targetptr_spec, newval_spec,
-				newval_final, cpu);
+		ret = rseq_cmpeqv_trystorev_storev(opt_mo, RSEQ_PERCPU,
+			targetptr_final, offset, targetptr_spec,
+			newval_spec, newval_final, cpu);
 		if (rseq_likely(!ret)) {
 			result = true;
 			break;
@@ -753,7 +810,7 @@ struct percpu_buffer_node *this_cpu_buffer_pop(struct percpu_buffer *buffer,
 		intptr_t offset;
 		int ret;
 
-		cpu = rseq_cpu_start();
+		cpu = get_current_cpu_id();
 		/* Load offset with single-copy atomicity. */
 		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
 		if (offset == 0) {
@@ -763,7 +820,8 @@ struct percpu_buffer_node *this_cpu_buffer_pop(struct percpu_buffer *buffer,
 		head = RSEQ_READ_ONCE(buffer->c[cpu].array[offset - 1]);
 		newval = offset - 1;
 		targetptr = (intptr_t *)&buffer->c[cpu].offset;
-		ret = rseq_cmpeqv_cmpeqv_storev(targetptr, offset,
+		ret = rseq_cmpeqv_cmpeqv_storev(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+			targetptr, offset,
 			(intptr_t *)&buffer->c[cpu].array[offset - 1],
 			(intptr_t)head, newval, cpu);
 		if (rseq_likely(!ret))
@@ -920,7 +978,7 @@ bool this_cpu_memcpy_buffer_push(struct percpu_memcpy_buffer *buffer,
 		size_t copylen;
 		int ret;
 
-		cpu = rseq_cpu_start();
+		cpu = get_current_cpu_id();
 		/* Load offset with single-copy atomicity. */
 		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
 		if (offset == buffer->c[cpu].buflen)
@@ -931,15 +989,11 @@ bool this_cpu_memcpy_buffer_push(struct percpu_memcpy_buffer *buffer,
 		copylen = sizeof(item);
 		newval_final = offset + 1;
 		targetptr_final = &buffer->c[cpu].offset;
-		if (opt_mb)
-			ret = rseq_cmpeqv_trymemcpy_storev_release(
-				targetptr_final, offset,
-				destptr, srcptr, copylen,
-				newval_final, cpu);
-		else
-			ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
-				offset, destptr, srcptr, copylen,
-				newval_final, cpu);
+		ret = rseq_cmpeqv_trymemcpy_storev(
+			opt_mo, RSEQ_PERCPU,
+			targetptr_final, offset,
+			destptr, srcptr, copylen,
+			newval_final, cpu);
 		if (rseq_likely(!ret)) {
 			result = true;
 			break;
@@ -964,7 +1018,7 @@ bool this_cpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
 		size_t copylen;
 		int ret;
 
-		cpu = rseq_cpu_start();
+		cpu = get_current_cpu_id();
 		/* Load offset with single-copy atomicity. */
 		offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
 		if (offset == 0)
@@ -975,8 +1029,8 @@ bool this_cpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
 		copylen = sizeof(*item);
 		newval_final = offset - 1;
 		targetptr_final = &buffer->c[cpu].offset;
-		ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
-			offset, destptr, srcptr, copylen,
+		ret = rseq_cmpeqv_trymemcpy_storev(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+			targetptr_final, offset, destptr, srcptr, copylen,
 			newval_final, cpu);
 		if (rseq_likely(!ret)) {
 			result = true;
@@ -1151,7 +1205,7 @@ static int set_signal_handler(void)
 }
 
 /* Test MEMBARRIER_CMD_PRIVATE_RESTART_RSEQ_ON_CPU membarrier command. */
-#ifdef RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+#ifdef TEST_MEMBARRIER
 struct test_membarrier_thread_args {
 	int stop;
 	intptr_t percpu_list_ptr;
@@ -1178,9 +1232,10 @@ void *test_membarrier_worker_thread(void *arg)
 		int ret;
 
 		do {
-			int cpu = rseq_cpu_start();
+			int cpu = get_current_cpu_id();
 
-			ret = rseq_offset_deref_addv(&args->percpu_list_ptr,
+			ret = rseq_offset_deref_addv(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+				&args->percpu_list_ptr,
 				sizeof(struct percpu_list_entry) * cpu, 1, cpu);
 		} while (rseq_unlikely(ret));
 	}
@@ -1217,11 +1272,6 @@ void test_membarrier_free_percpu_list(struct percpu_list *list)
 		free(list->c[i].head);
 }
 
-static int sys_membarrier(int cmd, int flags, int cpu_id)
-{
-	return syscall(__NR_membarrier, cmd, flags, cpu_id);
-}
-
 /*
  * The manager thread swaps per-cpu lists that worker threads see,
  * and validates that there are no unexpected modifications.
@@ -1260,8 +1310,7 @@ void *test_membarrier_manager_thread(void *arg)
 
 		/* Make list_b "active". */
 		atomic_store(&args->percpu_list_ptr, (intptr_t)&list_b);
-		if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
-					MEMBARRIER_CMD_FLAG_CPU, cpu_a) &&
+		if (rseq_membarrier_expedited(cpu_a) &&
 				errno != ENXIO /* missing CPU */) {
 			perror("sys_membarrier");
 			abort();
@@ -1284,8 +1333,7 @@ void *test_membarrier_manager_thread(void *arg)
 
 		/* Make list_a "active". */
 		atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
-		if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
-					MEMBARRIER_CMD_FLAG_CPU, cpu_b) &&
+		if (rseq_membarrier_expedited(cpu_b) &&
 				errno != ENXIO /* missing CPU*/) {
 			perror("sys_membarrier");
 			abort();
@@ -1356,7 +1404,7 @@ void test_membarrier(void)
 		abort();
 	}
 }
-#else /* RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV */
+#else /* TEST_MEMBARRIER */
 void test_membarrier(void)
 {
 	fprintf(stderr, "rseq_offset_deref_addv is not implemented on this architecture. "
@@ -1513,7 +1561,7 @@ int main(int argc, char **argv)
 			verbose = 1;
 			break;
 		case 'M':
-			opt_mb = 1;
+			opt_mo = RSEQ_MO_RELEASE;
 			break;
 		default:
 			show_usage(argc, argv);
@@ -1533,6 +1581,10 @@ int main(int argc, char **argv)
 
 	if (!opt_disable_rseq && rseq_register_current_thread())
 		goto error;
+	if (!opt_disable_rseq && !rseq_validate_cpu_id()) {
+		fprintf(stderr, "Error: cpu id getter unavailable\n");
+		goto error;
+	}
 	switch (opt_test) {
 	case 's':
 		printf_verbose("spinlock\n");
diff --git a/tools/testing/selftests/rseq/run_param_test.sh b/tools/testing/selftests/rseq/run_param_test.sh
index f51bc83c9e41..8d31426ab41f 100755
--- a/tools/testing/selftests/rseq/run_param_test.sh
+++ b/tools/testing/selftests/rseq/run_param_test.sh
@@ -42,6 +42,11 @@ function do_tests()
 		./param_test ${TEST_LIST[$i]} -r ${REPS} -t ${NR_THREADS} ${@} ${EXTRA_ARGS} || exit 1
 		echo "Running compare-twice test ${TEST_NAME[$i]}"
 		./param_test_compare_twice ${TEST_LIST[$i]} -r ${REPS} -t ${NR_THREADS} ${@} ${EXTRA_ARGS} || exit 1
+
+		echo "Running mm_cid test ${TEST_NAME[$i]}"
+		./param_test_mm_cid ${TEST_LIST[$i]} -r ${REPS} -t ${NR_THREADS} ${@} ${EXTRA_ARGS} || exit 1
+		echo "Running mm_cid compare-twice test ${TEST_NAME[$i]}"
+		./param_test_mm_cid_compare_twice ${TEST_LIST[$i]} -r ${REPS} -t ${NR_THREADS} ${@} ${EXTRA_ARGS} || exit 1
 		let "i++"
 	done
 }
-- 
cgit 


From a3798e6ffb37481c47773dd1d785c395c0785af8 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Nov 2022 15:39:22 -0500
Subject: selftests/rseq: parametrized test: Report/abort on negative
 concurrency ID

Report and abort when a negative concurrency ID value is observed by the
spinlock test.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221122203932.231377-21-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/param_test.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
index cadb9d884811..bf951a490bb4 100644
--- a/tools/testing/selftests/rseq/param_test.c
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -410,6 +410,11 @@ static int rseq_this_cpu_lock(struct percpu_lock *lock)
 		int ret;
 
 		cpu = get_current_cpu_id();
+		if (cpu < 0) {
+			fprintf(stderr, "pid: %d: tid: %d, cpu: %d: cid: %d\n",
+					getpid(), (int) rseq_gettid(), rseq_current_cpu_raw(), cpu);
+			abort();
+		}
 		ret = rseq_cmpeqv_storev(RSEQ_MO_RELAXED, RSEQ_PERCPU,
 					 &lock->c[cpu].v,
 					 0, 1, cpu);
-- 
cgit 


From b344b8f2d88dbf095caf97ac57fd3645843fa70f Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 16 Dec 2022 09:53:32 -0500
Subject: selftests/rseq: Add mm_numa_cid to test script

Add mm_numa_cid tests to the run_param_test.sh test script.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221216145332.205095-1-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/rseq/run_param_test.sh | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/run_param_test.sh b/tools/testing/selftests/rseq/run_param_test.sh
index 8d31426ab41f..603b3b69d20c 100755
--- a/tools/testing/selftests/rseq/run_param_test.sh
+++ b/tools/testing/selftests/rseq/run_param_test.sh
@@ -47,6 +47,11 @@ function do_tests()
 		./param_test_mm_cid ${TEST_LIST[$i]} -r ${REPS} -t ${NR_THREADS} ${@} ${EXTRA_ARGS} || exit 1
 		echo "Running mm_cid compare-twice test ${TEST_NAME[$i]}"
 		./param_test_mm_cid_compare_twice ${TEST_LIST[$i]} -r ${REPS} -t ${NR_THREADS} ${@} ${EXTRA_ARGS} || exit 1
+
+		echo "Running mm_numa_cid test ${TEST_NAME[$i]}"
+		./param_test_mm_numa_cid ${TEST_LIST[$i]} -r ${REPS} -t ${NR_THREADS} ${@} ${EXTRA_ARGS} || exit 1
+		echo "Running mm_numa_cid compare-twice test ${TEST_NAME[$i]}"
+		./param_test_mm_numa_cid_compare_twice ${TEST_LIST[$i]} -r ${REPS} -t ${NR_THREADS} ${@} ${EXTRA_ARGS} || exit 1
 		let "i++"
 	done
 }
-- 
cgit 


From 07453245620c075779abefa2a9f469fa336e7510 Mon Sep 17 00:00:00 2001
From: Xin Liu <liuxin350@huawei.com>
Date: Fri, 23 Dec 2022 21:36:18 +0800
Subject: libbpf: fix errno is overwritten after being closed.

In the ensure_good_fd function, if the fcntl function succeeds but
the close function fails, ensure_good_fd returns a normal fd and
sets errno, which may cause users to misunderstand. The close
failure is not a serious problem, and the correct FD has been
handed over to the upper-layer application. Let's restore errno here.

Signed-off-by: Xin Liu <liuxin350@huawei.com>
Link: https://lore.kernel.org/r/20221223133618.10323-1-liuxin350@huawei.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/libbpf_internal.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 377642ff51fc..98333a6c38e9 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -543,10 +543,9 @@ static inline int ensure_good_fd(int fd)
 		fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
 		saved_errno = errno;
 		close(old_fd);
-		if (fd < 0) {
+		errno = saved_errno;
+		if (fd < 0)
 			pr_warn("failed to dup FD %d to FD > 2: %d\n", old_fd, -saved_errno);
-			errno = saved_errno;
-		}
 	}
 	return fd;
 }
-- 
cgit 


From 8125b6cda9b17f5a0163cca3904c430f4f03591b Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Tue, 4 Oct 2022 11:31:29 +0200
Subject: KVM: selftests: Use TAP interface in the kvm_binary_stats_test

The kvm_binary_stats_test test currently does not have any output (unless
one of the TEST_ASSERT statement fails), so it's hard to say for a user
how far it did proceed already. Thus let's make this a little bit more
user-friendly and include some TAP output via the kselftest.h interface.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Andrew Jones <andrew.jones@linux.dev>
Message-Id: <20221004093131.40392-2-thuth@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/kvm_binary_stats_test.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c
index 0b45ac593387..894417c96f70 100644
--- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c
+++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c
@@ -19,6 +19,7 @@
 #include "kvm_util.h"
 #include "asm/kvm.h"
 #include "linux/kvm.h"
+#include "kselftest.h"
 
 static void stats_test(int stats_fd)
 {
@@ -51,7 +52,7 @@ static void stats_test(int stats_fd)
 
 	/* Sanity check for other fields in header */
 	if (header.num_desc == 0) {
-		printf("No KVM stats defined!");
+		ksft_print_msg("No KVM stats defined!\n");
 		return;
 	}
 	/*
@@ -224,9 +225,13 @@ int main(int argc, char *argv[])
 			max_vcpu = DEFAULT_NUM_VCPU;
 	}
 
+	ksft_print_header();
+
 	/* Check the extension for binary stats */
 	TEST_REQUIRE(kvm_has_cap(KVM_CAP_BINARY_STATS_FD));
 
+	ksft_set_plan(max_vm);
+
 	/* Create VMs and VCPUs */
 	vms = malloc(sizeof(vms[0]) * max_vm);
 	TEST_ASSERT(vms, "Allocate memory for storing VM pointers");
@@ -245,10 +250,12 @@ int main(int argc, char *argv[])
 		vm_stats_test(vms[i]);
 		for (j = 0; j < max_vcpu; ++j)
 			vcpu_stats_test(vcpus[i * max_vcpu + j]);
+		ksft_test_result_pass("vm%i\n", i);
 	}
 
 	for (i = 0; i < max_vm; ++i)
 		kvm_vm_free(vms[i]);
 	free(vms);
-	return 0;
+
+	ksft_finished();	/* Print results and exit() accordingly */
 }
-- 
cgit 


From 79edd55049527596187ee493d5f949fd1eda7067 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Tue, 4 Oct 2022 11:31:31 +0200
Subject: KVM: selftests: x86: Use TAP interface in the tsc_msrs_test

Let's add some output here so that the user has some feedback
about what is being run.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Message-Id: <20221004093131.40392-4-thuth@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
index 22d366c697f7..c9f67702f657 100644
--- a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
@@ -72,11 +72,16 @@ static void run_vcpu(struct kvm_vcpu *vcpu, int stage)
 
 	switch (get_ucall(vcpu, &uc)) {
 	case UCALL_SYNC:
-		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-			    uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
-			    stage + 1, (ulong)uc.args[1]);
+		if (!strcmp((const char *)uc.args[0], "hello") &&
+		    uc.args[1] == stage + 1)
+			ksft_test_result_pass("stage %d passed\n", stage + 1);
+		else
+			ksft_test_result_fail(
+				"stage %d: Unexpected register values vmexit, got %lx",
+				stage + 1, (ulong)uc.args[1]);
 		return;
 	case UCALL_DONE:
+		ksft_test_result_pass("stage %d passed\n", stage + 1);
 		return;
 	case UCALL_ABORT:
 		REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx");
@@ -92,6 +97,9 @@ int main(void)
 	struct kvm_vm *vm;
 	uint64_t val;
 
+	ksft_print_header();
+	ksft_set_plan(5);
+
 	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 
 	val = 0;
@@ -149,5 +157,5 @@ int main(void)
 
 	kvm_vm_free(vm);
 
-	return 0;
+	ksft_finished();	/* Print results and exit() accordingly */
 }
-- 
cgit 


From 67b16f180541d5d1869a07afb844a244f9b6db0f Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 13 Oct 2022 11:58:46 +0200
Subject: KVM: selftests: Rename 'msr->available' to 'msr->fault_exepected' in
 hyperv_features test

It may not be clear what 'msr->available' means. The test actually
checks that accessing the particular MSR doesn't cause #GP, rename
the variable accordingly.

While on it, use 'true'/'false' instead of '1'/'0' for 'write'/
'fault_expected' as these are boolean.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20221013095849.705943-5-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 .../testing/selftests/kvm/x86_64/hyperv_features.c | 184 ++++++++++-----------
 1 file changed, 92 insertions(+), 92 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
index 3163c3e8db0a..4cf1368af48a 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
@@ -15,7 +15,7 @@
 
 struct msr_data {
 	uint32_t idx;
-	bool available;
+	bool fault_expected;
 	bool write;
 	u64 write_val;
 };
@@ -38,10 +38,10 @@ static void guest_msr(struct msr_data *msr)
 	else
 		vector = wrmsr_safe(msr->idx, msr->write_val);
 
-	if (msr->available)
-		GUEST_ASSERT_2(!vector, msr->idx, vector);
-	else
+	if (msr->fault_expected)
 		GUEST_ASSERT_2(vector == GP_VECTOR, msr->idx, vector);
+	else
+		GUEST_ASSERT_2(!vector, msr->idx, vector);
 	GUEST_DONE();
 }
 
@@ -134,13 +134,13 @@ static void guest_test_msrs_access(void)
 			 * Only available when Hyper-V identification is set
 			 */
 			msr->idx = HV_X64_MSR_GUEST_OS_ID;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 1:
 			msr->idx = HV_X64_MSR_HYPERCALL;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 2:
 			feat->eax |= HV_MSR_HYPERCALL_AVAILABLE;
@@ -149,118 +149,118 @@ static void guest_test_msrs_access(void)
 			 * HV_X64_MSR_HYPERCALL available.
 			 */
 			msr->idx = HV_X64_MSR_GUEST_OS_ID;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = HYPERV_LINUX_OS_ID;
-			msr->available = 1;
+			msr->fault_expected = false;
 			break;
 		case 3:
 			msr->idx = HV_X64_MSR_GUEST_OS_ID;
-			msr->write = 0;
-			msr->available = 1;
+			msr->write = false;
+			msr->fault_expected = false;
 			break;
 		case 4:
 			msr->idx = HV_X64_MSR_HYPERCALL;
-			msr->write = 0;
-			msr->available = 1;
+			msr->write = false;
+			msr->fault_expected = false;
 			break;
 
 		case 5:
 			msr->idx = HV_X64_MSR_VP_RUNTIME;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 6:
 			feat->eax |= HV_MSR_VP_RUNTIME_AVAILABLE;
 			msr->idx = HV_X64_MSR_VP_RUNTIME;
-			msr->write = 0;
-			msr->available = 1;
+			msr->write = false;
+			msr->fault_expected = false;
 			break;
 		case 7:
 			/* Read only */
 			msr->idx = HV_X64_MSR_VP_RUNTIME;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 1;
-			msr->available = 0;
+			msr->fault_expected = true;
 			break;
 
 		case 8:
 			msr->idx = HV_X64_MSR_TIME_REF_COUNT;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 9:
 			feat->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE;
 			msr->idx = HV_X64_MSR_TIME_REF_COUNT;
-			msr->write = 0;
-			msr->available = 1;
+			msr->write = false;
+			msr->fault_expected = false;
 			break;
 		case 10:
 			/* Read only */
 			msr->idx = HV_X64_MSR_TIME_REF_COUNT;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 1;
-			msr->available = 0;
+			msr->fault_expected = true;
 			break;
 
 		case 11:
 			msr->idx = HV_X64_MSR_VP_INDEX;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 12:
 			feat->eax |= HV_MSR_VP_INDEX_AVAILABLE;
 			msr->idx = HV_X64_MSR_VP_INDEX;
-			msr->write = 0;
-			msr->available = 1;
+			msr->write = false;
+			msr->fault_expected = false;
 			break;
 		case 13:
 			/* Read only */
 			msr->idx = HV_X64_MSR_VP_INDEX;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 1;
-			msr->available = 0;
+			msr->fault_expected = true;
 			break;
 
 		case 14:
 			msr->idx = HV_X64_MSR_RESET;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 15:
 			feat->eax |= HV_MSR_RESET_AVAILABLE;
 			msr->idx = HV_X64_MSR_RESET;
-			msr->write = 0;
-			msr->available = 1;
+			msr->write = false;
+			msr->fault_expected = false;
 			break;
 		case 16:
 			msr->idx = HV_X64_MSR_RESET;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 0;
-			msr->available = 1;
+			msr->fault_expected = false;
 			break;
 
 		case 17:
 			msr->idx = HV_X64_MSR_REFERENCE_TSC;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 18:
 			feat->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE;
 			msr->idx = HV_X64_MSR_REFERENCE_TSC;
-			msr->write = 0;
-			msr->available = 1;
+			msr->write = false;
+			msr->fault_expected = false;
 			break;
 		case 19:
 			msr->idx = HV_X64_MSR_REFERENCE_TSC;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 0;
-			msr->available = 1;
+			msr->fault_expected = false;
 			break;
 
 		case 20:
 			msr->idx = HV_X64_MSR_EOM;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 21:
 			/*
@@ -268,146 +268,146 @@ static void guest_test_msrs_access(void)
 			 * capability enabled and guest visible CPUID bit unset.
 			 */
 			msr->idx = HV_X64_MSR_EOM;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 22:
 			feat->eax |= HV_MSR_SYNIC_AVAILABLE;
 			msr->idx = HV_X64_MSR_EOM;
-			msr->write = 0;
-			msr->available = 1;
+			msr->write = false;
+			msr->fault_expected = false;
 			break;
 		case 23:
 			msr->idx = HV_X64_MSR_EOM;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 0;
-			msr->available = 1;
+			msr->fault_expected = false;
 			break;
 
 		case 24:
 			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 25:
 			feat->eax |= HV_MSR_SYNTIMER_AVAILABLE;
 			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
-			msr->write = 0;
-			msr->available = 1;
+			msr->write = false;
+			msr->fault_expected = false;
 			break;
 		case 26:
 			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 0;
-			msr->available = 1;
+			msr->fault_expected = false;
 			break;
 		case 27:
 			/* Direct mode test */
 			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 1 << 12;
-			msr->available = 0;
+			msr->fault_expected = true;
 			break;
 		case 28:
 			feat->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
 			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 1 << 12;
-			msr->available = 1;
+			msr->fault_expected = false;
 			break;
 
 		case 29:
 			msr->idx = HV_X64_MSR_EOI;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 30:
 			feat->eax |= HV_MSR_APIC_ACCESS_AVAILABLE;
 			msr->idx = HV_X64_MSR_EOI;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 1;
-			msr->available = 1;
+			msr->fault_expected = false;
 			break;
 
 		case 31:
 			msr->idx = HV_X64_MSR_TSC_FREQUENCY;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 32:
 			feat->eax |= HV_ACCESS_FREQUENCY_MSRS;
 			msr->idx = HV_X64_MSR_TSC_FREQUENCY;
-			msr->write = 0;
-			msr->available = 1;
+			msr->write = false;
+			msr->fault_expected = false;
 			break;
 		case 33:
 			/* Read only */
 			msr->idx = HV_X64_MSR_TSC_FREQUENCY;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 1;
-			msr->available = 0;
+			msr->fault_expected = true;
 			break;
 
 		case 34:
 			msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 35:
 			feat->eax |= HV_ACCESS_REENLIGHTENMENT;
 			msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL;
-			msr->write = 0;
-			msr->available = 1;
+			msr->write = false;
+			msr->fault_expected = false;
 			break;
 		case 36:
 			msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 1;
-			msr->available = 1;
+			msr->fault_expected = false;
 			break;
 		case 37:
 			/* Can only write '0' */
 			msr->idx = HV_X64_MSR_TSC_EMULATION_STATUS;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 1;
-			msr->available = 0;
+			msr->fault_expected = true;
 			break;
 
 		case 38:
 			msr->idx = HV_X64_MSR_CRASH_P0;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 39:
 			feat->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
 			msr->idx = HV_X64_MSR_CRASH_P0;
-			msr->write = 0;
-			msr->available = 1;
+			msr->write = false;
+			msr->fault_expected = false;
 			break;
 		case 40:
 			msr->idx = HV_X64_MSR_CRASH_P0;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 1;
-			msr->available = 1;
+			msr->fault_expected = false;
 			break;
 
 		case 41:
 			msr->idx = HV_X64_MSR_SYNDBG_STATUS;
-			msr->write = 0;
-			msr->available = 0;
+			msr->write = false;
+			msr->fault_expected = true;
 			break;
 		case 42:
 			feat->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
 			dbg->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
 			msr->idx = HV_X64_MSR_SYNDBG_STATUS;
-			msr->write = 0;
-			msr->available = 1;
+			msr->write = false;
+			msr->fault_expected = false;
 			break;
 		case 43:
 			msr->idx = HV_X64_MSR_SYNDBG_STATUS;
-			msr->write = 1;
+			msr->write = true;
 			msr->write_val = 0;
-			msr->available = 1;
+			msr->fault_expected = false;
 			break;
 
 		case 44:
-- 
cgit 


From 2f10428ace91d07a8c7bd6f127d66eaa9e3a1a9f Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 13 Oct 2022 11:58:47 +0200
Subject: KVM: selftests: Convert hyperv_features test to using
 KVM_X86_CPU_FEATURE()

hyperv_features test needs to set certain CPUID bits in Hyper-V feature
leaves but instead of open coding this, common KVM_X86_CPU_FEATURE()
infrastructure can be used.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20221013095849.705943-6-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 .../testing/selftests/kvm/include/x86_64/hyperv.h  | 141 ++++++++++++++-------
 .../testing/selftests/kvm/x86_64/hyperv_features.c |  67 +++++-----
 2 files changed, 127 insertions(+), 81 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h
index 9218bb5f44bf..ab455c4efc66 100644
--- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h
+++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h
@@ -85,61 +85,108 @@
 #define HV_X64_MSR_SYNDBG_OPTIONS		0x400000FF
 
 /* HYPERV_CPUID_FEATURES.EAX */
-#define HV_MSR_VP_RUNTIME_AVAILABLE		BIT(0)
-#define HV_MSR_TIME_REF_COUNT_AVAILABLE		BIT(1)
-#define HV_MSR_SYNIC_AVAILABLE			BIT(2)
-#define HV_MSR_SYNTIMER_AVAILABLE		BIT(3)
-#define HV_MSR_APIC_ACCESS_AVAILABLE		BIT(4)
-#define HV_MSR_HYPERCALL_AVAILABLE		BIT(5)
-#define HV_MSR_VP_INDEX_AVAILABLE		BIT(6)
-#define HV_MSR_RESET_AVAILABLE			BIT(7)
-#define HV_MSR_STAT_PAGES_AVAILABLE		BIT(8)
-#define HV_MSR_REFERENCE_TSC_AVAILABLE		BIT(9)
-#define HV_MSR_GUEST_IDLE_AVAILABLE		BIT(10)
-#define HV_ACCESS_FREQUENCY_MSRS		BIT(11)
-#define HV_ACCESS_REENLIGHTENMENT		BIT(13)
-#define HV_ACCESS_TSC_INVARIANT			BIT(15)
+#define HV_MSR_VP_RUNTIME_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 0)
+#define HV_MSR_TIME_REF_COUNT_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 1)
+#define HV_MSR_SYNIC_AVAILABLE			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 2)
+#define HV_MSR_SYNTIMER_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 3)
+#define HV_MSR_APIC_ACCESS_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 4)
+#define HV_MSR_HYPERCALL_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 5)
+#define HV_MSR_VP_INDEX_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 6)
+#define HV_MSR_RESET_AVAILABLE			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 7)
+#define HV_MSR_STAT_PAGES_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 8)
+#define HV_MSR_REFERENCE_TSC_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 9)
+#define HV_MSR_GUEST_IDLE_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 10)
+#define HV_ACCESS_FREQUENCY_MSRS		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 11)
+#define HV_ACCESS_REENLIGHTENMENT		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 13)
+#define HV_ACCESS_TSC_INVARIANT			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 15)
 
 /* HYPERV_CPUID_FEATURES.EBX */
-#define HV_CREATE_PARTITIONS			BIT(0)
-#define HV_ACCESS_PARTITION_ID			BIT(1)
-#define HV_ACCESS_MEMORY_POOL			BIT(2)
-#define HV_ADJUST_MESSAGE_BUFFERS		BIT(3)
-#define HV_POST_MESSAGES			BIT(4)
-#define HV_SIGNAL_EVENTS			BIT(5)
-#define HV_CREATE_PORT				BIT(6)
-#define HV_CONNECT_PORT				BIT(7)
-#define HV_ACCESS_STATS				BIT(8)
-#define HV_DEBUGGING				BIT(11)
-#define HV_CPU_MANAGEMENT			BIT(12)
-#define HV_ISOLATION				BIT(22)
+#define HV_CREATE_PARTITIONS		        \
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 0)
+#define HV_ACCESS_PARTITION_ID			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 1)
+#define HV_ACCESS_MEMORY_POOL			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 2)
+#define HV_ADJUST_MESSAGE_BUFFERS		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 3)
+#define HV_POST_MESSAGES			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 4)
+#define HV_SIGNAL_EVENTS			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 5)
+#define HV_CREATE_PORT				\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 6)
+#define HV_CONNECT_PORT				\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 7)
+#define HV_ACCESS_STATS				\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 8)
+#define HV_DEBUGGING				\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 11)
+#define HV_CPU_MANAGEMENT			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 12)
+#define HV_ISOLATION				\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 22)
 
 /* HYPERV_CPUID_FEATURES.EDX */
-#define HV_X64_MWAIT_AVAILABLE				BIT(0)
-#define HV_X64_GUEST_DEBUGGING_AVAILABLE		BIT(1)
-#define HV_X64_PERF_MONITOR_AVAILABLE			BIT(2)
-#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE	BIT(3)
-#define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE		BIT(4)
-#define HV_X64_GUEST_IDLE_STATE_AVAILABLE		BIT(5)
-#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE		BIT(8)
-#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE		BIT(10)
-#define HV_FEATURE_DEBUG_MSRS_AVAILABLE			BIT(11)
-#define HV_STIMER_DIRECT_MODE_AVAILABLE			BIT(19)
+#define HV_X64_MWAIT_AVAILABLE				\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 0)
+#define HV_X64_GUEST_DEBUGGING_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 1)
+#define HV_X64_PERF_MONITOR_AVAILABLE			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 2)
+#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE	\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 3)
+#define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 4)
+#define HV_X64_GUEST_IDLE_STATE_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 5)
+#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 8)
+#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 10)
+#define HV_FEATURE_DEBUG_MSRS_AVAILABLE			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 11)
+#define HV_STIMER_DIRECT_MODE_AVAILABLE			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 19)
 
 /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */
-#define HV_X64_AS_SWITCH_RECOMMENDED			BIT(0)
-#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED		BIT(1)
-#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED		BIT(2)
-#define HV_X64_APIC_ACCESS_RECOMMENDED			BIT(3)
-#define HV_X64_SYSTEM_RESET_RECOMMENDED			BIT(4)
-#define HV_X64_RELAXED_TIMING_RECOMMENDED		BIT(5)
-#define HV_DEPRECATING_AEOI_RECOMMENDED			BIT(9)
-#define HV_X64_CLUSTER_IPI_RECOMMENDED			BIT(10)
-#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED		BIT(11)
-#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED		BIT(14)
+#define HV_X64_AS_SWITCH_RECOMMENDED			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 0)
+#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 1)
+#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 2)
+#define HV_X64_APIC_ACCESS_RECOMMENDED			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 3)
+#define HV_X64_SYSTEM_RESET_RECOMMENDED			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 4)
+#define HV_X64_RELAXED_TIMING_RECOMMENDED		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 5)
+#define HV_DEPRECATING_AEOI_RECOMMENDED			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 9)
+#define HV_X64_CLUSTER_IPI_RECOMMENDED			\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 10)
+#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 11)
+#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 14)
 
 /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
-#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING	BIT(1)
+#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING	\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0, EAX, 1)
 
 /* Hypercalls */
 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE	0x0002
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
index 4cf1368af48a..daad3e3cc7bb 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
@@ -13,6 +13,14 @@
 #include "processor.h"
 #include "hyperv.h"
 
+/*
+ * HYPERV_CPUID_ENLIGHTMENT_INFO.EBX is not a 'feature' CPUID leaf
+ * but to activate the feature it is sufficient to set it to a non-zero
+ * value. Use BIT(0) for that.
+ */
+#define HV_PV_SPINLOCKS_TEST            \
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EBX, 0)
+
 struct msr_data {
 	uint32_t idx;
 	bool fault_expected;
@@ -89,7 +97,6 @@ static void vcpu_reset_hv_cpuid(struct kvm_vcpu *vcpu)
 static void guest_test_msrs_access(void)
 {
 	struct kvm_cpuid2 *prev_cpuid = NULL;
-	struct kvm_cpuid_entry2 *feat, *dbg;
 	struct kvm_vcpu *vcpu;
 	struct kvm_run *run;
 	struct kvm_vm *vm;
@@ -116,9 +123,6 @@ static void guest_test_msrs_access(void)
 			vcpu_init_cpuid(vcpu, prev_cpuid);
 		}
 
-		feat = vcpu_get_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES);
-		dbg = vcpu_get_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
-
 		vm_init_descriptor_tables(vm);
 		vcpu_init_descriptor_tables(vcpu);
 
@@ -143,7 +147,7 @@ static void guest_test_msrs_access(void)
 			msr->fault_expected = true;
 			break;
 		case 2:
-			feat->eax |= HV_MSR_HYPERCALL_AVAILABLE;
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_HYPERCALL_AVAILABLE);
 			/*
 			 * HV_X64_MSR_GUEST_OS_ID has to be written first to make
 			 * HV_X64_MSR_HYPERCALL available.
@@ -170,7 +174,7 @@ static void guest_test_msrs_access(void)
 			msr->fault_expected = true;
 			break;
 		case 6:
-			feat->eax |= HV_MSR_VP_RUNTIME_AVAILABLE;
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_VP_RUNTIME_AVAILABLE);
 			msr->idx = HV_X64_MSR_VP_RUNTIME;
 			msr->write = false;
 			msr->fault_expected = false;
@@ -189,7 +193,7 @@ static void guest_test_msrs_access(void)
 			msr->fault_expected = true;
 			break;
 		case 9:
-			feat->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE;
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_TIME_REF_COUNT_AVAILABLE);
 			msr->idx = HV_X64_MSR_TIME_REF_COUNT;
 			msr->write = false;
 			msr->fault_expected = false;
@@ -208,7 +212,7 @@ static void guest_test_msrs_access(void)
 			msr->fault_expected = true;
 			break;
 		case 12:
-			feat->eax |= HV_MSR_VP_INDEX_AVAILABLE;
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_VP_INDEX_AVAILABLE);
 			msr->idx = HV_X64_MSR_VP_INDEX;
 			msr->write = false;
 			msr->fault_expected = false;
@@ -227,7 +231,7 @@ static void guest_test_msrs_access(void)
 			msr->fault_expected = true;
 			break;
 		case 15:
-			feat->eax |= HV_MSR_RESET_AVAILABLE;
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_RESET_AVAILABLE);
 			msr->idx = HV_X64_MSR_RESET;
 			msr->write = false;
 			msr->fault_expected = false;
@@ -245,7 +249,7 @@ static void guest_test_msrs_access(void)
 			msr->fault_expected = true;
 			break;
 		case 18:
-			feat->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE;
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_REFERENCE_TSC_AVAILABLE);
 			msr->idx = HV_X64_MSR_REFERENCE_TSC;
 			msr->write = false;
 			msr->fault_expected = false;
@@ -272,7 +276,7 @@ static void guest_test_msrs_access(void)
 			msr->fault_expected = true;
 			break;
 		case 22:
-			feat->eax |= HV_MSR_SYNIC_AVAILABLE;
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_SYNIC_AVAILABLE);
 			msr->idx = HV_X64_MSR_EOM;
 			msr->write = false;
 			msr->fault_expected = false;
@@ -290,7 +294,7 @@ static void guest_test_msrs_access(void)
 			msr->fault_expected = true;
 			break;
 		case 25:
-			feat->eax |= HV_MSR_SYNTIMER_AVAILABLE;
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_SYNTIMER_AVAILABLE);
 			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
 			msr->write = false;
 			msr->fault_expected = false;
@@ -309,7 +313,7 @@ static void guest_test_msrs_access(void)
 			msr->fault_expected = true;
 			break;
 		case 28:
-			feat->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
+			vcpu_set_cpuid_feature(vcpu, HV_STIMER_DIRECT_MODE_AVAILABLE);
 			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
 			msr->write = true;
 			msr->write_val = 1 << 12;
@@ -322,7 +326,7 @@ static void guest_test_msrs_access(void)
 			msr->fault_expected = true;
 			break;
 		case 30:
-			feat->eax |= HV_MSR_APIC_ACCESS_AVAILABLE;
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_APIC_ACCESS_AVAILABLE);
 			msr->idx = HV_X64_MSR_EOI;
 			msr->write = true;
 			msr->write_val = 1;
@@ -335,7 +339,7 @@ static void guest_test_msrs_access(void)
 			msr->fault_expected = true;
 			break;
 		case 32:
-			feat->eax |= HV_ACCESS_FREQUENCY_MSRS;
+			vcpu_set_cpuid_feature(vcpu, HV_ACCESS_FREQUENCY_MSRS);
 			msr->idx = HV_X64_MSR_TSC_FREQUENCY;
 			msr->write = false;
 			msr->fault_expected = false;
@@ -354,7 +358,7 @@ static void guest_test_msrs_access(void)
 			msr->fault_expected = true;
 			break;
 		case 35:
-			feat->eax |= HV_ACCESS_REENLIGHTENMENT;
+			vcpu_set_cpuid_feature(vcpu, HV_ACCESS_REENLIGHTENMENT);
 			msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL;
 			msr->write = false;
 			msr->fault_expected = false;
@@ -379,7 +383,7 @@ static void guest_test_msrs_access(void)
 			msr->fault_expected = true;
 			break;
 		case 39:
-			feat->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+			vcpu_set_cpuid_feature(vcpu, HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE);
 			msr->idx = HV_X64_MSR_CRASH_P0;
 			msr->write = false;
 			msr->fault_expected = false;
@@ -397,8 +401,8 @@ static void guest_test_msrs_access(void)
 			msr->fault_expected = true;
 			break;
 		case 42:
-			feat->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
-			dbg->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
+			vcpu_set_cpuid_feature(vcpu, HV_FEATURE_DEBUG_MSRS_AVAILABLE);
+			vcpu_set_cpuid_feature(vcpu, HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING);
 			msr->idx = HV_X64_MSR_SYNDBG_STATUS;
 			msr->write = false;
 			msr->fault_expected = false;
@@ -445,7 +449,6 @@ static void guest_test_msrs_access(void)
 
 static void guest_test_hcalls_access(void)
 {
-	struct kvm_cpuid_entry2 *feat, *recomm, *dbg;
 	struct kvm_cpuid2 *prev_cpuid = NULL;
 	struct kvm_vcpu *vcpu;
 	struct kvm_run *run;
@@ -480,15 +483,11 @@ static void guest_test_hcalls_access(void)
 			vcpu_init_cpuid(vcpu, prev_cpuid);
 		}
 
-		feat = vcpu_get_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES);
-		recomm = vcpu_get_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO);
-		dbg = vcpu_get_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
-
 		run = vcpu->run;
 
 		switch (stage) {
 		case 0:
-			feat->eax |= HV_MSR_HYPERCALL_AVAILABLE;
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_HYPERCALL_AVAILABLE);
 			hcall->control = 0xbeef;
 			hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE;
 			break;
@@ -498,7 +497,7 @@ static void guest_test_hcalls_access(void)
 			hcall->expect = HV_STATUS_ACCESS_DENIED;
 			break;
 		case 2:
-			feat->ebx |= HV_POST_MESSAGES;
+			vcpu_set_cpuid_feature(vcpu, HV_POST_MESSAGES);
 			hcall->control = HVCALL_POST_MESSAGE;
 			hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT;
 			break;
@@ -508,7 +507,7 @@ static void guest_test_hcalls_access(void)
 			hcall->expect = HV_STATUS_ACCESS_DENIED;
 			break;
 		case 4:
-			feat->ebx |= HV_SIGNAL_EVENTS;
+			vcpu_set_cpuid_feature(vcpu, HV_SIGNAL_EVENTS);
 			hcall->control = HVCALL_SIGNAL_EVENT;
 			hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT;
 			break;
@@ -518,12 +517,12 @@ static void guest_test_hcalls_access(void)
 			hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE;
 			break;
 		case 6:
-			dbg->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
+			vcpu_set_cpuid_feature(vcpu, HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING);
 			hcall->control = HVCALL_RESET_DEBUG_SESSION;
 			hcall->expect = HV_STATUS_ACCESS_DENIED;
 			break;
 		case 7:
-			feat->ebx |= HV_DEBUGGING;
+			vcpu_set_cpuid_feature(vcpu, HV_DEBUGGING);
 			hcall->control = HVCALL_RESET_DEBUG_SESSION;
 			hcall->expect = HV_STATUS_OPERATION_DENIED;
 			break;
@@ -533,7 +532,7 @@ static void guest_test_hcalls_access(void)
 			hcall->expect = HV_STATUS_ACCESS_DENIED;
 			break;
 		case 9:
-			recomm->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+			vcpu_set_cpuid_feature(vcpu, HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED);
 			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE;
 			hcall->expect = HV_STATUS_SUCCESS;
 			break;
@@ -542,7 +541,7 @@ static void guest_test_hcalls_access(void)
 			hcall->expect = HV_STATUS_ACCESS_DENIED;
 			break;
 		case 11:
-			recomm->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED;
+			vcpu_set_cpuid_feature(vcpu, HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED);
 			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX;
 			hcall->expect = HV_STATUS_SUCCESS;
 			break;
@@ -552,7 +551,7 @@ static void guest_test_hcalls_access(void)
 			hcall->expect = HV_STATUS_ACCESS_DENIED;
 			break;
 		case 13:
-			recomm->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED;
+			vcpu_set_cpuid_feature(vcpu, HV_X64_CLUSTER_IPI_RECOMMENDED);
 			hcall->control = HVCALL_SEND_IPI;
 			hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT;
 			break;
@@ -567,7 +566,7 @@ static void guest_test_hcalls_access(void)
 			hcall->expect = HV_STATUS_ACCESS_DENIED;
 			break;
 		case 16:
-			recomm->ebx = 0xfff;
+			vcpu_set_cpuid_feature(vcpu, HV_PV_SPINLOCKS_TEST);
 			hcall->control = HVCALL_NOTIFY_LONG_SPIN_WAIT;
 			hcall->expect = HV_STATUS_SUCCESS;
 			break;
@@ -577,7 +576,7 @@ static void guest_test_hcalls_access(void)
 			hcall->ud_expected = true;
 			break;
 		case 18:
-			feat->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE;
+			vcpu_set_cpuid_feature(vcpu, HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE);
 			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT;
 			hcall->ud_expected = false;
 			hcall->expect = HV_STATUS_SUCCESS;
-- 
cgit 


From 91a0b5478aab482d7133b8f069eb7d4590efcb7e Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 13 Oct 2022 11:58:48 +0200
Subject: KVM: selftests: Test that values written to Hyper-V MSRs are
 preserved

Enhance 'hyperv_features' selftest by adding a check that KVM
preserves values written to PV MSRs. Two MSRs are, however, 'special':
- HV_X64_MSR_EOI as it is a 'write-only' MSR,
- HV_X64_MSR_RESET as it always reads as '0'.
The later doesn't require any special handling right now because the
test never writes anything besides '0' to the MSR, leave a TODO node
about the fact.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20221013095849.705943-7-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 .../testing/selftests/kvm/x86_64/hyperv_features.c | 36 +++++++++++++++++-----
 1 file changed, 28 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
index daad3e3cc7bb..b00240963974 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
@@ -34,22 +34,36 @@ struct hcall_data {
 	bool ud_expected;
 };
 
+static bool is_write_only_msr(uint32_t msr)
+{
+	return msr == HV_X64_MSR_EOI;
+}
+
 static void guest_msr(struct msr_data *msr)
 {
-	uint64_t ignored;
-	uint8_t vector;
+	uint8_t vector = 0;
+	uint64_t msr_val = 0;
 
 	GUEST_ASSERT(msr->idx);
 
-	if (!msr->write)
-		vector = rdmsr_safe(msr->idx, &ignored);
-	else
+	if (msr->write)
 		vector = wrmsr_safe(msr->idx, msr->write_val);
 
+	if (!vector && (!msr->write || !is_write_only_msr(msr->idx)))
+		vector = rdmsr_safe(msr->idx, &msr_val);
+
 	if (msr->fault_expected)
-		GUEST_ASSERT_2(vector == GP_VECTOR, msr->idx, vector);
+		GUEST_ASSERT_3(vector == GP_VECTOR, msr->idx, vector, GP_VECTOR);
 	else
-		GUEST_ASSERT_2(!vector, msr->idx, vector);
+		GUEST_ASSERT_3(!vector, msr->idx, vector, 0);
+
+	if (vector || is_write_only_msr(msr->idx))
+		goto done;
+
+	if (msr->write)
+		GUEST_ASSERT_3(msr_val == msr->write_val, msr->idx,
+			       msr_val, msr->write_val);
+done:
 	GUEST_DONE();
 }
 
@@ -239,6 +253,12 @@ static void guest_test_msrs_access(void)
 		case 16:
 			msr->idx = HV_X64_MSR_RESET;
 			msr->write = true;
+			/*
+			 * TODO: the test only writes '0' to HV_X64_MSR_RESET
+			 * at the moment, writing some other value there will
+			 * trigger real vCPU reset and the code is not prepared
+			 * to handle it yet.
+			 */
 			msr->write_val = 0;
 			msr->fault_expected = false;
 			break;
@@ -433,7 +453,7 @@ static void guest_test_msrs_access(void)
 
 		switch (get_ucall(vcpu, &uc)) {
 		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT_2(uc, "MSR = %lx, vector = %lx");
+			REPORT_GUEST_ASSERT_3(uc, "MSR = %lx, arg1 = %lx, arg2 = %lx");
 			return;
 		case UCALL_DONE:
 			break;
-- 
cgit 


From bd827bd775375fdef13da4c8d3f237555938aa6d Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 13 Oct 2022 11:58:49 +0200
Subject: KVM: selftests: Test Hyper-V invariant TSC control

Add a test for the newly introduced Hyper-V invariant TSC control feature:
- HV_X64_MSR_TSC_INVARIANT_CONTROL is not available without
 HV_ACCESS_TSC_INVARIANT CPUID bit set and available with it.
- BIT(0) of HV_X64_MSR_TSC_INVARIANT_CONTROL controls the filtering of
architectural invariant TSC (CPUID.80000007H:EDX[8]) bit.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20221013095849.705943-8-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 .../testing/selftests/kvm/include/x86_64/hyperv.h  |  3 ++
 .../selftests/kvm/include/x86_64/processor.h       |  1 +
 .../testing/selftests/kvm/x86_64/hyperv_features.c | 47 ++++++++++++++++++++++
 3 files changed, 51 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h
index ab455c4efc66..28eb99046475 100644
--- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h
+++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h
@@ -335,4 +335,7 @@ struct hyperv_test_pages {
 struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm,
 						       vm_vaddr_t *p_hv_pages_gva);
 
+/* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */
+#define HV_INVARIANT_TSC_EXPOSED               BIT_ULL(0)
+
 #endif /* !SELFTEST_KVM_HYPERV_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index b1a31de7108a..2a5f47d51388 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -137,6 +137,7 @@ struct kvm_x86_cpu_feature {
 #define	X86_FEATURE_GBPAGES		KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 26)
 #define	X86_FEATURE_RDTSCP		KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 27)
 #define	X86_FEATURE_LM			KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 29)
+#define	X86_FEATURE_INVTSC		KVM_X86_CPU_FEATURE(0x80000007, 0, EDX, 8)
 #define	X86_FEATURE_RDPRU		KVM_X86_CPU_FEATURE(0x80000008, 0, EBX, 4)
 #define	X86_FEATURE_AMD_IBPB		KVM_X86_CPU_FEATURE(0x80000008, 0, EBX, 12)
 #define	X86_FEATURE_NPT			KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 0)
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
index b00240963974..258267df8e2a 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
@@ -63,6 +63,16 @@ static void guest_msr(struct msr_data *msr)
 	if (msr->write)
 		GUEST_ASSERT_3(msr_val == msr->write_val, msr->idx,
 			       msr_val, msr->write_val);
+
+	/* Invariant TSC bit appears when TSC invariant control MSR is written to */
+	if (msr->idx == HV_X64_MSR_TSC_INVARIANT_CONTROL) {
+		if (!this_cpu_has(HV_ACCESS_TSC_INVARIANT))
+			GUEST_ASSERT(this_cpu_has(X86_FEATURE_INVTSC));
+		else
+			GUEST_ASSERT(this_cpu_has(X86_FEATURE_INVTSC) ==
+				     !!(msr_val & HV_INVARIANT_TSC_EXPOSED));
+	}
+
 done:
 	GUEST_DONE();
 }
@@ -118,6 +128,7 @@ static void guest_test_msrs_access(void)
 	int stage = 0;
 	vm_vaddr_t msr_gva;
 	struct msr_data *msr;
+	bool has_invtsc = kvm_cpu_has(X86_FEATURE_INVTSC);
 
 	while (true) {
 		vm = vm_create_with_one_vcpu(&vcpu, guest_msr);
@@ -435,6 +446,42 @@ static void guest_test_msrs_access(void)
 			break;
 
 		case 44:
+			/* MSR is not available when CPUID feature bit is unset */
+			if (!has_invtsc)
+				continue;
+			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 45:
+			/* MSR is vailable when CPUID feature bit is set */
+			if (!has_invtsc)
+				continue;
+			vcpu_set_cpuid_feature(vcpu, HV_ACCESS_TSC_INVARIANT);
+			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 46:
+			/* Writing bits other than 0 is forbidden */
+			if (!has_invtsc)
+				continue;
+			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
+			msr->write = true;
+			msr->write_val = 0xdeadbeef;
+			msr->fault_expected = true;
+			break;
+		case 47:
+			/* Setting bit 0 enables the feature */
+			if (!has_invtsc)
+				continue;
+			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = false;
+			break;
+
+		default:
 			kvm_vm_free(vm);
 			return;
 		}
-- 
cgit 


From 7244eb669397f309c3d014264823cdc9cb3f8e6b Mon Sep 17 00:00:00 2001
From: "Daniel T. Lee" <danieltimlee@gmail.com>
Date: Sat, 24 Dec 2022 16:15:27 +0900
Subject: libbpf: Fix invalid return address register in s390

There is currently an invalid register mapping in the s390 return
address register. As the manual[1] states, the return address can be
found at r14. In bpf_tracing.h, the s390 registers were named
gprs(general purpose registers). This commit fixes the problem by
correcting the mistyped mapping.

[1]: https://uclibc.org/docs/psABI-s390x.pdf#page=14

Fixes: 3cc31d794097 ("libbpf: Normalize PT_REGS_xxx() macro definitions")
Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20221224071527.2292-7-danieltimlee@gmail.com
---
 tools/lib/bpf/bpf_tracing.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 2972dc25ff72..9c1b1689068d 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -137,7 +137,7 @@ struct pt_regs___s390 {
 #define __PT_PARM3_REG gprs[4]
 #define __PT_PARM4_REG gprs[5]
 #define __PT_PARM5_REG gprs[6]
-#define __PT_RET_REG grps[14]
+#define __PT_RET_REG gprs[14]
 #define __PT_FP_REG gprs[11]	/* Works only with CONFIG_FRAME_POINTER */
 #define __PT_RC_REG gprs[2]
 #define __PT_SP_REG gprs[15]
-- 
cgit 


From 678a1c036199011e65b203367900f002a28da004 Mon Sep 17 00:00:00 2001
From: Xin Liu <liuxin350@huawei.com>
Date: Sat, 24 Dec 2022 19:20:58 +0800
Subject: libbpf: Added the description of some API functions

Currently, many API functions are not described in the document.
Add add API description of the following four API functions:
  - libbpf_set_print;
  - bpf_object__open;
  - bpf_object__load;
  - bpf_object__close.

Signed-off-by: Xin Liu <liuxin350@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20221224112058.12038-1-liuxin350@huawei.com
---
 tools/lib/bpf/libbpf.h | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index eee883f007f9..898db26e42e9 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -96,6 +96,12 @@ enum libbpf_print_level {
 typedef int (*libbpf_print_fn_t)(enum libbpf_print_level level,
 				 const char *, va_list ap);
 
+/**
+ * @brief **libbpf_set_print()** sets user-provided log callback function to
+ * be used for libbpf warnings and informational messages.
+ * @param fn The log print function. If NULL, libbpf won't print anything.
+ * @return Pointer to old print function.
+ */
 LIBBPF_API libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn);
 
 /* Hide internal to user */
@@ -174,6 +180,14 @@ struct bpf_object_open_opts {
 };
 #define bpf_object_open_opts__last_field kernel_log_level
 
+/**
+ * @brief **bpf_object__open()** creates a bpf_object by opening
+ * the BPF ELF object file pointed to by the passed path and loading it
+ * into memory.
+ * @param path BPF object file path.
+ * @return pointer to the new bpf_object; or NULL is returned on error,
+ * error code is stored in errno
+ */
 LIBBPF_API struct bpf_object *bpf_object__open(const char *path);
 
 /**
@@ -203,10 +217,21 @@ LIBBPF_API struct bpf_object *
 bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz,
 		     const struct bpf_object_open_opts *opts);
 
-/* Load/unload object into/from kernel */
+/**
+ * @brief **bpf_object__load()** loads BPF object into kernel.
+ * @param obj Pointer to a valid BPF object instance returned by
+ * **bpf_object__open*()** APIs
+ * @return 0, on success; negative error code, otherwise, error code is
+ * stored in errno
+ */
 LIBBPF_API int bpf_object__load(struct bpf_object *obj);
 
-LIBBPF_API void bpf_object__close(struct bpf_object *object);
+/**
+ * @brief **bpf_object__close()** closes a BPF object and releases all
+ * resources.
+ * @param obj Pointer to a valid BPF object
+ */
+LIBBPF_API void bpf_object__close(struct bpf_object *obj);
 
 /* pin_maps and unpin_maps can both be called with a NULL path, in which case
  * they will use the pin_path attribute of each map (and ignore all maps that
-- 
cgit 


From bb5747cfbc4b7fe29621ca6cd4a695d2723bf2e8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 29 Dec 2022 19:12:17 -0800
Subject: libbpf: Restore errno after pr_warn.

pr_warn calls into user-provided callback, which can clobber errno, so
`errno = saved_errno` should happen after pr_warn.

Fixes: 07453245620c ("libbpf: fix errno is overwritten after being closed.")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/libbpf_internal.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 98333a6c38e9..e4d05662a96c 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -544,8 +544,10 @@ static inline int ensure_good_fd(int fd)
 		saved_errno = errno;
 		close(old_fd);
 		errno = saved_errno;
-		if (fd < 0)
+		if (fd < 0) {
 			pr_warn("failed to dup FD %d to FD > 2: %d\n", old_fd, -saved_errno);
+			errno = saved_errno;
+		}
 	}
 	return fd;
 }
-- 
cgit 


From c48cafc241bff09d7ea682982ff96d532e64e2ed Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Dec 2022 17:06:46 +0000
Subject: kselftest/alsa: pcm - Drop recent coverage improvement changes

In preparation to adopting a better, more comprehensive approach to
adding the coverage that was just added using some changes from Jaroslav
which were sent at the same time the recently added improvements were
being applied drop what was applied.  This reverts:

  7d721baea138 "kselftest/alsa: Add more coverage of sample rates and channel counts"
  ee12040dd53a "kselftest/alsa: Provide more meaningful names for tests"
  ae95efd9754c "kselftest/alsa: Don't any configuration in the sample config"
  8370d9b00c92 Revert "kselftest/alsa: Report failures to set the requested channels as skips"
  f944f8b539ea "kselftest/alsa: Report failures to set the requested sample rate as skips"
  22eeb8f531c1 "kselftest/alsa: Refactor pcm-test to list the tests to run in a struct"

Reviewed-by: Jaroslav Kysela <perex@perex.cz>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-alsa-pcm-test-hacks-v4-1-5a152e65b1e1@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 .../alsa/conf.d/Lenovo_ThinkPad_P1_Gen2.conf       | 35 ++++-----
 tools/testing/selftests/alsa/pcm-test.c            | 88 +++++++---------------
 2 files changed, 42 insertions(+), 81 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/conf.d/Lenovo_ThinkPad_P1_Gen2.conf b/tools/testing/selftests/alsa/conf.d/Lenovo_ThinkPad_P1_Gen2.conf
index 9eca985e0c08..0a83f35d43eb 100644
--- a/tools/testing/selftests/alsa/conf.d/Lenovo_ThinkPad_P1_Gen2.conf
+++ b/tools/testing/selftests/alsa/conf.d/Lenovo_ThinkPad_P1_Gen2.conf
@@ -39,25 +39,22 @@ card.hda {
 	#
 	pcm.0.0 {
 		PLAYBACK {
-			#
-			# Uncomment to override values for specific tests
-			#
-			#test_name1 {
-			#	access RW_INTERLEAVED
-			#	format S16_LE
-			#	rate 48000
-			#	channels 2
-			#	period_size 512
-			#	buffer_size 4096
-			#}
-			#test_name2 {
-			#	access RW_INTERLEAVED
-			#	format S16_LE
-			#	rate 48000
-			#	channels 2
-			#	period_size 24000
-			#	buffer_size 192000
-			#}
+			test.time1 {
+				access RW_INTERLEAVED	# can be omitted - default
+				format S16_LE		# can be omitted - default
+				rate 48000		# can be omitted - default
+				channels 2		# can be omitted - default
+				period_size 512
+				buffer_size 4096
+			}
+			test.time2 {
+				access RW_INTERLEAVED
+				format S16_LE
+				rate 48000
+				channels 2
+				period_size 24000
+				buffer_size 192000
+			}
 		}
 		CAPTURE {
 			# use default tests, check for the presence
diff --git a/tools/testing/selftests/alsa/pcm-test.c b/tools/testing/selftests/alsa/pcm-test.c
index f293c7d81009..6e7dfc395b98 100644
--- a/tools/testing/selftests/alsa/pcm-test.c
+++ b/tools/testing/selftests/alsa/pcm-test.c
@@ -37,15 +37,6 @@ struct pcm_data *pcm_list = NULL;
 int num_missing = 0;
 struct pcm_data *pcm_missing = NULL;
 
-struct time_test_def {
-	const char *cfg_prefix;
-	const char *format;
-	long rate;
-	long channels;
-	long period_size;
-	long buffer_size;
-};
-
 void timestamp_now(timestamp_t *tstamp)
 {
 	if (clock_gettime(CLOCK_MONOTONIC_RAW, tstamp))
@@ -229,7 +220,9 @@ static void find_pcms(void)
 }
 
 static void test_pcm_time1(struct pcm_data *data,
-			   const struct time_test_def *test)
+			   const char *cfg_prefix, const char *sformat,
+			   long srate, long schannels,
+			   long speriod_size, long sbuffer_size)
 {
 	char name[64], key[128], msg[256];
 	const char *cs;
@@ -241,32 +234,30 @@ static void test_pcm_time1(struct pcm_data *data,
 	snd_pcm_sframes_t frames;
 	long long ms;
 	long rate, channels, period_size, buffer_size;
-	unsigned int rchannels;
 	unsigned int rrate;
 	snd_pcm_uframes_t rperiod_size, rbuffer_size, start_threshold;
 	timestamp_t tstamp;
 	bool pass = false, automatic = true;
 	snd_pcm_hw_params_t *hw_params;
 	snd_pcm_sw_params_t *sw_params;
-	bool skip = false;
 
 	snd_pcm_hw_params_alloca(&hw_params);
 	snd_pcm_sw_params_alloca(&sw_params);
 
-	cs = conf_get_string(data->pcm_config, test->cfg_prefix, "format", test->format);
+	cs = conf_get_string(data->pcm_config, cfg_prefix, "format", sformat);
 	format = snd_pcm_format_value(cs);
 	if (format == SND_PCM_FORMAT_UNKNOWN)
 		ksft_exit_fail_msg("Wrong format '%s'\n", cs);
-	rate = conf_get_long(data->pcm_config, test->cfg_prefix, "rate", test->rate);
-	channels = conf_get_long(data->pcm_config, test->cfg_prefix, "channels", test->channels);
-	period_size = conf_get_long(data->pcm_config, test->cfg_prefix, "period_size", test->period_size);
-	buffer_size = conf_get_long(data->pcm_config, test->cfg_prefix, "buffer_size", test->buffer_size);
+	rate = conf_get_long(data->pcm_config, cfg_prefix, "rate", srate);
+	channels = conf_get_long(data->pcm_config, cfg_prefix, "channels", schannels);
+	period_size = conf_get_long(data->pcm_config, cfg_prefix, "period_size", speriod_size);
+	buffer_size = conf_get_long(data->pcm_config, cfg_prefix, "buffer_size", sbuffer_size);
 
-	automatic = strcmp(test->format, snd_pcm_format_name(format)) == 0 &&
-			test->rate == rate &&
-			test->channels == channels &&
-			test->period_size == period_size &&
-			test->buffer_size == buffer_size;
+	automatic = strcmp(sformat, snd_pcm_format_name(format)) == 0 &&
+			srate == rate &&
+			schannels == channels &&
+			speriod_size == period_size &&
+			sbuffer_size == buffer_size;
 
 	samples = malloc((rate * channels * snd_pcm_format_physical_width(format)) / 8);
 	if (!samples)
@@ -302,7 +293,7 @@ __format:
 		if (automatic && format == SND_PCM_FORMAT_S16_LE) {
 			format = SND_PCM_FORMAT_S32_LE;
 			ksft_print_msg("%s.%d.%d.%d.%s.%s format S16_LE -> S32_LE\n",
-					 test->cfg_prefix,
+					 cfg_prefix,
 					 data->card, data->device, data->subdevice,
 					 snd_pcm_stream_name(data->stream),
 					 snd_pcm_access_name(access));
@@ -311,17 +302,11 @@ __format:
 					   snd_pcm_format_name(format), snd_strerror(err));
 		goto __close;
 	}
-	rchannels = channels;
-	err = snd_pcm_hw_params_set_channels_near(handle, hw_params, &rchannels);
+	err = snd_pcm_hw_params_set_channels(handle, hw_params, channels);
 	if (err < 0) {
 		snprintf(msg, sizeof(msg), "snd_pcm_hw_params_set_channels %ld: %s", channels, snd_strerror(err));
 		goto __close;
 	}
-	if (rchannels != channels) {
-		snprintf(msg, sizeof(msg), "channels unsupported %ld != %ld", channels, rchannels);
-		skip = true;
-		goto __close;
-	}
 	rrate = rate;
 	err = snd_pcm_hw_params_set_rate_near(handle, hw_params, &rrate, 0);
 	if (err < 0) {
@@ -329,8 +314,7 @@ __format:
 		goto __close;
 	}
 	if (rrate != rate) {
-		snprintf(msg, sizeof(msg), "rate unsupported %ld != %ld", rate, rrate);
-		skip = true;
+		snprintf(msg, sizeof(msg), "rate mismatch %ld != %ld", rate, rrate);
 		goto __close;
 	}
 	rperiod_size = period_size;
@@ -378,7 +362,7 @@ __format:
 	}
 
 	ksft_print_msg("%s.%d.%d.%d.%s hw_params.%s.%s.%ld.%ld.%ld.%ld sw_params.%ld\n",
-			 test->cfg_prefix,
+			 cfg_prefix,
 			 data->card, data->device, data->subdevice,
 			 snd_pcm_stream_name(data->stream),
 			 snd_pcm_access_name(access),
@@ -426,40 +410,21 @@ __format:
 	msg[0] = '\0';
 	pass = true;
 __close:
-	if (!skip) {
-		ksft_test_result(pass, "%s.%d.%d.%d.%s%s%s\n",
-				 test->cfg_prefix,
-				 data->card, data->device, data->subdevice,
-				 snd_pcm_stream_name(data->stream),
-				 msg[0] ? " " : "", msg);
-	} else {
-		ksft_test_result_skip("%s.%d.%d.%d.%s%s%s\n",
-				      test->cfg_prefix,
-				      data->card, data->device,
-				      data->subdevice,
-				      snd_pcm_stream_name(data->stream),
-				      msg[0] ? " " : "", msg);
-	}
+	ksft_test_result(pass, "%s.%d.%d.%d.%s%s%s\n",
+			 cfg_prefix,
+			 data->card, data->device, data->subdevice,
+			 snd_pcm_stream_name(data->stream),
+			 msg[0] ? " " : "", msg);
 	free(samples);
 	if (handle)
 		snd_pcm_close(handle);
 }
 
-static const struct time_test_def time_tests[] = {
-	/* name          format     rate   chan  period  buffer */
-	{ "8k.1.big",    "S16_LE",   8000, 2,     8000,   32000 },
-	{ "8k.2.big",    "S16_LE",   8000, 2,     8000,   32000 },
-	{ "44k1.2.big",  "S16_LE",  44100, 2,    22050,  192000 },
-	{ "48k.2.small", "S16_LE",  48000, 2,      512,    4096 },
-	{ "48k.2.big",   "S16_LE",  48000, 2,    24000,  192000 },
-	{ "48k.6.big",   "S16_LE",  48000, 6,    48000,  576000 },
-	{ "96k.2.big",   "S16_LE",  96000, 2,    48000,  192000 },
-};
+#define TESTS_PER_PCM 2
 
 int main(void)
 {
 	struct pcm_data *pcm;
-	int i;
 
 	ksft_print_header();
 
@@ -467,7 +432,7 @@ int main(void)
 
 	find_pcms();
 
-	ksft_set_plan(num_missing + num_pcms * ARRAY_SIZE(time_tests));
+	ksft_set_plan(num_missing + num_pcms * TESTS_PER_PCM);
 
 	for (pcm = pcm_missing; pcm != NULL; pcm = pcm->next) {
 		ksft_test_result(false, "test.missing.%d.%d.%d.%s\n",
@@ -476,9 +441,8 @@ int main(void)
 	}
 
 	for (pcm = pcm_list; pcm != NULL; pcm = pcm->next) {
-		for (i = 0; i < ARRAY_SIZE(time_tests); i++) {
-			test_pcm_time1(pcm, &time_tests[i]);
-		}
+		test_pcm_time1(pcm, "test.time1", "S16_LE", 48000, 2, 512, 4096);
+		test_pcm_time1(pcm, "test.time2", "S16_LE", 48000, 2, 24000, 192000);
 	}
 
 	conf_free();
-- 
cgit 


From 348d09fcd1b6ba455141882daf1d50ff33cd0bf8 Mon Sep 17 00:00:00 2001
From: Jaroslav Kysela <perex@perex.cz>
Date: Tue, 27 Dec 2022 17:06:47 +0000
Subject: kselftest/alsa: pcm - move more configuration to configuration files

Obtain all test parameters from the configuration files. The defaults
are defined in the pcm-test.conf file. The test count and parameters
may be variable per specific hardware.

Also, handle alt_formats field now (with the fixes in the format loop).
It replaces the original "automatic" logic which is not so universal.

The code may be further extended to skip various tests based
on the configuration hints, if the exact PCM hardware parameters
are not available for the given hardware.

Signed-off-by: Jaroslav Kysela <perex@perex.cz>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-alsa-pcm-test-hacks-v4-2-5a152e65b1e1@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 tools/testing/selftests/alsa/Makefile              |   2 +-
 tools/testing/selftests/alsa/alsa-local.h          |   3 +
 tools/testing/selftests/alsa/conf.c                |  26 +++++-
 .../alsa/conf.d/Lenovo_ThinkPad_P1_Gen2.conf       |   8 ++
 tools/testing/selftests/alsa/pcm-test.c            | 102 ++++++++++++++-------
 tools/testing/selftests/alsa/pcm-test.conf         |  16 ++++
 6 files changed, 121 insertions(+), 36 deletions(-)
 create mode 100644 tools/testing/selftests/alsa/pcm-test.conf

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/Makefile b/tools/testing/selftests/alsa/Makefile
index a8c0383878d3..77fba3e498cc 100644
--- a/tools/testing/selftests/alsa/Makefile
+++ b/tools/testing/selftests/alsa/Makefile
@@ -14,7 +14,7 @@ TEST_GEN_PROGS := mixer-test pcm-test
 
 TEST_GEN_PROGS_EXTENDED := libatest.so
 
-TEST_FILES := conf.d
+TEST_FILES := conf.d pcm-test.conf
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/alsa/alsa-local.h b/tools/testing/selftests/alsa/alsa-local.h
index 65f197ea9773..de030dc23bd1 100644
--- a/tools/testing/selftests/alsa/alsa-local.h
+++ b/tools/testing/selftests/alsa/alsa-local.h
@@ -12,6 +12,7 @@
 
 snd_config_t *get_alsalib_config(void);
 
+snd_config_t *conf_load_from_file(const char *filename);
 void conf_load(void);
 void conf_free(void);
 snd_config_t *conf_by_card(int card);
@@ -20,5 +21,7 @@ int conf_get_count(snd_config_t *root, const char *key1, const char *key2);
 const char *conf_get_string(snd_config_t *root, const char *key1, const char *key2, const char *def);
 long conf_get_long(snd_config_t *root, const char *key1, const char *key2, long def);
 int conf_get_bool(snd_config_t *root, const char *key1, const char *key2, int def);
+void conf_get_string_array(snd_config_t *root, const char *key1, const char *key2,
+			   const char **array, int array_size, const char *def);
 
 #endif /* __ALSA_LOCAL_H */
diff --git a/tools/testing/selftests/alsa/conf.c b/tools/testing/selftests/alsa/conf.c
index c7ffc8f04195..d7aafe5a1993 100644
--- a/tools/testing/selftests/alsa/conf.c
+++ b/tools/testing/selftests/alsa/conf.c
@@ -125,7 +125,7 @@ static int dump_config_tree(snd_config_t *top)
 	snd_output_close(out);
 }
 
-static snd_config_t *load(const char *filename)
+snd_config_t *conf_load_from_file(const char *filename)
 {
 	snd_config_t *dst;
 	snd_input_t *input;
@@ -235,7 +235,7 @@ static bool test_filename1(int card, const char *filename, const char *sysfs_car
 	snd_config_t *config, *sysfs_config, *card_config, *sysfs_card_config, *node;
 	snd_config_iterator_t i, next;
 
-	config = load(filename);
+	config = conf_load_from_file(filename);
 	if (snd_config_search(config, "sysfs", &sysfs_config) ||
 	    snd_config_get_type(sysfs_config) != SND_CONFIG_TYPE_COMPOUND)
 		ksft_exit_fail_msg("Missing global sysfs block in filename %s\n", filename);
@@ -446,3 +446,25 @@ int conf_get_bool(snd_config_t *root, const char *key1, const char *key2, int de
 		ksft_exit_fail_msg("key '%s'.'%s' is not an bool\n", key1, key2);
 	return !!ret;
 }
+
+void conf_get_string_array(snd_config_t *root, const char *key1, const char *key2,
+			   const char **array, int array_size, const char *def)
+{
+	snd_config_t *cfg;
+	char buf[16];
+	int ret, index;
+
+	ret = conf_get_by_keys(root, key1, key2, &cfg);
+	if (ret == -ENOENT)
+		cfg = NULL;
+	else if (ret < 0)
+		ksft_exit_fail_msg("key '%s'.'%s' search error: %s\n", key1, key2, snd_strerror(ret));
+	for (index = 0; index < array_size; index++) {
+		if (cfg == NULL) {
+			array[index] = def;
+		} else {
+			sprintf(buf, "%i", index);
+			array[index] = conf_get_string(cfg, buf, NULL, def);
+		}
+	}
+}
diff --git a/tools/testing/selftests/alsa/conf.d/Lenovo_ThinkPad_P1_Gen2.conf b/tools/testing/selftests/alsa/conf.d/Lenovo_ThinkPad_P1_Gen2.conf
index 0a83f35d43eb..5b40a916295d 100644
--- a/tools/testing/selftests/alsa/conf.d/Lenovo_ThinkPad_P1_Gen2.conf
+++ b/tools/testing/selftests/alsa/conf.d/Lenovo_ThinkPad_P1_Gen2.conf
@@ -55,6 +55,14 @@ card.hda {
 				period_size 24000
 				buffer_size 192000
 			}
+			test.time3 {
+				access RW_INTERLEAVED
+				format S16_LE
+				rate 44100
+				channels 2
+				period_size 24000
+				buffer_size 192000
+			}
 		}
 		CAPTURE {
 			# use default tests, check for the presence
diff --git a/tools/testing/selftests/alsa/pcm-test.c b/tools/testing/selftests/alsa/pcm-test.c
index 6e7dfc395b98..e973b03ae1fd 100644
--- a/tools/testing/selftests/alsa/pcm-test.c
+++ b/tools/testing/selftests/alsa/pcm-test.c
@@ -31,7 +31,6 @@ struct pcm_data {
 	struct pcm_data *next;
 };
 
-int num_pcms = 0;
 struct pcm_data *pcm_list = NULL;
 
 int num_missing = 0;
@@ -200,7 +199,6 @@ static void find_pcms(void)
 					pcm_data->pcm_config = conf_get_subtree(card_config, key, NULL);
 					pcm_data->next = pcm_list;
 					pcm_list = pcm_data;
-					num_pcms++;
 				}
 			}
 		}
@@ -219,17 +217,15 @@ static void find_pcms(void)
 	snd_config_delete(config);
 }
 
-static void test_pcm_time1(struct pcm_data *data,
-			   const char *cfg_prefix, const char *sformat,
-			   long srate, long schannels,
-			   long speriod_size, long sbuffer_size)
+static void test_pcm_time(struct pcm_data *data, const char *test_name, snd_config_t *pcm_cfg)
 {
 	char name[64], key[128], msg[256];
 	const char *cs;
 	int i, err;
 	snd_pcm_t *handle = NULL;
 	snd_pcm_access_t access = SND_PCM_ACCESS_RW_INTERLEAVED;
-	snd_pcm_format_t format;
+	snd_pcm_format_t format, old_format;
+	const char *alt_formats[8];
 	unsigned char *samples = NULL;
 	snd_pcm_sframes_t frames;
 	long long ms;
@@ -237,27 +233,23 @@ static void test_pcm_time1(struct pcm_data *data,
 	unsigned int rrate;
 	snd_pcm_uframes_t rperiod_size, rbuffer_size, start_threshold;
 	timestamp_t tstamp;
-	bool pass = false, automatic = true;
+	bool pass = false;
 	snd_pcm_hw_params_t *hw_params;
 	snd_pcm_sw_params_t *sw_params;
 
 	snd_pcm_hw_params_alloca(&hw_params);
 	snd_pcm_sw_params_alloca(&sw_params);
 
-	cs = conf_get_string(data->pcm_config, cfg_prefix, "format", sformat);
+	cs = conf_get_string(pcm_cfg, "format", NULL, "S16_LE");
 	format = snd_pcm_format_value(cs);
 	if (format == SND_PCM_FORMAT_UNKNOWN)
 		ksft_exit_fail_msg("Wrong format '%s'\n", cs);
-	rate = conf_get_long(data->pcm_config, cfg_prefix, "rate", srate);
-	channels = conf_get_long(data->pcm_config, cfg_prefix, "channels", schannels);
-	period_size = conf_get_long(data->pcm_config, cfg_prefix, "period_size", speriod_size);
-	buffer_size = conf_get_long(data->pcm_config, cfg_prefix, "buffer_size", sbuffer_size);
-
-	automatic = strcmp(sformat, snd_pcm_format_name(format)) == 0 &&
-			srate == rate &&
-			schannels == channels &&
-			speriod_size == period_size &&
-			sbuffer_size == buffer_size;
+	conf_get_string_array(pcm_cfg, "alt_formats", NULL,
+				alt_formats, ARRAY_SIZE(alt_formats), NULL);
+	rate = conf_get_long(pcm_cfg, "rate", NULL, 48000);
+	channels = conf_get_long(pcm_cfg, "channels", NULL, 2);
+	period_size = conf_get_long(pcm_cfg, "period_size", NULL, 4096);
+	buffer_size = conf_get_long(pcm_cfg, "buffer_size", NULL, 16384);
 
 	samples = malloc((rate * channels * snd_pcm_format_physical_width(format)) / 8);
 	if (!samples)
@@ -287,16 +279,29 @@ static void test_pcm_time1(struct pcm_data *data,
 					   snd_pcm_access_name(access), snd_strerror(err));
 		goto __close;
 	}
+	i = -1;
 __format:
 	err = snd_pcm_hw_params_set_format(handle, hw_params, format);
 	if (err < 0) {
-		if (automatic && format == SND_PCM_FORMAT_S16_LE) {
-			format = SND_PCM_FORMAT_S32_LE;
-			ksft_print_msg("%s.%d.%d.%d.%s.%s format S16_LE -> S32_LE\n",
-					 cfg_prefix,
-					 data->card, data->device, data->subdevice,
-					 snd_pcm_stream_name(data->stream),
-					 snd_pcm_access_name(access));
+		i++;
+		if (i < ARRAY_SIZE(alt_formats) && alt_formats[i]) {
+			old_format = format;
+			format = snd_pcm_format_value(alt_formats[i]);
+			if (format != SND_PCM_FORMAT_UNKNOWN) {
+				ksft_print_msg("%s.%d.%d.%d.%s.%s format %s -> %s\n",
+						 test_name,
+						 data->card, data->device, data->subdevice,
+						 snd_pcm_stream_name(data->stream),
+						 snd_pcm_access_name(access),
+						 snd_pcm_format_name(old_format),
+						 snd_pcm_format_name(format));
+				samples = realloc(samples, (rate * channels *
+							    snd_pcm_format_physical_width(format)) / 8);
+				if (!samples)
+					ksft_exit_fail_msg("Out of memory\n");
+				snd_pcm_format_set_silence(format, samples, rate * channels);
+				goto __format;
+			}
 		}
 		snprintf(msg, sizeof(msg), "snd_pcm_hw_params_set_format %s: %s",
 					   snd_pcm_format_name(format), snd_strerror(err));
@@ -362,7 +367,7 @@ __format:
 	}
 
 	ksft_print_msg("%s.%d.%d.%d.%s hw_params.%s.%s.%ld.%ld.%ld.%ld sw_params.%ld\n",
-			 cfg_prefix,
+			 test_name,
 			 data->card, data->device, data->subdevice,
 			 snd_pcm_stream_name(data->stream),
 			 snd_pcm_access_name(access),
@@ -411,7 +416,7 @@ __format:
 	pass = true;
 __close:
 	ksft_test_result(pass, "%s.%d.%d.%d.%s%s%s\n",
-			 cfg_prefix,
+			 test_name,
 			 data->card, data->device, data->subdevice,
 			 snd_pcm_stream_name(data->stream),
 			 msg[0] ? " " : "", msg);
@@ -420,19 +425,35 @@ __close:
 		snd_pcm_close(handle);
 }
 
-#define TESTS_PER_PCM 2
-
 int main(void)
 {
 	struct pcm_data *pcm;
+	snd_config_t *global_config, *default_pcm_config, *cfg, *pcm_cfg;
+	snd_config_iterator_t i, next;
+	int num_pcm_tests = 0, num_tests;
+	const char *test_name, *test_type;
 
 	ksft_print_header();
 
+	global_config = conf_load_from_file("pcm-test.conf");
+	default_pcm_config = conf_get_subtree(global_config, "pcm", NULL);
+	if (default_pcm_config == NULL)
+		ksft_exit_fail_msg("default pcm test configuration (pcm compound) is missing\n");
+
 	conf_load();
 
 	find_pcms();
 
-	ksft_set_plan(num_missing + num_pcms * TESTS_PER_PCM);
+	for (pcm = pcm_list; pcm != NULL; pcm = pcm->next) {
+		cfg = pcm->pcm_config;
+		if (cfg == NULL)
+			cfg = default_pcm_config;
+		num_tests = conf_get_count(cfg, "test", NULL);
+		if (num_tests > 0)
+			num_pcm_tests += num_tests;
+	}
+
+	ksft_set_plan(num_missing + num_pcm_tests);
 
 	for (pcm = pcm_missing; pcm != NULL; pcm = pcm->next) {
 		ksft_test_result(false, "test.missing.%d.%d.%d.%s\n",
@@ -441,10 +462,25 @@ int main(void)
 	}
 
 	for (pcm = pcm_list; pcm != NULL; pcm = pcm->next) {
-		test_pcm_time1(pcm, "test.time1", "S16_LE", 48000, 2, 512, 4096);
-		test_pcm_time1(pcm, "test.time2", "S16_LE", 48000, 2, 24000, 192000);
+		cfg = pcm->pcm_config;
+		if (cfg == NULL)
+			cfg = default_pcm_config;
+		cfg = conf_get_subtree(cfg, "test", NULL);
+		if (cfg == NULL)
+			continue;
+		snd_config_for_each(i, next, cfg) {
+			pcm_cfg = snd_config_iterator_entry(i);
+			if (snd_config_get_id(pcm_cfg, &test_name) < 0)
+				ksft_exit_fail_msg("snd_config_get_id\n");
+			test_type = conf_get_string(pcm_cfg, "type", NULL, "time");
+			if (strcmp(test_type, "time") == 0)
+				test_pcm_time(pcm, test_name, pcm_cfg);
+			else
+				ksft_exit_fail_msg("unknown test type '%s'\n", test_type);
+		}
 	}
 
+	snd_config_delete(global_config);
 	conf_free();
 
 	ksft_exit_pass();
diff --git a/tools/testing/selftests/alsa/pcm-test.conf b/tools/testing/selftests/alsa/pcm-test.conf
new file mode 100644
index 000000000000..473a19251b49
--- /dev/null
+++ b/tools/testing/selftests/alsa/pcm-test.conf
@@ -0,0 +1,16 @@
+pcm.test.time1 {
+	format S16_LE
+	alt_formats [ S32_LE ]
+	rate 48000
+	channels 2
+	period_size 512
+	buffer_size 4096
+}
+pcm.test.time2 {
+	format S16_LE
+	alt_formats [ S32_LE ]
+	rate 48000
+	channels 2
+	period_size 24000
+	buffer_size 192000
+}
-- 
cgit 


From 7769f1abecf501902cf96e871739da7f97424a43 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Dec 2022 17:06:48 +0000
Subject: kselftest/alsa: pcm - Always run the default set of tests

Rather than allowing the system specific tests to replace the default set
of tests instead run them in addition to the standard set of tests. In
order to avoid name collisions in the reported tests we add an additional
test class element to the reported tests. Doing this means we always get a
consistent baseline of coverage no matter what card we run on but retain
the ability to specify specific coverage for a given system.

Reviewed-by: Jaroslav Kysela <perex@perex.cz>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-alsa-pcm-test-hacks-v4-3-5a152e65b1e1@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 tools/testing/selftests/alsa/pcm-test.c | 83 +++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/pcm-test.c b/tools/testing/selftests/alsa/pcm-test.c
index e973b03ae1fd..afc616ddc820 100644
--- a/tools/testing/selftests/alsa/pcm-test.c
+++ b/tools/testing/selftests/alsa/pcm-test.c
@@ -36,6 +36,11 @@ struct pcm_data *pcm_list = NULL;
 int num_missing = 0;
 struct pcm_data *pcm_missing = NULL;
 
+enum test_class {
+	TEST_CLASS_DEFAULT,
+	TEST_CLASS_SYSTEM,
+};
+
 void timestamp_now(timestamp_t *tstamp)
 {
 	if (clock_gettime(CLOCK_MONOTONIC_RAW, tstamp))
@@ -217,7 +222,8 @@ static void find_pcms(void)
 	snd_config_delete(config);
 }
 
-static void test_pcm_time(struct pcm_data *data, const char *test_name, snd_config_t *pcm_cfg)
+static void test_pcm_time(struct pcm_data *data, enum test_class class,
+			  const char *test_name, snd_config_t *pcm_cfg)
 {
 	char name[64], key[128], msg[256];
 	const char *cs;
@@ -236,6 +242,19 @@ static void test_pcm_time(struct pcm_data *data, const char *test_name, snd_conf
 	bool pass = false;
 	snd_pcm_hw_params_t *hw_params;
 	snd_pcm_sw_params_t *sw_params;
+	const char *test_class_name;
+
+	switch (class) {
+	case TEST_CLASS_DEFAULT:
+		test_class_name = "default";
+		break;
+	case TEST_CLASS_SYSTEM:
+		test_class_name = "system";
+		break;
+	default:
+		ksft_exit_fail_msg("Unknown test class %d\n", class);
+		break;
+	}
 
 	snd_pcm_hw_params_alloca(&hw_params);
 	snd_pcm_sw_params_alloca(&sw_params);
@@ -366,8 +385,8 @@ __format:
 		goto __close;
 	}
 
-	ksft_print_msg("%s.%d.%d.%d.%s hw_params.%s.%s.%ld.%ld.%ld.%ld sw_params.%ld\n",
-			 test_name,
+	ksft_print_msg("%s.%s.%d.%d.%d.%s hw_params.%s.%s.%ld.%ld.%ld.%ld sw_params.%ld\n",
+		         test_class_name, test_name,
 			 data->card, data->device, data->subdevice,
 			 snd_pcm_stream_name(data->stream),
 			 snd_pcm_access_name(access),
@@ -415,8 +434,9 @@ __format:
 	msg[0] = '\0';
 	pass = true;
 __close:
-	ksft_test_result(pass, "%s.%d.%d.%d.%s%s%s\n",
-			 test_name,
+
+	ksft_test_result(pass, "%s.%s.%d.%d.%d.%s%s%s\n",
+			 test_class_name, test_name,
 			 data->card, data->device, data->subdevice,
 			 snd_pcm_stream_name(data->stream),
 			 msg[0] ? " " : "", msg);
@@ -425,13 +445,37 @@ __close:
 		snd_pcm_close(handle);
 }
 
+void run_time_tests(struct pcm_data *pcm, enum test_class class,
+		    snd_config_t *cfg)
+{
+	const char *test_name, *test_type;
+	snd_config_t *pcm_cfg;
+	snd_config_iterator_t i, next;
+
+	if (!cfg)
+		return;
+
+	cfg = conf_get_subtree(cfg, "test", NULL);
+	if (cfg == NULL)
+		return;
+
+	snd_config_for_each(i, next, cfg) {
+		pcm_cfg = snd_config_iterator_entry(i);
+		if (snd_config_get_id(pcm_cfg, &test_name) < 0)
+			ksft_exit_fail_msg("snd_config_get_id\n");
+		test_type = conf_get_string(pcm_cfg, "type", NULL, "time");
+		if (strcmp(test_type, "time") == 0)
+			test_pcm_time(pcm, class, test_name, pcm_cfg);
+		else
+			ksft_exit_fail_msg("unknown test type '%s'\n", test_type);
+	}
+}
+
 int main(void)
 {
 	struct pcm_data *pcm;
 	snd_config_t *global_config, *default_pcm_config, *cfg, *pcm_cfg;
-	snd_config_iterator_t i, next;
-	int num_pcm_tests = 0, num_tests;
-	const char *test_name, *test_type;
+	int num_pcm_tests = 0, num_tests, num_std_pcm_tests;
 
 	ksft_print_header();
 
@@ -444,10 +488,13 @@ int main(void)
 
 	find_pcms();
 
+	num_std_pcm_tests = conf_get_count(default_pcm_config, "test", NULL);
+
 	for (pcm = pcm_list; pcm != NULL; pcm = pcm->next) {
+		num_pcm_tests += num_std_pcm_tests;
 		cfg = pcm->pcm_config;
 		if (cfg == NULL)
-			cfg = default_pcm_config;
+			continue;
 		num_tests = conf_get_count(cfg, "test", NULL);
 		if (num_tests > 0)
 			num_pcm_tests += num_tests;
@@ -462,22 +509,8 @@ int main(void)
 	}
 
 	for (pcm = pcm_list; pcm != NULL; pcm = pcm->next) {
-		cfg = pcm->pcm_config;
-		if (cfg == NULL)
-			cfg = default_pcm_config;
-		cfg = conf_get_subtree(cfg, "test", NULL);
-		if (cfg == NULL)
-			continue;
-		snd_config_for_each(i, next, cfg) {
-			pcm_cfg = snd_config_iterator_entry(i);
-			if (snd_config_get_id(pcm_cfg, &test_name) < 0)
-				ksft_exit_fail_msg("snd_config_get_id\n");
-			test_type = conf_get_string(pcm_cfg, "type", NULL, "time");
-			if (strcmp(test_type, "time") == 0)
-				test_pcm_time(pcm, test_name, pcm_cfg);
-			else
-				ksft_exit_fail_msg("unknown test type '%s'\n", test_type);
-		}
+		run_time_tests(pcm, TEST_CLASS_DEFAULT, default_pcm_config);
+		run_time_tests(pcm, TEST_CLASS_SYSTEM, pcm->pcm_config);
 	}
 
 	snd_config_delete(global_config);
-- 
cgit 


From 34fb956cbdba8e726d72da9796a8a1b65337d9db Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Dec 2022 17:06:49 +0000
Subject: kselftest/alsa: pcm - skip tests when we fail to set params

Since we don't know what the capabilities of an unknown card is any of our
standard tests may fail due to not being supported by the system. Set a
flag once we've configured the stream, just before we start data, to say
that the system accepted our stream configuration.

Since there shouldn't be a use case for tests that are specified for the
individual system failing for those tests we also add a new test which
fails if we are unable to configure the settings specified in the system
specific configuration file.

Reviewed-by: Jaroslav Kysela <perex@perex.cz>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-alsa-pcm-test-hacks-v4-4-5a152e65b1e1@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 tools/testing/selftests/alsa/pcm-test.c | 39 ++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/pcm-test.c b/tools/testing/selftests/alsa/pcm-test.c
index afc616ddc820..5fbb3ff517aa 100644
--- a/tools/testing/selftests/alsa/pcm-test.c
+++ b/tools/testing/selftests/alsa/pcm-test.c
@@ -243,6 +243,7 @@ static void test_pcm_time(struct pcm_data *data, enum test_class class,
 	snd_pcm_hw_params_t *hw_params;
 	snd_pcm_sw_params_t *sw_params;
 	const char *test_class_name;
+	bool skip = true;
 
 	switch (class) {
 	case TEST_CLASS_DEFAULT:
@@ -395,6 +396,9 @@ __format:
 			 (long)rperiod_size, (long)rbuffer_size,
 			 (long)start_threshold);
 
+	/* Set all the params, actually run the test */
+	skip = false;
+
 	timestamp_now(&tstamp);
 	for (i = 0; i < 4; i++) {
 		if (data->stream == SND_PCM_STREAM_PLAYBACK) {
@@ -434,12 +438,34 @@ __format:
 	msg[0] = '\0';
 	pass = true;
 __close:
+	switch (class) {
+	case TEST_CLASS_SYSTEM:
+		test_class_name = "system";
+		/*
+		 * Anything specified as specific to this system
+		 * should always be supported.
+		 */
+		ksft_test_result(!skip, "%s.%s.%d.%d.%d.%s.params\n",
+				 test_class_name, test_name,
+				 data->card, data->device, data->subdevice,
+				 snd_pcm_stream_name(data->stream));
+		break;
+	default:
+		break;
+	}
 
-	ksft_test_result(pass, "%s.%s.%d.%d.%d.%s%s%s\n",
-			 test_class_name, test_name,
-			 data->card, data->device, data->subdevice,
-			 snd_pcm_stream_name(data->stream),
-			 msg[0] ? " " : "", msg);
+	if (!skip)
+		ksft_test_result(pass, "%s.%s.%d.%d.%d.%s%s%s\n",
+				 test_class_name, test_name,
+				 data->card, data->device, data->subdevice,
+				 snd_pcm_stream_name(data->stream),
+				 msg[0] ? " " : "", msg);
+	else
+		ksft_test_result_skip("%s.%s.%d.%d.%d.%s%s%s\n",
+				 test_class_name, test_name,
+				 data->card, data->device, data->subdevice,
+				 snd_pcm_stream_name(data->stream),
+				 msg[0] ? " " : "", msg);
 	free(samples);
 	if (handle)
 		snd_pcm_close(handle);
@@ -495,7 +521,8 @@ int main(void)
 		cfg = pcm->pcm_config;
 		if (cfg == NULL)
 			continue;
-		num_tests = conf_get_count(cfg, "test", NULL);
+		/* Setting params is reported as a separate test */
+		num_tests = conf_get_count(cfg, "test", NULL) * 2;
 		if (num_tests > 0)
 			num_pcm_tests += num_tests;
 	}
-- 
cgit 


From 8acb452467f5a9671756937e55baa1776344e80e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Dec 2022 17:06:50 +0000
Subject: kselftest/alsa: pcm - Support optional description for tests

In order to help with the comprehensibility of tests it is useful for us to
document what the test is attempting to cover. We could just do this through
comments in the configuration files but in order to aid people looking at
the output of the program in logs let's provide support for an optional
'description' directive which we log prior to running each of the tests.

Reviewed-by: Jaroslav Kysela <perex@perex.cz>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-alsa-pcm-test-hacks-v4-5-5a152e65b1e1@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 tools/testing/selftests/alsa/pcm-test.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/pcm-test.c b/tools/testing/selftests/alsa/pcm-test.c
index 5fbb3ff517aa..57d3f6dcb46b 100644
--- a/tools/testing/selftests/alsa/pcm-test.c
+++ b/tools/testing/selftests/alsa/pcm-test.c
@@ -244,6 +244,11 @@ static void test_pcm_time(struct pcm_data *data, enum test_class class,
 	snd_pcm_sw_params_t *sw_params;
 	const char *test_class_name;
 	bool skip = true;
+	const char *desc;
+
+	desc = conf_get_string(pcm_cfg, "description", NULL, NULL);
+	if (desc)
+		ksft_print_msg("%s\n", desc);
 
 	switch (class) {
 	case TEST_CLASS_DEFAULT:
-- 
cgit 


From b8680e2128742d4b678a68f86122b3c570d3dfda Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Dec 2022 17:06:51 +0000
Subject: kselftest/alsa: pcm - Provide descriptions for the default tests

Help people understand what the standard tests are trying to cover by
providing descriptions which both serve as comments in the file and log
messages in the program's output.

Reviewed-by: Jaroslav Kysela <perex@perex.cz>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-alsa-pcm-test-hacks-v4-6-5a152e65b1e1@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 tools/testing/selftests/alsa/pcm-test.conf | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/pcm-test.conf b/tools/testing/selftests/alsa/pcm-test.conf
index 473a19251b49..1662a8c7073e 100644
--- a/tools/testing/selftests/alsa/pcm-test.conf
+++ b/tools/testing/selftests/alsa/pcm-test.conf
@@ -1,4 +1,5 @@
 pcm.test.time1 {
+	description "48kHz stereo small periods"
 	format S16_LE
 	alt_formats [ S32_LE ]
 	rate 48000
@@ -7,6 +8,7 @@ pcm.test.time1 {
 	buffer_size 4096
 }
 pcm.test.time2 {
+	description "48kHz stereo large periods"
 	format S16_LE
 	alt_formats [ S32_LE ]
 	rate 48000
-- 
cgit 


From 777ad8835e43155101b6b8f09ea433ffbd1fc028 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Dec 2022 17:06:52 +0000
Subject: kselftest/alsa: pcm - Add more coverage by default

Add more coverage to our standard test cases:

 - 8kHz mono and stereo to verify that the most common mono format is
   clocked correctly.
 - 44.1kHz stereo to verify that this different clock base is generated
   accurately.
 - 48kHz 6 channel to verify that 6 channel is clocked correctly.
 - 96kHz stereo since that is a common audiophile rate.

Reviewed-by: Jaroslav Kysela <perex@perex.cz>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-alsa-pcm-test-hacks-v4-7-5a152e65b1e1@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 tools/testing/selftests/alsa/pcm-test.conf | 47 +++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/pcm-test.conf b/tools/testing/selftests/alsa/pcm-test.conf
index 1662a8c7073e..71bd3f78a6f2 100644
--- a/tools/testing/selftests/alsa/pcm-test.conf
+++ b/tools/testing/selftests/alsa/pcm-test.conf
@@ -1,4 +1,31 @@
 pcm.test.time1 {
+	description "8kHz mono large periods"
+	format S16_LE
+	alt_formats [ S32_LE ]
+	rate 8000
+	channels 1
+	period_size 8000
+	buffer_size 32000
+}
+pcm.test.time2 {
+	description "8kHz stereo large periods"
+	format S16_LE
+	alt_formats [ S32_LE ]
+	rate 8000
+	channels 2
+	period_size 8000
+	buffer_size 32000
+}
+pcm.test.time3 {
+	description "44.1kHz stereo large periods"
+	format S16_LE
+	alt_formats [ S32_LE ]
+	rate 44100
+	channels 2
+	period_size 22500
+	buffer_size 192000
+}
+pcm.test.time4 {
 	description "48kHz stereo small periods"
 	format S16_LE
 	alt_formats [ S32_LE ]
@@ -7,7 +34,7 @@ pcm.test.time1 {
 	period_size 512
 	buffer_size 4096
 }
-pcm.test.time2 {
+pcm.test.time5 {
 	description "48kHz stereo large periods"
 	format S16_LE
 	alt_formats [ S32_LE ]
@@ -16,3 +43,21 @@ pcm.test.time2 {
 	period_size 24000
 	buffer_size 192000
 }
+pcm.test.time6 {
+	description "48kHz 6 channel large periods"
+	format S16_LE
+	alt_formats [ S32_LE ]
+	rate 48000
+	channels 2
+	period_size 48000
+	buffer_size 576000
+}
+pcm.test.time7 {
+	description "96kHz stereo large periods"
+	format S16_LE
+	alt_formats [ S32_LE ]
+	rate 96000
+	channels 2
+	period_size 48000
+	buffer_size 192000
+}
-- 
cgit 


From 00883922ab404c0fc921709d8c2cec86f49c32f2 Mon Sep 17 00:00:00 2001
From: Hengqi Chen <hengqi.chen@gmail.com>
Date: Sat, 31 Dec 2022 18:07:57 +0800
Subject: libbpf: Add LoongArch support to bpf_tracing.h

Add PT_REGS macros for LoongArch ([0]).

  [0]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html

Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Huacai Chen <chenhuacai@loongson.cn>
Link: https://lore.kernel.org/bpf/20221231100757.3177034-1-hengqi.chen@gmail.com
---
 tools/lib/bpf/bpf_tracing.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 9c1b1689068d..bdb0f6b5be84 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -32,6 +32,9 @@
 #elif defined(__TARGET_ARCH_arc)
 	#define bpf_target_arc
 	#define bpf_target_defined
+#elif defined(__TARGET_ARCH_loongarch)
+	#define bpf_target_loongarch
+	#define bpf_target_defined
 #else
 
 /* Fall back to what the compiler says */
@@ -62,6 +65,9 @@
 #elif defined(__arc__)
 	#define bpf_target_arc
 	#define bpf_target_defined
+#elif defined(__loongarch__)
+	#define bpf_target_loongarch
+	#define bpf_target_defined
 #endif /* no compiler target */
 
 #endif
@@ -258,6 +264,23 @@ struct pt_regs___arm64 {
 /* arc does not select ARCH_HAS_SYSCALL_WRAPPER. */
 #define PT_REGS_SYSCALL_REGS(ctx) ctx
 
+#elif defined(bpf_target_loongarch)
+
+/* https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html */
+
+#define __PT_PARM1_REG regs[4]
+#define __PT_PARM2_REG regs[5]
+#define __PT_PARM3_REG regs[6]
+#define __PT_PARM4_REG regs[7]
+#define __PT_PARM5_REG regs[8]
+#define __PT_RET_REG regs[1]
+#define __PT_FP_REG regs[22]
+#define __PT_RC_REG regs[4]
+#define __PT_SP_REG regs[3]
+#define __PT_IP_REG csr_era
+/* loongarch does not select ARCH_HAS_SYSCALL_WRAPPER. */
+#define PT_REGS_SYSCALL_REGS(ctx) ctx
+
 #endif
 
 #if defined(bpf_target_defined)
-- 
cgit 


From acd3b7768048fe338248cdf43ccfbf8c084a6bc1 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Sat, 31 Dec 2022 23:14:36 +0800
Subject: libbpf: Return -ENODATA for missing btf section

As discussed before, return -ENODATA (No data available) would be more
meaningful than ENOENT (No such file or directory).

Suggested-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Changbin Du <changbin.du@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20221231151436.6541-1-changbin.du@gmail.com
---
 tools/lib/bpf/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index b03250007018..64841117fbb2 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1004,7 +1004,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf,
 
 	if (!btf_data) {
 		pr_warn("failed to find '%s' ELF section in %s\n", BTF_ELF_SEC, path);
-		err = -ENOENT;
+		err = -ENODATA;
 		goto done;
 	}
 	btf = btf_new(btf_data->d_buf, btf_data->d_size, base_btf);
-- 
cgit 


From 3e4c07b9f8217b016753985856238ac7684bc11f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sun, 20 Nov 2022 19:56:27 -0800
Subject: torture: make kvm-find-errors.sh check for compressed vmlinux files

Under some conditions, a given run's vmlinux file will be compressed,
so that it is named vmlinux.xz rather than vmlinux.  in such cases,
kvm-find-errors.sh will complain about the nonexistence of vmlinux.
This commit therefore causes kvm-find-errors.sh to check for vmlinux.xz
as well as for vmlinux.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
index 88983cba7956..d4dc059843a4 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
@@ -36,7 +36,7 @@ do
 	then
 		egrep "error:|warning:|^ld: .*undefined reference to" < $i > $i.diags
 		files="$files $i.diags $i"
-	elif ! test -f ${scenariobasedir}/vmlinux && ! test -f "${rundir}/re-run"
+	elif ! test -f ${scenariobasedir}/vmlinux && ! test -f ${scenariobasedir}/vmlinux.xz && ! test -f "${rundir}/re-run"
 	then
 		echo No ${scenariobasedir}/vmlinux file > $i.diags
 		files="$files $i.diags $i"
-- 
cgit 


From eeb4dd9e530de52fdf43283c50f371f0771725c9 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Wed, 23 Nov 2022 09:03:28 +0800
Subject: selftests: rcutorture: Use "grep -E" instead of "egrep"

The latest version of grep is deprecating the egrep command, so that
its output contains warnings as follows:

	egrep: warning: egrep is obsolescent; using grep -E

Fix this using "grep -E" instead.

  sed -i "s/egrep/grep -E/g" `grep egrep -rwl tools/testing/selftests/rcutorture`

Here are the steps to install the latest grep:

  wget http://ftp.gnu.org/gnu/grep/grep-3.8.tar.gz
  tar xf grep-3.8.tar.gz
  cd grep-3.8 && ./configure && make
  sudo make install
  export PATH=/usr/local/bin:$PATH

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/console-badness.sh |  2 +-
 tools/testing/selftests/rcutorture/bin/kvm-build.sh       |  4 ++--
 tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh |  4 ++--
 tools/testing/selftests/rcutorture/bin/kvm.sh             |  4 ++--
 tools/testing/selftests/rcutorture/bin/parse-console.sh   | 10 +++++-----
 5 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/console-badness.sh b/tools/testing/selftests/rcutorture/bin/console-badness.sh
index 69f8a5958cef..aad51e7c0183 100755
--- a/tools/testing/selftests/rcutorture/bin/console-badness.sh
+++ b/tools/testing/selftests/rcutorture/bin/console-badness.sh
@@ -10,7 +10,7 @@
 #
 # Authors: Paul E. McKenney <paulmck@kernel.org>
 
-egrep 'Badness|WARNING:|Warn|BUG|===========|BUG: KCSAN:|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for|!!!' |
+grep -E 'Badness|WARNING:|Warn|BUG|===========|BUG: KCSAN:|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for|!!!' |
 grep -v 'ODEBUG: ' |
 grep -v 'This means that this is a DEBUG kernel and it is' |
 grep -v 'Warning: unable to open an initial console' |
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
index e28a82851f7c..11f8d232b0ee 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
@@ -44,10 +44,10 @@ fi
 ncpus="`getconf _NPROCESSORS_ONLN`"
 make -j$((2 * ncpus)) $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1
 retval=$?
-if test $retval -ne 0 || grep "rcu[^/]*": < $resdir/Make.out | egrep -q "Stop|Error|error:|warning:" || egrep -q "Stop|Error|error:" < $resdir/Make.out
+if test $retval -ne 0 || grep "rcu[^/]*": < $resdir/Make.out | grep -E -q "Stop|Error|error:|warning:" || grep -E -q "Stop|Error|error:" < $resdir/Make.out
 then
 	echo Kernel build error
-	egrep "Stop|Error|error:|warning:" < $resdir/Make.out
+	grep -E "Stop|Error|error:|warning:" < $resdir/Make.out
 	echo Run aborted.
 	exit 3
 fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
index d4dc059843a4..28981007465b 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
@@ -32,9 +32,9 @@ for i in ${rundir}/*/Make.out
 do
 	scenariodir="`dirname $i`"
 	scenariobasedir="`echo ${scenariodir} | sed -e 's/\.[0-9]*$//'`"
-	if egrep -q "error:|warning:|^ld: .*undefined reference to" < $i
+	if grep -E -q "error:|warning:|^ld: .*undefined reference to" < $i
 	then
-		egrep "error:|warning:|^ld: .*undefined reference to" < $i > $i.diags
+		grep -E "error:|warning:|^ld: .*undefined reference to" < $i > $i.diags
 		files="$files $i.diags $i"
 	elif ! test -f ${scenariobasedir}/vmlinux && ! test -f ${scenariobasedir}/vmlinux.xz && ! test -f "${rundir}/re-run"
 	then
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 7710b1e1cdda..3025a949bc99 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -585,7 +585,7 @@ awk < $T/cfgcpu.pack \
 echo kvm-end-run-stats.sh "$resdir/$ds" "$starttime" >> $T/script
 
 # Extract the tests and their batches from the script.
-egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" |
+grep -E 'Start batch|Starting build\.' $T/script | grep -v ">>" |
 	sed -e 's/:.*$//' -e 's/^echo //' -e 's/-ovf//' |
 	awk '
 	/^----Start/ {
@@ -622,7 +622,7 @@ then
 elif test "$dryrun" = sched
 then
 	# Extract the test run schedule from the script.
-	egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" |
+	grep -E 'Start batch|Starting build\.' $T/script | grep -v ">>" |
 		sed -e 's/:.*$//' -e 's/^echo //'
 	nbuilds="`grep 'Starting build\.' $T/script |
 		  grep -v ">>" | sed -e 's/:.*$//' -e 's/^echo //' |
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 822eb037a057..9ab0f6bc172c 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -65,7 +65,7 @@ then
 	fi
 
 	grep --binary-files=text 'torture:.*ver:' $file |
-	egrep --binary-files=text -v '\(null\)|rtc: 000000000* ' |
+	grep -E --binary-files=text -v '\(null\)|rtc: 000000000* ' |
 	sed -e 's/^(initramfs)[^]]*] //' -e 's/^\[[^]]*] //' |
 	sed -e 's/^.*ver: //' |
 	awk '
@@ -128,17 +128,17 @@ then
 	then
 		summary="$summary  Badness: $n_badness"
 	fi
-	n_warn=`grep -v 'Warning: unable to open an initial console' $file | grep -v 'Warning: Failed to add ttynull console. No stdin, stdout, and stderr for the init process' | egrep -c 'WARNING:|Warn'`
+	n_warn=`grep -v 'Warning: unable to open an initial console' $file | grep -v 'Warning: Failed to add ttynull console. No stdin, stdout, and stderr for the init process' | grep -E -c 'WARNING:|Warn'`
 	if test "$n_warn" -ne 0
 	then
 		summary="$summary  Warnings: $n_warn"
 	fi
-	n_bugs=`egrep -c '\bBUG|Oops:' $file`
+	n_bugs=`grep -E -c '\bBUG|Oops:' $file`
 	if test "$n_bugs" -ne 0
 	then
 		summary="$summary  Bugs: $n_bugs"
 	fi
-	n_kcsan=`egrep -c 'BUG: KCSAN: ' $file`
+	n_kcsan=`grep -E -c 'BUG: KCSAN: ' $file`
 	if test "$n_kcsan" -ne 0
 	then
 		if test "$n_bugs" = "$n_kcsan"
@@ -158,7 +158,7 @@ then
 	then
 		summary="$summary  lockdep: $n_badness"
 	fi
-	n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' $file`
+	n_stalls=`grep -E -c 'detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' $file`
 	if test "$n_stalls" -ne 0
 	then
 		summary="$summary  Stalls: $n_stalls"
-- 
cgit 


From ac71c3dd11e809cb732ae34efa6d9fc29d4664e1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 14 Dec 2022 16:37:27 -0800
Subject: torture: Permit double-quoted-string Kconfig options

Currently, the presence of any quoted-string Kconfig option in the
scenario files or the CFcommon file (aside from the special-cased
CONFIG_INITRAMFS_SOURCE option) will result in an "improperly set"
diagnostic.  This commit updates configcheck.sh to strip double quotes
in order to permit string-valued Kconfig options to be handled correctly.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/configcheck.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/configcheck.sh b/tools/testing/selftests/rcutorture/bin/configcheck.sh
index 83fac1852ab2..b92dfeb7fbbf 100755
--- a/tools/testing/selftests/rcutorture/bin/configcheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/configcheck.sh
@@ -10,10 +10,9 @@
 T="`mktemp -d ${TMPDIR-/tmp}/configcheck.sh.XXXXXX`"
 trap 'rm -rf $T' 0
 
-cat $1 > $T/.config
+sed -e 's/"//g' < $1 > $T/.config
 
-cat $2 | sed -e 's/\(.*\)=n/# \1 is not set/' -e 's/^#CHECK#//' |
-grep -v '^CONFIG_INITRAMFS_SOURCE' |
+sed -e 's/"//g' -e 's/\(.*\)=n/# \1 is not set/' -e 's/^#CHECK#//' < $2 |
 awk	'
 {
 		print "if grep -q \"" $0 "\" < '"$T/.config"'";
-- 
cgit 


From 5a6cd56ad79a490d40ead5df0dfc8502e8081db0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 14 Dec 2022 20:53:00 -0800
Subject: rcu: Permit string-valued Kconfig options in kvm.sh

This commit upgrades the kvm.sh script's --kconfig parameter to accept
string-valued Kconfig options with double-quoted string values.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 3025a949bc99..62f3b0f56e4d 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -186,7 +186,7 @@ do
 		fi
 		;;
 	--kconfig|--kconfigs)
-		checkarg --kconfig "(Kconfig options)" $# "$2" '^CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\( CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\)*$' '^error$'
+		checkarg --kconfig "(Kconfig options)" $# "$2" '^CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\( CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\)*$' '^error$'
 		TORTURE_KCONFIG_ARG="`echo "$TORTURE_KCONFIG_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`"
 		shift
 		;;
-- 
cgit 


From ebd50e2947de9d2675b800a6a29748d0ed7d7fd4 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 16 Nov 2022 15:48:01 -0500
Subject: tools: memory-model: Add rmw-sequences to the LKMM

Viktor (as relayed by Jonas) has pointed out a weakness in the Linux
Kernel Memory Model.  Namely, the memory ordering properties of atomic
operations are not monotonic: An atomic op with full-barrier semantics
does not always provide ordering as strong as one with release-barrier
semantics.

The following litmus test illustrates the problem:

--------------------------------------------------
C atomics-not-monotonic

{}

P0(int *x, atomic_t *y)
{
	WRITE_ONCE(*x, 1);
	smp_wmb();
	atomic_set(y, 1);
}

P1(atomic_t *y)
{
	int r1;

	r1 = atomic_inc_return(y);
}

P2(int *x, atomic_t *y)
{
	int r2;
	int r3;

	r2 = atomic_read(y);
	smp_rmb();
	r3 = READ_ONCE(*x);
}

exists (2:r2=2 /\ 2:r3=0)
--------------------------------------------------

The litmus test is allowed as shown with atomic_inc_return(), which
has full-barrier semantics.  But if the operation is changed to
atomic_inc_return_release(), which only has release-barrier semantics,
the litmus test is forbidden.  Clearly this violates monotonicity.

The reason is because the LKMM treats full-barrier atomic ops as if
they were written:

	mb();
	load();
	store();
	mb();

(where the load() and store() are the two parts of an atomic RMW op),
whereas it treats release-barrier atomic ops as if they were written:

	load();
	release_barrier();
	store();

The difference is that here the release barrier orders the load part
of the atomic op before the store part with A-cumulativity, whereas
the mb()'s above do not.  This means that release-barrier atomics can
effectively extend the cumul-fence relation but full-barrier atomics
cannot.

To resolve this problem we introduce the rmw-sequence relation,
representing an arbitrarily long sequence of atomic RMW operations in
which each operation reads from the previous one, and explicitly allow
it to extend cumul-fence.  This modification of the memory model is
sound; it holds for PPC because of B-cumulativity, it holds for TSO
and ARM64 because of other-multicopy atomicity, and we can assume that
atomic ops on all other architectures will be implemented so as to
make it hold for them.

For similar reasons we also allow rmw-sequence to extend the
w-post-bounded relation, which is analogous to cumul-fence in some
ways.

Reported-by: Viktor Vafeiadis <viktor@mpi-sws.org>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Reviewed-by: Jonas Oberhauser <jonas.oberhauser@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/memory-model/Documentation/explanation.txt | 30 ++++++++++++++++++++++++
 tools/memory-model/linux-kernel.cat              |  5 ++--
 2 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt
index 11a1d2d4f681..e901b47236c3 100644
--- a/tools/memory-model/Documentation/explanation.txt
+++ b/tools/memory-model/Documentation/explanation.txt
@@ -1007,6 +1007,36 @@ order.  Equivalently,
 where the rmw relation links the read and write events making up each
 atomic update.  This is what the LKMM's "atomic" axiom says.
 
+Atomic rmw updates play one more role in the LKMM: They can form "rmw
+sequences".  An rmw sequence is simply a bunch of atomic updates where
+each update reads from the previous one.  Written using events, it
+looks like this:
+
+	Z0 ->rf Y1 ->rmw Z1 ->rf ... ->rf Yn ->rmw Zn,
+
+where Z0 is some store event and n can be any number (even 0, in the
+degenerate case).  We write this relation as: Z0 ->rmw-sequence Zn.
+Note that this implies Z0 and Zn are stores to the same variable.
+
+Rmw sequences have a special property in the LKMM: They can extend the
+cumul-fence relation.  That is, if we have:
+
+	U ->cumul-fence X -> rmw-sequence Y
+
+then also U ->cumul-fence Y.  Thinking about this in terms of the
+operational model, U ->cumul-fence X says that the store U propagates
+to each CPU before the store X does.  Then the fact that X and Y are
+linked by an rmw sequence means that U also propagates to each CPU
+before Y does.  In an analogous way, rmw sequences can also extend
+the w-post-bounded relation defined below in the PLAIN ACCESSES AND
+DATA RACES section.
+
+(The notion of rmw sequences in the LKMM is similar to, but not quite
+the same as, that of release sequences in the C11 memory model.  They
+were added to the LKMM to fix an obscure bug; without them, atomic
+updates with full-barrier semantics did not always guarantee ordering
+at least as strong as atomic updates with release-barrier semantics.)
+
 
 THE PRESERVED PROGRAM ORDER RELATION: ppo
 -----------------------------------------
diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat
index d70315fddef6..07f884f9b2bf 100644
--- a/tools/memory-model/linux-kernel.cat
+++ b/tools/memory-model/linux-kernel.cat
@@ -74,8 +74,9 @@ let ppo = to-r | to-w | fence | (po-unlock-lock-po & int)
 
 (* Propagation: Ordering from release operations and strong fences. *)
 let A-cumul(r) = (rfe ; [Marked])? ; r
+let rmw-sequence = (rf ; rmw)*
 let cumul-fence = [Marked] ; (A-cumul(strong-fence | po-rel) | wmb |
-	po-unlock-lock-po) ; [Marked]
+	po-unlock-lock-po) ; [Marked] ; rmw-sequence
 let prop = [Marked] ; (overwrite & ext)? ; cumul-fence* ;
 	[Marked] ; rfe? ; [Marked]
 
@@ -174,7 +175,7 @@ let vis = cumul-fence* ; rfe? ; [Marked] ;
 let w-pre-bounded = [Marked] ; (addr | fence)?
 let r-pre-bounded = [Marked] ; (addr | nonrw-fence |
 	([R4rmb] ; fencerel(Rmb) ; [~Noreturn]))?
-let w-post-bounded = fence? ; [Marked]
+let w-post-bounded = fence? ; [Marked] ; rmw-sequence
 let r-post-bounded = (nonrw-fence | ([~Noreturn] ; fencerel(Rmb) ; [R4rmb]))? ;
 	[Marked]
 
-- 
cgit 


From 9ba7d3b3b826ef47c1b7b8dbc2d57da868168128 Mon Sep 17 00:00:00 2001
From: Jonas Oberhauser <jonas.oberhauser@huawei.com>
Date: Fri, 2 Dec 2022 13:51:00 +0100
Subject: tools: memory-model: Make plain accesses carry dependencies

As reported by Viktor, plain accesses in LKMM are weaker than
accesses to registers: the latter carry dependencies but the former
do not. This is exemplified in the following snippet:

  int r = READ_ONCE(*x);
  WRITE_ONCE(*y, r);

Here a data dependency links the READ_ONCE() to the WRITE_ONCE(),
preserving their order, because the model treats r as a register.
If r is turned into a memory location accessed by plain accesses,
however, the link is broken and the order between READ_ONCE() and
WRITE_ONCE() is no longer preserved.

This is too conservative, since any optimizations on plain
accesses that might break dependencies are also possible on
registers; it also contradicts the intuitive notion of "dependency"
as the data stored by the WRITE_ONCE() does depend on the data read
by the READ_ONCE(), independently of whether r is a register or a
memory location.

This is resolved by redefining all dependencies to include
dependencies carried by memory accesses; a dependency is said to be
carried by memory accesses (in the model: carry-dep) from one load
to another load if the initial load is followed by an arbitrarily
long sequence alternating between stores and loads of the same
thread, where the data of each store depends on the previous load,
and is read by the next load.

Any dependency linking the final load in the sequence to another
access also links the initial load in the sequence to that access.

More deep details can be found in this LKML discussion:

https://lore.kernel.org/lkml/d86295788ad14a02874ab030ddb8a6f8@huawei.com/

Reported-by: Viktor Vafeiadis <viktor@mpi-sws.org>
Signed-off-by: Jonas Oberhauser <jonas.oberhauser@huawei.com>
Reviewed-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/memory-model/Documentation/explanation.txt |  9 ++++++-
 tools/memory-model/linux-kernel.bell             |  6 +++++
 tools/memory-model/litmus-tests/dep+plain.litmus | 31 ++++++++++++++++++++++++
 3 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 tools/memory-model/litmus-tests/dep+plain.litmus

(limited to 'tools')

diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt
index e901b47236c3..8e7085238470 100644
--- a/tools/memory-model/Documentation/explanation.txt
+++ b/tools/memory-model/Documentation/explanation.txt
@@ -2575,7 +2575,7 @@ smp_store_release() -- which is basically how the Linux kernel treats
 them.
 
 Although we said that plain accesses are not linked by the ppo
-relation, they do contribute to it indirectly.  Namely, when there is
+relation, they do contribute to it indirectly.  Firstly, when there is
 an address dependency from a marked load R to a plain store W,
 followed by smp_wmb() and then a marked store W', the LKMM creates a
 ppo link from R to W'.  The reasoning behind this is perhaps a little
@@ -2584,6 +2584,13 @@ for this source code in which W' could execute before R.  Just as with
 pre-bounding by address dependencies, it is possible for the compiler
 to undermine this relation if sufficient care is not taken.
 
+Secondly, plain accesses can carry dependencies: If a data dependency
+links a marked load R to a store W, and the store is read by a load R'
+from the same thread, then the data loaded by R' depends on the data
+loaded originally by R. Thus, if R' is linked to any access X by a
+dependency, R is also linked to access X by the same dependency, even
+if W' or R' (or both!) are plain.
+
 There are a few oddball fences which need special treatment:
 smp_mb__before_atomic(), smp_mb__after_atomic(), and
 smp_mb__after_spinlock().  The LKMM uses fence events with special
diff --git a/tools/memory-model/linux-kernel.bell b/tools/memory-model/linux-kernel.bell
index 5be86b1025e8..70a9073dec3e 100644
--- a/tools/memory-model/linux-kernel.bell
+++ b/tools/memory-model/linux-kernel.bell
@@ -82,3 +82,9 @@ flag ~empty different-values(srcu-rscs) as srcu-bad-nesting
 let Marked = (~M) | IW | Once | Release | Acquire | domain(rmw) | range(rmw) |
 		LKR | LKW | UL | LF | RL | RU
 let Plain = M \ Marked
+
+(* Redefine dependencies to include those carried through plain accesses *)
+let carry-dep = (data ; rfi)*
+let addr = carry-dep ; addr
+let ctrl = carry-dep ; ctrl
+let data = carry-dep ; data
diff --git a/tools/memory-model/litmus-tests/dep+plain.litmus b/tools/memory-model/litmus-tests/dep+plain.litmus
new file mode 100644
index 000000000000..ebf84daa9a59
--- /dev/null
+++ b/tools/memory-model/litmus-tests/dep+plain.litmus
@@ -0,0 +1,31 @@
+C dep+plain
+
+(*
+ * Result: Never
+ *
+ * This litmus test demonstrates that in LKMM, plain accesses
+ * carry dependencies much like accesses to registers:
+ * The data stored to *z1 and *z2 by P0() originates from P0()'s
+ * READ_ONCE(), and therefore using that data to compute the
+ * conditional of P0()'s if-statement creates a control dependency
+ * from that READ_ONCE() to P0()'s WRITE_ONCE().
+ *)
+
+{}
+
+P0(int *x, int *y, int *z1, int *z2)
+{
+	int a = READ_ONCE(*x);
+	*z1 = a;
+	*z2 = *z1;
+	if (*z2 == 1)
+		WRITE_ONCE(*y, 1);
+}
+
+P1(int *x, int *y)
+{
+	int r = smp_load_acquire(y);
+	smp_store_release(x, r);
+}
+
+exists (x=1 /\ y=1)
-- 
cgit 


From 4a20bc3e207488064e08fc5d7220d6acf95c80dd Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 8 Dec 2022 09:02:00 -0800
Subject: cxl/pci: Move tracepoint definitions to drivers/cxl/core/

CXL is using tracepoints for reporting RAS capability register payloads
for AER events, and has plans to use tracepoints for the output payload
of Get Poison List and Get Event Records commands. For organization
purposes it would be nice to keep those all under a single + local CXL
trace system. This also organization also potentially helps in the
future when CXL drivers expand beyond generic memory expanders, however
that would also entail a move away from the expander-specific
cxl_dev_state context, save that for later.

Note that the powerpc-specific drivers/misc/cxl/ also defines a 'cxl'
trace system, however, it is unlikely that a single platform will ever
load both drivers simultaneously.

Cc: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/167051869176.436579.9728373544811641087.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/Makefile  |   3 ++
 drivers/cxl/core/pci.c     | 112 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/cxl/core/trace.c   |   5 ++
 drivers/cxl/core/trace.h   | 109 +++++++++++++++++++++++++++++++++++++++++++
 drivers/cxl/cxl.h          |   2 +
 drivers/cxl/cxlpci.h       |   3 ++
 drivers/cxl/pci.c          | 111 --------------------------------------------
 include/trace/events/cxl.h | 112 ---------------------------------------------
 tools/testing/cxl/Kbuild   |   2 +
 9 files changed, 236 insertions(+), 223 deletions(-)
 create mode 100644 drivers/cxl/core/trace.c
 create mode 100644 drivers/cxl/core/trace.h
 delete mode 100644 include/trace/events/cxl.h

(limited to 'tools')

diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
index 79c7257f4107..ca4ae31d8f57 100644
--- a/drivers/cxl/core/Makefile
+++ b/drivers/cxl/core/Makefile
@@ -3,6 +3,8 @@ obj-$(CONFIG_CXL_BUS) += cxl_core.o
 obj-$(CONFIG_CXL_SUSPEND) += suspend.o
 
 ccflags-y += -I$(srctree)/drivers/cxl
+CFLAGS_trace.o = -DTRACE_INCLUDE_PATH=. -I$(src)
+
 cxl_core-y := port.o
 cxl_core-y += pmem.o
 cxl_core-y += regs.o
@@ -10,4 +12,5 @@ cxl_core-y += memdev.o
 cxl_core-y += mbox.o
 cxl_core-y += pci.o
 cxl_core-y += hdm.o
+cxl_core-$(CONFIG_TRACING) += trace.o
 cxl_core-$(CONFIG_CXL_REGION) += region.o
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 57764e9cd19d..1d1492440287 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -9,6 +9,7 @@
 #include <cxlmem.h>
 #include <cxl.h>
 #include "core.h"
+#include "trace.h"
 
 /**
  * DOC: cxl core pci
@@ -622,3 +623,114 @@ void read_cdat_data(struct cxl_port *port)
 	}
 }
 EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
+
+void cxl_cor_error_detected(struct pci_dev *pdev)
+{
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
+	struct device *dev = &cxlmd->dev;
+	void __iomem *addr;
+	u32 status;
+
+	if (!cxlds->regs.ras)
+		return;
+
+	addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
+	status = readl(addr);
+	if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
+		writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
+		trace_cxl_aer_correctable_error(dev, status);
+	}
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL);
+
+/* CXL spec rev3.0 8.2.4.16.1 */
+static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log)
+{
+	void __iomem *addr;
+	u32 *log_addr;
+	int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32);
+
+	addr = cxlds->regs.ras + CXL_RAS_HEADER_LOG_OFFSET;
+	log_addr = log;
+
+	for (i = 0; i < log_u32_size; i++) {
+		*log_addr = readl(addr);
+		log_addr++;
+		addr += sizeof(u32);
+	}
+}
+
+/*
+ * Log the state of the RAS status registers and prepare them to log the
+ * next error status. Return 1 if reset needed.
+ */
+static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
+{
+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
+	struct device *dev = &cxlmd->dev;
+	u32 hl[CXL_HEADERLOG_SIZE_U32];
+	void __iomem *addr;
+	u32 status;
+	u32 fe;
+
+	if (!cxlds->regs.ras)
+		return false;
+
+	addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
+	status = readl(addr);
+	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
+		return false;
+
+	/* If multiple errors, log header points to first error from ctrl reg */
+	if (hweight32(status) > 1) {
+		addr = cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET;
+		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, readl(addr)));
+	} else {
+		fe = status;
+	}
+
+	header_log_copy(cxlds, hl);
+	trace_cxl_aer_uncorrectable_error(dev, status, fe, hl);
+	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
+
+	return true;
+}
+
+pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
+				    pci_channel_state_t state)
+{
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
+	struct device *dev = &cxlmd->dev;
+	bool ue;
+
+	/*
+	 * A frozen channel indicates an impending reset which is fatal to
+	 * CXL.mem operation, and will likely crash the system. On the off
+	 * chance the situation is recoverable dump the status of the RAS
+	 * capability registers and bounce the active state of the memdev.
+	 */
+	ue = cxl_report_and_clear(cxlds);
+
+	switch (state) {
+	case pci_channel_io_normal:
+		if (ue) {
+			device_release_driver(dev);
+			return PCI_ERS_RESULT_NEED_RESET;
+		}
+		return PCI_ERS_RESULT_CAN_RECOVER;
+	case pci_channel_io_frozen:
+		dev_warn(&pdev->dev,
+			 "%s: frozen state error detected, disable CXL.mem\n",
+			 dev_name(dev));
+		device_release_driver(dev);
+		return PCI_ERS_RESULT_NEED_RESET;
+	case pci_channel_io_perm_failure:
+		dev_warn(&pdev->dev,
+			 "failure state error detected, request disconnect\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_error_detected, CXL);
diff --git a/drivers/cxl/core/trace.c b/drivers/cxl/core/trace.c
new file mode 100644
index 000000000000..29ae7ce81dc5
--- /dev/null
+++ b/drivers/cxl/core/trace.c
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
new file mode 100644
index 000000000000..20ca2fe2ca8e
--- /dev/null
+++ b/drivers/cxl/core/trace.h
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cxl
+
+#if !defined(_CXL_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _CXL_EVENTS_H
+
+#include <cxl.h>
+#include <linux/tracepoint.h>
+
+#define CXL_RAS_UC_CACHE_DATA_PARITY	BIT(0)
+#define CXL_RAS_UC_CACHE_ADDR_PARITY	BIT(1)
+#define CXL_RAS_UC_CACHE_BE_PARITY	BIT(2)
+#define CXL_RAS_UC_CACHE_DATA_ECC	BIT(3)
+#define CXL_RAS_UC_MEM_DATA_PARITY	BIT(4)
+#define CXL_RAS_UC_MEM_ADDR_PARITY	BIT(5)
+#define CXL_RAS_UC_MEM_BE_PARITY	BIT(6)
+#define CXL_RAS_UC_MEM_DATA_ECC		BIT(7)
+#define CXL_RAS_UC_REINIT_THRESH	BIT(8)
+#define CXL_RAS_UC_RSVD_ENCODE		BIT(9)
+#define CXL_RAS_UC_POISON		BIT(10)
+#define CXL_RAS_UC_RECV_OVERFLOW	BIT(11)
+#define CXL_RAS_UC_INTERNAL_ERR		BIT(14)
+#define CXL_RAS_UC_IDE_TX_ERR		BIT(15)
+#define CXL_RAS_UC_IDE_RX_ERR		BIT(16)
+
+#define show_uc_errs(status)	__print_flags(status, " | ",		  \
+	{ CXL_RAS_UC_CACHE_DATA_PARITY, "Cache Data Parity Error" },	  \
+	{ CXL_RAS_UC_CACHE_ADDR_PARITY, "Cache Address Parity Error" },	  \
+	{ CXL_RAS_UC_CACHE_BE_PARITY, "Cache Byte Enable Parity Error" }, \
+	{ CXL_RAS_UC_CACHE_DATA_ECC, "Cache Data ECC Error" },		  \
+	{ CXL_RAS_UC_MEM_DATA_PARITY, "Memory Data Parity Error" },	  \
+	{ CXL_RAS_UC_MEM_ADDR_PARITY, "Memory Address Parity Error" },	  \
+	{ CXL_RAS_UC_MEM_BE_PARITY, "Memory Byte Enable Parity Error" },  \
+	{ CXL_RAS_UC_MEM_DATA_ECC, "Memory Data ECC Error" },		  \
+	{ CXL_RAS_UC_REINIT_THRESH, "REINIT Threshold Hit" },		  \
+	{ CXL_RAS_UC_RSVD_ENCODE, "Received Unrecognized Encoding" },	  \
+	{ CXL_RAS_UC_POISON, "Received Poison From Peer" },		  \
+	{ CXL_RAS_UC_RECV_OVERFLOW, "Receiver Overflow" },		  \
+	{ CXL_RAS_UC_INTERNAL_ERR, "Component Specific Error" },	  \
+	{ CXL_RAS_UC_IDE_TX_ERR, "IDE Tx Error" },			  \
+	{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" }			  \
+)
+
+TRACE_EVENT(cxl_aer_uncorrectable_error,
+	TP_PROTO(const struct device *dev, u32 status, u32 fe, u32 *hl),
+	TP_ARGS(dev, status, fe, hl),
+	TP_STRUCT__entry(
+		__string(dev_name, dev_name(dev))
+		__field(u32, status)
+		__field(u32, first_error)
+		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
+	),
+	TP_fast_assign(
+		__assign_str(dev_name, dev_name(dev));
+		__entry->status = status;
+		__entry->first_error = fe;
+		/*
+		 * Embed the 512B headerlog data for user app retrieval and
+		 * parsing, but no need to print this in the trace buffer.
+		 */
+		memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
+	),
+	TP_printk("%s: status: '%s' first_error: '%s'",
+		  __get_str(dev_name),
+		  show_uc_errs(__entry->status),
+		  show_uc_errs(__entry->first_error)
+	)
+);
+
+#define CXL_RAS_CE_CACHE_DATA_ECC	BIT(0)
+#define CXL_RAS_CE_MEM_DATA_ECC		BIT(1)
+#define CXL_RAS_CE_CRC_THRESH		BIT(2)
+#define CLX_RAS_CE_RETRY_THRESH		BIT(3)
+#define CXL_RAS_CE_CACHE_POISON		BIT(4)
+#define CXL_RAS_CE_MEM_POISON		BIT(5)
+#define CXL_RAS_CE_PHYS_LAYER_ERR	BIT(6)
+
+#define show_ce_errs(status)	__print_flags(status, " | ",			\
+	{ CXL_RAS_CE_CACHE_DATA_ECC, "Cache Data ECC Error" },			\
+	{ CXL_RAS_CE_MEM_DATA_ECC, "Memory Data ECC Error" },			\
+	{ CXL_RAS_CE_CRC_THRESH, "CRC Threshold Hit" },				\
+	{ CLX_RAS_CE_RETRY_THRESH, "Retry Threshold" },				\
+	{ CXL_RAS_CE_CACHE_POISON, "Received Cache Poison From Peer" },		\
+	{ CXL_RAS_CE_MEM_POISON, "Received Memory Poison From Peer" },		\
+	{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" }	\
+)
+
+TRACE_EVENT(cxl_aer_correctable_error,
+	TP_PROTO(const struct device *dev, u32 status),
+	TP_ARGS(dev, status),
+	TP_STRUCT__entry(
+		__string(dev_name, dev_name(dev))
+		__field(u32, status)
+	),
+	TP_fast_assign(
+		__assign_str(dev_name, dev_name(dev));
+		__entry->status = status;
+	),
+	TP_printk("%s: status: '%s'",
+		  __get_str(dev_name), show_ce_errs(__entry->status)
+	)
+);
+
+#endif /* _CXL_EVENTS_H */
+
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 1b1cf459ac77..aa3af3bb73b2 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -140,6 +140,8 @@ static inline int ways_to_eiw(unsigned int ways, u8 *eiw)
 #define CXL_RAS_CAP_CONTROL_FE_MASK GENMASK(5, 0)
 #define CXL_RAS_HEADER_LOG_OFFSET 0x18
 #define CXL_RAS_CAPABILITY_LENGTH 0x58
+#define CXL_HEADERLOG_SIZE SZ_512
+#define CXL_HEADERLOG_SIZE_U32 SZ_512 / sizeof(u32)
 
 /* CXL 2.0 8.2.8.1 Device Capabilities Array Register */
 #define CXLDEV_CAP_ARRAY_OFFSET 0x0
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 920909791bb9..77dbdb980b12 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -66,4 +66,7 @@ int devm_cxl_port_enumerate_dports(struct cxl_port *port);
 struct cxl_dev_state;
 int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm);
 void read_cdat_data(struct cxl_port *port);
+void cxl_cor_error_detected(struct pci_dev *pdev);
+pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
+				    pci_channel_state_t state);
 #endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 33083a522fd1..3a66aadb4df0 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -14,8 +14,6 @@
 #include "cxlmem.h"
 #include "cxlpci.h"
 #include "cxl.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/cxl.h>
 
 /**
  * DOC: cxl pci
@@ -514,96 +512,6 @@ static const struct pci_device_id cxl_mem_pci_tbl[] = {
 };
 MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl);
 
-/* CXL spec rev3.0 8.2.4.16.1 */
-static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log)
-{
-	void __iomem *addr;
-	u32 *log_addr;
-	int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32);
-
-	addr = cxlds->regs.ras + CXL_RAS_HEADER_LOG_OFFSET;
-	log_addr = log;
-
-	for (i = 0; i < log_u32_size; i++) {
-		*log_addr = readl(addr);
-		log_addr++;
-		addr += sizeof(u32);
-	}
-}
-
-/*
- * Log the state of the RAS status registers and prepare them to log the
- * next error status. Return 1 if reset needed.
- */
-static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
-{
-	struct cxl_memdev *cxlmd = cxlds->cxlmd;
-	struct device *dev = &cxlmd->dev;
-	u32 hl[CXL_HEADERLOG_SIZE_U32];
-	void __iomem *addr;
-	u32 status;
-	u32 fe;
-
-	if (!cxlds->regs.ras)
-		return false;
-
-	addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
-	status = readl(addr);
-	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
-		return false;
-
-	/* If multiple errors, log header points to first error from ctrl reg */
-	if (hweight32(status) > 1) {
-		addr = cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET;
-		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, readl(addr)));
-	} else {
-		fe = status;
-	}
-
-	header_log_copy(cxlds, hl);
-	trace_cxl_aer_uncorrectable_error(dev, status, fe, hl);
-	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
-
-	return true;
-}
-
-static pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
-					   pci_channel_state_t state)
-{
-	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
-	struct cxl_memdev *cxlmd = cxlds->cxlmd;
-	struct device *dev = &cxlmd->dev;
-	bool ue;
-
-	/*
-	 * A frozen channel indicates an impending reset which is fatal to
-	 * CXL.mem operation, and will likely crash the system. On the off
-	 * chance the situation is recoverable dump the status of the RAS
-	 * capability registers and bounce the active state of the memdev.
-	 */
-	ue = cxl_report_and_clear(cxlds);
-
-	switch (state) {
-	case pci_channel_io_normal:
-		if (ue) {
-			device_release_driver(dev);
-			return PCI_ERS_RESULT_NEED_RESET;
-		}
-		return PCI_ERS_RESULT_CAN_RECOVER;
-	case pci_channel_io_frozen:
-		dev_warn(&pdev->dev,
-			 "%s: frozen state error detected, disable CXL.mem\n",
-			 dev_name(dev));
-		device_release_driver(dev);
-		return PCI_ERS_RESULT_NEED_RESET;
-	case pci_channel_io_perm_failure:
-		dev_warn(&pdev->dev,
-			 "failure state error detected, request disconnect\n");
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-	return PCI_ERS_RESULT_NEED_RESET;
-}
-
 static pci_ers_result_t cxl_slot_reset(struct pci_dev *pdev)
 {
 	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
@@ -628,25 +536,6 @@ static void cxl_error_resume(struct pci_dev *pdev)
 		 dev->driver ? "successful" : "failed");
 }
 
-static void cxl_cor_error_detected(struct pci_dev *pdev)
-{
-	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
-	struct cxl_memdev *cxlmd = cxlds->cxlmd;
-	struct device *dev = &cxlmd->dev;
-	void __iomem *addr;
-	u32 status;
-
-	if (!cxlds->regs.ras)
-		return;
-
-	addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
-	status = readl(addr);
-	if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
-		writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
-		trace_cxl_aer_correctable_error(dev, status);
-	}
-}
-
 static const struct pci_error_handlers cxl_error_handlers = {
 	.error_detected	= cxl_error_detected,
 	.slot_reset	= cxl_slot_reset,
diff --git a/include/trace/events/cxl.h b/include/trace/events/cxl.h
deleted file mode 100644
index ad085a2534ef..000000000000
--- a/include/trace/events/cxl.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM cxl
-
-#if !defined(_CXL_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _CXL_EVENTS_H
-
-#include <linux/tracepoint.h>
-
-#define CXL_HEADERLOG_SIZE		SZ_512
-#define CXL_HEADERLOG_SIZE_U32		SZ_512 / sizeof(u32)
-
-#define CXL_RAS_UC_CACHE_DATA_PARITY	BIT(0)
-#define CXL_RAS_UC_CACHE_ADDR_PARITY	BIT(1)
-#define CXL_RAS_UC_CACHE_BE_PARITY	BIT(2)
-#define CXL_RAS_UC_CACHE_DATA_ECC	BIT(3)
-#define CXL_RAS_UC_MEM_DATA_PARITY	BIT(4)
-#define CXL_RAS_UC_MEM_ADDR_PARITY	BIT(5)
-#define CXL_RAS_UC_MEM_BE_PARITY	BIT(6)
-#define CXL_RAS_UC_MEM_DATA_ECC		BIT(7)
-#define CXL_RAS_UC_REINIT_THRESH	BIT(8)
-#define CXL_RAS_UC_RSVD_ENCODE		BIT(9)
-#define CXL_RAS_UC_POISON		BIT(10)
-#define CXL_RAS_UC_RECV_OVERFLOW	BIT(11)
-#define CXL_RAS_UC_INTERNAL_ERR		BIT(14)
-#define CXL_RAS_UC_IDE_TX_ERR		BIT(15)
-#define CXL_RAS_UC_IDE_RX_ERR		BIT(16)
-
-#define show_uc_errs(status)	__print_flags(status, " | ",		  \
-	{ CXL_RAS_UC_CACHE_DATA_PARITY, "Cache Data Parity Error" },	  \
-	{ CXL_RAS_UC_CACHE_ADDR_PARITY, "Cache Address Parity Error" },	  \
-	{ CXL_RAS_UC_CACHE_BE_PARITY, "Cache Byte Enable Parity Error" }, \
-	{ CXL_RAS_UC_CACHE_DATA_ECC, "Cache Data ECC Error" },		  \
-	{ CXL_RAS_UC_MEM_DATA_PARITY, "Memory Data Parity Error" },	  \
-	{ CXL_RAS_UC_MEM_ADDR_PARITY, "Memory Address Parity Error" },	  \
-	{ CXL_RAS_UC_MEM_BE_PARITY, "Memory Byte Enable Parity Error" },  \
-	{ CXL_RAS_UC_MEM_DATA_ECC, "Memory Data ECC Error" },		  \
-	{ CXL_RAS_UC_REINIT_THRESH, "REINIT Threshold Hit" },		  \
-	{ CXL_RAS_UC_RSVD_ENCODE, "Received Unrecognized Encoding" },	  \
-	{ CXL_RAS_UC_POISON, "Received Poison From Peer" },		  \
-	{ CXL_RAS_UC_RECV_OVERFLOW, "Receiver Overflow" },		  \
-	{ CXL_RAS_UC_INTERNAL_ERR, "Component Specific Error" },	  \
-	{ CXL_RAS_UC_IDE_TX_ERR, "IDE Tx Error" },			  \
-	{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" }			  \
-)
-
-TRACE_EVENT(cxl_aer_uncorrectable_error,
-	TP_PROTO(const struct device *dev, u32 status, u32 fe, u32 *hl),
-	TP_ARGS(dev, status, fe, hl),
-	TP_STRUCT__entry(
-		__string(dev_name, dev_name(dev))
-		__field(u32, status)
-		__field(u32, first_error)
-		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
-	),
-	TP_fast_assign(
-		__assign_str(dev_name, dev_name(dev));
-		__entry->status = status;
-		__entry->first_error = fe;
-		/*
-		 * Embed the 512B headerlog data for user app retrieval and
-		 * parsing, but no need to print this in the trace buffer.
-		 */
-		memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
-	),
-	TP_printk("%s: status: '%s' first_error: '%s'",
-		  __get_str(dev_name),
-		  show_uc_errs(__entry->status),
-		  show_uc_errs(__entry->first_error)
-	)
-);
-
-#define CXL_RAS_CE_CACHE_DATA_ECC	BIT(0)
-#define CXL_RAS_CE_MEM_DATA_ECC		BIT(1)
-#define CXL_RAS_CE_CRC_THRESH		BIT(2)
-#define CLX_RAS_CE_RETRY_THRESH		BIT(3)
-#define CXL_RAS_CE_CACHE_POISON		BIT(4)
-#define CXL_RAS_CE_MEM_POISON		BIT(5)
-#define CXL_RAS_CE_PHYS_LAYER_ERR	BIT(6)
-
-#define show_ce_errs(status)	__print_flags(status, " | ",			\
-	{ CXL_RAS_CE_CACHE_DATA_ECC, "Cache Data ECC Error" },			\
-	{ CXL_RAS_CE_MEM_DATA_ECC, "Memory Data ECC Error" },			\
-	{ CXL_RAS_CE_CRC_THRESH, "CRC Threshold Hit" },				\
-	{ CLX_RAS_CE_RETRY_THRESH, "Retry Threshold" },				\
-	{ CXL_RAS_CE_CACHE_POISON, "Received Cache Poison From Peer" },		\
-	{ CXL_RAS_CE_MEM_POISON, "Received Memory Poison From Peer" },		\
-	{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" }	\
-)
-
-TRACE_EVENT(cxl_aer_correctable_error,
-	TP_PROTO(const struct device *dev, u32 status),
-	TP_ARGS(dev, status),
-	TP_STRUCT__entry(
-		__string(dev_name, dev_name(dev))
-		__field(u32, status)
-	),
-	TP_fast_assign(
-		__assign_str(dev_name, dev_name(dev));
-		__entry->status = status;
-	),
-	TP_printk("%s: status: '%s'",
-		  __get_str(dev_name), show_ce_errs(__entry->status)
-	)
-);
-
-#endif /* _CXL_EVENTS_H */
-
-/* This part must be outside protection */
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE cxl
-#include <trace/define_trace.h>
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 0805f08af8b3..12af1c9270ff 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -17,6 +17,7 @@ CXL_SRC := $(DRIVERS)/cxl
 CXL_CORE_SRC := $(DRIVERS)/cxl/core
 ccflags-y := -I$(srctree)/drivers/cxl/
 ccflags-y += -D__mock=__weak
+ccflags-y += -DTRACE_INCLUDE_PATH=$(CXL_CORE_SRC) -I$(srctree)/drivers/cxl/core/
 
 obj-m += cxl_acpi.o
 
@@ -49,6 +50,7 @@ cxl_core-y += $(CXL_CORE_SRC)/memdev.o
 cxl_core-y += $(CXL_CORE_SRC)/mbox.o
 cxl_core-y += $(CXL_CORE_SRC)/pci.o
 cxl_core-y += $(CXL_CORE_SRC)/hdm.o
+cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
 cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
 cxl_core-y += config_check.o
 
-- 
cgit 


From 5d1dd961e74334a2178264193ea813d44ce5e725 Mon Sep 17 00:00:00 2001
From: "Borislav Petkov (AMD)" <bp@alien8.de>
Date: Thu, 22 Dec 2022 12:42:24 +0100
Subject: x86/alternatives: Add alt_instr.flags

Add a struct alt_instr.flags field which will contain different flags
controlling alternatives patching behavior.

The initial idea was to be able to specify it as a separate macro
parameter but that would mean touching all possible invocations of the
alternatives macros and thus a lot of churn.

What is more, as PeterZ suggested, being able to say ALT_NOT(feature) is
very readable and explains exactly what is meant.

So make the feature field a u32 where the patching flags are the upper
u16 part of the dword quantity while the lower u16 word is the feature.

The highest feature number currently is 0x26a (i.e., word 19) so there
is plenty of space. If that becomes insufficient, the field can be
extended to u64 which will then make struct alt_instr of the nice size
of 16 bytes (14 bytes currently).

There should be no functional changes resulting from this.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/Y6RCoJEtxxZWwotd@zn.tnic
---
 arch/x86/include/asm/alternative.h            | 132 +++++++++++++++-----------
 arch/x86/kernel/alternative.c                 |  14 ++-
 tools/objtool/arch/x86/include/arch/special.h |   6 +-
 3 files changed, 85 insertions(+), 67 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 7659217f4d49..e2975a32d443 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -6,8 +6,10 @@
 #include <linux/stringify.h>
 #include <asm/asm.h>
 
-#define ALTINSTR_FLAG_INV	(1 << 15)
-#define ALT_NOT(feat)		((feat) | ALTINSTR_FLAG_INV)
+#define ALT_FLAGS_SHIFT		16
+
+#define ALT_FLAG_NOT		BIT(0)
+#define ALT_NOT(feature)	((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature))
 
 #ifndef __ASSEMBLY__
 
@@ -59,10 +61,27 @@
 	".long 999b - .\n\t"					\
 	".popsection\n\t"
 
+/*
+ * The patching flags are part of the upper bits of the @ft_flags parameter when
+ * specifying them. The split is currently like this:
+ *
+ * [31... flags ...16][15... CPUID feature bit ...0]
+ *
+ * but since this is all hidden in the macros argument being split, those fields can be
+ * extended in the future to fit in a u64 or however the need arises.
+ */
 struct alt_instr {
 	s32 instr_offset;	/* original instruction */
 	s32 repl_offset;	/* offset to replacement instruction */
-	u16 cpuid;		/* cpuid bit set for replacement */
+
+	union {
+		struct {
+			u32 cpuid: 16;	/* CPUID bit set for replacement */
+			u32 flags: 16;	/* patching control flags */
+		};
+		u32 ft_flags;
+	};
+
 	u8  instrlen;		/* length of original instruction */
 	u8  replacementlen;	/* length of new instruction */
 } __packed;
@@ -182,10 +201,10 @@ static inline int alternatives_text_reserved(void *start, void *end)
 		" - (" alt_slen ")), 0x90\n"							\
 	alt_end_marker ":\n"
 
-#define ALTINSTR_ENTRY(feature, num)					      \
+#define ALTINSTR_ENTRY(ft_flags, num)					      \
 	" .long 661b - .\n"				/* label           */ \
 	" .long " b_replacement(num)"f - .\n"		/* new instruction */ \
-	" .word " __stringify(feature) "\n"		/* feature bit     */ \
+	" .4byte " __stringify(ft_flags) "\n"		/* feature + flags */ \
 	" .byte " alt_total_slen "\n"			/* source len      */ \
 	" .byte " alt_rlen(num) "\n"			/* replacement len */
 
@@ -194,20 +213,20 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n"
 
 /* alternative assembly primitive: */
-#define ALTERNATIVE(oldinstr, newinstr, feature)			\
+#define ALTERNATIVE(oldinstr, newinstr, ft_flags)			\
 	OLDINSTR(oldinstr, 1)						\
 	".pushsection .altinstructions,\"a\"\n"				\
-	ALTINSTR_ENTRY(feature, 1)					\
+	ALTINSTR_ENTRY(ft_flags, 1)					\
 	".popsection\n"							\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(newinstr, 1)				\
 	".popsection\n"
 
-#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
+#define ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \
 	OLDINSTR_2(oldinstr, 1, 2)					\
 	".pushsection .altinstructions,\"a\"\n"				\
-	ALTINSTR_ENTRY(feature1, 1)					\
-	ALTINSTR_ENTRY(feature2, 2)					\
+	ALTINSTR_ENTRY(ft_flags1, 1)					\
+	ALTINSTR_ENTRY(ft_flags2, 2)					\
 	".popsection\n"							\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(newinstr1, 1)				\
@@ -215,21 +234,22 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	".popsection\n"
 
 /* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
-#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \
+#define ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) \
 	ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS,	\
-		      newinstr_yes, feature)
-
-#define ALTERNATIVE_3(oldinsn, newinsn1, feat1, newinsn2, feat2, newinsn3, feat3) \
-	OLDINSTR_3(oldinsn, 1, 2, 3)						\
-	".pushsection .altinstructions,\"a\"\n"					\
-	ALTINSTR_ENTRY(feat1, 1)						\
-	ALTINSTR_ENTRY(feat2, 2)						\
-	ALTINSTR_ENTRY(feat3, 3)						\
-	".popsection\n"								\
-	".pushsection .altinstr_replacement, \"ax\"\n"				\
-	ALTINSTR_REPLACEMENT(newinsn1, 1)					\
-	ALTINSTR_REPLACEMENT(newinsn2, 2)					\
-	ALTINSTR_REPLACEMENT(newinsn3, 3)					\
+		      newinstr_yes, ft_flags)
+
+#define ALTERNATIVE_3(oldinsn, newinsn1, ft_flags1, newinsn2, ft_flags2, \
+			newinsn3, ft_flags3)				\
+	OLDINSTR_3(oldinsn, 1, 2, 3)					\
+	".pushsection .altinstructions,\"a\"\n"				\
+	ALTINSTR_ENTRY(ft_flags1, 1)					\
+	ALTINSTR_ENTRY(ft_flags2, 2)					\
+	ALTINSTR_ENTRY(ft_flags3, 3)					\
+	".popsection\n"							\
+	".pushsection .altinstr_replacement, \"ax\"\n"			\
+	ALTINSTR_REPLACEMENT(newinsn1, 1)				\
+	ALTINSTR_REPLACEMENT(newinsn2, 2)				\
+	ALTINSTR_REPLACEMENT(newinsn3, 3)				\
 	".popsection\n"
 
 /*
@@ -244,14 +264,14 @@ static inline int alternatives_text_reserved(void *start, void *end)
  * For non barrier like inlines please define new variants
  * without volatile and memory clobber.
  */
-#define alternative(oldinstr, newinstr, feature)			\
-	asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
+#define alternative(oldinstr, newinstr, ft_flags)			\
+	asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) : : : "memory")
 
-#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
-	asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+#define alternative_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \
+	asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) ::: "memory")
 
-#define alternative_ternary(oldinstr, feature, newinstr_yes, newinstr_no) \
-	asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) ::: "memory")
+#define alternative_ternary(oldinstr, ft_flags, newinstr_yes, newinstr_no) \
+	asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) ::: "memory")
 
 /*
  * Alternative inline assembly with input.
@@ -261,8 +281,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
  * Argument numbers start with 1.
  * Leaving an unused argument 0 to keep API compatibility.
  */
-#define alternative_input(oldinstr, newinstr, feature, input...)	\
-	asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature)	\
+#define alternative_input(oldinstr, newinstr, ft_flags, input...)	\
+	asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags)	\
 		: : "i" (0), ## input)
 
 /*
@@ -273,20 +293,20 @@ static inline int alternatives_text_reserved(void *start, void *end)
  * Otherwise, if CPU has feature1, newinstr1 is used.
  * Otherwise, oldinstr is used.
  */
-#define alternative_input_2(oldinstr, newinstr1, feature1, newinstr2,	     \
-			   feature2, input...)				     \
-	asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1,     \
-		newinstr2, feature2)					     \
+#define alternative_input_2(oldinstr, newinstr1, ft_flags1, newinstr2,	     \
+			   ft_flags2, input...)				     \
+	asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1,     \
+		newinstr2, ft_flags2)					     \
 		: : "i" (0), ## input)
 
 /* Like alternative_input, but with a single output argument */
-#define alternative_io(oldinstr, newinstr, feature, output, input...)	\
-	asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature)	\
+#define alternative_io(oldinstr, newinstr, ft_flags, output, input...)	\
+	asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags)	\
 		: output : "i" (0), ## input)
 
 /* Like alternative_io, but for replacing a direct call with another one. */
-#define alternative_call(oldfunc, newfunc, feature, output, input...)	\
-	asm_inline volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \
+#define alternative_call(oldfunc, newfunc, ft_flags, output, input...)	\
+	asm_inline volatile (ALTERNATIVE("call %P[old]", "call %P[new]", ft_flags) \
 		: output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
 
 /*
@@ -295,10 +315,10 @@ static inline int alternatives_text_reserved(void *start, void *end)
  * Otherwise, if CPU has feature1, function1 is used.
  * Otherwise, old function is used.
  */
-#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2,   \
+#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2,   \
 			   output, input...)				      \
-	asm_inline volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
-		"call %P[new2]", feature2)				      \
+	asm_inline volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", ft_flags1,\
+		"call %P[new2]", ft_flags2)				      \
 		: output, ASM_CALL_CONSTRAINT				      \
 		: [old] "i" (oldfunc), [new1] "i" (newfunc1),		      \
 		  [new2] "i" (newfunc2), ## input)
@@ -347,10 +367,10 @@ static inline int alternatives_text_reserved(void *start, void *end)
  * enough information for the alternatives patching code to patch an
  * instruction. See apply_alternatives().
  */
-.macro altinstruction_entry orig alt feature orig_len alt_len
+.macro altinstr_entry orig alt ft_flags orig_len alt_len
 	.long \orig - .
 	.long \alt - .
-	.word \feature
+	.4byte \ft_flags
 	.byte \orig_len
 	.byte \alt_len
 .endm
@@ -361,7 +381,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
  * @newinstr. ".skip" directive takes care of proper instruction padding
  * in case @newinstr is longer than @oldinstr.
  */
-.macro ALTERNATIVE oldinstr, newinstr, feature
+.macro ALTERNATIVE oldinstr, newinstr, ft_flags
 140:
 	\oldinstr
 141:
@@ -369,7 +389,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
 142:
 
 	.pushsection .altinstructions,"a"
-	altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f
+	altinstr_entry 140b,143f,\ft_flags,142b-140b,144f-143f
 	.popsection
 
 	.pushsection .altinstr_replacement,"ax"
@@ -399,7 +419,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
  * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
  * @feature2, it replaces @oldinstr with @feature2.
  */
-.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+.macro ALTERNATIVE_2 oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2
 140:
 	\oldinstr
 141:
@@ -408,8 +428,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
 142:
 
 	.pushsection .altinstructions,"a"
-	altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f
-	altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f
+	altinstr_entry 140b,143f,\ft_flags1,142b-140b,144f-143f
+	altinstr_entry 140b,144f,\ft_flags2,142b-140b,145f-144f
 	.popsection
 
 	.pushsection .altinstr_replacement,"ax"
@@ -421,7 +441,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	.popsection
 .endm
 
-.macro ALTERNATIVE_3 oldinstr, newinstr1, feature1, newinstr2, feature2, newinstr3, feature3
+.macro ALTERNATIVE_3 oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2, newinstr3, ft_flags3
 140:
 	\oldinstr
 141:
@@ -430,9 +450,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
 142:
 
 	.pushsection .altinstructions,"a"
-	altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f
-	altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f
-	altinstruction_entry 140b,145f,\feature3,142b-140b,146f-145f
+	altinstr_entry 140b,143f,\ft_flags1,142b-140b,144f-143f
+	altinstr_entry 140b,144f,\ft_flags2,142b-140b,145f-144f
+	altinstr_entry 140b,145f,\ft_flags3,142b-140b,146f-145f
 	.popsection
 
 	.pushsection .altinstr_replacement,"ax"
@@ -447,9 +467,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
 .endm
 
 /* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
-#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \
+#define ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) \
 	ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS,	\
-	newinstr_yes, feature
+	newinstr_yes, ft_flags
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 7d8c3cbde368..9a45aa1c775c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -282,27 +282,25 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 	 */
 	for (a = start; a < end; a++) {
 		int insn_buff_sz = 0;
-		/* Mask away "NOT" flag bit for feature to test. */
-		u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV;
 
 		instr = (u8 *)&a->instr_offset + a->instr_offset;
 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
 		BUG_ON(a->instrlen > sizeof(insn_buff));
-		BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32);
+		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
 
 		/*
 		 * Patch if either:
 		 * - feature is present
-		 * - feature not present but ALTINSTR_FLAG_INV is set to mean,
+		 * - feature not present but ALT_FLAG_NOT is set to mean,
 		 *   patch if feature is *NOT* present.
 		 */
-		if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
+		if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT))
 			goto next;
 
 		DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
-			(a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
-			feature >> 5,
-			feature & 0x1f,
+			(a->flags & ALT_FLAG_NOT) ? "!" : "",
+			a->cpuid >> 5,
+			a->cpuid & 0x1f,
 			instr, instr, a->instrlen,
 			replacement, a->replacementlen);
 
diff --git a/tools/objtool/arch/x86/include/arch/special.h b/tools/objtool/arch/x86/include/arch/special.h
index f2918f789a0a..ca8131352994 100644
--- a/tools/objtool/arch/x86/include/arch/special.h
+++ b/tools/objtool/arch/x86/include/arch/special.h
@@ -11,11 +11,11 @@
 #define JUMP_NEW_OFFSET		4
 #define JUMP_KEY_OFFSET		8
 
-#define ALT_ENTRY_SIZE		12
+#define ALT_ENTRY_SIZE		14
 #define ALT_ORIG_OFFSET		0
 #define ALT_NEW_OFFSET		4
 #define ALT_FEATURE_OFFSET	8
-#define ALT_ORIG_LEN_OFFSET	10
-#define ALT_NEW_LEN_OFFSET	11
+#define ALT_ORIG_LEN_OFFSET	12
+#define ALT_NEW_LEN_OFFSET	13
 
 #endif /* _X86_ARCH_SPECIAL_H */
-- 
cgit 


From 8c149eb011be23679b3320d3939f4e3c8271969c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 13 Dec 2022 08:44:24 -0800
Subject: tools/testing/cxl: Prevent cxl_test from confusing production modules

The cxl_test machinery builds modified versions of the modules in
drivers/cxl/ and intercepts some of their calls to allow cxl_test to
inject mock CXL topologies for test.

However, if cxl_test attempts the same with production modules,
fireworks ensue as Luis discovered [1]. Prevent that scenario by
arranging for cxl_test to check for a "watermark" symbol in each of the
modules it expects to be modified before the test can run. This turns
undefined runtime behavior or crashes into a safer failure to load the
cxl_test module.

Link: http://lore.kernel.org/r/20221209062919.1096779-1-mcgrof@kernel.org [1]
Reported-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/cxl/Kbuild          |  6 ++++++
 tools/testing/cxl/cxl_acpi_test.c |  6 ++++++
 tools/testing/cxl/cxl_core_test.c |  6 ++++++
 tools/testing/cxl/cxl_mem_test.c  |  6 ++++++
 tools/testing/cxl/cxl_pmem_test.c |  6 ++++++
 tools/testing/cxl/cxl_port_test.c |  6 ++++++
 tools/testing/cxl/test/cxl.c      |  8 ++++++++
 tools/testing/cxl/watermark.h     | 25 +++++++++++++++++++++++++
 8 files changed, 69 insertions(+)
 create mode 100644 tools/testing/cxl/cxl_acpi_test.c
 create mode 100644 tools/testing/cxl/cxl_core_test.c
 create mode 100644 tools/testing/cxl/cxl_mem_test.c
 create mode 100644 tools/testing/cxl/cxl_pmem_test.c
 create mode 100644 tools/testing/cxl/cxl_port_test.c
 create mode 100644 tools/testing/cxl/watermark.h

(limited to 'tools')

diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 12af1c9270ff..37f77ac9b917 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -24,22 +24,27 @@ obj-m += cxl_acpi.o
 cxl_acpi-y := $(CXL_SRC)/acpi.o
 cxl_acpi-y += mock_acpi.o
 cxl_acpi-y += config_check.o
+cxl_acpi-y += cxl_acpi_test.o
 
 obj-m += cxl_pmem.o
 
 cxl_pmem-y := $(CXL_SRC)/pmem.o
 cxl_pmem-y += $(CXL_SRC)/security.o
 cxl_pmem-y += config_check.o
+cxl_pmem-y += cxl_pmem_test.o
 
 obj-m += cxl_port.o
 
 cxl_port-y := $(CXL_SRC)/port.o
 cxl_port-y += config_check.o
+cxl_port-y += cxl_port_test.o
+
 
 obj-m += cxl_mem.o
 
 cxl_mem-y := $(CXL_SRC)/mem.o
 cxl_mem-y += config_check.o
+cxl_mem-y += cxl_mem_test.o
 
 obj-m += cxl_core.o
 
@@ -53,5 +58,6 @@ cxl_core-y += $(CXL_CORE_SRC)/hdm.o
 cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
 cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
 cxl_core-y += config_check.o
+cxl_core-y += cxl_core_test.o
 
 obj-m += test/
diff --git a/tools/testing/cxl/cxl_acpi_test.c b/tools/testing/cxl/cxl_acpi_test.c
new file mode 100644
index 000000000000..8602dc27c81c
--- /dev/null
+++ b/tools/testing/cxl/cxl_acpi_test.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+
+#include "watermark.h"
+
+cxl_test_watermark(cxl_acpi);
diff --git a/tools/testing/cxl/cxl_core_test.c b/tools/testing/cxl/cxl_core_test.c
new file mode 100644
index 000000000000..464a9255e4d6
--- /dev/null
+++ b/tools/testing/cxl/cxl_core_test.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+
+#include "watermark.h"
+
+cxl_test_watermark(cxl_core);
diff --git a/tools/testing/cxl/cxl_mem_test.c b/tools/testing/cxl/cxl_mem_test.c
new file mode 100644
index 000000000000..ba7fb8a44288
--- /dev/null
+++ b/tools/testing/cxl/cxl_mem_test.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+
+#include "watermark.h"
+
+cxl_test_watermark(cxl_mem);
diff --git a/tools/testing/cxl/cxl_pmem_test.c b/tools/testing/cxl/cxl_pmem_test.c
new file mode 100644
index 000000000000..3fd884fae537
--- /dev/null
+++ b/tools/testing/cxl/cxl_pmem_test.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+
+#include "watermark.h"
+
+cxl_test_watermark(cxl_pmem);
diff --git a/tools/testing/cxl/cxl_port_test.c b/tools/testing/cxl/cxl_port_test.c
new file mode 100644
index 000000000000..be183917a9f6
--- /dev/null
+++ b/tools/testing/cxl/cxl_port_test.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+
+#include "watermark.h"
+
+cxl_test_watermark(cxl_port);
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index 30ee680d38ff..920bd969c554 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -9,6 +9,8 @@
 #include <linux/pci.h>
 #include <linux/mm.h>
 #include <cxlmem.h>
+
+#include "../watermark.h"
 #include "mock.h"
 
 static int interleave_arithmetic;
@@ -1119,6 +1121,12 @@ static __init int cxl_test_init(void)
 {
 	int rc, i;
 
+	cxl_acpi_test();
+	cxl_core_test();
+	cxl_mem_test();
+	cxl_pmem_test();
+	cxl_port_test();
+
 	register_cxl_mock_ops(&cxl_mock_ops);
 
 	cxl_mock_pool = gen_pool_create(ilog2(SZ_2M), NUMA_NO_NODE);
diff --git a/tools/testing/cxl/watermark.h b/tools/testing/cxl/watermark.h
new file mode 100644
index 000000000000..9d81d4a5f6be
--- /dev/null
+++ b/tools/testing/cxl/watermark.h
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#ifndef _TEST_CXL_WATERMARK_H_
+#define _TEST_CXL_WATERMARK_H_
+#include <linux/module.h>
+#include <linux/printk.h>
+
+int cxl_acpi_test(void);
+int cxl_core_test(void);
+int cxl_mem_test(void);
+int cxl_pmem_test(void);
+int cxl_port_test(void);
+
+/*
+ * dummy routine for cxl_test to validate it is linking to the properly
+ * mocked module and not the standard one from the base tree.
+ */
+#define cxl_test_watermark(x)				\
+int x##_test(void)					\
+{							\
+	pr_debug("%s for cxl_test\n", KBUILD_MODNAME);	\
+	return 0;					\
+}							\
+EXPORT_SYMBOL(x##_test)
+#endif /* _TEST_CXL_WATERMARK_H_ */
-- 
cgit 


From 6d0c4b11e743b84eb57898783e9adecb9083b269 Mon Sep 17 00:00:00 2001
From: Rong Tao <rongtao@cestc.cn>
Date: Thu, 5 Jan 2023 22:36:33 +0800
Subject: libbpf: Poison strlcpy()

Since commit 9fc205b413b3("libbpf: Add sane strncpy alternative and use
it internally") introduce libbpf_strlcpy(), thus add strlcpy() to a poison
list to prevent accidental use of it.

Signed-off-by: Rong Tao <rongtao@cestc.cn>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/tencent_5695A257C4D16B4413036BA1DAACDECB0B07@qq.com
---
 tools/lib/bpf/libbpf_internal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index e4d05662a96c..fbaf68335394 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -20,8 +20,8 @@
 /* make sure libbpf doesn't use kernel-only integer typedefs */
 #pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
 
-/* prevent accidental re-addition of reallocarray() */
-#pragma GCC poison reallocarray
+/* prevent accidental re-addition of reallocarray()/strlcpy() */
+#pragma GCC poison reallocarray strlcpy
 
 #include "libbpf.h"
 #include "btf.h"
-- 
cgit 


From d74f87f37672e71457bfcc14eca5eeb1d61b6438 Mon Sep 17 00:00:00 2001
From: Michal Clapinski <mclapinski@google.com>
Date: Wed, 7 Dec 2022 17:43:38 +0100
Subject: selftests/membarrier: Test MEMBARRIER_CMD_GET_REGISTRATIONS

Keep track of previously issued registrations and compare the result
with MEMBARRIER_CMD_GET_REGISTRATIONS return value.

Signed-off-by: Michal Clapinski <mclapinski@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/r/20221207164338.1535591-3-mclapinski@google.com
---
 .../selftests/membarrier/membarrier_test_impl.h    | 33 ++++++++++++++++++++++
 .../membarrier/membarrier_test_multi_thread.c      |  2 +-
 .../membarrier/membarrier_test_single_thread.c     |  6 +++-
 3 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/membarrier/membarrier_test_impl.h b/tools/testing/selftests/membarrier/membarrier_test_impl.h
index 186be69f0a59..af89855adb7b 100644
--- a/tools/testing/selftests/membarrier/membarrier_test_impl.h
+++ b/tools/testing/selftests/membarrier/membarrier_test_impl.h
@@ -9,11 +9,38 @@
 
 #include "../kselftest.h"
 
+static int registrations;
+
 static int sys_membarrier(int cmd, int flags)
 {
 	return syscall(__NR_membarrier, cmd, flags);
 }
 
+static int test_membarrier_get_registrations(int cmd)
+{
+	int ret, flags = 0;
+	const char *test_name =
+		"sys membarrier MEMBARRIER_CMD_GET_REGISTRATIONS";
+
+	registrations |= cmd;
+
+	ret = sys_membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS, 0);
+	if (ret < 0) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d, errno = %d\n",
+			test_name, flags, errno);
+	} else if (ret != registrations) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d, ret = %d, registrations = %d\n",
+			test_name, flags, ret, registrations);
+	}
+	ksft_test_result_pass(
+		"%s test: flags = %d, ret = %d, registrations = %d\n",
+		test_name, flags, ret, registrations);
+
+	return 0;
+}
+
 static int test_membarrier_cmd_fail(void)
 {
 	int cmd = -1, flags = 0;
@@ -113,6 +140,8 @@ static int test_membarrier_register_private_expedited_success(void)
 	ksft_test_result_pass(
 		"%s test: flags = %d\n",
 		test_name, flags);
+
+	test_membarrier_get_registrations(cmd);
 	return 0;
 }
 
@@ -170,6 +199,8 @@ static int test_membarrier_register_private_expedited_sync_core_success(void)
 	ksft_test_result_pass(
 		"%s test: flags = %d\n",
 		test_name, flags);
+
+	test_membarrier_get_registrations(cmd);
 	return 0;
 }
 
@@ -204,6 +235,8 @@ static int test_membarrier_register_global_expedited_success(void)
 	ksft_test_result_pass(
 		"%s test: flags = %d\n",
 		test_name, flags);
+
+	test_membarrier_get_registrations(cmd);
 	return 0;
 }
 
diff --git a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c
index ac5613e5b0eb..a9cc17facfb3 100644
--- a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c
+++ b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c
@@ -62,7 +62,7 @@ static int test_mt_membarrier(void)
 int main(int argc, char **argv)
 {
 	ksft_print_header();
-	ksft_set_plan(13);
+	ksft_set_plan(16);
 
 	test_membarrier_query();
 
diff --git a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c
index c1c963902854..4cdc8b1d124c 100644
--- a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c
+++ b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c
@@ -12,7 +12,9 @@
 int main(int argc, char **argv)
 {
 	ksft_print_header();
-	ksft_set_plan(13);
+	ksft_set_plan(18);
+
+	test_membarrier_get_registrations(/*cmd=*/0);
 
 	test_membarrier_query();
 
@@ -20,5 +22,7 @@ int main(int argc, char **argv)
 
 	test_membarrier_success();
 
+	test_membarrier_get_registrations(/*cmd=*/0);
+
 	return ksft_exit_pass();
 }
-- 
cgit 


From 3da73f102309fe29150e5c35acd20dd82063ff67 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006@gmail.com>
Date: Mon, 5 Dec 2022 12:06:42 +0400
Subject: objtool: Fix memory leak in create_static_call_sections()

strdup() allocates memory for key_name. We need to release the memory in
the following error paths. Add free() to avoid memory leak.

Fixes: 1e7e47883830 ("x86/static_call: Add inline static call implementation for x86-64")
Signed-off-by: Miaoqian Lin <linmq006@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20221205080642.558583-1-linmq006@gmail.com
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 tools/objtool/check.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 4350be739f4f..cab1a162781c 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -679,6 +679,7 @@ static int create_static_call_sections(struct objtool_file *file)
 		if (strncmp(key_name, STATIC_CALL_TRAMP_PREFIX_STR,
 			    STATIC_CALL_TRAMP_PREFIX_LEN)) {
 			WARN("static_call: trampoline name malformed: %s", key_name);
+			free(key_name);
 			return -1;
 		}
 		tmp = key_name + STATIC_CALL_TRAMP_PREFIX_LEN - STATIC_CALL_KEY_PREFIX_LEN;
@@ -688,6 +689,7 @@ static int create_static_call_sections(struct objtool_file *file)
 		if (!key_sym) {
 			if (!opts.module) {
 				WARN("static_call: can't find static_call_key symbol: %s", tmp);
+				free(key_name);
 				return -1;
 			}
 
-- 
cgit 


From 4a753ca5013d465694d6639fbe1378831a399bda Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Fri, 6 Jan 2023 10:57:24 -0800
Subject: selftest: mptcp: exit from copyfd_io_poll() when receive SIGUSR1

For now, mptcp_connect won't exit after receiving the 'SIGUSR1' signal
if '-r' is set. Fix this by skipping poll and sleep in copyfd_io_poll()
if 'quit' is set.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index 8a8266957bc5..b25a31445ded 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -627,7 +627,7 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd,
 		char rbuf[8192];
 		ssize_t len;
 
-		if (fds.events == 0)
+		if (fds.events == 0 || quit)
 			break;
 
 		switch (poll(&fds, 1, poll_timeout)) {
@@ -733,7 +733,7 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd,
 	}
 
 	/* leave some time for late join/announce */
-	if (cfg_remove)
+	if (cfg_remove && !quit)
 		usleep(cfg_wait);
 
 	return 0;
-- 
cgit 


From e04a30f788091076ff874ede84d33817d50d5937 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Fri, 6 Jan 2023 10:57:25 -0800
Subject: selftest: mptcp: add test for mptcp socket in use

Add the function chk_msk_inuse() to diag.sh, which is used to check the
statistics of mptcp socket in use. As mptcp socket in listen state will
be closed randomly after 'accept', we need to get the count of listening
mptcp socket through 'ss' command.

All tests pass.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/mptcp/diag.sh | 56 ++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh
index 24bcd7b9bdb2..ef628b16fe9b 100755
--- a/tools/testing/selftests/net/mptcp/diag.sh
+++ b/tools/testing/selftests/net/mptcp/diag.sh
@@ -17,6 +17,11 @@ flush_pids()
 	sleep 1.1
 
 	ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGUSR1 &>/dev/null
+
+	for _ in $(seq 10); do
+		[ -z "$(ip netns pids "${ns}")" ] && break
+		sleep 0.1
+	done
 }
 
 cleanup()
@@ -37,15 +42,20 @@ if [ $? -ne 0 ];then
 	exit $ksft_skip
 fi
 
+get_msk_inuse()
+{
+	ip netns exec $ns cat /proc/net/protocols | awk '$1~/^MPTCP$/{print $3}'
+}
+
 __chk_nr()
 {
-	local condition="$1"
+	local command="$1"
 	local expected=$2
 	local msg nr
 
 	shift 2
 	msg=$*
-	nr=$(ss -inmHMN $ns | $condition)
+	nr=$(eval $command)
 
 	printf "%-50s" "$msg"
 	if [ $nr != $expected ]; then
@@ -57,9 +67,17 @@ __chk_nr()
 	test_cnt=$((test_cnt+1))
 }
 
+__chk_msk_nr()
+{
+	local condition=$1
+	shift 1
+
+	__chk_nr "ss -inmHMN $ns | $condition" $*
+}
+
 chk_msk_nr()
 {
-	__chk_nr "grep -c token:" $*
+	__chk_msk_nr "grep -c token:" $*
 }
 
 wait_msk_nr()
@@ -97,12 +115,12 @@ wait_msk_nr()
 
 chk_msk_fallback_nr()
 {
-		__chk_nr "grep -c fallback" $*
+		__chk_msk_nr "grep -c fallback" $*
 }
 
 chk_msk_remote_key_nr()
 {
-		__chk_nr "grep -c remote_key" $*
+		__chk_msk_nr "grep -c remote_key" $*
 }
 
 __chk_listen()
@@ -142,6 +160,26 @@ chk_msk_listen()
 	nr=$(ss -Ml $filter | wc -l)
 }
 
+chk_msk_inuse()
+{
+	local expected=$1
+	local listen_nr
+
+	shift 1
+
+	listen_nr=$(ss -N "${ns}" -Ml | grep -c LISTEN)
+	expected=$((expected + listen_nr))
+
+	for _ in $(seq 10); do
+		if [ $(get_msk_inuse) -eq $expected ];then
+			break
+		fi
+		sleep 0.1
+	done
+
+	__chk_nr get_msk_inuse $expected $*
+}
+
 # $1: ns, $2: port
 wait_local_port_listen()
 {
@@ -195,8 +233,10 @@ wait_connected $ns 10000
 chk_msk_nr 2 "after MPC handshake "
 chk_msk_remote_key_nr 2 "....chk remote_key"
 chk_msk_fallback_nr 0 "....chk no fallback"
+chk_msk_inuse 2 "....chk 2 msk in use"
 flush_pids
 
+chk_msk_inuse 0 "....chk 0 msk in use after flush"
 
 echo "a" | \
 	timeout ${timeout_test} \
@@ -211,8 +251,11 @@ echo "b" | \
 				127.0.0.1 >/dev/null &
 wait_connected $ns 10001
 chk_msk_fallback_nr 1 "check fallback"
+chk_msk_inuse 1 "....chk 1 msk in use"
 flush_pids
 
+chk_msk_inuse 0 "....chk 0 msk in use after flush"
+
 NR_CLIENTS=100
 for I in `seq 1 $NR_CLIENTS`; do
 	echo "a" | \
@@ -232,6 +275,9 @@ for I in `seq 1 $NR_CLIENTS`; do
 done
 
 wait_msk_nr $((NR_CLIENTS*2)) "many msk socket present"
+chk_msk_inuse $((NR_CLIENTS*2)) "....chk many msk in use"
 flush_pids
 
+chk_msk_inuse 0 "....chk 0 msk in use after flush"
+
 exit $ret
-- 
cgit 


From 18a5a09d90a75f40d5dcd3bd2ff9052845f36c37 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 9 Jan 2023 09:09:07 +0100
Subject: nolibc: add support for s390

Use arch-x86_64 as a template. Not really different, but
we have our own mmap syscall which takes a structure instead
of discrete arguments.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-s390.h | 222 +++++++++++++++++++++++++++++++++++++++
 tools/include/nolibc/arch.h      |   2 +
 tools/include/nolibc/sys.h       |   2 +
 3 files changed, 226 insertions(+)
 create mode 100644 tools/include/nolibc/arch-s390.h

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h
new file mode 100644
index 000000000000..76bc8fdaf922
--- /dev/null
+++ b/tools/include/nolibc/arch-s390.h
@@ -0,0 +1,222 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * s390 specific definitions for NOLIBC
+ */
+
+#ifndef _NOLIBC_ARCH_S390_H
+#define _NOLIBC_ARCH_S390_H
+#include <asm/unistd.h>
+
+/* O_* macros for fcntl/open are architecture-specific */
+#define O_RDONLY            0
+#define O_WRONLY            1
+#define O_RDWR              2
+#define O_CREAT          0x40
+#define O_EXCL           0x80
+#define O_NOCTTY        0x100
+#define O_TRUNC         0x200
+#define O_APPEND        0x400
+#define O_NONBLOCK      0x800
+#define O_DIRECTORY   0x10000
+
+/* The struct returned by the stat() syscall, equivalent to stat64(). The
+ * syscall returns 116 bytes and stops in the middle of __unused.
+ */
+
+struct sys_stat_struct {
+	unsigned long	st_dev;
+	unsigned long	st_ino;
+	unsigned long	st_nlink;
+	unsigned int	st_mode;
+	unsigned int	st_uid;
+	unsigned int	st_gid;
+	unsigned int	__pad1;
+	unsigned long	st_rdev;
+	unsigned long	st_size;
+	unsigned long	st_atime;
+	unsigned long	st_atime_nsec;
+	unsigned long	st_mtime;
+	unsigned long	st_mtime_nsec;
+	unsigned long	st_ctime;
+	unsigned long	st_ctime_nsec;
+	unsigned long	st_blksize;
+	long		st_blocks;
+	unsigned long	__unused[3];
+};
+
+/* Syscalls for s390:
+ *   - registers are 64-bit
+ *   - syscall number is passed in r1
+ *   - arguments are in r2-r7
+ *   - the system call is performed by calling the svc instruction
+ *   - syscall return value is in r2
+ *   - r1 and r2 are clobbered, others are preserved.
+ *
+ * Link s390 ABI: https://github.com/IBM/s390x-abi
+ *
+ */
+
+#define my_syscall0(num)						\
+({									\
+	register long _num __asm__ ("1") = (num);			\
+	register long _rc __asm__ ("2");				\
+									\
+	__asm__  volatile (						\
+		"svc 0\n"						\
+		: "=d"(_rc)						\
+		: "d"(_num)						\
+		: "memory", "cc"					\
+		);							\
+	_rc;								\
+})
+
+#define my_syscall1(num, arg1)						\
+({									\
+	register long _num __asm__ ("1") = (num);			\
+	register long _arg1 __asm__ ("2") = (long)(arg1);		\
+									\
+	__asm__  volatile (						\
+		"svc 0\n"						\
+		: "+d"(_arg1)						\
+		: "d"(_num)						\
+		: "memory", "cc"					\
+		);							\
+	_arg1;								\
+})
+
+#define my_syscall2(num, arg1, arg2)					\
+({									\
+	register long _num __asm__ ("1") = (num);			\
+	register long _arg1 __asm__ ("2") = (long)(arg1);		\
+	register long _arg2 __asm__ ("3") = (long)(arg2);		\
+									\
+	__asm__  volatile (						\
+		"svc 0\n"						\
+		: "+d"(_arg1)						\
+		: "d"(_arg2), "d"(_num)					\
+		: "memory", "cc"					\
+		);							\
+	_arg1;								\
+})
+
+#define my_syscall3(num, arg1, arg2, arg3)				\
+({									\
+	register long _num __asm__ ("1") = (num);			\
+	register long _arg1 __asm__ ("2") = (long)(arg1);		\
+	register long _arg2 __asm__ ("3") = (long)(arg2);		\
+	register long _arg3 __asm__ ("4") = (long)(arg3);		\
+									\
+	__asm__  volatile (						\
+		"svc 0\n"						\
+		: "+d"(_arg1)						\
+		: "d"(_arg2), "d"(_arg3), "d"(_num)			\
+		: "memory", "cc"					\
+		);							\
+	_arg1;								\
+})
+
+#define my_syscall4(num, arg1, arg2, arg3, arg4)			\
+({									\
+	register long _num __asm__ ("1") = (num);			\
+	register long _arg1 __asm__ ("2") = (long)(arg1);		\
+	register long _arg2 __asm__ ("3") = (long)(arg2);		\
+	register long _arg3 __asm__ ("4") = (long)(arg3);		\
+	register long _arg4 __asm__ ("5") = (long)(arg4);		\
+									\
+	__asm__  volatile (						\
+		"svc 0\n"						\
+		: "+d"(_arg1)						\
+		: "d"(_arg2), "d"(_arg3), "d"(_arg4), "d"(_num)		\
+		: "memory", "cc"					\
+		);							\
+	_arg1;								\
+})
+
+#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5)			\
+({									\
+	register long _num __asm__ ("1") = (num);			\
+	register long _arg1 __asm__ ("2") = (long)(arg1);		\
+	register long _arg2 __asm__ ("3") = (long)(arg2);		\
+	register long _arg3 __asm__ ("4") = (long)(arg3);		\
+	register long _arg4 __asm__ ("5") = (long)(arg4);		\
+	register long _arg5 __asm__ ("6") = (long)(arg5);		\
+									\
+	__asm__  volatile (						\
+		"svc 0\n"						\
+		: "+d"(_arg1)						\
+		: "d"(_arg2), "d"(_arg3), "d"(_arg4), "d"(_arg5),	\
+		  "d"(_num)						\
+		: "memory", "cc"					\
+		);							\
+	_arg1;								\
+})
+
+#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6)		\
+({									\
+	register long _num __asm__ ("1") = (num);			\
+	register long _arg1 __asm__ ("2") = (long)(arg1);		\
+	register long _arg2 __asm__ ("3") = (long)(arg2);		\
+	register long _arg3 __asm__ ("4") = (long)(arg3);		\
+	register long _arg4 __asm__ ("5") = (long)(arg4);		\
+	register long _arg5 __asm__ ("6") = (long)(arg5);		\
+	register long _arg6 __asm__ ("7") = (long)(arg6);		\
+									\
+	__asm__  volatile (						\
+		"svc 0\n"						\
+		: "+d"(_arg1)						\
+		: "d"(_arg2), "d"(_arg3), "d"(_arg4), "d"(_arg5),	\
+		  "d"(_arg6), "d"(_num)					\
+		: "memory", "cc"					\
+		);							\
+	_arg1;								\
+})
+
+/* startup code */
+__asm__ (".section .text\n"
+	 ".weak _start\n"
+	 "_start:\n"
+	 "lg	%r2,0(%r15)\n"		/* argument count */
+	 "la	%r3,8(%r15)\n"		/* argument pointers */
+
+	 "xgr	%r0,%r0\n"		/* r0 will be our NULL value */
+	 /* search for envp */
+	 "lgr	%r4,%r3\n"		/* start at argv */
+	 "0:\n"
+	 "clg	%r0,0(%r4)\n"		/* entry zero? */
+	 "la	%r4,8(%r4)\n"		/* advance pointer */
+	 "jnz	0b\n"			/* no -> test next pointer */
+					/* yes -> r4 now contains start of envp */
+
+	 "aghi	%r15,-160\n"		/* allocate new stackframe */
+	 "xc	0(8,%r15),0(%r15)\n"	/* clear backchain */
+	 "brasl	%r14,main\n"		/* ret value of main is arg to exit */
+	 "lghi	%r1,1\n"		/* __NR_exit */
+	 "svc	0\n"
+	 "");
+
+struct s390_mmap_arg_struct {
+	unsigned long addr;
+	unsigned long len;
+	unsigned long prot;
+	unsigned long flags;
+	unsigned long fd;
+	unsigned long offset;
+};
+
+static __attribute__((unused))
+void *sys_mmap(void *addr, size_t length, int prot, int flags, int fd,
+	       off_t offset)
+{
+	struct s390_mmap_arg_struct args = {
+		.addr = (unsigned long)addr,
+		.len = (unsigned long)length,
+		.prot = prot,
+		.flags = flags,
+		.fd = fd,
+		.offset = (unsigned long)offset
+	};
+
+	return (void *)my_syscall1(__NR_mmap, &args);
+}
+#define sys_mmap sys_mmap
+#endif // _NOLIBC_ARCH_S390_H
diff --git a/tools/include/nolibc/arch.h b/tools/include/nolibc/arch.h
index 4c6992321b0d..78b067a4fa47 100644
--- a/tools/include/nolibc/arch.h
+++ b/tools/include/nolibc/arch.h
@@ -27,6 +27,8 @@
 #include "arch-mips.h"
 #elif defined(__riscv)
 #include "arch-riscv.h"
+#elif defined(__s390x__)
+#include "arch-s390.h"
 #endif
 
 #endif /* _NOLIBC_ARCH_H */
diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h
index 78473d34e27c..a42d7c405bdc 100644
--- a/tools/include/nolibc/sys.h
+++ b/tools/include/nolibc/sys.h
@@ -686,6 +686,7 @@ int mknod(const char *path, mode_t mode, dev_t dev)
 #define MAP_FAILED ((void *)-1)
 #endif
 
+#ifndef sys_mmap
 static __attribute__((unused))
 void *sys_mmap(void *addr, size_t length, int prot, int flags, int fd,
 	       off_t offset)
@@ -707,6 +708,7 @@ void *sys_mmap(void *addr, size_t length, int prot, int flags, int fd,
 	return (void *)my_syscall6(n, addr, length, prot, flags, fd, offset);
 #endif
 }
+#endif
 
 static __attribute__((unused))
 void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset)
-- 
cgit 


From 0043e6f21dbeb039f3e204c40471071a9c43f928 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 9 Jan 2023 09:09:08 +0100
Subject: selftests/nolibc: add s390 support

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/Makefile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 22f1e1d73fa8..2bf613ee363d 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -19,6 +19,7 @@ IMAGE_arm64   = arch/arm64/boot/Image
 IMAGE_arm     = arch/arm/boot/zImage
 IMAGE_mips    = vmlinuz
 IMAGE_riscv   = arch/riscv/boot/Image
+IMAGE_s390    = arch/s390/boot/bzImage
 IMAGE         = $(IMAGE_$(ARCH))
 IMAGE_NAME    = $(notdir $(IMAGE))
 
@@ -29,6 +30,7 @@ DEFCONFIG_arm64   = defconfig
 DEFCONFIG_arm     = multi_v7_defconfig
 DEFCONFIG_mips    = malta_defconfig
 DEFCONFIG_riscv   = defconfig
+DEFCONFIG_s390    = defconfig
 DEFCONFIG         = $(DEFCONFIG_$(ARCH))
 
 # optional tests to run (default = all)
@@ -41,6 +43,7 @@ QEMU_ARCH_arm64   = aarch64
 QEMU_ARCH_arm     = arm
 QEMU_ARCH_mips    = mipsel  # works with malta_defconfig
 QEMU_ARCH_riscv   = riscv64
+QEMU_ARCH_s390    = s390x
 QEMU_ARCH         = $(QEMU_ARCH_$(ARCH))
 
 # QEMU_ARGS : some arch-specific args to pass to qemu
@@ -50,6 +53,7 @@ QEMU_ARGS_arm64   = -M virt -cpu cortex-a53 -append "panic=-1 $(TEST:%=NOLIBC_TE
 QEMU_ARGS_arm     = -M virt -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_mips    = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_riscv   = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_s390    = -M s390-ccw-virtio -m 1G -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS         = $(QEMU_ARGS_$(ARCH))
 
 # OUTPUT is only set when run from the main makefile, otherwise
@@ -62,7 +66,8 @@ else
 Q=@
 endif
 
-CFLAGS  ?= -Os -fno-ident -fno-asynchronous-unwind-tables
+CFLAGS_s390 = -m64
+CFLAGS  ?= -Os -fno-ident -fno-asynchronous-unwind-tables $(CFLAGS_$(ARCH))
 LDFLAGS := -s
 
 help:
-- 
cgit 


From 16d4b2bd799da6338bd4fe8db2a93c5fa70b50f3 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 9 Jan 2023 09:09:09 +0100
Subject: rcutorture: add support for s390

Add the required values to identify_qemu() and
identify_bootimage().

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/functions.sh | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh
index 66d0414d8e4b..b52d5069563c 100644
--- a/tools/testing/selftests/rcutorture/bin/functions.sh
+++ b/tools/testing/selftests/rcutorture/bin/functions.sh
@@ -159,6 +159,9 @@ identify_boot_image () {
 		qemu-system-aarch64)
 			echo arch/arm64/boot/Image
 			;;
+		qemu-system-s390x)
+			echo arch/s390/boot/bzImage
+			;;
 		*)
 			echo vmlinux
 			;;
@@ -184,6 +187,9 @@ identify_qemu () {
 	elif echo $u | grep -q aarch64
 	then
 		echo qemu-system-aarch64
+	elif echo $u | grep -q 'IBM S/390'
+	then
+		echo qemu-system-s390x
 	elif uname -a | grep -q ppc64
 	then
 		echo qemu-system-ppc64
-- 
cgit 


From 28ef4c3753a4e57a347b2bf5f598645de966c137 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 9 Jan 2023 09:09:10 +0100
Subject: rcutorture: build initrd for rcutorture with nolibc

This reduces the size of init from ~600KB to ~1KB.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/mkinitrd.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh
index 70d62fd0d31d..71f0dfbb2a6d 100755
--- a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh
+++ b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh
@@ -64,7 +64,7 @@ ___EOF___
 # build using nolibc on supported archs (smaller executable) and fall
 # back to regular glibc on other ones.
 if echo -e "#if __x86_64__||__i386__||__i486__||__i586__||__i686__" \
-           "||__ARM_EABI__||__aarch64__\nyes\n#endif" \
+           "||__ARM_EABI__||__aarch64__||__s390x__\nyes\n#endif" \
    | ${CROSS_COMPILE}gcc -E -nostdlib -xc - \
    | grep -q '^yes'; then
 	# architecture supported by nolibc
-- 
cgit 


From 7f8548589661d5edbde4c343e4971117585da2f5 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:13 +0100
Subject: tools/nolibc: make compiler and assembler agree on the section around
 _start

The out-of-block asm() statement carrying _start does not allow the
compiler to know what section the assembly code is being emitted to,
and there's no easy way to push/pop the current section and restore
it. It sometimes causes issues depending on the include files ordering
and compiler optimizations. For example if a variable is declared
immediately before the asm() block and another one after, the compiler
assumes that the current section is still .bss and doesn't re-emit it,
making the second variable appear inside the .text section instead.
Forcing .bss at the end of the _start block doesn't work either because
at certain optimizations the compiler may reorder blocks and will make
some real code appear just after this block.

A significant number of solutions were attempted, but many of them were
still sensitive to section reordering. In the end, the best way to make
sure the compiler and assembler agree on the current section is to place
this code inside a function. Here the function is directly called _start
and configured not to emit a frame-pointer, hence to have no prologue.
If some future architectures would still emit some prologue, another
working approach consists in naming the function differently and placing
the _start label inside the asm statement. But the current solution is
simpler.

It was tested with nolibc-test at -O,-O0,-O2,-O3,-Os for arm,arm64,i386,
mips,riscv,s390 and x86_64.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-aarch64.h | 29 +++++++++++----------
 tools/include/nolibc/arch-arm.h     | 40 +++++++++++------------------
 tools/include/nolibc/arch-i386.h    | 38 ++++++++++++++-------------
 tools/include/nolibc/arch-mips.h    | 51 ++++++++++++++++++++-----------------
 tools/include/nolibc/arch-riscv.h   | 36 +++++++++++++-------------
 tools/include/nolibc/arch-s390.h    | 44 +++++++++++++++++---------------
 tools/include/nolibc/arch-x86_64.h  | 30 ++++++++++++----------
 7 files changed, 135 insertions(+), 133 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-aarch64.h b/tools/include/nolibc/arch-aarch64.h
index f68baf8f395f..4d263661411f 100644
--- a/tools/include/nolibc/arch-aarch64.h
+++ b/tools/include/nolibc/arch-aarch64.h
@@ -182,18 +182,19 @@ struct sys_stat_struct {
 })
 
 /* startup code */
-__asm__ (".section .text\n"
-    ".weak _start\n"
-    "_start:\n"
-    "ldr x0, [sp]\n"              // argc (x0) was in the stack
-    "add x1, sp, 8\n"             // argv (x1) = sp
-    "lsl x2, x0, 3\n"             // envp (x2) = 8*argc ...
-    "add x2, x2, 8\n"             //           + 8 (skip null)
-    "add x2, x2, x1\n"            //           + argv
-    "and sp, x1, -16\n"           // sp must be 16-byte aligned in the callee
-    "bl main\n"                   // main() returns the status code, we'll exit with it.
-    "mov x8, 93\n"                // NR_exit == 93
-    "svc #0\n"
-    "");
-
+void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
+{
+	__asm__ volatile (
+		"ldr x0, [sp]\n"     // argc (x0) was in the stack
+		"add x1, sp, 8\n"    // argv (x1) = sp
+		"lsl x2, x0, 3\n"    // envp (x2) = 8*argc ...
+		"add x2, x2, 8\n"    //           + 8 (skip null)
+		"add x2, x2, x1\n"   //           + argv
+		"and sp, x1, -16\n"  // sp must be 16-byte aligned in the callee
+		"bl main\n"          // main() returns the status code, we'll exit with it.
+		"mov x8, 93\n"       // NR_exit == 93
+		"svc #0\n"
+	);
+	__builtin_unreachable();
+}
 #endif // _NOLIBC_ARCH_AARCH64_H
diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h
index f31be8e967d6..875b21975137 100644
--- a/tools/include/nolibc/arch-arm.h
+++ b/tools/include/nolibc/arch-arm.h
@@ -175,30 +175,20 @@ struct sys_stat_struct {
 })
 
 /* startup code */
-__asm__ (".section .text\n"
-    ".weak _start\n"
-    "_start:\n"
-#if defined(__THUMBEB__) || defined(__THUMBEL__)
-    /* We enter here in 32-bit mode but if some previous functions were in
-     * 16-bit mode, the assembler cannot know, so we need to tell it we're in
-     * 32-bit now, then switch to 16-bit (is there a better way to do it than
-     * adding 1 by hand ?) and tell the asm we're now in 16-bit mode so that
-     * it generates correct instructions. Note that we do not support thumb1.
-     */
-    ".code 32\n"
-    "add     r0, pc, #1\n"
-    "bx      r0\n"
-    ".code 16\n"
-#endif
-    "pop {%r0}\n"                 // argc was in the stack
-    "mov %r1, %sp\n"              // argv = sp
-    "add %r2, %r1, %r0, lsl #2\n" // envp = argv + 4*argc ...
-    "add %r2, %r2, $4\n"          //        ... + 4
-    "and %r3, %r1, $-8\n"         // AAPCS : sp must be 8-byte aligned in the
-    "mov %sp, %r3\n"              //         callee, an bl doesn't push (lr=pc)
-    "bl main\n"                   // main() returns the status code, we'll exit with it.
-    "movs r7, $1\n"               // NR_exit == 1
-    "svc $0x00\n"
-    "");
+void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
+{
+	__asm__ volatile (
+		"pop {%r0}\n"                 // argc was in the stack
+		"mov %r1, %sp\n"              // argv = sp
+		"add %r2, %r1, %r0, lsl #2\n" // envp = argv + 4*argc ...
+		"add %r2, %r2, $4\n"          //        ... + 4
+		"and %r3, %r1, $-8\n"         // AAPCS : sp must be 8-byte aligned in the
+		"mov %sp, %r3\n"              //         callee, an bl doesn't push (lr=pc)
+		"bl main\n"                   // main() returns the status code, we'll exit with it.
+		"movs r7, $1\n"               // NR_exit == 1
+		"svc $0x00\n"
+	  );
+	__builtin_unreachable();
+}
 
 #endif // _NOLIBC_ARCH_ARM_H
diff --git a/tools/include/nolibc/arch-i386.h b/tools/include/nolibc/arch-i386.h
index d7e7212346e2..b1bed2d87f74 100644
--- a/tools/include/nolibc/arch-i386.h
+++ b/tools/include/nolibc/arch-i386.h
@@ -197,23 +197,25 @@ struct sys_stat_struct {
  * 2) The deepest stack frame should be set to zero
  *
  */
-__asm__ (".section .text\n"
-    ".weak _start\n"
-    "_start:\n"
-    "pop %eax\n"                // argc   (first arg, %eax)
-    "mov %esp, %ebx\n"          // argv[] (second arg, %ebx)
-    "lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx)
-    "xor %ebp, %ebp\n"          // zero the stack frame
-    "and $-16, %esp\n"          // x86 ABI : esp must be 16-byte aligned before
-    "sub $4, %esp\n"            // the call instruction (args are aligned)
-    "push %ecx\n"               // push all registers on the stack so that we
-    "push %ebx\n"               // support both regparm and plain stack modes
-    "push %eax\n"
-    "call main\n"               // main() returns the status code in %eax
-    "mov %eax, %ebx\n"          // retrieve exit code (32-bit int)
-    "movl $1, %eax\n"           // NR_exit == 1
-    "int $0x80\n"               // exit now
-    "hlt\n"                     // ensure it does not
-    "");
+void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
+{
+	__asm__ volatile (
+		"pop %eax\n"                // argc   (first arg, %eax)
+		"mov %esp, %ebx\n"          // argv[] (second arg, %ebx)
+		"lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx)
+		"xor %ebp, %ebp\n"          // zero the stack frame
+		"and $-16, %esp\n"          // x86 ABI : esp must be 16-byte aligned before
+		"sub $4, %esp\n"            // the call instruction (args are aligned)
+		"push %ecx\n"               // push all registers on the stack so that we
+		"push %ebx\n"               // support both regparm and plain stack modes
+		"push %eax\n"
+		"call main\n"               // main() returns the status code in %eax
+		"mov %eax, %ebx\n"          // retrieve exit code (32-bit int)
+		"movl $1, %eax\n"           // NR_exit == 1
+		"int $0x80\n"               // exit now
+		"hlt\n"                     // ensure it does not
+	);
+	__builtin_unreachable();
+}
 
 #endif // _NOLIBC_ARCH_I386_H
diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h
index 7380093ba9e7..11270ef25ea5 100644
--- a/tools/include/nolibc/arch-mips.h
+++ b/tools/include/nolibc/arch-mips.h
@@ -189,29 +189,32 @@ struct sys_stat_struct {
 })
 
 /* startup code, note that it's called __start on MIPS */
-__asm__ (".section .text\n"
-    ".weak __start\n"
-    ".set nomips16\n"
-    ".set push\n"
-    ".set    noreorder\n"
-    ".option pic0\n"
-    ".ent __start\n"
-    "__start:\n"
-    "lw $a0,($sp)\n"              // argc was in the stack
-    "addiu  $a1, $sp, 4\n"        // argv = sp + 4
-    "sll $a2, $a0, 2\n"           // a2 = argc * 4
-    "add   $a2, $a2, $a1\n"       // envp = argv + 4*argc ...
-    "addiu $a2, $a2, 4\n"         //        ... + 4
-    "li $t0, -8\n"
-    "and $sp, $sp, $t0\n"         // sp must be 8-byte aligned
-    "addiu $sp,$sp,-16\n"         // the callee expects to save a0..a3 there!
-    "jal main\n"                  // main() returns the status code, we'll exit with it.
-    "nop\n"                       // delayed slot
-    "move $a0, $v0\n"             // retrieve 32-bit exit code from v0
-    "li $v0, 4001\n"              // NR_exit == 4001
-    "syscall\n"
-    ".end __start\n"
-    ".set pop\n"
-    "");
+void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) __start(void)
+{
+	__asm__ volatile (
+		//".set nomips16\n"
+		".set push\n"
+		".set    noreorder\n"
+		".option pic0\n"
+		//".ent __start\n"
+		//"__start:\n"
+		"lw $a0,($sp)\n"        // argc was in the stack
+		"addiu  $a1, $sp, 4\n"  // argv = sp + 4
+		"sll $a2, $a0, 2\n"     // a2 = argc * 4
+		"add   $a2, $a2, $a1\n" // envp = argv + 4*argc ...
+		"addiu $a2, $a2, 4\n"   //        ... + 4
+		"li $t0, -8\n"
+		"and $sp, $sp, $t0\n"   // sp must be 8-byte aligned
+		"addiu $sp,$sp,-16\n"   // the callee expects to save a0..a3 there!
+		"jal main\n"            // main() returns the status code, we'll exit with it.
+		"nop\n"                 // delayed slot
+		"move $a0, $v0\n"       // retrieve 32-bit exit code from v0
+		"li $v0, 4001\n"        // NR_exit == 4001
+		"syscall\n"
+		//".end __start\n"
+		".set pop\n"
+	);
+	__builtin_unreachable();
+}
 
 #endif // _NOLIBC_ARCH_MIPS_H
diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h
index a3bdd9803f8c..bee769e6885c 100644
--- a/tools/include/nolibc/arch-riscv.h
+++ b/tools/include/nolibc/arch-riscv.h
@@ -183,22 +183,24 @@ struct sys_stat_struct {
 })
 
 /* startup code */
-__asm__ (".section .text\n"
-    ".weak _start\n"
-    "_start:\n"
-    ".option push\n"
-    ".option norelax\n"
-    "lla   gp, __global_pointer$\n"
-    ".option pop\n"
-    "lw    a0, 0(sp)\n"          // argc (a0) was in the stack
-    "add   a1, sp, "SZREG"\n"    // argv (a1) = sp
-    "slli  a2, a0, "PTRLOG"\n"   // envp (a2) = SZREG*argc ...
-    "add   a2, a2, "SZREG"\n"    //             + SZREG (skip null)
-    "add   a2,a2,a1\n"           //             + argv
-    "andi  sp,a1,-16\n"          // sp must be 16-byte aligned
-    "call  main\n"               // main() returns the status code, we'll exit with it.
-    "li a7, 93\n"                // NR_exit == 93
-    "ecall\n"
-    "");
+void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
+{
+	__asm__ volatile (
+		".option push\n"
+		".option norelax\n"
+		"lla   gp, __global_pointer$\n"
+		".option pop\n"
+		"lw    a0, 0(sp)\n"          // argc (a0) was in the stack
+		"add   a1, sp, "SZREG"\n"    // argv (a1) = sp
+		"slli  a2, a0, "PTRLOG"\n"   // envp (a2) = SZREG*argc ...
+		"add   a2, a2, "SZREG"\n"    //             + SZREG (skip null)
+		"add   a2,a2,a1\n"           //             + argv
+		"andi  sp,a1,-16\n"          // sp must be 16-byte aligned
+		"call  main\n"               // main() returns the status code, we'll exit with it.
+		"li a7, 93\n"                // NR_exit == 93
+		"ecall\n"
+	);
+	__builtin_unreachable();
+}
 
 #endif // _NOLIBC_ARCH_RISCV_H
diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h
index 76bc8fdaf922..2c0b8847c050 100644
--- a/tools/include/nolibc/arch-s390.h
+++ b/tools/include/nolibc/arch-s390.h
@@ -172,27 +172,29 @@ struct sys_stat_struct {
 })
 
 /* startup code */
-__asm__ (".section .text\n"
-	 ".weak _start\n"
-	 "_start:\n"
-	 "lg	%r2,0(%r15)\n"		/* argument count */
-	 "la	%r3,8(%r15)\n"		/* argument pointers */
-
-	 "xgr	%r0,%r0\n"		/* r0 will be our NULL value */
-	 /* search for envp */
-	 "lgr	%r4,%r3\n"		/* start at argv */
-	 "0:\n"
-	 "clg	%r0,0(%r4)\n"		/* entry zero? */
-	 "la	%r4,8(%r4)\n"		/* advance pointer */
-	 "jnz	0b\n"			/* no -> test next pointer */
-					/* yes -> r4 now contains start of envp */
-
-	 "aghi	%r15,-160\n"		/* allocate new stackframe */
-	 "xc	0(8,%r15),0(%r15)\n"	/* clear backchain */
-	 "brasl	%r14,main\n"		/* ret value of main is arg to exit */
-	 "lghi	%r1,1\n"		/* __NR_exit */
-	 "svc	0\n"
-	 "");
+void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
+{
+	__asm__ volatile (
+		"lg	%r2,0(%r15)\n"		/* argument count */
+		"la	%r3,8(%r15)\n"		/* argument pointers */
+
+		"xgr	%r0,%r0\n"		/* r0 will be our NULL value */
+		/* search for envp */
+		"lgr	%r4,%r3\n"		/* start at argv */
+		"0:\n"
+		"clg	%r0,0(%r4)\n"		/* entry zero? */
+		"la	%r4,8(%r4)\n"		/* advance pointer */
+		"jnz	0b\n"			/* no -> test next pointer */
+						/* yes -> r4 now contains start of envp */
+
+		"aghi	%r15,-160\n"		/* allocate new stackframe */
+		"xc	0(8,%r15),0(%r15)\n"	/* clear backchain */
+		"brasl	%r14,main\n"		/* ret value of main is arg to exit */
+		"lghi	%r1,1\n"		/* __NR_exit */
+		"svc	0\n"
+	);
+	__builtin_unreachable();
+}
 
 struct s390_mmap_arg_struct {
 	unsigned long addr;
diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h
index 0e1e9eb8545d..c70a84612a9e 100644
--- a/tools/include/nolibc/arch-x86_64.h
+++ b/tools/include/nolibc/arch-x86_64.h
@@ -197,19 +197,21 @@ struct sys_stat_struct {
  * 2) The deepest stack frame should be zero (the %rbp).
  *
  */
-__asm__ (".section .text\n"
-    ".weak _start\n"
-    "_start:\n"
-    "pop %rdi\n"                // argc   (first arg, %rdi)
-    "mov %rsp, %rsi\n"          // argv[] (second arg, %rsi)
-    "lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx)
-    "xor %ebp, %ebp\n"          // zero the stack frame
-    "and $-16, %rsp\n"          // x86 ABI : esp must be 16-byte aligned before call
-    "call main\n"               // main() returns the status code, we'll exit with it.
-    "mov %eax, %edi\n"          // retrieve exit code (32 bit)
-    "mov $60, %eax\n"           // NR_exit == 60
-    "syscall\n"                 // really exit
-    "hlt\n"                     // ensure it does not return
-    "");
+void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
+{
+	__asm__ volatile (
+		"pop %rdi\n"                // argc   (first arg, %rdi)
+		"mov %rsp, %rsi\n"          // argv[] (second arg, %rsi)
+		"lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx)
+		"xor %ebp, %ebp\n"          // zero the stack frame
+		"and $-16, %rsp\n"          // x86 ABI : esp must be 16-byte aligned before call
+		"call main\n"               // main() returns the status code, we'll exit with it.
+		"mov %eax, %edi\n"          // retrieve exit code (32 bit)
+		"mov $60, %eax\n"           // NR_exit == 60
+		"syscall\n"                 // really exit
+		"hlt\n"                     // ensure it does not return
+	);
+	__builtin_unreachable();
+}
 
 #endif // _NOLIBC_ARCH_X86_64_H
-- 
cgit 


From 20470dfd656ef71145d78102b540f73a11748182 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:14 +0100
Subject: tools/nolibc: enable support for thumb1 mode for ARM

Passing -mthumb to the kernel.org arm toolchain failed to build because it
defaults to armv5 hence thumb1, which has a fairly limited instruction set
compared to thumb2 enabled with armv7 that is much more complete. It's not
very difficult to adjust the instructions to also build on thumb1, it only
adds a total of 3 instructions, so it's worth doing it at least to ease use
by casual testers. It was verified that the adjusted code now builds and
works fine for armv5, thumb1, armv7 and thumb2, as long as frame pointers
are not used.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-arm.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h
index 875b21975137..e4ba77b0310f 100644
--- a/tools/include/nolibc/arch-arm.h
+++ b/tools/include/nolibc/arch-arm.h
@@ -180,10 +180,16 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 	__asm__ volatile (
 		"pop {%r0}\n"                 // argc was in the stack
 		"mov %r1, %sp\n"              // argv = sp
-		"add %r2, %r1, %r0, lsl #2\n" // envp = argv + 4*argc ...
-		"add %r2, %r2, $4\n"          //        ... + 4
-		"and %r3, %r1, $-8\n"         // AAPCS : sp must be 8-byte aligned in the
-		"mov %sp, %r3\n"              //         callee, an bl doesn't push (lr=pc)
+
+		"add %r2, %r0, $1\n"          // envp = (argc + 1) ...
+		"lsl %r2, %r2, $2\n"          //        * 4        ...
+		"add %r2, %r2, %r1\n"         //        + argv
+
+		"mov %r3, $8\n"               // AAPCS : sp must be 8-byte aligned in the
+		"neg %r3, %r3\n"              //         callee, and bl doesn't push (lr=pc)
+		"and %r3, %r3, %r1\n"         // so we do sp = r1(=sp) & r3(=-8);
+		"mov %sp, %r3\n"              //
+
 		"bl main\n"                   // main() returns the status code, we'll exit with it.
 		"movs r7, $1\n"               // NR_exit == 1
 		"svc $0x00\n"
-- 
cgit 


From 5a51b6de5968885347e559d0dac8307a38005806 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:15 +0100
Subject: tools/nolibc: support thumb mode with frame pointers on ARM

In Thumb mode, register r7 is normally used to store the frame pointer.
By default when optimizing at -Os there's no frame pointer so this works
fine. But if no optimization is set, then build errors occur, indicating
that r7 cannot not be used. It's difficult to cheat because it's the
compiler that is complaining, not the assembler, so it's not even possible
to report that the register was clobbered. The solution consists in saving
and restoring r7 around the syscall, but this slightly inflates the code.
The syscall number is passed via r6 which is never used by syscalls.

The current patch adds a few macroes which do that only in Thumb mode,
and which continue to directly assign the syscall number to register r7
in ARM mode. Now this always builds and works for all modes (tested on
Arm, Thumbv1, Thumbv2 modes, at -Os, -O0, -O0 -fomit-frame-pointer).
The code is very slightly inflated in thumb-mode without frame-pointers
compared to previously (e.g. 7928 vs 7864 bytes for nolibc-test) but at
least it's always operational. And it's possible to disable this mechanism
by setting NOLIBC_OMIT_FRAME_POINTER.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-arm.h | 60 ++++++++++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h
index e4ba77b0310f..ef94df2d93d5 100644
--- a/tools/include/nolibc/arch-arm.h
+++ b/tools/include/nolibc/arch-arm.h
@@ -70,20 +70,44 @@ struct sys_stat_struct {
  *     don't have to experience issues with register constraints.
  *   - the syscall number is always specified last in order to allow to force
  *     some registers before (gcc refuses a %-register at the last position).
+ *   - in thumb mode without -fomit-frame-pointer, r7 is also used to store the
+ *     frame pointer, and we cannot directly assign it as a register variable,
+ *     nor can we clobber it. Instead we assign the r6 register and swap it
+ *     with r7 before calling svc, and r6 is marked as clobbered.
+ *     We're just using any regular register which we assign to r7 after saving
+ *     it.
  *
  * Also, ARM supports the old_select syscall if newselect is not available
  */
 #define __ARCH_WANT_SYS_OLD_SELECT
 
+#if (defined(__THUMBEB__) || defined(__THUMBEL__)) && \
+    !defined(NOLIBC_OMIT_FRAME_POINTER)
+/* swap r6,r7 needed in Thumb mode since we can't use nor clobber r7 */
+#define _NOLIBC_SYSCALL_REG         "r6"
+#define _NOLIBC_THUMB_SET_R7        "eor r7, r6\neor r6, r7\neor r7, r6\n"
+#define _NOLIBC_THUMB_RESTORE_R7    "mov r7, r6\n"
+
+#else  /* we're in ARM mode */
+/* in Arm mode we can directly use r7 */
+#define _NOLIBC_SYSCALL_REG         "r7"
+#define _NOLIBC_THUMB_SET_R7        ""
+#define _NOLIBC_THUMB_RESTORE_R7    ""
+
+#endif /* end THUMB */
+
 #define my_syscall0(num)                                                      \
 ({                                                                            \
-	register long _num __asm__ ("r7") = (num);                            \
+	register long _num  __asm__(_NOLIBC_SYSCALL_REG) = (num);             \
 	register long _arg1 __asm__ ("r0");                                   \
 	                                                                      \
 	__asm__  volatile (                                                   \
+		_NOLIBC_THUMB_SET_R7                                          \
 		"svc #0\n"                                                    \
-		: "=r"(_arg1)                                                 \
-		: "r"(_num)                                                   \
+		_NOLIBC_THUMB_RESTORE_R7                                      \
+		: "=r"(_arg1), "=r"(_num)                                     \
+		: "r"(_arg1),                                                 \
+		  "r"(_num)                                                   \
 		: "memory", "cc", "lr"                                        \
 	);                                                                    \
 	_arg1;                                                                \
@@ -91,12 +115,14 @@ struct sys_stat_struct {
 
 #define my_syscall1(num, arg1)                                                \
 ({                                                                            \
-	register long _num __asm__ ("r7") = (num);                            \
+	register long _num  __asm__(_NOLIBC_SYSCALL_REG) = (num);             \
 	register long _arg1 __asm__ ("r0") = (long)(arg1);                    \
 	                                                                      \
 	__asm__  volatile (                                                   \
+		_NOLIBC_THUMB_SET_R7                                          \
 		"svc #0\n"                                                    \
-		: "=r"(_arg1)                                                 \
+		_NOLIBC_THUMB_RESTORE_R7                                      \
+		: "=r"(_arg1), "=r" (_num)                                    \
 		: "r"(_arg1),                                                 \
 		  "r"(_num)                                                   \
 		: "memory", "cc", "lr"                                        \
@@ -106,13 +132,15 @@ struct sys_stat_struct {
 
 #define my_syscall2(num, arg1, arg2)                                          \
 ({                                                                            \
-	register long _num __asm__ ("r7") = (num);                            \
+	register long _num  __asm__(_NOLIBC_SYSCALL_REG) = (num);             \
 	register long _arg1 __asm__ ("r0") = (long)(arg1);                    \
 	register long _arg2 __asm__ ("r1") = (long)(arg2);                    \
 	                                                                      \
 	__asm__  volatile (                                                   \
+		_NOLIBC_THUMB_SET_R7                                          \
 		"svc #0\n"                                                    \
-		: "=r"(_arg1)                                                 \
+		_NOLIBC_THUMB_RESTORE_R7                                      \
+		: "=r"(_arg1), "=r" (_num)                                    \
 		: "r"(_arg1), "r"(_arg2),                                     \
 		  "r"(_num)                                                   \
 		: "memory", "cc", "lr"                                        \
@@ -122,14 +150,16 @@ struct sys_stat_struct {
 
 #define my_syscall3(num, arg1, arg2, arg3)                                    \
 ({                                                                            \
-	register long _num __asm__ ("r7") = (num);                            \
+	register long _num  __asm__(_NOLIBC_SYSCALL_REG) = (num);             \
 	register long _arg1 __asm__ ("r0") = (long)(arg1);                    \
 	register long _arg2 __asm__ ("r1") = (long)(arg2);                    \
 	register long _arg3 __asm__ ("r2") = (long)(arg3);                    \
 	                                                                      \
 	__asm__  volatile (                                                   \
+		_NOLIBC_THUMB_SET_R7                                          \
 		"svc #0\n"                                                    \
-		: "=r"(_arg1)                                                 \
+		_NOLIBC_THUMB_RESTORE_R7                                      \
+		: "=r"(_arg1), "=r" (_num)                                    \
 		: "r"(_arg1), "r"(_arg2), "r"(_arg3),                         \
 		  "r"(_num)                                                   \
 		: "memory", "cc", "lr"                                        \
@@ -139,15 +169,17 @@ struct sys_stat_struct {
 
 #define my_syscall4(num, arg1, arg2, arg3, arg4)                              \
 ({                                                                            \
-	register long _num __asm__ ("r7") = (num);                            \
+	register long _num  __asm__(_NOLIBC_SYSCALL_REG) = (num);             \
 	register long _arg1 __asm__ ("r0") = (long)(arg1);                    \
 	register long _arg2 __asm__ ("r1") = (long)(arg2);                    \
 	register long _arg3 __asm__ ("r2") = (long)(arg3);                    \
 	register long _arg4 __asm__ ("r3") = (long)(arg4);                    \
 	                                                                      \
 	__asm__  volatile (                                                   \
+		_NOLIBC_THUMB_SET_R7                                          \
 		"svc #0\n"                                                    \
-		: "=r"(_arg1)                                                 \
+		_NOLIBC_THUMB_RESTORE_R7                                      \
+		: "=r"(_arg1), "=r" (_num)                                    \
 		: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4),             \
 		  "r"(_num)                                                   \
 		: "memory", "cc", "lr"                                        \
@@ -157,7 +189,7 @@ struct sys_stat_struct {
 
 #define my_syscall5(num, arg1, arg2, arg3, arg4, arg5)                        \
 ({                                                                            \
-	register long _num __asm__ ("r7") = (num);                            \
+	register long _num  __asm__(_NOLIBC_SYSCALL_REG) = (num);             \
 	register long _arg1 __asm__ ("r0") = (long)(arg1);                    \
 	register long _arg2 __asm__ ("r1") = (long)(arg2);                    \
 	register long _arg3 __asm__ ("r2") = (long)(arg3);                    \
@@ -165,8 +197,10 @@ struct sys_stat_struct {
 	register long _arg5 __asm__ ("r4") = (long)(arg5);                    \
 	                                                                      \
 	__asm__  volatile (                                                   \
+		_NOLIBC_THUMB_SET_R7                                          \
 		"svc #0\n"                                                    \
-		: "=r" (_arg1)                                                \
+		_NOLIBC_THUMB_RESTORE_R7                                      \
+		: "=r"(_arg1), "=r" (_num)                                    \
 		: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \
 		  "r"(_num)                                                   \
 		: "memory", "cc", "lr"                                        \
-- 
cgit 


From d5b48f958b36e38ee1a9bebb522bdd86114c34ae Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:16 +0100
Subject: tools/nolibc: remove local definitions of O_* flags for open/fcntl

The historic nolibc code did not include asm/fcntl.h and had to define
the various O_RDWR etc macros in each arch-specific file (since such
values differ between certain archs). This was found at least once to
induce bugs due to wrong definitions. Let's get rid of all of them and
include asm/nolibc.h from sys.h instead. This was verified to work
properly on all supported architectures.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-aarch64.h | 12 ------------
 tools/include/nolibc/arch-arm.h     | 12 ------------
 tools/include/nolibc/arch-i386.h    | 12 ------------
 tools/include/nolibc/arch-mips.h    | 12 ------------
 tools/include/nolibc/arch-riscv.h   | 12 ------------
 tools/include/nolibc/arch-s390.h    | 12 ------------
 tools/include/nolibc/arch-x86_64.h  | 12 ------------
 tools/include/nolibc/sys.h          |  1 +
 8 files changed, 1 insertion(+), 84 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-aarch64.h b/tools/include/nolibc/arch-aarch64.h
index 4d263661411f..f480993159ec 100644
--- a/tools/include/nolibc/arch-aarch64.h
+++ b/tools/include/nolibc/arch-aarch64.h
@@ -7,18 +7,6 @@
 #ifndef _NOLIBC_ARCH_AARCH64_H
 #define _NOLIBC_ARCH_AARCH64_H
 
-/* O_* macros for fcntl/open are architecture-specific */
-#define O_RDONLY            0
-#define O_WRONLY            1
-#define O_RDWR              2
-#define O_CREAT          0x40
-#define O_EXCL           0x80
-#define O_NOCTTY        0x100
-#define O_TRUNC         0x200
-#define O_APPEND        0x400
-#define O_NONBLOCK      0x800
-#define O_DIRECTORY    0x4000
-
 /* The struct returned by the newfstatat() syscall. Differs slightly from the
  * x86_64's stat one by field ordering, so be careful.
  */
diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h
index ef94df2d93d5..48bd95492c87 100644
--- a/tools/include/nolibc/arch-arm.h
+++ b/tools/include/nolibc/arch-arm.h
@@ -7,18 +7,6 @@
 #ifndef _NOLIBC_ARCH_ARM_H
 #define _NOLIBC_ARCH_ARM_H
 
-/* O_* macros for fcntl/open are architecture-specific */
-#define O_RDONLY            0
-#define O_WRONLY            1
-#define O_RDWR              2
-#define O_CREAT          0x40
-#define O_EXCL           0x80
-#define O_NOCTTY        0x100
-#define O_TRUNC         0x200
-#define O_APPEND        0x400
-#define O_NONBLOCK      0x800
-#define O_DIRECTORY    0x4000
-
 /* The struct returned by the stat() syscall, 32-bit only, the syscall returns
  * exactly 56 bytes (stops before the unused array). In big endian, the format
  * differs as devices are returned as short only.
diff --git a/tools/include/nolibc/arch-i386.h b/tools/include/nolibc/arch-i386.h
index b1bed2d87f74..ef2a836ee667 100644
--- a/tools/include/nolibc/arch-i386.h
+++ b/tools/include/nolibc/arch-i386.h
@@ -7,18 +7,6 @@
 #ifndef _NOLIBC_ARCH_I386_H
 #define _NOLIBC_ARCH_I386_H
 
-/* O_* macros for fcntl/open are architecture-specific */
-#define O_RDONLY            0
-#define O_WRONLY            1
-#define O_RDWR              2
-#define O_CREAT          0x40
-#define O_EXCL           0x80
-#define O_NOCTTY        0x100
-#define O_TRUNC         0x200
-#define O_APPEND        0x400
-#define O_NONBLOCK      0x800
-#define O_DIRECTORY   0x10000
-
 /* The struct returned by the stat() syscall, 32-bit only, the syscall returns
  * exactly 56 bytes (stops before the unused array).
  */
diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h
index 11270ef25ea5..a3764f6d267e 100644
--- a/tools/include/nolibc/arch-mips.h
+++ b/tools/include/nolibc/arch-mips.h
@@ -7,18 +7,6 @@
 #ifndef _NOLIBC_ARCH_MIPS_H
 #define _NOLIBC_ARCH_MIPS_H
 
-/* O_* macros for fcntl/open are architecture-specific */
-#define O_RDONLY            0
-#define O_WRONLY            1
-#define O_RDWR              2
-#define O_APPEND       0x0008
-#define O_NONBLOCK     0x0080
-#define O_CREAT        0x0100
-#define O_TRUNC        0x0200
-#define O_EXCL         0x0400
-#define O_NOCTTY       0x0800
-#define O_DIRECTORY   0x10000
-
 /* The struct returned by the stat() syscall. 88 bytes are returned by the
  * syscall.
  */
diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h
index bee769e6885c..c2b5db383d96 100644
--- a/tools/include/nolibc/arch-riscv.h
+++ b/tools/include/nolibc/arch-riscv.h
@@ -7,18 +7,6 @@
 #ifndef _NOLIBC_ARCH_RISCV_H
 #define _NOLIBC_ARCH_RISCV_H
 
-/* O_* macros for fcntl/open are architecture-specific */
-#define O_RDONLY            0
-#define O_WRONLY            1
-#define O_RDWR              2
-#define O_CREAT          0x40
-#define O_EXCL           0x80
-#define O_NOCTTY        0x100
-#define O_TRUNC         0x200
-#define O_APPEND        0x400
-#define O_NONBLOCK      0x800
-#define O_DIRECTORY   0x10000
-
 struct sys_stat_struct {
 	unsigned long	st_dev;		/* Device.  */
 	unsigned long	st_ino;		/* File serial number.  */
diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h
index 2c0b8847c050..b58f64d47b82 100644
--- a/tools/include/nolibc/arch-s390.h
+++ b/tools/include/nolibc/arch-s390.h
@@ -7,18 +7,6 @@
 #define _NOLIBC_ARCH_S390_H
 #include <asm/unistd.h>
 
-/* O_* macros for fcntl/open are architecture-specific */
-#define O_RDONLY            0
-#define O_WRONLY            1
-#define O_RDWR              2
-#define O_CREAT          0x40
-#define O_EXCL           0x80
-#define O_NOCTTY        0x100
-#define O_TRUNC         0x200
-#define O_APPEND        0x400
-#define O_NONBLOCK      0x800
-#define O_DIRECTORY   0x10000
-
 /* The struct returned by the stat() syscall, equivalent to stat64(). The
  * syscall returns 116 bytes and stops in the middle of __unused.
  */
diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h
index c70a84612a9e..8d482505c347 100644
--- a/tools/include/nolibc/arch-x86_64.h
+++ b/tools/include/nolibc/arch-x86_64.h
@@ -7,18 +7,6 @@
 #ifndef _NOLIBC_ARCH_X86_64_H
 #define _NOLIBC_ARCH_X86_64_H
 
-/* O_* macros for fcntl/open are architecture-specific */
-#define O_RDONLY            0
-#define O_WRONLY            1
-#define O_RDWR              2
-#define O_CREAT          0x40
-#define O_EXCL           0x80
-#define O_NOCTTY        0x100
-#define O_TRUNC         0x200
-#define O_APPEND        0x400
-#define O_NONBLOCK      0x800
-#define O_DIRECTORY   0x10000
-
 /* The struct returned by the stat() syscall, equivalent to stat64(). The
  * syscall returns 116 bytes and stops in the middle of __unused.
  */
diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h
index a42d7c405bdc..47bf67668860 100644
--- a/tools/include/nolibc/sys.h
+++ b/tools/include/nolibc/sys.h
@@ -11,6 +11,7 @@
 #include "std.h"
 
 /* system includes */
+#include <asm/fcntl.h>   // for O_*
 #include <asm/unistd.h>
 #include <asm/signal.h>  // for SIGCHLD
 #include <asm/ioctls.h>
-- 
cgit 


From 1caa1154c3e9ad071a07c02431a700ff5df94392 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:17 +0100
Subject: tools/nolibc: make errno a weak symbol instead of a static one

Till now errno was declared static so that it could be eliminated if
unused. While the goal is commendable for tiny executables as it allows
to eliminate any data and bss segments when not used, this comes with
some limitations, one of which being that the errno symbol seen in
different units are not the same. Even though this has never been a
real issue given the nature of the programs involved till now, it
happens that referencing the same symbol from multiple units can also
be achieved using weak symbols, with a difference being that only one
of them will be used for all of them. Compared to weak symbols, static
basically have no benefit for regular programs since there are always
at least a few variables in most of these, so the bss segment cannot
be eliminated. E.g:

  $ size nolibc-test-static-errno
     text    data     bss     dec     hex filename
    11531       0      48   11579    2d3b nolibc-test-static-errno

Furthermore, the weak symbol doesn't use bss storage at all, resulting
in a slightly section:

  $ size nolibc-test-weak-errno
     text    data     bss     dec     hex filename
    11531       0      40   11571    2d33 nolibc-test-weak-errno

This patch thus converts errno from static to weak.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/errno.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/errno.h b/tools/include/nolibc/errno.h
index 9dc4919c769b..a44486ff0477 100644
--- a/tools/include/nolibc/errno.h
+++ b/tools/include/nolibc/errno.h
@@ -9,11 +9,9 @@
 
 #include <asm/errno.h>
 
-/* this way it will be removed if unused */
-static int errno;
-
 #ifndef NOLIBC_IGNORE_ERRNO
 #define SET_ERRNO(v) do { errno = (v); } while (0)
+int errno __attribute__((weak));
 #else
 #define SET_ERRNO(v) do { } while (0)
 #endif
-- 
cgit 


From 89dc50921c87e2b4a4612188c5a90abebc02b60d Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:18 +0100
Subject: tools/nolibc: export environ as a weak symbol on x86_64

The environ is retrieved from the _start code and is easy to store at
this moment. Let's declare the variable weak and store the value into
it. By not being static it will be visible to all units. By being weak,
if some programs already declared it, they will continue to be able to
use it. This was tested both with environ inherited from _start and
extracted from envp.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-x86_64.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h
index 8d482505c347..683702a16a61 100644
--- a/tools/include/nolibc/arch-x86_64.h
+++ b/tools/include/nolibc/arch-x86_64.h
@@ -178,6 +178,8 @@ struct sys_stat_struct {
 	_ret;                                                                 \
 })
 
+char **environ __attribute__((weak));
+
 /* startup code */
 /*
  * x86-64 System V ABI mandates:
@@ -191,6 +193,7 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 		"pop %rdi\n"                // argc   (first arg, %rdi)
 		"mov %rsp, %rsi\n"          // argv[] (second arg, %rsi)
 		"lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx)
+		"mov %rdx, environ\n"       // save environ
 		"xor %ebp, %ebp\n"          // zero the stack frame
 		"and $-16, %rsp\n"          // x86 ABI : esp must be 16-byte aligned before call
 		"call main\n"               // main() returns the status code, we'll exit with it.
-- 
cgit 


From 52e423f5b93e6d30fd5d311f068094b768caffb5 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:19 +0100
Subject: tools/nolibc: export environ as a weak symbol on i386

The environ is retrieved from the _start code and is easy to store at
this moment. Let's declare the variable weak and store the value into
it. By not being static it will be visible to all units. By being weak,
if some programs already declared it, they will continue to be able to
use it. This was tested both with environ inherited from _start and
extracted from envp.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-i386.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-i386.h b/tools/include/nolibc/arch-i386.h
index ef2a836ee667..60b586120727 100644
--- a/tools/include/nolibc/arch-i386.h
+++ b/tools/include/nolibc/arch-i386.h
@@ -178,6 +178,8 @@ struct sys_stat_struct {
 	_eax;							\
 })
 
+char **environ __attribute__((weak));
+
 /* startup code */
 /*
  * i386 System V ABI mandates:
@@ -191,6 +193,7 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 		"pop %eax\n"                // argc   (first arg, %eax)
 		"mov %esp, %ebx\n"          // argv[] (second arg, %ebx)
 		"lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx)
+		"mov %ecx, environ\n"       // save environ
 		"xor %ebp, %ebp\n"          // zero the stack frame
 		"and $-16, %esp\n"          // x86 ABI : esp must be 16-byte aligned before
 		"sub $4, %esp\n"            // the call instruction (args are aligned)
-- 
cgit 


From 9b8688c6eaddcdf0db600953d57beaeafaae1155 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:20 +0100
Subject: tools/nolibc: export environ as a weak symbol on arm64

The environ is retrieved from the _start code and is easy to store at
this moment. Let's declare the variable weak and store the value into
it. By not being static it will be visible to all units. By being weak,
if some programs already declared it, they will continue to be able to
use it. This was tested both with environ inherited from _start and
extracted from envp.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-aarch64.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-aarch64.h b/tools/include/nolibc/arch-aarch64.h
index f480993159ec..2e3d9adc4c4c 100644
--- a/tools/include/nolibc/arch-aarch64.h
+++ b/tools/include/nolibc/arch-aarch64.h
@@ -169,6 +169,8 @@ struct sys_stat_struct {
 	_arg1;                                                                \
 })
 
+char **environ __attribute__((weak));
+
 /* startup code */
 void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 {
@@ -178,6 +180,8 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 		"lsl x2, x0, 3\n"    // envp (x2) = 8*argc ...
 		"add x2, x2, 8\n"    //           + 8 (skip null)
 		"add x2, x2, x1\n"   //           + argv
+		"adrp x3, environ\n"          // x3 = &environ (high bits)
+		"str x2, [x3, #:lo12:environ]\n" // store envp into environ
 		"and sp, x1, -16\n"  // sp must be 16-byte aligned in the callee
 		"bl main\n"          // main() returns the status code, we'll exit with it.
 		"mov x8, 93\n"       // NR_exit == 93
-- 
cgit 


From a6f29a2c41f3d283e78e00dc8974e6efacaccf37 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:21 +0100
Subject: tools/nolibc: export environ as a weak symbol on arm

The environ is retrieved from the _start code and is easy to store at
this moment. Let's declare the variable weak and store the value into
it. By not being static it will be visible to all units. By being weak,
if some programs already declared it, they will continue to be able to
use it. This was tested in arm and thumb1 and thumb2 modes, and for each
mode, both with environ inherited from _start and extracted from envp.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-arm.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h
index 48bd95492c87..79666b590e87 100644
--- a/tools/include/nolibc/arch-arm.h
+++ b/tools/include/nolibc/arch-arm.h
@@ -196,6 +196,8 @@ struct sys_stat_struct {
 	_arg1;                                                                \
 })
 
+char **environ __attribute__((weak));
+
 /* startup code */
 void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 {
@@ -206,6 +208,8 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 		"add %r2, %r0, $1\n"          // envp = (argc + 1) ...
 		"lsl %r2, %r2, $2\n"          //        * 4        ...
 		"add %r2, %r2, %r1\n"         //        + argv
+		"ldr %r3, 1f\n"               // r3 = &environ (see below)
+		"str %r2, [r3]\n"             // store envp into environ
 
 		"mov %r3, $8\n"               // AAPCS : sp must be 8-byte aligned in the
 		"neg %r3, %r3\n"              //         callee, and bl doesn't push (lr=pc)
@@ -215,7 +219,10 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 		"bl main\n"                   // main() returns the status code, we'll exit with it.
 		"movs r7, $1\n"               // NR_exit == 1
 		"svc $0x00\n"
-	  );
+		".align 2\n"                  // below are the pointers to a few variables
+		"1:\n"
+		".word environ\n"
+	);
 	__builtin_unreachable();
 }
 
-- 
cgit 


From 8f7fafebd1da984c427d53b614e48ebecb456824 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:22 +0100
Subject: tools/nolibc: export environ as a weak symbol on mips

The environ is retrieved from the _start code and is easy to store at
this moment. Let's declare the variable weak and store the value into
it. By not being static it will be visible to all units. By being weak,
if some programs already declared it, they will continue to be able to
use it. This was tested with mips24kc (BE) both with environ inherited
from _start and extracted from envp.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-mips.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h
index a3764f6d267e..7d22f7bc38b3 100644
--- a/tools/include/nolibc/arch-mips.h
+++ b/tools/include/nolibc/arch-mips.h
@@ -176,6 +176,8 @@ struct sys_stat_struct {
 	_arg4 ? -_num : _num;                                                 \
 })
 
+char **environ __attribute__((weak));
+
 /* startup code, note that it's called __start on MIPS */
 void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) __start(void)
 {
@@ -191,6 +193,9 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) __start(void)
 		"sll $a2, $a0, 2\n"     // a2 = argc * 4
 		"add   $a2, $a2, $a1\n" // envp = argv + 4*argc ...
 		"addiu $a2, $a2, 4\n"   //        ... + 4
+		"lui $a3, %hi(environ)\n"     // load environ into a3 (hi)
+		"addiu $a3, %lo(environ)\n"   // load environ into a3 (lo)
+		"sw $a2,($a3)\n"              // store envp(a2) into environ
 		"li $t0, -8\n"
 		"and $sp, $sp, $t0\n"   // sp must be 8-byte aligned
 		"addiu $sp,$sp,-16\n"   // the callee expects to save a0..a3 there!
-- 
cgit 


From 758f33379569d817558a4310c47b42834b8e4e57 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:23 +0100
Subject: tools/nolibc: export environ as a weak symbol on riscv

The environ is retrieved from the _start code and is easy to store at
this moment. Let's declare the variable weak and store the value into
it. By not being static it will be visible to all units. By being weak,
if some programs already declared it, they will continue to be able to
use it. This was tested on riscv64 both with environ inherited from
_start and extracted from envp.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-riscv.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h
index c2b5db383d96..1608e6bd94b9 100644
--- a/tools/include/nolibc/arch-riscv.h
+++ b/tools/include/nolibc/arch-riscv.h
@@ -170,6 +170,8 @@ struct sys_stat_struct {
 	_arg1;                                                                \
 })
 
+char **environ __attribute__((weak));
+
 /* startup code */
 void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 {
@@ -183,6 +185,8 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 		"slli  a2, a0, "PTRLOG"\n"   // envp (a2) = SZREG*argc ...
 		"add   a2, a2, "SZREG"\n"    //             + SZREG (skip null)
 		"add   a2,a2,a1\n"           //             + argv
+		"lui a3, %hi(environ)\n"     // a3 = &environ (high bits)
+		"sd a2,%lo(environ)(a3)\n"   // store envp(a2) into environ
 		"andi  sp,a1,-16\n"          // sp must be 16-byte aligned
 		"call  main\n"               // main() returns the status code, we'll exit with it.
 		"li a7, 93\n"                // NR_exit == 93
-- 
cgit 


From 9e5bdc613d06fdc6fed73267767c09a0dc0c6aac Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Tue, 10 Jan 2023 08:24:24 +0100
Subject: tools/nolibc: export environ as a weak symbol on s390

The environ is retrieved from the _start code and is easy to store at
this moment. Let's declare the variable weak and store the value into
it. By not being static it will be visible to all units. By being weak,
if some programs already declared it, they will continue to be able to
use it. This was tested on s390 both with environ inherited from
_start and extracted from envp.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-s390.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h
index b58f64d47b82..039b454e79f0 100644
--- a/tools/include/nolibc/arch-s390.h
+++ b/tools/include/nolibc/arch-s390.h
@@ -159,6 +159,8 @@ struct sys_stat_struct {
 	_arg1;								\
 })
 
+char **environ __attribute__((weak));
+
 /* startup code */
 void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 {
@@ -174,6 +176,8 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 		"la	%r4,8(%r4)\n"		/* advance pointer */
 		"jnz	0b\n"			/* no -> test next pointer */
 						/* yes -> r4 now contains start of envp */
+		"larl	%r1,environ\n"
+		"stg	%r4,0(%r1)\n"
 
 		"aghi	%r15,-160\n"		/* allocate new stackframe */
 		"xc	0(8,%r15),0(%r15)\n"	/* clear backchain */
-- 
cgit 


From 2ab4aa487b93d9d994b4a20382848a4041f00d78 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:25 +0100
Subject: tools/nolibc: add auxiliary vector retrieval for i386

In the _start block we now iterate over envp to find the auxiliary
vector after the NULL. The pointer is saved into an _auxv variable
that is marked as weak so that it's accessible from multiple units.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-i386.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-i386.h b/tools/include/nolibc/arch-i386.h
index 60b586120727..e8d0cf545bf1 100644
--- a/tools/include/nolibc/arch-i386.h
+++ b/tools/include/nolibc/arch-i386.h
@@ -179,6 +179,7 @@ struct sys_stat_struct {
 })
 
 char **environ __attribute__((weak));
+const unsigned long *_auxv __attribute__((weak));
 
 /* startup code */
 /*
@@ -195,6 +196,12 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 		"lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx)
 		"mov %ecx, environ\n"       // save environ
 		"xor %ebp, %ebp\n"          // zero the stack frame
+		"mov %ecx, %edx\n"          // search for auxv (follows NULL after last env)
+		"0:\n"
+		"add $4, %edx\n"            // search for auxv using edx, it follows the
+		"cmp -4(%edx), %ebp\n"      // ... NULL after last env (ebp is zero here)
+		"jnz 0b\n"
+		"mov %edx, _auxv\n"         // save it into _auxv
 		"and $-16, %esp\n"          // x86 ABI : esp must be 16-byte aligned before
 		"sub $4, %esp\n"            // the call instruction (args are aligned)
 		"push %ecx\n"               // push all registers on the stack so that we
-- 
cgit 


From 1cce162ab4a565c8a88ffac1d9f78c80e160d6a1 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:26 +0100
Subject: tools/nolibc: add auxiliary vector retrieval for x86_64

In the _start block we now iterate over envp to find the auxiliary
vector after the NULL. The pointer is saved into an _auxv variable
that is marked as weak so that it's accessible from multiple units.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-x86_64.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h
index 683702a16a61..17f6751208e7 100644
--- a/tools/include/nolibc/arch-x86_64.h
+++ b/tools/include/nolibc/arch-x86_64.h
@@ -179,6 +179,7 @@ struct sys_stat_struct {
 })
 
 char **environ __attribute__((weak));
+const unsigned long *_auxv __attribute__((weak));
 
 /* startup code */
 /*
@@ -195,6 +196,12 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 		"lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx)
 		"mov %rdx, environ\n"       // save environ
 		"xor %ebp, %ebp\n"          // zero the stack frame
+		"mov %rdx, %rax\n"          // search for auxv (follows NULL after last env)
+		"0:\n"
+		"add $8, %rax\n"            // search for auxv using rax, it follows the
+		"cmp -8(%rax), %rbp\n"      // ... NULL after last env (rbp is zero here)
+		"jnz 0b\n"
+		"mov %rax, _auxv\n"         // save it into _auxv
 		"and $-16, %rsp\n"          // x86 ABI : esp must be 16-byte aligned before call
 		"call main\n"               // main() returns the status code, we'll exit with it.
 		"mov %eax, %edi\n"          // retrieve exit code (32 bit)
-- 
cgit 


From 2a39a53245d2b5eae83c10a67581e8dc6b629b85 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:27 +0100
Subject: tools/nolibc: add auxiliary vector retrieval for arm64

In the _start block we now iterate over envp to find the auxiliary
vector after the NULL. The pointer is saved into an _auxv variable
that is marked as weak so that it's accessible from multiple units.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-aarch64.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-aarch64.h b/tools/include/nolibc/arch-aarch64.h
index 2e3d9adc4c4c..383baddef701 100644
--- a/tools/include/nolibc/arch-aarch64.h
+++ b/tools/include/nolibc/arch-aarch64.h
@@ -170,6 +170,7 @@ struct sys_stat_struct {
 })
 
 char **environ __attribute__((weak));
+const unsigned long *_auxv __attribute__((weak));
 
 /* startup code */
 void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
@@ -182,6 +183,12 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 		"add x2, x2, x1\n"   //           + argv
 		"adrp x3, environ\n"          // x3 = &environ (high bits)
 		"str x2, [x3, #:lo12:environ]\n" // store envp into environ
+		"mov x4, x2\n"       // search for auxv (follows NULL after last env)
+		"0:\n"
+		"ldr x5, [x4], 8\n"  // x5 = *x4; x4 += 8
+		"cbnz x5, 0b\n"      // and stop at NULL after last env
+		"adrp x3, _auxv\n"   // x3 = &_auxv (high bits)
+		"str x4, [x3, #:lo12:_auxv]\n" // store x4 into _auxv
 		"and sp, x1, -16\n"  // sp must be 16-byte aligned in the callee
 		"bl main\n"          // main() returns the status code, we'll exit with it.
 		"mov x8, 93\n"       // NR_exit == 93
-- 
cgit 


From 59ea1876242163da431026921070282b2d651197 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:28 +0100
Subject: tools/nolibc: add auxiliary vector retrieval for arm

In the _start block we now iterate over envp to find the auxiliary
vector after the NULL. The pointer is saved into an _auxv variable
that is marked as weak so that it's accessible from multiple units.

Signed-off-by: Willy Tarreau <w@1wt.eu>

It was tested in arm, thumb1 and thumb2 modes.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-arm.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h
index 79666b590e87..42499f23e73c 100644
--- a/tools/include/nolibc/arch-arm.h
+++ b/tools/include/nolibc/arch-arm.h
@@ -197,6 +197,7 @@ struct sys_stat_struct {
 })
 
 char **environ __attribute__((weak));
+const unsigned long *_auxv __attribute__((weak));
 
 /* startup code */
 void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
@@ -211,6 +212,16 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 		"ldr %r3, 1f\n"               // r3 = &environ (see below)
 		"str %r2, [r3]\n"             // store envp into environ
 
+		"mov r4, r2\n"                // search for auxv (follows NULL after last env)
+		"0:\n"
+		"mov r5, r4\n"                // r5 = r4
+		"add r4, r4, #4\n"            // r4 += 4
+		"ldr r5,[r5]\n"               // r5 = *r5 = *(r4-4)
+		"cmp r5, #0\n"                // and stop at NULL after last env
+		"bne 0b\n"
+		"ldr %r3, 2f\n"               // r3 = &_auxv (low bits)
+		"str r4, [r3]\n"              // store r4 into _auxv
+
 		"mov %r3, $8\n"               // AAPCS : sp must be 8-byte aligned in the
 		"neg %r3, %r3\n"              //         callee, and bl doesn't push (lr=pc)
 		"and %r3, %r3, %r1\n"         // so we do sp = r1(=sp) & r3(=-8);
@@ -222,6 +233,8 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 		".align 2\n"                  // below are the pointers to a few variables
 		"1:\n"
 		".word environ\n"
+		"2:\n"
+		".word _auxv\n"
 	);
 	__builtin_unreachable();
 }
-- 
cgit 


From 041fa97cb332e59311aa861fac3be178abad6a9b Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:29 +0100
Subject: tools/nolibc: add auxiliary vector retrieval for riscv

In the _start block we now iterate over envp to find the auxiliary
vector after the NULL. The pointer is saved into an _auxv variable
that is marked as weak so that it's accessible from multiple units.
It was tested on riscv64 only.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-riscv.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h
index 1608e6bd94b9..e197fcb10ac0 100644
--- a/tools/include/nolibc/arch-riscv.h
+++ b/tools/include/nolibc/arch-riscv.h
@@ -171,6 +171,7 @@ struct sys_stat_struct {
 })
 
 char **environ __attribute__((weak));
+const unsigned long *_auxv __attribute__((weak));
 
 /* startup code */
 void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
@@ -185,6 +186,15 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 		"slli  a2, a0, "PTRLOG"\n"   // envp (a2) = SZREG*argc ...
 		"add   a2, a2, "SZREG"\n"    //             + SZREG (skip null)
 		"add   a2,a2,a1\n"           //             + argv
+
+		"add   a3, a2, zero\n"       // iterate a3 over envp to find auxv (after NULL)
+		"0:\n"                       // do {
+		"ld    a4, 0(a3)\n"          //   a4 = *a3;
+		"add   a3, a3, "SZREG"\n"    //   a3 += sizeof(void*);
+		"bne   a4, zero, 0b\n"       // } while (a4);
+		"lui   a4, %hi(_auxv)\n"     // a4 = &_auxv (high bits)
+		"sd    a3, %lo(_auxv)(a4)\n" // store a3 into _auxv
+
 		"lui a3, %hi(environ)\n"     // a3 = &environ (high bits)
 		"sd a2,%lo(environ)(a3)\n"   // store envp(a2) into environ
 		"andi  sp,a1,-16\n"          // sp must be 16-byte aligned
-- 
cgit 


From d01869cf1ee4ad6e02e853062eeced883b3b4504 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 10 Jan 2023 08:24:30 +0100
Subject: tools/nolibc: add auxiliary vector retrieval for mips

In the _start block we now iterate over envp to find the auxiliary
vector after the NULL. The pointer is saved into an _auxv variable
that is marked as weak so that it's accessible from multiple units.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-mips.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h
index 7d22f7bc38b3..bf83432d23ed 100644
--- a/tools/include/nolibc/arch-mips.h
+++ b/tools/include/nolibc/arch-mips.h
@@ -177,6 +177,7 @@ struct sys_stat_struct {
 })
 
 char **environ __attribute__((weak));
+const unsigned long *_auxv __attribute__((weak));
 
 /* startup code, note that it's called __start on MIPS */
 void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) __start(void)
@@ -196,6 +197,16 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) __start(void)
 		"lui $a3, %hi(environ)\n"     // load environ into a3 (hi)
 		"addiu $a3, %lo(environ)\n"   // load environ into a3 (lo)
 		"sw $a2,($a3)\n"              // store envp(a2) into environ
+
+		"move $t0, $a2\n"             // iterate t0 over envp, look for NULL
+		"0:"                          // do {
+		"lw $a3, ($t0)\n"             //   a3=*(t0);
+		"bne $a3, $0, 0b\n"           // } while (a3);
+		"addiu $t0, $t0, 4\n"         // delayed slot: t0+=4;
+		"lui $a3, %hi(_auxv)\n"       // load _auxv into a3 (hi)
+		"addiu $a3, %lo(_auxv)\n"     // load _auxv into a3 (lo)
+		"sw $t0, ($a3)\n"             // store t0 into _auxv
+
 		"li $t0, -8\n"
 		"and $sp, $sp, $t0\n"   // sp must be 8-byte aligned
 		"addiu $sp,$sp,-16\n"   // the callee expects to save a0..a3 there!
-- 
cgit 


From 241c4b4e02f25ceab21df8a28e1ac689a477c30c Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Tue, 10 Jan 2023 08:24:31 +0100
Subject: tools/nolibc: add auxiliary vector retrieval for s390

In the _start block we now iterate over envp to find the auxiliary
vector after the NULL. The pointer is saved into an _auxv variable
that is marked as weak so that it's accessible from multiple units.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-s390.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h
index 039b454e79f0..6b0e54ed543d 100644
--- a/tools/include/nolibc/arch-s390.h
+++ b/tools/include/nolibc/arch-s390.h
@@ -160,6 +160,7 @@ struct sys_stat_struct {
 })
 
 char **environ __attribute__((weak));
+const unsigned long *_auxv __attribute__((weak));
 
 /* startup code */
 void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
@@ -179,6 +180,15 @@ void __attribute__((weak,noreturn,optimize("omit-frame-pointer"))) _start(void)
 		"larl	%r1,environ\n"
 		"stg	%r4,0(%r1)\n"
 
+		/* search for auxv */
+		"lgr	%r5,%r4\n"		/* start at envp */
+		"1:\n"
+		"clg	%r0,0(%r5)\n"		/* entry zero? */
+		"la	%r5,8(%r5)\n"		/* advance pointer */
+		"jnz	1b\n"			/* no -> test next pointer */
+		"larl	%r1,_auxv\n"		/* yes -> store value in _auxv */
+		"stg	%r5,0(%r1)\n"
+
 		"aghi	%r15,-160\n"		/* allocate new stackframe */
 		"xc	0(8,%r15),0(%r15)\n"	/* clear backchain */
 		"brasl	%r14,main\n"		/* ret value of main is arg to exit */
-- 
cgit 


From c61a078015f325fe38a7371ab0b12d4444f5fba4 Mon Sep 17 00:00:00 2001
From: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Date: Tue, 10 Jan 2023 08:24:32 +0100
Subject: nolibc/stdlib: Implement `getauxval(3)` function

Previous commits save the address of the auxiliary vector into a global
variable @_auxv. This commit creates a new function 'getauxval()' as a
helper function to get the auxv value based on the given key.

The behavior of this function is identic with the function documented
in 'man 3 getauxval'. This function is also needed to implement
'getpagesize()' function that we will wire up in the next patches.

Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/stdlib.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h
index a24000d1e822..894c955d027e 100644
--- a/tools/include/nolibc/stdlib.h
+++ b/tools/include/nolibc/stdlib.h
@@ -12,6 +12,7 @@
 #include "types.h"
 #include "sys.h"
 #include "string.h"
+#include <linux/auxvec.h>
 
 struct nolibc_heap {
 	size_t	len;
@@ -108,6 +109,32 @@ char *getenv(const char *name)
 	return _getenv(name, environ);
 }
 
+static __attribute__((unused))
+unsigned long getauxval(unsigned long type)
+{
+	const unsigned long *auxv = _auxv;
+	unsigned long ret;
+
+	if (!auxv)
+		return 0;
+
+	while (1) {
+		if (!auxv[0] && !auxv[1]) {
+			ret = 0;
+			break;
+		}
+
+		if (auxv[0] == type) {
+			ret = auxv[1];
+			break;
+		}
+
+		auxv += 2;
+	}
+
+	return ret;
+}
+
 static __attribute__((unused))
 void *malloc(size_t len)
 {
-- 
cgit 


From 7efd762e97c87f07b0a93e428e5ce3436715f538 Mon Sep 17 00:00:00 2001
From: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Date: Tue, 10 Jan 2023 08:24:33 +0100
Subject: nolibc/sys: Implement `getpagesize(2)` function

This function returns the page size used by the running kernel. The
page size value is taken from the auxiliary vector at 'AT_PAGESZ' key.

'getpagesize(2)' is assumed as a syscall becuase the manpage placement
of this function is in entry 2 ('man 2 getpagesize') despite there is
no real 'getpagesize(2)' syscall in the Linux syscall table. Define
this function in 'sys.h'.

Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/sys.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h
index 47bf67668860..b5f8cd35c03b 100644
--- a/tools/include/nolibc/sys.h
+++ b/tools/include/nolibc/sys.h
@@ -19,6 +19,7 @@
 #include <linux/fs.h>
 #include <linux/loop.h>
 #include <linux/time.h>
+#include <linux/auxvec.h>
 
 #include "arch.h"
 #include "errno.h"
@@ -499,6 +500,26 @@ pid_t gettid(void)
 	return sys_gettid();
 }
 
+static unsigned long getauxval(unsigned long key);
+
+/*
+ * long getpagesize(void);
+ */
+
+static __attribute__((unused))
+long getpagesize(void)
+{
+	long ret;
+
+	ret = getauxval(AT_PAGESZ);
+	if (!ret) {
+		SET_ERRNO(ENOENT);
+		return -1;
+	}
+
+	return ret;
+}
+
 
 /*
  * int gettimeofday(struct timeval *tv, struct timezone *tz);
-- 
cgit 


From a290296ab8326e04a9fefd698ca9367dc72c0a87 Mon Sep 17 00:00:00 2001
From: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Date: Tue, 10 Jan 2023 08:24:34 +0100
Subject: selftests/nolibc: Add `getpagesize(2)` selftest

Test the getpagesize() function. Make sure it returns the correct
value.

Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/nolibc-test.c | 30 ++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index f14f5076fb6d..c4a0c915139c 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -442,6 +442,35 @@ int test_getdents64(const char *dir)
 	return ret;
 }
 
+static int test_getpagesize(void)
+{
+	long x = getpagesize();
+	int c;
+
+	if (x < 0)
+		return x;
+
+#if defined(__x86_64__) || defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__)
+	/*
+	 * x86 family is always 4K page.
+	 */
+	c = (x == 4096);
+#elif defined(__aarch64__)
+	/*
+	 * Linux aarch64 supports three values of page size: 4K, 16K, and 64K
+	 * which are selected at kernel compilation time.
+	 */
+	c = (x == 4096 || x == (16 * 1024) || x == (64 * 1024));
+#else
+	/*
+	 * Assuming other architectures must have at least 4K page.
+	 */
+	c = (x >= 4096);
+#endif
+
+	return !c;
+}
+
 /* Run syscall tests between IDs <min> and <max>.
  * Return 0 on success, non-zero on failure.
  */
@@ -502,6 +531,7 @@ int run_syscall(int min, int max)
 		CASE_TEST(gettimeofday_bad2); EXPECT_SYSER(1, gettimeofday(NULL, (void *)1), -1, EFAULT); break;
 		CASE_TEST(gettimeofday_bad2); EXPECT_SYSER(1, gettimeofday(NULL, (void *)1), -1, EFAULT); break;
 #endif
+		CASE_TEST(getpagesize);       EXPECT_SYSZR(1, test_getpagesize()); break;
 		CASE_TEST(ioctl_tiocinq);     EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break;
 		CASE_TEST(ioctl_tiocinq);     EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break;
 		CASE_TEST(link_root1);        EXPECT_SYSER(1, link("/", "/"), -1, EEXIST); break;
-- 
cgit 


From af0e26beaa693cc0cb0472b80a33a5831103d22f Mon Sep 17 00:00:00 2001
From: James Hilliard <james.hilliard1@gmail.com>
Date: Mon, 9 Jan 2023 18:45:04 -0700
Subject: bpftool: Add missing quotes to libbpf bootstrap submake vars

When passing compiler variables like CC=$(HOSTCC) to a submake
we must ensure the variable is quoted in order to handle cases
where $(HOSTCC) may be multiple binaries.

For example when using ccache $HOSTCC may be:
"/usr/bin/ccache /usr/bin/gcc"

If we pass CC without quotes like CC=$(HOSTCC) only the first
"/usr/bin/ccache" part will be assigned to the CC variable which
will cause an error due to dropping the "/usr/bin/gcc" part of
the variable in the submake invocation.

This fixes errors such as:
/usr/bin/ccache: invalid option -- 'd'

Signed-off-by: James Hilliard <james.hilliard1@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20230110014504.3120711-1-james.hilliard1@gmail.com
---
 tools/bpf/bpftool/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index ab20ecc5acce..d40e31bc4c9d 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -53,7 +53,7 @@ $(LIBBPF_INTERNAL_HDRS): $(LIBBPF_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_HDRS_
 $(LIBBPF_BOOTSTRAP): $(wildcard $(BPF_DIR)/*.[ch] $(BPF_DIR)/Makefile) | $(LIBBPF_BOOTSTRAP_OUTPUT)
 	$(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_BOOTSTRAP_OUTPUT) \
 		DESTDIR=$(LIBBPF_BOOTSTRAP_DESTDIR:/=) prefix= \
-		ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) AR=$(HOSTAR) $@ install_headers
+		ARCH= CROSS_COMPILE= CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" $@ install_headers
 
 $(LIBBPF_BOOTSTRAP_INTERNAL_HDRS): $(LIBBPF_BOOTSTRAP_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_BOOTSTRAP_HDRS_DIR)
 	$(call QUIET_INSTALL, $@)
-- 
cgit 


From 75514e4c661962cbbad869a1b5855dbce0f1aba2 Mon Sep 17 00:00:00 2001
From: Chethan Suresh <chethan.suresh@sony.com>
Date: Mon, 9 Jan 2023 08:07:42 +0530
Subject: bpftool: fix output for skipping kernel config check

When bpftool feature does not find kernel config
files under default path or wrong format,
do not output CONFIG_XYZ is not set.
Skip kernel config check and continue.

Signed-off-by: Chethan Suresh <chethan.suresh@sony.com>
Signed-off-by: Kenta Tada <Kenta.Tada@sony.com>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/r/20230109023742.29657-1-chethan.suresh@sony.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/bpf/bpftool/feature.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 36cf0f1517c9..da16e6a27ccc 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -486,16 +486,16 @@ static void probe_kernel_image_config(const char *define_prefix)
 		}
 	}
 
-end_parse:
-	if (file)
-		gzclose(file);
-
 	for (i = 0; i < ARRAY_SIZE(options); i++) {
 		if (define_prefix && !options[i].macro_dump)
 			continue;
 		print_kernel_option(options[i].name, values[i], define_prefix);
 		free(values[i]);
 	}
+
+end_parse:
+	if (file)
+		gzclose(file);
 }
 
 static bool probe_bpf_syscall(const char *define_prefix)
-- 
cgit 


From 6920b08661e3ad829206078b5c9879b24aea8dfc Mon Sep 17 00:00:00 2001
From: Ludovic L'Hours <ludovic.lhours@gmail.com>
Date: Sun, 8 Jan 2023 19:20:18 +0100
Subject: libbpf: Fix map creation flags sanitization

As BPF_F_MMAPABLE flag is now conditionnaly set (by map_is_mmapable),
it should not be toggled but disabled if not supported by kernel.

Fixes: 4fcac46c7e10 ("libbpf: only add BPF_F_MMAPABLE flag for data maps with global vars")
Signed-off-by: Ludovic L'Hours <ludovic.lhours@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20230108182018.24433-1-ludovic.lhours@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/lib/bpf/libbpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a5c67a3c93c5..f8dfee32c2bc 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -7355,7 +7355,7 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj)
 		if (!bpf_map__is_internal(m))
 			continue;
 		if (!kernel_supports(obj, FEAT_ARRAY_MMAP))
-			m->def.map_flags ^= BPF_F_MMAPABLE;
+			m->def.map_flags &= ~BPF_F_MMAPABLE;
 	}
 
 	return 0;
-- 
cgit 


From 2d0b2ae2871ae6d42a9f0a4280e0fb5bff8d38b8 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:12 +0100
Subject: selftests/xsk: print correct payload for packet dump

Print the correct payload when the packet dump option is selected. The
network to host conversion was forgotten and the payload was
erronously declared to be an int instead of an unsigned int.

Fixes: facb7cb2e909 ("selftests/bpf: Xsk selftests - SKB POLL, NOPOLL")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-2-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/xskxceiver.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 162d3a516f2c..2ff43b22180f 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -767,7 +767,7 @@ static void pkt_dump(void *pkt, u32 len)
 	struct ethhdr *ethhdr;
 	struct udphdr *udphdr;
 	struct iphdr *iphdr;
-	int payload, i;
+	u32 payload, i;
 
 	ethhdr = pkt;
 	iphdr = pkt + sizeof(*ethhdr);
@@ -792,7 +792,7 @@ static void pkt_dump(void *pkt, u32 len)
 	fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source));
 	fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest));
 	/*extract L5 frame */
-	payload = *((uint32_t *)(pkt + PKT_HDR_SIZE));
+	payload = ntohl(*((u32 *)(pkt + PKT_HDR_SIZE)));
 
 	fprintf(stdout, "DEBUG>> L5: payload: %d\n", payload);
 	fprintf(stdout, "---------------------------------------\n");
-- 
cgit 


From 5adaf52776a4178434c0670a8e05e529a2b4aafa Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:13 +0100
Subject: selftests/xsk: do not close unused file descriptors

Do not close descriptors that have never been used. File descriptor
fields that are not in use are erroneously marked with the number 0,
which is a valid fd. Mark unused fds with -1 instead and do not close
these when deleting the socket.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-3-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/xsk.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c
index 39d349509ba4..5e4a6552ed37 100644
--- a/tools/testing/selftests/bpf/xsk.c
+++ b/tools/testing/selftests/bpf/xsk.c
@@ -35,6 +35,8 @@
 #include "xsk.h"
 #include "bpf_util.h"
 
+#define FD_NOT_USED (-1)
+
 #ifndef SOL_XDP
  #define SOL_XDP 283
 #endif
@@ -583,6 +585,9 @@ static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
 {
 	struct xsk_ctx *ctx = xsk->ctx;
 
+	if (ctx->xsks_map_fd == FD_NOT_USED)
+		return;
+
 	bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id);
 	close(ctx->xsks_map_fd);
 }
@@ -941,6 +946,9 @@ static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
 	ctx->umem = umem;
 	ctx->queue_id = queue_id;
 	bpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
+	ctx->prog_fd = FD_NOT_USED;
+	ctx->link_fd = FD_NOT_USED;
+	ctx->xsks_map_fd = FD_NOT_USED;
 
 	ctx->fill = fill;
 	ctx->comp = comp;
@@ -1221,8 +1229,9 @@ void xsk_socket__delete(struct xsk_socket *xsk)
 
 	if (ctx->refcount == 1) {
 		xsk_delete_bpf_maps(xsk);
-		close(ctx->prog_fd);
-		if (ctx->has_bpf_link)
+		if (ctx->prog_fd != FD_NOT_USED)
+			close(ctx->prog_fd);
+		if (ctx->has_bpf_link && ctx->link_fd != FD_NOT_USED)
 			close(ctx->link_fd);
 	}
 
-- 
cgit 


From 1e04f23bccf967e00609ab3b210961db7a1886e4 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:14 +0100
Subject: selftests/xsk: submit correct number of frames in populate_fill_ring

Submit the correct number of frames in the function
xsk_populate_fill_ring(). For the tests that set the flag
use_addr_for_fill, uninitialized buffers were sent to the fill ring
following the correct ones. This has no impact on the tests, since
they only use the ones that were initialized. But for correctness,
this should be fixed.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-4-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/xskxceiver.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 2ff43b22180f..a239e975ab66 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -1272,7 +1272,7 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream
 
 		*xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr;
 	}
-	xsk_ring_prod__submit(&umem->fq, buffers_to_fill);
+	xsk_ring_prod__submit(&umem->fq, i);
 }
 
 static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
-- 
cgit 


From 085dcccfb7d3dc52ed708fc588587f319541bc83 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:15 +0100
Subject: selftests/xsk: print correct error codes when exiting

Print the correct error codes when exiting the test suite due to some
terminal error. Some of these had a switched sign and some of them
printed zero instead of errno.

Fixes: facb7cb2e909 ("selftests/bpf: Xsk selftests - SKB POLL, NOPOLL")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-5-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/xskxceiver.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index a239e975ab66..72578cebfbf7 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -350,7 +350,7 @@ static bool ifobj_zc_avail(struct ifobject *ifobject)
 	umem = calloc(1, sizeof(struct xsk_umem_info));
 	if (!umem) {
 		munmap(bufs, umem_sz);
-		exit_with_error(-ENOMEM);
+		exit_with_error(ENOMEM);
 	}
 	umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
 	ret = xsk_configure_umem(umem, bufs, umem_sz);
@@ -936,7 +936,7 @@ static int receive_pkts(struct test_spec *test, struct pollfd *fds)
 		if (ifobj->use_poll) {
 			ret = poll(fds, 1, POLL_TMOUT);
 			if (ret < 0)
-				exit_with_error(-ret);
+				exit_with_error(errno);
 
 			if (!ret) {
 				if (!is_umem_valid(test->ifobj_tx))
@@ -963,7 +963,7 @@ static int receive_pkts(struct test_spec *test, struct pollfd *fds)
 				if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
 					ret = poll(fds, 1, POLL_TMOUT);
 					if (ret < 0)
-						exit_with_error(-ret);
+						exit_with_error(errno);
 				}
 				ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
 			}
@@ -1015,7 +1015,7 @@ static int __send_pkts(struct ifobject *ifobject, u32 *pkt_nb, struct pollfd *fd
 			if (timeout) {
 				if (ret < 0) {
 					ksft_print_msg("ERROR: [%s] Poll error %d\n",
-						       __func__, ret);
+						       __func__, errno);
 					return TEST_FAILURE;
 				}
 				if (ret == 0)
@@ -1024,7 +1024,7 @@ static int __send_pkts(struct ifobject *ifobject, u32 *pkt_nb, struct pollfd *fd
 			}
 			if (ret <= 0) {
 				ksft_print_msg("ERROR: [%s] Poll error %d\n",
-					       __func__, ret);
+					       __func__, errno);
 				return TEST_FAILURE;
 			}
 		}
@@ -1323,18 +1323,18 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
 	if (ifobject->xdp_flags & XDP_FLAGS_SKB_MODE) {
 		if (opts.attach_mode != XDP_ATTACHED_SKB) {
 			ksft_print_msg("ERROR: [%s] XDP prog not in SKB mode\n");
-			exit_with_error(-EINVAL);
+			exit_with_error(EINVAL);
 		}
 	} else if (ifobject->xdp_flags & XDP_FLAGS_DRV_MODE) {
 		if (opts.attach_mode != XDP_ATTACHED_DRV) {
 			ksft_print_msg("ERROR: [%s] XDP prog not in DRV mode\n");
-			exit_with_error(-EINVAL);
+			exit_with_error(EINVAL);
 		}
 	}
 
 	ret = xsk_socket__update_xskmap(ifobject->xsk->xsk, ifobject->xsk_map_fd);
 	if (ret)
-		exit_with_error(-ret);
+		exit_with_error(errno);
 }
 
 static void *worker_testapp_validate_tx(void *arg)
@@ -1541,7 +1541,7 @@ static void swap_xsk_resources(struct ifobject *ifobj_tx, struct ifobject *ifobj
 
 	ret = xsk_socket__update_xskmap(ifobj_rx->xsk->xsk, ifobj_rx->xsk_map_fd);
 	if (ret)
-		exit_with_error(-ret);
+		exit_with_error(errno);
 }
 
 static void testapp_bpf_res(struct test_spec *test)
-- 
cgit 


From a4ca62277b6aeda31dfd686740462982b6a480b9 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:16 +0100
Subject: selftests/xsk: remove unused variable outstanding_tx

Remove the unused variable outstanding_tx.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-6-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/xsk.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c
index 5e4a6552ed37..b166edfff86d 100644
--- a/tools/testing/selftests/bpf/xsk.c
+++ b/tools/testing/selftests/bpf/xsk.c
@@ -86,7 +86,6 @@ struct xsk_ctx {
 struct xsk_socket {
 	struct xsk_ring_cons *rx;
 	struct xsk_ring_prod *tx;
-	__u64 outstanding_tx;
 	struct xsk_ctx *ctx;
 	struct xsk_socket_config config;
 	int fd;
@@ -1021,7 +1020,6 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
 	if (err)
 		goto out_xsk_alloc;
 
-	xsk->outstanding_tx = 0;
 	ifindex = if_nametoindex(ifname);
 	if (!ifindex) {
 		err = -errno;
-- 
cgit 


From 703bfd37101310ad5a6be09d5ff38c0e949ee8db Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:17 +0100
Subject: selftests/xsk: add debug option for creating netdevs

Add a new option to the test_xsk.sh script that only creates the two
veth netdevs and the extra namespace, then exits without running any
tests. The failed test can then be executed in the debugger without
having to create the netdevs and namespace manually. For ease-of-use,
the veth netdevs to use are printed so they can be copied into the
debugger.

Here is an example how to use it:

> sudo ./test_xsk.sh -d

veth10 veth11

> gdb xskxceiver

In gdb:

run -i veth10 -i veth11

And now the test cases can be debugged with gdb.

If you want to debug the test suite on a real NIC in loopback mode,
there is no need to use this feature as you already know the netdev of
your NIC.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-7-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_xsk.sh | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh
index d821fd098504..cb315d85148b 100755
--- a/tools/testing/selftests/bpf/test_xsk.sh
+++ b/tools/testing/selftests/bpf/test_xsk.sh
@@ -74,6 +74,9 @@
 # Run and dump packet contents:
 #   sudo ./test_xsk.sh -D
 #
+# Set up veth interfaces and leave them up so xskxceiver can be launched in a debugger:
+#   sudo ./test_xsk.sh -d
+#
 # Run test suite for physical device in loopback mode
 #   sudo ./test_xsk.sh -i IFACE
 
@@ -81,11 +84,12 @@
 
 ETH=""
 
-while getopts "vDi:" flag
+while getopts "vDi:d" flag
 do
 	case "${flag}" in
 		v) verbose=1;;
 		D) dump_pkts=1;;
+		d) debug=1;;
 		i) ETH=${OPTARG};;
 	esac
 done
@@ -174,6 +178,11 @@ statusList=()
 
 TEST_NAME="XSK_SELFTESTS_${VETH0}_SOFTIRQ"
 
+if [[ $debug -eq 1 ]]; then
+    echo "-i" ${VETH0} "-i" ${VETH1},${NS1}
+    exit
+fi
+
 exec_xskxceiver
 
 if [ -z $ETH ]; then
-- 
cgit 


From efe620e5ba03330a71a85b40e68ee1b497c789ed Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:18 +0100
Subject: selftests/xsk: replace asm acquire/release implementations

Replace our own homegrown assembly store/release and load/acquire
implementations with the HW agnositic atomic APIs C11 offers. This to
make the code more portable, easier to read, and reduce the
maintenance burden.

The original code used load-acquire and store-release barriers
hand-coded in assembly. Since C11, these kind of operations are
offered as built-ins in gcc and llvm. The load-acquire operation
prevents hoisting of non-atomic memory operations to before this
operation and it corresponds to the __ATOMIC_ACQUIRE operation in the
built-in atomics. The store-release operation prevents hoisting of
non-atomic memory operations to after this operation and it
corresponds to the __ATOMIC_RELEASE operation in the built-in atomics.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-8-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/xsk.h | 80 ++-------------------------------------
 1 file changed, 4 insertions(+), 76 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h
index 997723b0bfb2..24ee765aded3 100644
--- a/tools/testing/selftests/bpf/xsk.h
+++ b/tools/testing/selftests/bpf/xsk.h
@@ -23,77 +23,6 @@
 extern "C" {
 #endif
 
-/* This whole API has been deprecated and moved to libxdp that can be found at
- * https://github.com/xdp-project/xdp-tools. The APIs are exactly the same so
- * it should just be linking with libxdp instead of libbpf for this set of
- * functionality. If not, please submit a bug report on the aforementioned page.
- */
-
-/* Load-Acquire Store-Release barriers used by the XDP socket
- * library. The following macros should *NOT* be considered part of
- * the xsk.h API, and is subject to change anytime.
- *
- * LIBRARY INTERNAL
- */
-
-#define __XSK_READ_ONCE(x) (*(volatile typeof(x) *)&x)
-#define __XSK_WRITE_ONCE(x, v) (*(volatile typeof(x) *)&x) = (v)
-
-#if defined(__i386__) || defined(__x86_64__)
-# define libbpf_smp_store_release(p, v)					\
-	do {								\
-		asm volatile("" : : : "memory");			\
-		__XSK_WRITE_ONCE(*p, v);				\
-	} while (0)
-# define libbpf_smp_load_acquire(p)					\
-	({								\
-		typeof(*p) ___p1 = __XSK_READ_ONCE(*p);			\
-		asm volatile("" : : : "memory");			\
-		___p1;							\
-	})
-#elif defined(__aarch64__)
-# define libbpf_smp_store_release(p, v)					\
-		asm volatile ("stlr %w1, %0" : "=Q" (*p) : "r" (v) : "memory")
-# define libbpf_smp_load_acquire(p)					\
-	({								\
-		typeof(*p) ___p1;					\
-		asm volatile ("ldar %w0, %1"				\
-			      : "=r" (___p1) : "Q" (*p) : "memory");	\
-		___p1;							\
-	})
-#elif defined(__riscv)
-# define libbpf_smp_store_release(p, v)					\
-	do {								\
-		asm volatile ("fence rw,w" : : : "memory");		\
-		__XSK_WRITE_ONCE(*p, v);				\
-	} while (0)
-# define libbpf_smp_load_acquire(p)					\
-	({								\
-		typeof(*p) ___p1 = __XSK_READ_ONCE(*p);			\
-		asm volatile ("fence r,rw" : : : "memory");		\
-		___p1;							\
-	})
-#endif
-
-#ifndef libbpf_smp_store_release
-#define libbpf_smp_store_release(p, v)					\
-	do {								\
-		__sync_synchronize();					\
-		__XSK_WRITE_ONCE(*p, v);				\
-	} while (0)
-#endif
-
-#ifndef libbpf_smp_load_acquire
-#define libbpf_smp_load_acquire(p)					\
-	({								\
-		typeof(*p) ___p1 = __XSK_READ_ONCE(*p);			\
-		__sync_synchronize();					\
-		___p1;							\
-	})
-#endif
-
-/* LIBRARY INTERNAL -- END */
-
 /* Do not access these members directly. Use the functions below. */
 #define DEFINE_XSK_RING(name) \
 struct name { \
@@ -168,7 +97,7 @@ static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
 	 * this function. Without this optimization it whould have been
 	 * free_entries = r->cached_prod - r->cached_cons + r->size.
 	 */
-	r->cached_cons = libbpf_smp_load_acquire(r->consumer);
+	r->cached_cons = __atomic_load_n(r->consumer, __ATOMIC_ACQUIRE);
 	r->cached_cons += r->size;
 
 	return r->cached_cons - r->cached_prod;
@@ -179,7 +108,7 @@ static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb)
 	__u32 entries = r->cached_prod - r->cached_cons;
 
 	if (entries == 0) {
-		r->cached_prod = libbpf_smp_load_acquire(r->producer);
+		r->cached_prod = __atomic_load_n(r->producer, __ATOMIC_ACQUIRE);
 		entries = r->cached_prod - r->cached_cons;
 	}
 
@@ -202,7 +131,7 @@ static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb)
 	/* Make sure everything has been written to the ring before indicating
 	 * this to the kernel by writing the producer pointer.
 	 */
-	libbpf_smp_store_release(prod->producer, *prod->producer + nb);
+	__atomic_store_n(prod->producer, *prod->producer + nb, __ATOMIC_RELEASE);
 }
 
 static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx)
@@ -227,8 +156,7 @@ static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb)
 	/* Make sure data has been read before indicating we are done
 	 * with the entries by updating the consumer pointer.
 	 */
-	libbpf_smp_store_release(cons->consumer, *cons->consumer + nb);
-
+	__atomic_store_n(cons->consumer, *cons->consumer + nb, __ATOMIC_RELEASE);
 }
 
 static inline void *xsk_umem__get_data(void *umem_area, __u64 addr)
-- 
cgit 


From 64aef77d750ede548699d81c959936289d7d7819 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:19 +0100
Subject: selftests/xsk: remove namespaces

Remove the namespaces used as they fill no function. This will
simplify the code for speeding up the tests in the following commits.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-9-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_xsk.sh    | 33 ++++++++-------------
 tools/testing/selftests/bpf/xsk_prereqs.sh | 12 ++------
 tools/testing/selftests/bpf/xskxceiver.c   | 46 ++++--------------------------
 tools/testing/selftests/bpf/xskxceiver.h   |  3 --
 4 files changed, 20 insertions(+), 74 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh
index cb315d85148b..b077cf58f825 100755
--- a/tools/testing/selftests/bpf/test_xsk.sh
+++ b/tools/testing/selftests/bpf/test_xsk.sh
@@ -24,8 +24,6 @@
 #      -----------     |     ----------
 #      |  vethX  | --------- |  vethY |
 #      -----------   peer    ----------
-#           |          |          |
-#      namespaceX      |     namespaceY
 #
 # AF_XDP is an address family optimized for high performance packet processing,
 # it is XDP’s user-space interface.
@@ -39,10 +37,9 @@
 # Prerequisites setup by script:
 #
 #   Set up veth interfaces as per the topology shown ^^:
-#   * setup two veth interfaces and one namespace
-#   ** veth<xxxx> in root namespace
-#   ** veth<yyyy> in af_xdp<xxxx> namespace
-#   ** namespace af_xdp<xxxx>
+#   * setup two veth interfaces
+#   ** veth<xxxx>
+#   ** veth<yyyy>
 #   *** xxxx and yyyy are randomly generated 4 digit numbers used to avoid
 #       conflict with any existing interface
 #   * tests the veth and xsk layers of the topology
@@ -103,28 +100,25 @@ VETH0_POSTFIX=$(cat ${URANDOM} | tr -dc '0-9' | fold -w 256 | head -n 1 | head -
 VETH0=ve${VETH0_POSTFIX}
 VETH1_POSTFIX=$(cat ${URANDOM} | tr -dc '0-9' | fold -w 256 | head -n 1 | head --bytes 4)
 VETH1=ve${VETH1_POSTFIX}
-NS0=root
-NS1=af_xdp${VETH1_POSTFIX}
 MTU=1500
 
 trap ctrl_c INT
 
 function ctrl_c() {
-        cleanup_exit ${VETH0} ${VETH1} ${NS1}
+        cleanup_exit ${VETH0} ${VETH1}
 	exit 1
 }
 
 setup_vethPairs() {
 	if [[ $verbose -eq 1 ]]; then
-	        echo "setting up ${VETH0}: namespace: ${NS0}"
+	        echo "setting up ${VETH0}"
 	fi
-	ip netns add ${NS1}
 	ip link add ${VETH0} numtxqueues 4 numrxqueues 4 type veth peer name ${VETH1} numtxqueues 4 numrxqueues 4
 	if [ -f /proc/net/if_inet6 ]; then
 		echo 1 > /proc/sys/net/ipv6/conf/${VETH0}/disable_ipv6
 	fi
 	if [[ $verbose -eq 1 ]]; then
-	        echo "setting up ${VETH1}: namespace: ${NS1}"
+	        echo "setting up ${VETH1}"
 	fi
 
 	if [[ $busy_poll -eq 1 ]]; then
@@ -134,18 +128,15 @@ setup_vethPairs() {
 		echo 200000 > /sys/class/net/${VETH1}/gro_flush_timeout
 	fi
 
-	ip link set ${VETH1} netns ${NS1}
-	ip netns exec ${NS1} ip link set ${VETH1} mtu ${MTU}
+	ip link set ${VETH1} mtu ${MTU}
 	ip link set ${VETH0} mtu ${MTU}
-	ip netns exec ${NS1} ip link set ${VETH1} up
-	ip netns exec ${NS1} ip link set dev lo up
+	ip link set ${VETH1} up
 	ip link set ${VETH0} up
 }
 
 if [ ! -z $ETH ]; then
 	VETH0=${ETH}
 	VETH1=${ETH}
-	NS1=""
 else
 	validate_root_exec
 	validate_veth_support ${VETH0}
@@ -155,7 +146,7 @@ else
 	retval=$?
 	if [ $retval -ne 0 ]; then
 		test_status $retval "${TEST_NAME}"
-		cleanup_exit ${VETH0} ${VETH1} ${NS1}
+		cleanup_exit ${VETH0} ${VETH1}
 		exit $retval
 	fi
 fi
@@ -179,14 +170,14 @@ statusList=()
 TEST_NAME="XSK_SELFTESTS_${VETH0}_SOFTIRQ"
 
 if [[ $debug -eq 1 ]]; then
-    echo "-i" ${VETH0} "-i" ${VETH1},${NS1}
+    echo "-i" ${VETH0} "-i" ${VETH1}
     exit
 fi
 
 exec_xskxceiver
 
 if [ -z $ETH ]; then
-	cleanup_exit ${VETH0} ${VETH1} ${NS1}
+	cleanup_exit ${VETH0} ${VETH1}
 fi
 TEST_NAME="XSK_SELFTESTS_${VETH0}_BUSY_POLL"
 busy_poll=1
@@ -199,7 +190,7 @@ exec_xskxceiver
 ## END TESTS
 
 if [ -z $ETH ]; then
-	cleanup_exit ${VETH0} ${VETH1} ${NS1}
+	cleanup_exit ${VETH0} ${VETH1}
 fi
 
 failures=0
diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh
index a0b71723a818..ae697a10a056 100755
--- a/tools/testing/selftests/bpf/xsk_prereqs.sh
+++ b/tools/testing/selftests/bpf/xsk_prereqs.sh
@@ -55,21 +55,13 @@ test_exit()
 
 clear_configs()
 {
-	if [ $(ip netns show | grep $3 &>/dev/null; echo $?;) == 0 ]; then
-		[ $(ip netns exec $3 ip link show $2 &>/dev/null; echo $?;) == 0 ] &&
-			{ ip netns exec $3 ip link del $2; }
-		ip netns del $3
-	fi
-	#Once we delete a veth pair node, the entire veth pair is removed,
-	#this is just to be cautious just incase the NS does not exist then
-	#veth node inside NS won't get removed so we explicitly remove it
 	[ $(ip link show $1 &>/dev/null; echo $?;) == 0 ] &&
 		{ ip link del $1; }
 }
 
 cleanup_exit()
 {
-	clear_configs $1 $2 $3
+	clear_configs $1 $2
 }
 
 validate_ip_utility()
@@ -83,7 +75,7 @@ exec_xskxceiver()
 	        ARGS+="-b "
 	fi
 
-	./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${ARGS}
+	./${XSKOBJ} -i ${VETH0} -i ${VETH1} ${ARGS}
 
 	retval=$?
 	test_status $retval "${TEST_NAME}"
diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 72578cebfbf7..4c8f32e1c431 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -55,12 +55,11 @@
  * Flow:
  * -----
  * - Single process spawns two threads: Tx and Rx
- * - Each of these two threads attach to a veth interface within their assigned
- *   namespaces
- * - Each thread Creates one AF_XDP socket connected to a unique umem for each
+ * - Each of these two threads attach to a veth interface
+ * - Each thread creates one AF_XDP socket connected to a unique umem for each
  *   veth interface
- * - Tx thread Transmits 10k packets from veth<xxxx> to veth<yyyy>
- * - Rx thread verifies if all 10k packets were received and delivered in-order,
+ * - Tx thread Transmits a number of packets from veth<xxxx> to veth<yyyy>
+ * - Rx thread verifies if all packets were received and delivered in-order,
  *   and have the right content
  *
  * Enable/disable packet dump mode:
@@ -399,28 +398,6 @@ static void usage(const char *prog)
 	ksft_print_msg(str, prog);
 }
 
-static int switch_namespace(const char *nsname)
-{
-	char fqns[26] = "/var/run/netns/";
-	int nsfd;
-
-	if (!nsname || strlen(nsname) == 0)
-		return -1;
-
-	strncat(fqns, nsname, sizeof(fqns) - strlen(fqns) - 1);
-	nsfd = open(fqns, O_RDONLY);
-
-	if (nsfd == -1)
-		exit_with_error(errno);
-
-	if (setns(nsfd, 0) == -1)
-		exit_with_error(errno);
-
-	print_verbose("NS switched: %s\n", nsname);
-
-	return nsfd;
-}
-
 static bool validate_interface(struct ifobject *ifobj)
 {
 	if (!strcmp(ifobj->ifname, ""))
@@ -438,8 +415,6 @@ static void parse_command_line(struct ifobject *ifobj_tx, struct ifobject *ifobj
 	opterr = 0;
 
 	for (;;) {
-		char *sptr, *token;
-
 		c = getopt_long(argc, argv, "i:Dvb", long_options, &option_index);
 		if (c == -1)
 			break;
@@ -453,11 +428,8 @@ static void parse_command_line(struct ifobject *ifobj_tx, struct ifobject *ifobj
 			else
 				break;
 
-			sptr = strndupa(optarg, strlen(optarg));
-			memcpy(ifobj->ifname, strsep(&sptr, ","), MAX_INTERFACE_NAME_CHARS);
-			token = strsep(&sptr, ",");
-			if (token)
-				memcpy(ifobj->nsname, token, MAX_INTERFACES_NAMESPACE_CHARS);
+			memcpy(ifobj->ifname, optarg,
+			       min_t(size_t, MAX_INTERFACE_NAME_CHARS, strlen(optarg)));
 			interface_nb++;
 			break;
 		case 'D':
@@ -1283,8 +1255,6 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
 	int ret, ifindex;
 	void *bufs;
 
-	ifobject->ns_fd = switch_namespace(ifobject->nsname);
-
 	if (ifobject->umem->unaligned_mode)
 		mmap_flags |= MAP_HUGETLB;
 
@@ -1843,8 +1813,6 @@ static struct ifobject *ifobject_create(void)
 	if (!ifobj->umem)
 		goto out_umem;
 
-	ifobj->ns_fd = -1;
-
 	return ifobj;
 
 out_umem:
@@ -1856,8 +1824,6 @@ out_xsk_arr:
 
 static void ifobject_delete(struct ifobject *ifobj)
 {
-	if (ifobj->ns_fd != -1)
-		close(ifobj->ns_fd);
 	free(ifobj->umem);
 	free(ifobj->xsk_arr);
 	free(ifobj);
diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h
index edb76d2def9f..dcb908f5bb4c 100644
--- a/tools/testing/selftests/bpf/xskxceiver.h
+++ b/tools/testing/selftests/bpf/xskxceiver.h
@@ -30,7 +30,6 @@
 #define TEST_CONTINUE 1
 #define MAX_INTERFACES 2
 #define MAX_INTERFACE_NAME_CHARS 16
-#define MAX_INTERFACES_NAMESPACE_CHARS 16
 #define MAX_SOCKETS 2
 #define MAX_TEST_NAME_SIZE 32
 #define MAX_TEARDOWN_ITER 10
@@ -133,14 +132,12 @@ typedef void *(*thread_func_t)(void *arg);
 
 struct ifobject {
 	char ifname[MAX_INTERFACE_NAME_CHARS];
-	char nsname[MAX_INTERFACES_NAMESPACE_CHARS];
 	struct xsk_socket_info *xsk;
 	struct xsk_socket_info *xsk_arr;
 	struct xsk_umem_info *umem;
 	thread_func_t func_ptr;
 	validation_func_t validation_func;
 	struct pkt_stream *pkt_stream;
-	int ns_fd;
 	int xsk_map_fd;
 	u32 dst_ip;
 	u32 src_ip;
-- 
cgit 


From aa61d81f397c65b9951b0da2538657154564564f Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:20 +0100
Subject: selftests/xsk: load and attach XDP program only once per mode

Load and attach the XDP program only once per XDP mode that is being
executed. Today, the XDP program is loaded and attached for every
test, then unloaded, which takes a long time on real NICs, since they
have to reconfigure their HW, in contrast to veth. The test suite now
completes in 21 seconds, instead of 207 seconds previously on my
machine. This is a speed-up of around 10x.

This is accomplished by moving the XDP loading from the worker threads
to the main thread and replacing the XDP loading interfaces of xsk.c
that was taken from the xsk support in libbpf, with something more
explicit that is more useful for these tests. Instead, the relevant
file descriptors and ifindexes are just passed down to the new
functions.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-10-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/xsk.c        |  92 ++++++++++++-------
 tools/testing/selftests/bpf/xsk.h        |   7 +-
 tools/testing/selftests/bpf/xskxceiver.c | 147 +++++++++++++++++++------------
 tools/testing/selftests/bpf/xskxceiver.h |   3 +
 4 files changed, 162 insertions(+), 87 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c
index b166edfff86d..1dd953541812 100644
--- a/tools/testing/selftests/bpf/xsk.c
+++ b/tools/testing/selftests/bpf/xsk.c
@@ -51,6 +51,8 @@
 
 #define pr_warn(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__)
 
+#define XSKMAP_SIZE 1
+
 enum xsk_prog {
 	XSK_PROG_FALLBACK,
 	XSK_PROG_REDIRECT_FLAGS,
@@ -387,10 +389,9 @@ static enum xsk_prog get_xsk_prog(void)
 	return detected;
 }
 
-static int xsk_load_xdp_prog(struct xsk_socket *xsk)
+static int __xsk_load_xdp_prog(int xsk_map_fd)
 {
 	static const int log_buf_size = 16 * 1024;
-	struct xsk_ctx *ctx = xsk->ctx;
 	char log_buf[log_buf_size];
 	int prog_fd;
 
@@ -418,7 +419,7 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk)
 		/* *(u32 *)(r10 - 4) = r2 */
 		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4),
 		/* r1 = xskmap[] */
-		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
+		BPF_LD_MAP_FD(BPF_REG_1, xsk_map_fd),
 		/* r3 = XDP_PASS */
 		BPF_MOV64_IMM(BPF_REG_3, 2),
 		/* call bpf_redirect_map */
@@ -430,7 +431,7 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk)
 		/* r2 += -4 */
 		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
 		/* r1 = xskmap[] */
-		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
+		BPF_LD_MAP_FD(BPF_REG_1, xsk_map_fd),
 		/* call bpf_map_lookup_elem */
 		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
 		/* r1 = r0 */
@@ -442,7 +443,7 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk)
 		/* r2 = *(u32 *)(r10 - 4) */
 		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
 		/* r1 = xskmap[] */
-		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
+		BPF_LD_MAP_FD(BPF_REG_1, xsk_map_fd),
 		/* r3 = 0 */
 		BPF_MOV64_IMM(BPF_REG_3, 0),
 		/* call bpf_redirect_map */
@@ -461,7 +462,7 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk)
 		/* r2 = *(u32 *)(r1 + 16) */
 		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
 		/* r1 = xskmap[] */
-		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
+		BPF_LD_MAP_FD(BPF_REG_1, xsk_map_fd),
 		/* r3 = XDP_PASS */
 		BPF_MOV64_IMM(BPF_REG_3, 2),
 		/* call bpf_redirect_map */
@@ -480,13 +481,40 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk)
 
 	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "LGPL-2.1 or BSD-2-Clause",
 				progs[option], insns_cnt[option], &opts);
-	if (prog_fd < 0) {
+	if (prog_fd < 0)
 		pr_warn("BPF log buffer:\n%s", log_buf);
-		return prog_fd;
+
+	return prog_fd;
+}
+
+int xsk_attach_xdp_program(int ifindex, int prog_fd, u32 xdp_flags)
+{
+	DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
+	__u32 prog_id = 0;
+	int link_fd;
+	int err;
+
+	err = bpf_xdp_query_id(ifindex, xdp_flags, &prog_id);
+	if (err) {
+		pr_warn("getting XDP prog id failed\n");
+		return err;
 	}
 
-	ctx->prog_fd = prog_fd;
-	return 0;
+	/* If there's a netlink-based XDP prog loaded on interface, bail out
+	 * and ask user to do the removal by himself
+	 */
+	if (prog_id) {
+		pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n");
+		return -EINVAL;
+	}
+
+	opts.flags = xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE);
+
+	link_fd = bpf_link_create(prog_fd, ifindex, BPF_XDP, &opts);
+	if (link_fd < 0)
+		pr_warn("bpf_link_create failed: %s\n", strerror(errno));
+
+	return link_fd;
 }
 
 static int xsk_create_bpf_link(struct xsk_socket *xsk)
@@ -775,7 +803,7 @@ static int xsk_init_xdp_res(struct xsk_socket *xsk,
 	if (err)
 		return err;
 
-	err = xsk_load_xdp_prog(xsk);
+	err = __xsk_load_xdp_prog(*xsks_map_fd);
 	if (err)
 		goto err_load_xdp_prog;
 
@@ -871,6 +899,22 @@ int xsk_setup_xdp_prog_xsk(struct xsk_socket *xsk, int *xsks_map_fd)
 	return __xsk_setup_xdp_prog(xsk, xsks_map_fd);
 }
 
+int xsk_load_xdp_program(int *xsk_map_fd, int *prog_fd)
+{
+	*xsk_map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map", sizeof(int), sizeof(int),
+				     XSKMAP_SIZE, NULL);
+	if (*xsk_map_fd < 0)
+		return *xsk_map_fd;
+
+	*prog_fd = __xsk_load_xdp_prog(*xsk_map_fd);
+	if (*prog_fd < 0) {
+		close(*xsk_map_fd);
+		return *prog_fd;
+	}
+
+	return 0;
+}
+
 static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex,
 				   __u32 queue_id)
 {
@@ -917,7 +961,7 @@ out_free:
 
 static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
 				      struct xsk_umem *umem, int ifindex,
-				      const char *ifname, __u32 queue_id,
+				      __u32 queue_id,
 				      struct xsk_ring_prod *fill,
 				      struct xsk_ring_cons *comp)
 {
@@ -944,7 +988,6 @@ static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
 	ctx->refcount = 1;
 	ctx->umem = umem;
 	ctx->queue_id = queue_id;
-	bpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
 	ctx->prog_fd = FD_NOT_USED;
 	ctx->link_fd = FD_NOT_USED;
 	ctx->xsks_map_fd = FD_NOT_USED;
@@ -991,7 +1034,7 @@ int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd)
 }
 
 int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
-			      const char *ifname,
+			      int ifindex,
 			      __u32 queue_id, struct xsk_umem *umem,
 			      struct xsk_ring_cons *rx,
 			      struct xsk_ring_prod *tx,
@@ -1005,7 +1048,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
 	struct xdp_mmap_offsets off;
 	struct xsk_socket *xsk;
 	struct xsk_ctx *ctx;
-	int err, ifindex;
+	int err;
 
 	if (!umem || !xsk_ptr || !(rx || tx))
 		return -EFAULT;
@@ -1020,12 +1063,6 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
 	if (err)
 		goto out_xsk_alloc;
 
-	ifindex = if_nametoindex(ifname);
-	if (!ifindex) {
-		err = -errno;
-		goto out_xsk_alloc;
-	}
-
 	if (umem->refcount++ > 0) {
 		xsk->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0);
 		if (xsk->fd < 0) {
@@ -1045,8 +1082,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
 			goto out_socket;
 		}
 
-		ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id,
-				     fill, comp);
+		ctx = xsk_create_ctx(xsk, umem, ifindex, queue_id, fill, comp);
 		if (!ctx) {
 			err = -ENOMEM;
 			goto out_socket;
@@ -1144,12 +1180,6 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
 		goto out_mmap_tx;
 	}
 
-	if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
-		err = __xsk_setup_xdp_prog(xsk, NULL);
-		if (err)
-			goto out_mmap_tx;
-	}
-
 	*xsk_ptr = xsk;
 	umem->fill_save = NULL;
 	umem->comp_save = NULL;
@@ -1173,7 +1203,7 @@ out_xsk_alloc:
 	return err;
 }
 
-int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
+int xsk_socket__create(struct xsk_socket **xsk_ptr, int ifindex,
 		       __u32 queue_id, struct xsk_umem *umem,
 		       struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
 		       const struct xsk_socket_config *usr_config)
@@ -1181,7 +1211,7 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
 	if (!umem)
 		return -EFAULT;
 
-	return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem,
+	return xsk_socket__create_shared(xsk_ptr, ifindex, queue_id, umem,
 					 rx, tx, umem->fill_save,
 					 umem->comp_save, usr_config);
 }
diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h
index 24ee765aded3..7a5aeacd261b 100644
--- a/tools/testing/selftests/bpf/xsk.h
+++ b/tools/testing/selftests/bpf/xsk.h
@@ -204,6 +204,9 @@ int xsk_socket__update_xskmap(struct xsk_socket *xsk, int xsks_map_fd);
 /* Flags for the libbpf_flags field. */
 #define XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD (1 << 0)
 
+int xsk_load_xdp_program(int *xsk_map_fd, int *prog_fd);
+int xsk_attach_xdp_program(int ifindex, int prog_fd, u32 xdp_flags);
+
 struct xsk_socket_config {
 	__u32 rx_size;
 	__u32 tx_size;
@@ -219,13 +222,13 @@ int xsk_umem__create(struct xsk_umem **umem,
 		     struct xsk_ring_cons *comp,
 		     const struct xsk_umem_config *config);
 int xsk_socket__create(struct xsk_socket **xsk,
-		       const char *ifname, __u32 queue_id,
+		       int ifindex, __u32 queue_id,
 		       struct xsk_umem *umem,
 		       struct xsk_ring_cons *rx,
 		       struct xsk_ring_prod *tx,
 		       const struct xsk_socket_config *config);
 int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
-			      const char *ifname,
+			      int ifindex,
 			      __u32 queue_id, struct xsk_umem *umem,
 			      struct xsk_ring_cons *rx,
 			      struct xsk_ring_prod *tx,
diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 4c8f32e1c431..0c0974c209cd 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -268,6 +268,11 @@ static void gen_udp_csum(struct udphdr *udp_hdr, struct iphdr *ip_hdr)
 	    udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, IPPROTO_UDP, (u16 *)udp_hdr);
 }
 
+static u32 mode_to_xdp_flags(enum test_mode mode)
+{
+	return (mode == TEST_MODE_SKB) ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE;
+}
+
 static int xsk_configure_umem(struct xsk_umem_info *umem, void *buffer, u64 size)
 {
 	struct xsk_umem_config cfg = {
@@ -329,7 +334,7 @@ static int __xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_i
 
 	txr = ifobject->tx_on ? &xsk->tx : NULL;
 	rxr = ifobject->rx_on ? &xsk->rx : NULL;
-	return xsk_socket__create(&xsk->xsk, ifobject->ifname, 0, umem->umem, rxr, txr, &cfg);
+	return xsk_socket__create(&xsk->xsk, ifobject->ifindex, 0, umem->umem, rxr, txr, &cfg);
 }
 
 static bool ifobj_zc_avail(struct ifobject *ifobject)
@@ -359,8 +364,7 @@ static bool ifobj_zc_avail(struct ifobject *ifobject)
 	xsk = calloc(1, sizeof(struct xsk_socket_info));
 	if (!xsk)
 		goto out;
-	ifobject->xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
-	ifobject->xdp_flags |= XDP_FLAGS_DRV_MODE;
+	ifobject->xdp_flags = XDP_FLAGS_DRV_MODE;
 	ifobject->bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY;
 	ifobject->rx_on = true;
 	xsk->rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS;
@@ -430,6 +434,11 @@ static void parse_command_line(struct ifobject *ifobj_tx, struct ifobject *ifobj
 
 			memcpy(ifobj->ifname, optarg,
 			       min_t(size_t, MAX_INTERFACE_NAME_CHARS, strlen(optarg)));
+
+			ifobj->ifindex = if_nametoindex(ifobj->ifname);
+			if (!ifobj->ifindex)
+				exit_with_error(errno);
+
 			interface_nb++;
 			break;
 		case 'D':
@@ -510,12 +519,6 @@ static void test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
 	for (i = 0; i < MAX_INTERFACES; i++) {
 		struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx;
 
-		ifobj->xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
-		if (mode == TEST_MODE_SKB)
-			ifobj->xdp_flags |= XDP_FLAGS_SKB_MODE;
-		else
-			ifobj->xdp_flags |= XDP_FLAGS_DRV_MODE;
-
 		ifobj->bind_flags = XDP_USE_NEED_WAKEUP;
 		if (mode == TEST_MODE_ZC)
 			ifobj->bind_flags |= XDP_ZEROCOPY;
@@ -1252,7 +1255,8 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
 	u64 umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size;
 	int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
 	LIBBPF_OPTS(bpf_xdp_query_opts, opts);
-	int ret, ifindex;
+	u32 queue_id = 0;
+	int ret, fd;
 	void *bufs;
 
 	if (ifobject->umem->unaligned_mode)
@@ -1278,31 +1282,8 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
 	if (!ifobject->rx_on)
 		return;
 
-	ifindex = if_nametoindex(ifobject->ifname);
-	if (!ifindex)
-		exit_with_error(errno);
-
-	ret = xsk_setup_xdp_prog_xsk(ifobject->xsk->xsk, &ifobject->xsk_map_fd);
-	if (ret)
-		exit_with_error(-ret);
-
-	ret = bpf_xdp_query(ifindex, ifobject->xdp_flags, &opts);
-	if (ret)
-		exit_with_error(-ret);
-
-	if (ifobject->xdp_flags & XDP_FLAGS_SKB_MODE) {
-		if (opts.attach_mode != XDP_ATTACHED_SKB) {
-			ksft_print_msg("ERROR: [%s] XDP prog not in SKB mode\n");
-			exit_with_error(EINVAL);
-		}
-	} else if (ifobject->xdp_flags & XDP_FLAGS_DRV_MODE) {
-		if (opts.attach_mode != XDP_ATTACHED_DRV) {
-			ksft_print_msg("ERROR: [%s] XDP prog not in DRV mode\n");
-			exit_with_error(EINVAL);
-		}
-	}
-
-	ret = xsk_socket__update_xskmap(ifobject->xsk->xsk, ifobject->xsk_map_fd);
+	fd = xsk_socket__fd(ifobject->xsk->xsk);
+	ret = bpf_map_update_elem(ifobject->xsk_map_fd, &queue_id, &fd, 0);
 	if (ret)
 		exit_with_error(errno);
 }
@@ -1336,15 +1317,19 @@ static void *worker_testapp_validate_rx(void *arg)
 {
 	struct test_spec *test = (struct test_spec *)arg;
 	struct ifobject *ifobject = test->ifobj_rx;
+	int id = 0, err, fd = xsk_socket__fd(ifobject->xsk->xsk);
 	struct pollfd fds = { };
-	int id = 0;
-	int err;
+	u32 queue_id = 0;
 
 	if (test->current_step == 1) {
 		thread_common_ops(test, ifobject);
 	} else {
 		bpf_map_delete_elem(ifobject->xsk_map_fd, &id);
-		xsk_socket__update_xskmap(ifobject->xsk->xsk, ifobject->xsk_map_fd);
+		err = bpf_map_update_elem(ifobject->xsk_map_fd, &queue_id, &fd, 0);
+		if (err) {
+			printf("Error: Failed to update xskmap, error %s\n", strerror(err));
+			exit_with_error(err);
+		}
 	}
 
 	fds.fd = xsk_socket__fd(ifobject->xsk->xsk);
@@ -1413,7 +1398,10 @@ static int testapp_validate_traffic_single_thread(struct test_spec *test, struct
 	pthread_join(t0, NULL);
 
 	if (test->total_steps == test->current_step || test->fail) {
+		u32 queue_id = 0;
+
 		xsk_socket__delete(ifobj->xsk->xsk);
+		bpf_map_delete_elem(ifobj->xsk_map_fd, &queue_id);
 		testapp_clean_xsk_umem(ifobj);
 	}
 
@@ -1502,14 +1490,14 @@ static void testapp_bidi(struct test_spec *test)
 
 static void swap_xsk_resources(struct ifobject *ifobj_tx, struct ifobject *ifobj_rx)
 {
-	int ret;
+	int ret, queue_id = 0, fd = xsk_socket__fd(ifobj_rx->xsk->xsk);
 
 	xsk_socket__delete(ifobj_tx->xsk->xsk);
 	xsk_socket__delete(ifobj_rx->xsk->xsk);
 	ifobj_tx->xsk = &ifobj_tx->xsk_arr[1];
 	ifobj_rx->xsk = &ifobj_rx->xsk_arr[1];
 
-	ret = xsk_socket__update_xskmap(ifobj_rx->xsk->xsk, ifobj_rx->xsk_map_fd);
+	ret = bpf_map_update_elem(ifobj_rx->xsk_map_fd, &queue_id, &fd, 0);
 	if (ret)
 		exit_with_error(errno);
 }
@@ -1673,8 +1661,9 @@ static void testapp_invalid_desc(struct test_spec *test)
 
 static void init_iface(struct ifobject *ifobj, const char *dst_mac, const char *src_mac,
 		       const char *dst_ip, const char *src_ip, const u16 dst_port,
-		       const u16 src_port, thread_func_t func_ptr)
+		       const u16 src_port, thread_func_t func_ptr, bool load_xdp)
 {
+	int xsk_map_fd, prog_fd, err;
 	struct in_addr ip;
 
 	memcpy(ifobj->dst_mac, dst_mac, ETH_ALEN);
@@ -1690,6 +1679,24 @@ static void init_iface(struct ifobject *ifobj, const char *dst_mac, const char *
 	ifobj->src_port = src_port;
 
 	ifobj->func_ptr = func_ptr;
+
+	if (!load_xdp)
+		return;
+
+	err = xsk_load_xdp_program(&xsk_map_fd, &prog_fd);
+	if (err) {
+		printf("Error loading XDP program\n");
+		exit_with_error(err);
+	}
+
+	ifobj->xsk_map_fd = xsk_map_fd;
+	ifobj->prog_fd = prog_fd;
+	ifobj->xdp_flags = mode_to_xdp_flags(TEST_MODE_SKB);
+	ifobj->link_fd = xsk_attach_xdp_program(ifobj->ifindex, prog_fd, ifobj->xdp_flags);
+	if (ifobj->link_fd < 0) {
+		printf("Error attaching XDP program\n");
+		exit_with_error(ifobj->link_fd);
+	}
 }
 
 static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_type type)
@@ -1824,12 +1831,15 @@ out_xsk_arr:
 
 static void ifobject_delete(struct ifobject *ifobj)
 {
+	close(ifobj->prog_fd);
+	close(ifobj->xsk_map_fd);
+
 	free(ifobj->umem);
 	free(ifobj->xsk_arr);
 	free(ifobj);
 }
 
-static bool is_xdp_supported(struct ifobject *ifobject)
+static bool is_xdp_supported(int ifindex)
 {
 	int flags = XDP_FLAGS_DRV_MODE;
 
@@ -1838,7 +1848,6 @@ static bool is_xdp_supported(struct ifobject *ifobject)
 		BPF_MOV64_IMM(BPF_REG_0, XDP_PASS),
 		BPF_EXIT_INSN()
 	};
-	int ifindex = if_nametoindex(ifobject->ifname);
 	int prog_fd, insn_cnt = ARRAY_SIZE(insns);
 	int err;
 
@@ -1858,6 +1867,29 @@ static bool is_xdp_supported(struct ifobject *ifobject)
 	return true;
 }
 
+static void change_to_drv_mode(struct ifobject *ifobj)
+{
+	LIBBPF_OPTS(bpf_xdp_query_opts, opts);
+	int ret;
+
+	close(ifobj->link_fd);
+	ifobj->link_fd = xsk_attach_xdp_program(ifobj->ifindex, ifobj->prog_fd,
+						XDP_FLAGS_DRV_MODE);
+	if (ifobj->link_fd < 0) {
+		ksft_print_msg("Error attaching XDP program\n");
+		exit_with_error(-ifobj->link_fd);
+	}
+
+	ret = bpf_xdp_query(ifobj->ifindex, XDP_FLAGS_DRV_MODE, &opts);
+	if (ret)
+		exit_with_error(errno);
+
+	if (opts.attach_mode != XDP_ATTACHED_DRV) {
+		ksft_print_msg("ERROR: XDP prog not in DRV mode\n");
+		exit_with_error(EINVAL);
+	}
+}
+
 int main(int argc, char **argv)
 {
 	struct pkt_stream *rx_pkt_stream_default;
@@ -1866,7 +1898,7 @@ int main(int argc, char **argv)
 	int modes = TEST_MODE_SKB + 1;
 	u32 i, j, failed_tests = 0;
 	struct test_spec test;
-	bool shared_umem;
+	bool shared_netdev;
 
 	/* Use libbpf 1.0 API mode */
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
@@ -1881,27 +1913,27 @@ int main(int argc, char **argv)
 	setlocale(LC_ALL, "");
 
 	parse_command_line(ifobj_tx, ifobj_rx, argc, argv);
-	shared_umem = !strcmp(ifobj_tx->ifname, ifobj_rx->ifname);
 
-	ifobj_tx->shared_umem = shared_umem;
-	ifobj_rx->shared_umem = shared_umem;
+	shared_netdev = (ifobj_tx->ifindex == ifobj_rx->ifindex);
+	ifobj_tx->shared_umem = shared_netdev;
+	ifobj_rx->shared_umem = shared_netdev;
 
 	if (!validate_interface(ifobj_tx) || !validate_interface(ifobj_rx)) {
 		usage(basename(argv[0]));
 		ksft_exit_xfail();
 	}
 
-	init_iface(ifobj_tx, MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2,
-		   worker_testapp_validate_tx);
-	init_iface(ifobj_rx, MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1,
-		   worker_testapp_validate_rx);
-
-	if (is_xdp_supported(ifobj_tx)) {
+	if (is_xdp_supported(ifobj_tx->ifindex)) {
 		modes++;
 		if (ifobj_zc_avail(ifobj_tx))
 			modes++;
 	}
 
+	init_iface(ifobj_rx, MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2,
+		   worker_testapp_validate_rx, true);
+	init_iface(ifobj_tx, MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1,
+		   worker_testapp_validate_tx, !shared_netdev);
+
 	test_spec_init(&test, ifobj_tx, ifobj_rx, 0);
 	tx_pkt_stream_default = pkt_stream_generate(ifobj_tx->umem, DEFAULT_PKT_CNT, PKT_SIZE);
 	rx_pkt_stream_default = pkt_stream_generate(ifobj_rx->umem, DEFAULT_PKT_CNT, PKT_SIZE);
@@ -1912,7 +1944,13 @@ int main(int argc, char **argv)
 
 	ksft_set_plan(modes * TEST_TYPE_MAX);
 
-	for (i = 0; i < modes; i++)
+	for (i = 0; i < modes; i++) {
+		if (i == TEST_MODE_DRV) {
+			change_to_drv_mode(ifobj_rx);
+			if (!shared_netdev)
+				change_to_drv_mode(ifobj_tx);
+		}
+
 		for (j = 0; j < TEST_TYPE_MAX; j++) {
 			test_spec_init(&test, ifobj_tx, ifobj_rx, i);
 			run_pkt_test(&test, i, j);
@@ -1921,6 +1959,7 @@ int main(int argc, char **argv)
 			if (test.fail)
 				failed_tests++;
 		}
+	}
 
 	pkt_stream_delete(tx_pkt_stream_default);
 	pkt_stream_delete(rx_pkt_stream_default);
diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h
index dcb908f5bb4c..b2ba877b1966 100644
--- a/tools/testing/selftests/bpf/xskxceiver.h
+++ b/tools/testing/selftests/bpf/xskxceiver.h
@@ -139,6 +139,9 @@ struct ifobject {
 	validation_func_t validation_func;
 	struct pkt_stream *pkt_stream;
 	int xsk_map_fd;
+	int prog_fd;
+	int link_fd;
+	int ifindex;
 	u32 dst_ip;
 	u32 src_ip;
 	u32 xdp_flags;
-- 
cgit 


From 6b3c0821caa49538c49262b041bae59bad523c7c Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:21 +0100
Subject: selftests/xsk: remove unnecessary code in control path

Remove unnecessary code in the control path. This is located in the
file xsk.c that was moved from libbpf when the xsk support there was
deprecated. Some of the code there is not needed anymore as the
selftests are only guaranteed to run on the kernel it is shipped
with. Therefore, all the code that has to deal with compatibility of
older kernels can be dropped and also any other function that is not
of any use for the tests.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-11-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/xsk.c        | 617 +------------------------------
 tools/testing/selftests/bpf/xsk.h        |   9 -
 tools/testing/selftests/bpf/xskxceiver.c |   8 -
 3 files changed, 3 insertions(+), 631 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c
index 1dd953541812..9ed31d280e48 100644
--- a/tools/testing/selftests/bpf/xsk.c
+++ b/tools/testing/selftests/bpf/xsk.c
@@ -35,8 +35,6 @@
 #include "xsk.h"
 #include "bpf_util.h"
 
-#define FD_NOT_USED (-1)
-
 #ifndef SOL_XDP
  #define SOL_XDP 283
 #endif
@@ -53,11 +51,6 @@
 
 #define XSKMAP_SIZE 1
 
-enum xsk_prog {
-	XSK_PROG_FALLBACK,
-	XSK_PROG_REDIRECT_FLAGS,
-};
-
 struct xsk_umem {
 	struct xsk_ring_prod *fill_save;
 	struct xsk_ring_cons *comp_save;
@@ -78,11 +71,6 @@ struct xsk_ctx {
 	int refcount;
 	int ifindex;
 	struct list_head list;
-	int prog_fd;
-	int link_fd;
-	int xsks_map_fd;
-	char ifname[IFNAMSIZ];
-	bool has_bpf_link;
 };
 
 struct xsk_socket {
@@ -93,27 +81,6 @@ struct xsk_socket {
 	int fd;
 };
 
-struct xsk_nl_info {
-	bool xdp_prog_attached;
-	int ifindex;
-	int fd;
-};
-
-/* Up until and including Linux 5.3 */
-struct xdp_ring_offset_v1 {
-	__u64 producer;
-	__u64 consumer;
-	__u64 desc;
-};
-
-/* Up until and including Linux 5.3 */
-struct xdp_mmap_offsets_v1 {
-	struct xdp_ring_offset_v1 rx;
-	struct xdp_ring_offset_v1 tx;
-	struct xdp_ring_offset_v1 fr;
-	struct xdp_ring_offset_v1 cr;
-};
-
 int xsk_umem__fd(const struct xsk_umem *umem)
 {
 	return umem ? umem->fd : -EINVAL;
@@ -156,55 +123,17 @@ static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
 	if (!usr_cfg) {
 		cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
 		cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
-		cfg->libbpf_flags = 0;
-		cfg->xdp_flags = 0;
 		cfg->bind_flags = 0;
 		return 0;
 	}
 
-	if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)
-		return -EINVAL;
-
 	cfg->rx_size = usr_cfg->rx_size;
 	cfg->tx_size = usr_cfg->tx_size;
-	cfg->libbpf_flags = usr_cfg->libbpf_flags;
-	cfg->xdp_flags = usr_cfg->xdp_flags;
 	cfg->bind_flags = usr_cfg->bind_flags;
 
 	return 0;
 }
 
-static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off)
-{
-	struct xdp_mmap_offsets_v1 off_v1;
-
-	/* getsockopt on a kernel <= 5.3 has no flags fields.
-	 * Copy over the offsets to the correct places in the >=5.4 format
-	 * and put the flags where they would have been on that kernel.
-	 */
-	memcpy(&off_v1, off, sizeof(off_v1));
-
-	off->rx.producer = off_v1.rx.producer;
-	off->rx.consumer = off_v1.rx.consumer;
-	off->rx.desc = off_v1.rx.desc;
-	off->rx.flags = off_v1.rx.consumer + sizeof(__u32);
-
-	off->tx.producer = off_v1.tx.producer;
-	off->tx.consumer = off_v1.tx.consumer;
-	off->tx.desc = off_v1.tx.desc;
-	off->tx.flags = off_v1.tx.consumer + sizeof(__u32);
-
-	off->fr.producer = off_v1.fr.producer;
-	off->fr.consumer = off_v1.fr.consumer;
-	off->fr.desc = off_v1.fr.desc;
-	off->fr.flags = off_v1.fr.consumer + sizeof(__u32);
-
-	off->cr.producer = off_v1.cr.producer;
-	off->cr.consumer = off_v1.cr.consumer;
-	off->cr.desc = off_v1.cr.desc;
-	off->cr.flags = off_v1.cr.consumer + sizeof(__u32);
-}
-
 static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off)
 {
 	socklen_t optlen;
@@ -218,11 +147,6 @@ static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off)
 	if (optlen == sizeof(*off))
 		return 0;
 
-	if (optlen == sizeof(struct xdp_mmap_offsets_v1)) {
-		xsk_mmap_offsets_v1(off);
-		return 0;
-	}
-
 	return -EINVAL;
 }
 
@@ -343,122 +267,19 @@ out_umem_alloc:
 	return err;
 }
 
-struct xsk_umem_config_v1 {
-	__u32 fill_size;
-	__u32 comp_size;
-	__u32 frame_size;
-	__u32 frame_headroom;
-};
-
-static enum xsk_prog get_xsk_prog(void)
-{
-	enum xsk_prog detected = XSK_PROG_FALLBACK;
-	char data_in = 0, data_out;
-	struct bpf_insn insns[] = {
-		BPF_LD_MAP_FD(BPF_REG_1, 0),
-		BPF_MOV64_IMM(BPF_REG_2, 0),
-		BPF_MOV64_IMM(BPF_REG_3, XDP_PASS),
-		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
-		BPF_EXIT_INSN(),
-	};
-	LIBBPF_OPTS(bpf_test_run_opts, opts,
-		.data_in = &data_in,
-		.data_size_in = 1,
-		.data_out = &data_out,
-	);
-
-	int prog_fd, map_fd, ret, insn_cnt = ARRAY_SIZE(insns);
-
-	map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, NULL, sizeof(int), sizeof(int), 1, NULL);
-	if (map_fd < 0)
-		return detected;
-
-	insns[0].imm = map_fd;
-
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
-	if (prog_fd < 0) {
-		close(map_fd);
-		return detected;
-	}
-
-	ret = bpf_prog_test_run_opts(prog_fd, &opts);
-	if (!ret && opts.retval == XDP_PASS)
-		detected = XSK_PROG_REDIRECT_FLAGS;
-	close(prog_fd);
-	close(map_fd);
-	return detected;
-}
-
 static int __xsk_load_xdp_prog(int xsk_map_fd)
 {
 	static const int log_buf_size = 16 * 1024;
 	char log_buf[log_buf_size];
 	int prog_fd;
 
-	/* This is the fallback C-program:
-	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
-	 * {
-	 *     int ret, index = ctx->rx_queue_index;
-	 *
-	 *     // A set entry here means that the correspnding queue_id
-	 *     // has an active AF_XDP socket bound to it.
-	 *     ret = bpf_redirect_map(&xsks_map, index, XDP_PASS);
-	 *     if (ret > 0)
-	 *         return ret;
-	 *
-	 *     // Fallback for pre-5.3 kernels, not supporting default
-	 *     // action in the flags parameter.
-	 *     if (bpf_map_lookup_elem(&xsks_map, &index))
-	 *         return bpf_redirect_map(&xsks_map, index, 0);
-	 *     return XDP_PASS;
-	 * }
-	 */
-	struct bpf_insn prog[] = {
-		/* r2 = *(u32 *)(r1 + 16) */
-		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
-		/* *(u32 *)(r10 - 4) = r2 */
-		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4),
-		/* r1 = xskmap[] */
-		BPF_LD_MAP_FD(BPF_REG_1, xsk_map_fd),
-		/* r3 = XDP_PASS */
-		BPF_MOV64_IMM(BPF_REG_3, 2),
-		/* call bpf_redirect_map */
-		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
-		/* if w0 != 0 goto pc+13 */
-		BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13),
-		/* r2 = r10 */
-		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
-		/* r2 += -4 */
-		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
-		/* r1 = xskmap[] */
-		BPF_LD_MAP_FD(BPF_REG_1, xsk_map_fd),
-		/* call bpf_map_lookup_elem */
-		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
-		/* r1 = r0 */
-		BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
-		/* r0 = XDP_PASS */
-		BPF_MOV64_IMM(BPF_REG_0, 2),
-		/* if r1 == 0 goto pc+5 */
-		BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
-		/* r2 = *(u32 *)(r10 - 4) */
-		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
-		/* r1 = xskmap[] */
-		BPF_LD_MAP_FD(BPF_REG_1, xsk_map_fd),
-		/* r3 = 0 */
-		BPF_MOV64_IMM(BPF_REG_3, 0),
-		/* call bpf_redirect_map */
-		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
-		/* The jumps are to this instruction */
-		BPF_EXIT_INSN(),
-	};
-
 	/* This is the post-5.3 kernel C-program:
 	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
 	 * {
 	 *     return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, XDP_PASS);
 	 * }
 	 */
-	struct bpf_insn prog_redirect_flags[] = {
+	struct bpf_insn prog[] = {
 		/* r2 = *(u32 *)(r1 + 16) */
 		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
 		/* r1 = xskmap[] */
@@ -469,18 +290,14 @@ static int __xsk_load_xdp_prog(int xsk_map_fd)
 		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
 		BPF_EXIT_INSN(),
 	};
-	size_t insns_cnt[] = {ARRAY_SIZE(prog),
-			      ARRAY_SIZE(prog_redirect_flags),
-	};
-	struct bpf_insn *progs[] = {prog, prog_redirect_flags};
-	enum xsk_prog option = get_xsk_prog();
+	size_t insns_cnt = ARRAY_SIZE(prog);
 	LIBBPF_OPTS(bpf_prog_load_opts, opts,
 		.log_buf = log_buf,
 		.log_size = log_buf_size,
 	);
 
 	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "LGPL-2.1 or BSD-2-Clause",
-				progs[option], insns_cnt[option], &opts);
+				prog, insns_cnt, &opts);
 	if (prog_fd < 0)
 		pr_warn("BPF log buffer:\n%s", log_buf);
 
@@ -517,388 +334,6 @@ int xsk_attach_xdp_program(int ifindex, int prog_fd, u32 xdp_flags)
 	return link_fd;
 }
 
-static int xsk_create_bpf_link(struct xsk_socket *xsk)
-{
-	DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
-	struct xsk_ctx *ctx = xsk->ctx;
-	__u32 prog_id = 0;
-	int link_fd;
-	int err;
-
-	err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id);
-	if (err) {
-		pr_warn("getting XDP prog id failed\n");
-		return err;
-	}
-
-	/* if there's a netlink-based XDP prog loaded on interface, bail out
-	 * and ask user to do the removal by himself
-	 */
-	if (prog_id) {
-		pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n");
-		return -EINVAL;
-	}
-
-	opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE);
-
-	link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts);
-	if (link_fd < 0) {
-		pr_warn("bpf_link_create failed: %s\n", strerror(errno));
-		return link_fd;
-	}
-
-	ctx->link_fd = link_fd;
-	return 0;
-}
-
-static int xsk_get_max_queues(struct xsk_socket *xsk)
-{
-	struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
-	struct xsk_ctx *ctx = xsk->ctx;
-	struct ifreq ifr = {};
-	int fd, err, ret;
-
-	fd = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0);
-	if (fd < 0)
-		return -errno;
-
-	ifr.ifr_data = (void *)&channels;
-	bpf_strlcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ);
-	err = ioctl(fd, SIOCETHTOOL, &ifr);
-	if (err && errno != EOPNOTSUPP) {
-		ret = -errno;
-		goto out;
-	}
-
-	if (err) {
-		/* If the device says it has no channels, then all traffic
-		 * is sent to a single stream, so max queues = 1.
-		 */
-		ret = 1;
-	} else {
-		/* Take the max of rx, tx, combined. Drivers return
-		 * the number of channels in different ways.
-		 */
-		ret = max(channels.max_rx, channels.max_tx);
-		ret = max(ret, (int)channels.max_combined);
-	}
-
-out:
-	close(fd);
-	return ret;
-}
-
-static int xsk_create_bpf_maps(struct xsk_socket *xsk)
-{
-	struct xsk_ctx *ctx = xsk->ctx;
-	int max_queues;
-	int fd;
-
-	max_queues = xsk_get_max_queues(xsk);
-	if (max_queues < 0)
-		return max_queues;
-
-	fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map",
-			    sizeof(int), sizeof(int), max_queues, NULL);
-	if (fd < 0)
-		return fd;
-
-	ctx->xsks_map_fd = fd;
-
-	return 0;
-}
-
-static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
-{
-	struct xsk_ctx *ctx = xsk->ctx;
-
-	if (ctx->xsks_map_fd == FD_NOT_USED)
-		return;
-
-	bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id);
-	close(ctx->xsks_map_fd);
-}
-
-static int xsk_lookup_bpf_maps(struct xsk_socket *xsk)
-{
-	__u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info);
-	__u32 map_len = sizeof(struct bpf_map_info);
-	struct bpf_prog_info prog_info = {};
-	struct xsk_ctx *ctx = xsk->ctx;
-	struct bpf_map_info map_info;
-	int fd, err;
-
-	err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
-	if (err)
-		return err;
-
-	num_maps = prog_info.nr_map_ids;
-
-	map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids));
-	if (!map_ids)
-		return -ENOMEM;
-
-	memset(&prog_info, 0, prog_len);
-	prog_info.nr_map_ids = num_maps;
-	prog_info.map_ids = (__u64)(unsigned long)map_ids;
-
-	err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
-	if (err)
-		goto out_map_ids;
-
-	ctx->xsks_map_fd = -1;
-
-	for (i = 0; i < prog_info.nr_map_ids; i++) {
-		fd = bpf_map_get_fd_by_id(map_ids[i]);
-		if (fd < 0)
-			continue;
-
-		memset(&map_info, 0, map_len);
-		err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
-		if (err) {
-			close(fd);
-			continue;
-		}
-
-		if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) {
-			ctx->xsks_map_fd = fd;
-			break;
-		}
-
-		close(fd);
-	}
-
-	if (ctx->xsks_map_fd == -1)
-		err = -ENOENT;
-
-out_map_ids:
-	free(map_ids);
-	return err;
-}
-
-static int xsk_set_bpf_maps(struct xsk_socket *xsk)
-{
-	struct xsk_ctx *ctx = xsk->ctx;
-
-	return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id,
-				   &xsk->fd, 0);
-}
-
-static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd)
-{
-	struct bpf_link_info link_info;
-	__u32 link_len;
-	__u32 id = 0;
-	int err;
-	int fd;
-
-	while (true) {
-		err = bpf_link_get_next_id(id, &id);
-		if (err) {
-			if (errno == ENOENT) {
-				err = 0;
-				break;
-			}
-			pr_warn("can't get next link: %s\n", strerror(errno));
-			break;
-		}
-
-		fd = bpf_link_get_fd_by_id(id);
-		if (fd < 0) {
-			if (errno == ENOENT)
-				continue;
-			pr_warn("can't get link by id (%u): %s\n", id, strerror(errno));
-			err = -errno;
-			break;
-		}
-
-		link_len = sizeof(struct bpf_link_info);
-		memset(&link_info, 0, link_len);
-		err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len);
-		if (err) {
-			pr_warn("can't get link info: %s\n", strerror(errno));
-			close(fd);
-			break;
-		}
-		if (link_info.type == BPF_LINK_TYPE_XDP) {
-			if (link_info.xdp.ifindex == ifindex) {
-				*link_fd = fd;
-				if (prog_id)
-					*prog_id = link_info.prog_id;
-				break;
-			}
-		}
-		close(fd);
-	}
-
-	return err;
-}
-
-static bool xsk_probe_bpf_link(void)
-{
-	LIBBPF_OPTS(bpf_link_create_opts, opts, .flags = XDP_FLAGS_SKB_MODE);
-	struct bpf_insn insns[2] = {
-		BPF_MOV64_IMM(BPF_REG_0, XDP_PASS),
-		BPF_EXIT_INSN()
-	};
-	int prog_fd, link_fd = -1, insn_cnt = ARRAY_SIZE(insns);
-	int ifindex_lo = 1;
-	bool ret = false;
-	int err;
-
-	err = xsk_link_lookup(ifindex_lo, NULL, &link_fd);
-	if (err)
-		return ret;
-
-	if (link_fd >= 0)
-		return true;
-
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
-	if (prog_fd < 0)
-		return ret;
-
-	link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts);
-	close(prog_fd);
-
-	if (link_fd >= 0) {
-		ret = true;
-		close(link_fd);
-	}
-
-	return ret;
-}
-
-static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk)
-{
-	char ifname[IFNAMSIZ];
-	struct xsk_ctx *ctx;
-	char *interface;
-
-	ctx = calloc(1, sizeof(*ctx));
-	if (!ctx)
-		return -ENOMEM;
-
-	interface = if_indextoname(ifindex, &ifname[0]);
-	if (!interface) {
-		free(ctx);
-		return -errno;
-	}
-
-	ctx->ifindex = ifindex;
-	bpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
-
-	xsk->ctx = ctx;
-	xsk->ctx->has_bpf_link = xsk_probe_bpf_link();
-
-	return 0;
-}
-
-static int xsk_init_xdp_res(struct xsk_socket *xsk,
-			    int *xsks_map_fd)
-{
-	struct xsk_ctx *ctx = xsk->ctx;
-	int err;
-
-	err = xsk_create_bpf_maps(xsk);
-	if (err)
-		return err;
-
-	err = __xsk_load_xdp_prog(*xsks_map_fd);
-	if (err)
-		goto err_load_xdp_prog;
-
-	if (ctx->has_bpf_link)
-		err = xsk_create_bpf_link(xsk);
-	else
-		err = bpf_xdp_attach(xsk->ctx->ifindex, ctx->prog_fd,
-				     xsk->config.xdp_flags, NULL);
-
-	if (err)
-		goto err_attach_xdp_prog;
-
-	if (!xsk->rx)
-		return err;
-
-	err = xsk_set_bpf_maps(xsk);
-	if (err)
-		goto err_set_bpf_maps;
-
-	return err;
-
-err_set_bpf_maps:
-	if (ctx->has_bpf_link)
-		close(ctx->link_fd);
-	else
-		bpf_xdp_detach(ctx->ifindex, 0, NULL);
-err_attach_xdp_prog:
-	close(ctx->prog_fd);
-err_load_xdp_prog:
-	xsk_delete_bpf_maps(xsk);
-	return err;
-}
-
-static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id)
-{
-	struct xsk_ctx *ctx = xsk->ctx;
-	int err;
-
-	ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id);
-	if (ctx->prog_fd < 0) {
-		err = -errno;
-		goto err_prog_fd;
-	}
-	err = xsk_lookup_bpf_maps(xsk);
-	if (err)
-		goto err_lookup_maps;
-
-	if (!xsk->rx)
-		return err;
-
-	err = xsk_set_bpf_maps(xsk);
-	if (err)
-		goto err_set_maps;
-
-	return err;
-
-err_set_maps:
-	close(ctx->xsks_map_fd);
-err_lookup_maps:
-	close(ctx->prog_fd);
-err_prog_fd:
-	if (ctx->has_bpf_link)
-		close(ctx->link_fd);
-	return err;
-}
-
-static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd)
-{
-	struct xsk_socket *xsk = _xdp;
-	struct xsk_ctx *ctx = xsk->ctx;
-	__u32 prog_id = 0;
-	int err;
-
-	if (ctx->has_bpf_link)
-		err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd);
-	else
-		err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id);
-
-	if (err)
-		return err;
-
-	err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) :
-			 xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id);
-
-	if (!err && xsks_map_fd)
-		*xsks_map_fd = ctx->xsks_map_fd;
-
-	return err;
-}
-
-int xsk_setup_xdp_prog_xsk(struct xsk_socket *xsk, int *xsks_map_fd)
-{
-	return __xsk_setup_xdp_prog(xsk, xsks_map_fd);
-}
-
 int xsk_load_xdp_program(int *xsk_map_fd, int *prog_fd)
 {
 	*xsk_map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map", sizeof(int), sizeof(int),
@@ -988,51 +423,13 @@ static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
 	ctx->refcount = 1;
 	ctx->umem = umem;
 	ctx->queue_id = queue_id;
-	ctx->prog_fd = FD_NOT_USED;
-	ctx->link_fd = FD_NOT_USED;
-	ctx->xsks_map_fd = FD_NOT_USED;
 
 	ctx->fill = fill;
 	ctx->comp = comp;
 	list_add(&ctx->list, &umem->ctx_list);
-	ctx->has_bpf_link = xsk_probe_bpf_link();
 	return ctx;
 }
 
-static void xsk_destroy_xsk_struct(struct xsk_socket *xsk)
-{
-	free(xsk->ctx);
-	free(xsk);
-}
-
-int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd)
-{
-	xsk->ctx->xsks_map_fd = fd;
-	return xsk_set_bpf_maps(xsk);
-}
-
-int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd)
-{
-	struct xsk_socket *xsk;
-	int res;
-
-	xsk = calloc(1, sizeof(*xsk));
-	if (!xsk)
-		return -ENOMEM;
-
-	res = xsk_create_xsk_struct(ifindex, xsk);
-	if (res) {
-		free(xsk);
-		return -EINVAL;
-	}
-
-	res = __xsk_setup_xdp_prog(xsk, xsks_map_fd);
-
-	xsk_destroy_xsk_struct(xsk);
-
-	return res;
-}
-
 int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
 			      int ifindex,
 			      __u32 queue_id, struct xsk_umem *umem,
@@ -1255,14 +652,6 @@ void xsk_socket__delete(struct xsk_socket *xsk)
 	ctx = xsk->ctx;
 	umem = ctx->umem;
 
-	if (ctx->refcount == 1) {
-		xsk_delete_bpf_maps(xsk);
-		if (ctx->prog_fd != FD_NOT_USED)
-			close(ctx->prog_fd);
-		if (ctx->has_bpf_link && ctx->link_fd != FD_NOT_USED)
-			close(ctx->link_fd);
-	}
-
 	xsk_put_ctx(ctx, true);
 
 	err = xsk_get_mmap_offsets(xsk->fd, &off);
diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h
index 7a5aeacd261b..bd5b55ad9f8a 100644
--- a/tools/testing/selftests/bpf/xsk.h
+++ b/tools/testing/selftests/bpf/xsk.h
@@ -197,21 +197,12 @@ struct xsk_umem_config {
 	__u32 flags;
 };
 
-int xsk_setup_xdp_prog_xsk(struct xsk_socket *xsk, int *xsks_map_fd);
-int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd);
-int xsk_socket__update_xskmap(struct xsk_socket *xsk, int xsks_map_fd);
-
-/* Flags for the libbpf_flags field. */
-#define XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD (1 << 0)
-
 int xsk_load_xdp_program(int *xsk_map_fd, int *prog_fd);
 int xsk_attach_xdp_program(int ifindex, int prog_fd, u32 xdp_flags);
 
 struct xsk_socket_config {
 	__u32 rx_size;
 	__u32 tx_size;
-	__u32 libbpf_flags;
-	__u32 xdp_flags;
 	__u16 bind_flags;
 };
 
diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 0c0974c209cd..693f8a63f718 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -102,12 +102,6 @@
 #include <linux/filter.h>
 #include "../kselftest.h"
 
-/* AF_XDP APIs were moved into libxdp and marked as deprecated in libbpf.
- * Until xskxceiver is either moved or re-writed into libxdp, suppress
- * deprecation warnings in this file
- */
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-
 static const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62";
 static const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61";
 static const char *IP1 = "192.168.100.162";
@@ -326,8 +320,6 @@ static int __xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_i
 	xsk->umem = umem;
 	cfg.rx_size = xsk->rxqsize;
 	cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
-	cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD;
-	cfg.xdp_flags = ifobject->xdp_flags;
 	cfg.bind_flags = ifobject->bind_flags;
 	if (shared)
 		cfg.bind_flags |= XDP_SHARED_UMEM;
-- 
cgit 


From f0a249df1b071d6f7177cc615d688a3a5d48423a Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:22 +0100
Subject: selftests/xsk: get rid of built-in XDP program

Get rid of the built-in XDP program that was part of the old libbpf
code in xsk.c and replace it with an eBPF program build using the
framework by all the other bpf selftests. This will form the base for
adding more programs in later commits.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-12-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile              |  6 +-
 tools/testing/selftests/bpf/progs/xsk_xdp_progs.c | 19 +++++
 tools/testing/selftests/bpf/xsk.c                 | 88 +++++------------------
 tools/testing/selftests/bpf/xsk.h                 |  6 +-
 tools/testing/selftests/bpf/xskxceiver.c          | 72 +++++++++++--------
 tools/testing/selftests/bpf/xskxceiver.h          |  7 +-
 6 files changed, 92 insertions(+), 106 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/xsk_xdp_progs.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 205e8c3c346a..22533a18705e 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -240,7 +240,6 @@ $(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS)
 $(OUTPUT)/test_maps: $(TESTING_HELPERS)
 $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS)
 $(OUTPUT)/xsk.o: $(BPFOBJ)
-$(OUTPUT)/xskxceiver: $(OUTPUT)/xsk.o
 
 BPFTOOL ?= $(DEFAULT_BPFTOOL)
 $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)    \
@@ -383,6 +382,7 @@ linked_maps.skel.h-deps := linked_maps1.bpf.o linked_maps2.bpf.o
 test_subskeleton.skel.h-deps := test_subskeleton_lib2.bpf.o test_subskeleton_lib.bpf.o test_subskeleton.bpf.o
 test_subskeleton_lib.skel.h-deps := test_subskeleton_lib2.bpf.o test_subskeleton_lib.bpf.o
 test_usdt.skel.h-deps := test_usdt.bpf.o test_usdt_multispec.bpf.o
+xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o
 
 LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps)))
 
@@ -576,6 +576,10 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT)
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
 
+$(OUTPUT)/xskxceiver: xskxceiver.c $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT)
+	$(call msg,BINARY,,$@)
+	$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
+
 # Make sure we are able to include and link libbpf against c++.
 $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ)
 	$(call msg,CXX,,$@)
diff --git a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c
new file mode 100644
index 000000000000..698176882ac6
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Intel */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_XSKMAP);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(int));
+	__uint(value_size, sizeof(int));
+} xsk SEC(".maps");
+
+SEC("xdp") int xsk_def_prog(struct xdp_md *xdp)
+{
+	return bpf_redirect_map(&xsk, 0, XDP_DROP);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c
index 9ed31d280e48..dc6b47280ec4 100644
--- a/tools/testing/selftests/bpf/xsk.c
+++ b/tools/testing/selftests/bpf/xsk.c
@@ -267,87 +267,37 @@ out_umem_alloc:
 	return err;
 }
 
-static int __xsk_load_xdp_prog(int xsk_map_fd)
+int xsk_attach_xdp_program(struct bpf_program *prog, int ifindex, u32 xdp_flags)
 {
-	static const int log_buf_size = 16 * 1024;
-	char log_buf[log_buf_size];
 	int prog_fd;
 
-	/* This is the post-5.3 kernel C-program:
-	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
-	 * {
-	 *     return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, XDP_PASS);
-	 * }
-	 */
-	struct bpf_insn prog[] = {
-		/* r2 = *(u32 *)(r1 + 16) */
-		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
-		/* r1 = xskmap[] */
-		BPF_LD_MAP_FD(BPF_REG_1, xsk_map_fd),
-		/* r3 = XDP_PASS */
-		BPF_MOV64_IMM(BPF_REG_3, 2),
-		/* call bpf_redirect_map */
-		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
-		BPF_EXIT_INSN(),
-	};
-	size_t insns_cnt = ARRAY_SIZE(prog);
-	LIBBPF_OPTS(bpf_prog_load_opts, opts,
-		.log_buf = log_buf,
-		.log_size = log_buf_size,
-	);
-
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "LGPL-2.1 or BSD-2-Clause",
-				prog, insns_cnt, &opts);
-	if (prog_fd < 0)
-		pr_warn("BPF log buffer:\n%s", log_buf);
-
-	return prog_fd;
+	prog_fd = bpf_program__fd(prog);
+	return bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL);
 }
 
-int xsk_attach_xdp_program(int ifindex, int prog_fd, u32 xdp_flags)
+void xsk_detach_xdp_program(int ifindex, u32 xdp_flags)
 {
-	DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
-	__u32 prog_id = 0;
-	int link_fd;
-	int err;
-
-	err = bpf_xdp_query_id(ifindex, xdp_flags, &prog_id);
-	if (err) {
-		pr_warn("getting XDP prog id failed\n");
-		return err;
-	}
-
-	/* If there's a netlink-based XDP prog loaded on interface, bail out
-	 * and ask user to do the removal by himself
-	 */
-	if (prog_id) {
-		pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n");
-		return -EINVAL;
-	}
-
-	opts.flags = xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE);
+	bpf_xdp_detach(ifindex, xdp_flags, NULL);
+}
 
-	link_fd = bpf_link_create(prog_fd, ifindex, BPF_XDP, &opts);
-	if (link_fd < 0)
-		pr_warn("bpf_link_create failed: %s\n", strerror(errno));
+void xsk_clear_xskmap(struct bpf_map *map)
+{
+	u32 index = 0;
+	int map_fd;
 
-	return link_fd;
+	map_fd = bpf_map__fd(map);
+	bpf_map_delete_elem(map_fd, &index);
 }
 
-int xsk_load_xdp_program(int *xsk_map_fd, int *prog_fd)
+int xsk_update_xskmap(struct bpf_map *map, struct xsk_socket *xsk)
 {
-	*xsk_map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map", sizeof(int), sizeof(int),
-				     XSKMAP_SIZE, NULL);
-	if (*xsk_map_fd < 0)
-		return *xsk_map_fd;
-
-	*prog_fd = __xsk_load_xdp_prog(*xsk_map_fd);
-	if (*prog_fd < 0) {
-		close(*xsk_map_fd);
-		return *prog_fd;
-	}
+	int map_fd, sock_fd;
+	u32 index = 0;
 
-	return 0;
+	map_fd = bpf_map__fd(map);
+	sock_fd = xsk_socket__fd(xsk);
+
+	return bpf_map_update_elem(map_fd, &index, &sock_fd, 0);
 }
 
 static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex,
diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h
index bd5b55ad9f8a..5624d31b8db7 100644
--- a/tools/testing/selftests/bpf/xsk.h
+++ b/tools/testing/selftests/bpf/xsk.h
@@ -197,8 +197,10 @@ struct xsk_umem_config {
 	__u32 flags;
 };
 
-int xsk_load_xdp_program(int *xsk_map_fd, int *prog_fd);
-int xsk_attach_xdp_program(int ifindex, int prog_fd, u32 xdp_flags);
+int xsk_attach_xdp_program(struct bpf_program *prog, int ifindex, u32 xdp_flags);
+void xsk_detach_xdp_program(int ifindex, u32 xdp_flags);
+int xsk_update_xskmap(struct bpf_map *map, struct xsk_socket *xsk);
+void xsk_clear_xskmap(struct bpf_map *map);
 
 struct xsk_socket_config {
 	__u32 rx_size;
diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 693f8a63f718..d69100267f70 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -1207,7 +1207,7 @@ static void thread_common_ops_tx(struct test_spec *test, struct ifobject *ifobje
 {
 	xsk_configure_socket(test, ifobject, test->ifobj_rx->umem, true);
 	ifobject->xsk = &ifobject->xsk_arr[0];
-	ifobject->xsk_map_fd = test->ifobj_rx->xsk_map_fd;
+	ifobject->xskmap = test->ifobj_rx->xskmap;
 	memcpy(ifobject->umem, test->ifobj_rx->umem, sizeof(struct xsk_umem_info));
 }
 
@@ -1247,9 +1247,8 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
 	u64 umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size;
 	int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
 	LIBBPF_OPTS(bpf_xdp_query_opts, opts);
-	u32 queue_id = 0;
-	int ret, fd;
 	void *bufs;
+	int ret;
 
 	if (ifobject->umem->unaligned_mode)
 		mmap_flags |= MAP_HUGETLB;
@@ -1274,8 +1273,7 @@ static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject)
 	if (!ifobject->rx_on)
 		return;
 
-	fd = xsk_socket__fd(ifobject->xsk->xsk);
-	ret = bpf_map_update_elem(ifobject->xsk_map_fd, &queue_id, &fd, 0);
+	ret = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk);
 	if (ret)
 		exit_with_error(errno);
 }
@@ -1309,18 +1307,17 @@ static void *worker_testapp_validate_rx(void *arg)
 {
 	struct test_spec *test = (struct test_spec *)arg;
 	struct ifobject *ifobject = test->ifobj_rx;
-	int id = 0, err, fd = xsk_socket__fd(ifobject->xsk->xsk);
 	struct pollfd fds = { };
-	u32 queue_id = 0;
+	int err;
 
 	if (test->current_step == 1) {
 		thread_common_ops(test, ifobject);
 	} else {
-		bpf_map_delete_elem(ifobject->xsk_map_fd, &id);
-		err = bpf_map_update_elem(ifobject->xsk_map_fd, &queue_id, &fd, 0);
+		xsk_clear_xskmap(ifobject->xskmap);
+		err = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk);
 		if (err) {
-			printf("Error: Failed to update xskmap, error %s\n", strerror(err));
-			exit_with_error(err);
+			printf("Error: Failed to update xskmap, error %s\n", strerror(-err));
+			exit_with_error(-err);
 		}
 	}
 
@@ -1390,10 +1387,8 @@ static int testapp_validate_traffic_single_thread(struct test_spec *test, struct
 	pthread_join(t0, NULL);
 
 	if (test->total_steps == test->current_step || test->fail) {
-		u32 queue_id = 0;
-
 		xsk_socket__delete(ifobj->xsk->xsk);
-		bpf_map_delete_elem(ifobj->xsk_map_fd, &queue_id);
+		xsk_clear_xskmap(ifobj->xskmap);
 		testapp_clean_xsk_umem(ifobj);
 	}
 
@@ -1482,14 +1477,14 @@ static void testapp_bidi(struct test_spec *test)
 
 static void swap_xsk_resources(struct ifobject *ifobj_tx, struct ifobject *ifobj_rx)
 {
-	int ret, queue_id = 0, fd = xsk_socket__fd(ifobj_rx->xsk->xsk);
+	int ret;
 
 	xsk_socket__delete(ifobj_tx->xsk->xsk);
 	xsk_socket__delete(ifobj_rx->xsk->xsk);
 	ifobj_tx->xsk = &ifobj_tx->xsk_arr[1];
 	ifobj_rx->xsk = &ifobj_rx->xsk_arr[1];
 
-	ret = bpf_map_update_elem(ifobj_rx->xsk_map_fd, &queue_id, &fd, 0);
+	ret = xsk_update_xskmap(ifobj_rx->xskmap, ifobj_rx->xsk->xsk);
 	if (ret)
 		exit_with_error(errno);
 }
@@ -1651,12 +1646,26 @@ static void testapp_invalid_desc(struct test_spec *test)
 	pkt_stream_restore_default(test);
 }
 
+static int xsk_load_xdp_programs(struct ifobject *ifobj)
+{
+	ifobj->xdp_progs = xsk_xdp_progs__open_and_load();
+	if (libbpf_get_error(ifobj->xdp_progs))
+		return libbpf_get_error(ifobj->xdp_progs);
+
+	return 0;
+}
+
+static void xsk_unload_xdp_programs(struct ifobject *ifobj)
+{
+	xsk_xdp_progs__destroy(ifobj->xdp_progs);
+}
+
 static void init_iface(struct ifobject *ifobj, const char *dst_mac, const char *src_mac,
 		       const char *dst_ip, const char *src_ip, const u16 dst_port,
 		       const u16 src_port, thread_func_t func_ptr, bool load_xdp)
 {
-	int xsk_map_fd, prog_fd, err;
 	struct in_addr ip;
+	int err;
 
 	memcpy(ifobj->dst_mac, dst_mac, ETH_ALEN);
 	memcpy(ifobj->src_mac, src_mac, ETH_ALEN);
@@ -1675,20 +1684,20 @@ static void init_iface(struct ifobject *ifobj, const char *dst_mac, const char *
 	if (!load_xdp)
 		return;
 
-	err = xsk_load_xdp_program(&xsk_map_fd, &prog_fd);
+	err = xsk_load_xdp_programs(ifobj);
 	if (err) {
 		printf("Error loading XDP program\n");
 		exit_with_error(err);
 	}
 
-	ifobj->xsk_map_fd = xsk_map_fd;
-	ifobj->prog_fd = prog_fd;
 	ifobj->xdp_flags = mode_to_xdp_flags(TEST_MODE_SKB);
-	ifobj->link_fd = xsk_attach_xdp_program(ifobj->ifindex, prog_fd, ifobj->xdp_flags);
-	if (ifobj->link_fd < 0) {
+	err = xsk_attach_xdp_program(ifobj->xdp_progs->progs.xsk_def_prog, ifobj->ifindex,
+				     ifobj->xdp_flags);
+	if (err) {
 		printf("Error attaching XDP program\n");
-		exit_with_error(ifobj->link_fd);
+		exit_with_error(-err);
 	}
+	ifobj->xskmap = ifobj->xdp_progs->maps.xsk;
 }
 
 static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_type type)
@@ -1823,9 +1832,6 @@ out_xsk_arr:
 
 static void ifobject_delete(struct ifobject *ifobj)
 {
-	close(ifobj->prog_fd);
-	close(ifobj->xsk_map_fd);
-
 	free(ifobj->umem);
 	free(ifobj->xsk_arr);
 	free(ifobj);
@@ -1864,13 +1870,15 @@ static void change_to_drv_mode(struct ifobject *ifobj)
 	LIBBPF_OPTS(bpf_xdp_query_opts, opts);
 	int ret;
 
-	close(ifobj->link_fd);
-	ifobj->link_fd = xsk_attach_xdp_program(ifobj->ifindex, ifobj->prog_fd,
-						XDP_FLAGS_DRV_MODE);
-	if (ifobj->link_fd < 0) {
+	xsk_detach_xdp_program(ifobj->ifindex, ifobj->xdp_flags);
+	ifobj->xdp_flags = XDP_FLAGS_DRV_MODE;
+	ret = xsk_attach_xdp_program(ifobj->xdp_progs->progs.xsk_def_prog, ifobj->ifindex,
+				     ifobj->xdp_flags);
+	if (ret) {
 		ksft_print_msg("Error attaching XDP program\n");
-		exit_with_error(-ifobj->link_fd);
+		exit_with_error(-ret);
 	}
+	ifobj->xskmap = ifobj->xdp_progs->maps.xsk;
 
 	ret = bpf_xdp_query(ifobj->ifindex, XDP_FLAGS_DRV_MODE, &opts);
 	if (ret)
@@ -1955,6 +1963,8 @@ int main(int argc, char **argv)
 
 	pkt_stream_delete(tx_pkt_stream_default);
 	pkt_stream_delete(rx_pkt_stream_default);
+	xsk_unload_xdp_programs(ifobj_tx);
+	xsk_unload_xdp_programs(ifobj_rx);
 	ifobject_delete(ifobj_tx);
 	ifobject_delete(ifobj_rx);
 
diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h
index b2ba877b1966..70b3e5d1d40c 100644
--- a/tools/testing/selftests/bpf/xskxceiver.h
+++ b/tools/testing/selftests/bpf/xskxceiver.h
@@ -5,6 +5,8 @@
 #ifndef XSKXCEIVER_H_
 #define XSKXCEIVER_H_
 
+#include "xsk_xdp_progs.skel.h"
+
 #ifndef SOL_XDP
 #define SOL_XDP 283
 #endif
@@ -138,9 +140,8 @@ struct ifobject {
 	thread_func_t func_ptr;
 	validation_func_t validation_func;
 	struct pkt_stream *pkt_stream;
-	int xsk_map_fd;
-	int prog_fd;
-	int link_fd;
+	struct xsk_xdp_progs *xdp_progs;
+	struct bpf_map *xskmap;
 	int ifindex;
 	u32 dst_ip;
 	u32 src_ip;
-- 
cgit 


From 80bea9acabb7690042f8efe25f208ffc67032aac Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:23 +0100
Subject: selftests/xsk: add test when some packets are XDP_DROPed

Add a new test where some of the packets are not passed to the AF_XDP
socket and instead get a XDP_DROP verdict. This is important as it
tests the recycling mechanism of the buffer pool. If a packet is not
sent to the AF_XDP socket, the buffer the packet resides in is instead
recycled so it can be used again without the round-trip to user
space. The test introduces a new XDP program that drops every other
packet.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-13-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/xsk_xdp_progs.c | 11 ++++++++
 tools/testing/selftests/bpf/xskxceiver.c          | 31 +++++++++++++++++++++++
 tools/testing/selftests/bpf/xskxceiver.h          |  1 +
 3 files changed, 43 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c
index 698176882ac6..744a01d0e57d 100644
--- a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c
+++ b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c
@@ -11,9 +11,20 @@ struct {
 	__uint(value_size, sizeof(int));
 } xsk SEC(".maps");
 
+static unsigned int idx;
+
 SEC("xdp") int xsk_def_prog(struct xdp_md *xdp)
 {
 	return bpf_redirect_map(&xsk, 0, XDP_DROP);
 }
 
+SEC("xdp") int xsk_xdp_drop(struct xdp_md *xdp)
+{
+	/* Drop every other packet */
+	if (idx++ % 2)
+		return XDP_DROP;
+
+	return bpf_redirect_map(&xsk, 0, XDP_DROP);
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index d69100267f70..a33f11b4c598 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -1646,6 +1646,34 @@ static void testapp_invalid_desc(struct test_spec *test)
 	pkt_stream_restore_default(test);
 }
 
+static void testapp_xdp_drop(struct test_spec *test)
+{
+	struct ifobject *ifobj = test->ifobj_rx;
+	int err;
+
+	test_spec_set_name(test, "XDP_DROP_HALF");
+	xsk_detach_xdp_program(ifobj->ifindex, ifobj->xdp_flags);
+	err = xsk_attach_xdp_program(ifobj->xdp_progs->progs.xsk_xdp_drop, ifobj->ifindex,
+				     ifobj->xdp_flags);
+	if (err) {
+		printf("Error attaching XDP_DROP program\n");
+		test->fail = true;
+		return;
+	}
+
+	pkt_stream_receive_half(test);
+	testapp_validate_traffic(test);
+
+	pkt_stream_restore_default(test);
+	xsk_detach_xdp_program(ifobj->ifindex, ifobj->xdp_flags);
+	err = xsk_attach_xdp_program(ifobj->xdp_progs->progs.xsk_def_prog, ifobj->ifindex,
+				     ifobj->xdp_flags);
+	if (err) {
+		printf("Error restoring default XDP program\n");
+		exit_with_error(-err);
+	}
+}
+
 static int xsk_load_xdp_programs(struct ifobject *ifobj)
 {
 	ifobj->xdp_progs = xsk_xdp_progs__open_and_load();
@@ -1796,6 +1824,9 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_
 	case TEST_TYPE_HEADROOM:
 		testapp_headroom(test);
 		break;
+	case TEST_TYPE_XDP_DROP_HALF:
+		testapp_xdp_drop(test);
+		break;
 	default:
 		break;
 	}
diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h
index 70b3e5d1d40c..a4daa5134fc0 100644
--- a/tools/testing/selftests/bpf/xskxceiver.h
+++ b/tools/testing/selftests/bpf/xskxceiver.h
@@ -87,6 +87,7 @@ enum test_type {
 	TEST_TYPE_STATS_RX_FULL,
 	TEST_TYPE_STATS_FILL_EMPTY,
 	TEST_TYPE_BPF_RES,
+	TEST_TYPE_XDP_DROP_HALF,
 	TEST_TYPE_MAX
 };
 
-- 
cgit 


From 7f881984073a717cf09f82882b7b8062da6773d7 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:24 +0100
Subject: selftests/xsk: merge dual and single thread dispatchers

Make the thread dispatching code common by unifying the dual and
single thread dispatcher code. This so we do not have to add code in
two places in upcoming commits.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-14-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/xskxceiver.c | 120 ++++++++++++++-----------------
 1 file changed, 54 insertions(+), 66 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index a33f11b4c598..11e4f29d40f7 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -1356,83 +1356,59 @@ static void handler(int signum)
 	pthread_exit(NULL);
 }
 
-static int testapp_validate_traffic_single_thread(struct test_spec *test, struct ifobject *ifobj,
-						  enum test_type type)
+static int __testapp_validate_traffic(struct test_spec *test, struct ifobject *ifobj1,
+				      struct ifobject *ifobj2)
 {
-	bool old_shared_umem = ifobj->shared_umem;
-	pthread_t t0;
+	pthread_t t0, t1;
 
-	if (pthread_barrier_init(&barr, NULL, 2))
-		exit_with_error(errno);
+	if (ifobj2)
+		if (pthread_barrier_init(&barr, NULL, 2))
+			exit_with_error(errno);
 
 	test->current_step++;
-	if (type == TEST_TYPE_POLL_RXQ_TMOUT)
-		pkt_stream_reset(ifobj->pkt_stream);
+	pkt_stream_reset(ifobj1->pkt_stream);
 	pkts_in_flight = 0;
 
-	test->ifobj_rx->shared_umem = false;
-	test->ifobj_tx->shared_umem = false;
-
 	signal(SIGUSR1, handler);
-	/* Spawn thread */
-	pthread_create(&t0, NULL, ifobj->func_ptr, test);
+	/*Spawn RX thread */
+	pthread_create(&t0, NULL, ifobj1->func_ptr, test);
 
-	if (type != TEST_TYPE_POLL_TXQ_TMOUT)
+	if (ifobj2) {
 		pthread_barrier_wait(&barr);
+		if (pthread_barrier_destroy(&barr))
+			exit_with_error(errno);
 
-	if (pthread_barrier_destroy(&barr))
-		exit_with_error(errno);
+		/*Spawn TX thread */
+		pthread_create(&t1, NULL, ifobj2->func_ptr, test);
 
-	pthread_kill(t0, SIGUSR1);
-	pthread_join(t0, NULL);
+		pthread_join(t1, NULL);
+	}
+
+	if (!ifobj2)
+		pthread_kill(t0, SIGUSR1);
+	else
+		pthread_join(t0, NULL);
 
 	if (test->total_steps == test->current_step || test->fail) {
-		xsk_socket__delete(ifobj->xsk->xsk);
-		xsk_clear_xskmap(ifobj->xskmap);
-		testapp_clean_xsk_umem(ifobj);
+		if (ifobj2)
+			xsk_socket__delete(ifobj2->xsk->xsk);
+		xsk_socket__delete(ifobj1->xsk->xsk);
+		testapp_clean_xsk_umem(ifobj1);
+		if (ifobj2 && !ifobj2->shared_umem)
+			testapp_clean_xsk_umem(ifobj2);
 	}
 
-	test->ifobj_rx->shared_umem = old_shared_umem;
-	test->ifobj_tx->shared_umem = old_shared_umem;
-
 	return !!test->fail;
 }
 
 static int testapp_validate_traffic(struct test_spec *test)
 {
-	struct ifobject *ifobj_tx = test->ifobj_tx;
-	struct ifobject *ifobj_rx = test->ifobj_rx;
-	pthread_t t0, t1;
-
-	if (pthread_barrier_init(&barr, NULL, 2))
-		exit_with_error(errno);
-
-	test->current_step++;
-	pkt_stream_reset(ifobj_rx->pkt_stream);
-	pkts_in_flight = 0;
-
-	/*Spawn RX thread */
-	pthread_create(&t0, NULL, ifobj_rx->func_ptr, test);
-
-	pthread_barrier_wait(&barr);
-	if (pthread_barrier_destroy(&barr))
-		exit_with_error(errno);
-
-	/*Spawn TX thread */
-	pthread_create(&t1, NULL, ifobj_tx->func_ptr, test);
-
-	pthread_join(t1, NULL);
-	pthread_join(t0, NULL);
-
-	if (test->total_steps == test->current_step || test->fail) {
-		xsk_socket__delete(ifobj_tx->xsk->xsk);
-		xsk_socket__delete(ifobj_rx->xsk->xsk);
-		testapp_clean_xsk_umem(ifobj_rx);
-		if (!ifobj_tx->shared_umem)
-			testapp_clean_xsk_umem(ifobj_tx);
-	}
+	return __testapp_validate_traffic(test, test->ifobj_rx, test->ifobj_tx);
+}
 
-	return !!test->fail;
+static int testapp_validate_traffic_single_thread(struct test_spec *test, struct ifobject *ifobj)
+{
+	return __testapp_validate_traffic(test, ifobj, NULL);
 }
 
 static void testapp_teardown(struct test_spec *test)
@@ -1674,6 +1650,26 @@ static void testapp_xdp_drop(struct test_spec *test)
 	}
 }
 
+static void testapp_poll_txq_tmout(struct test_spec *test)
+{
+	test_spec_set_name(test, "POLL_TXQ_FULL");
+
+	test->ifobj_tx->use_poll = true;
+	/* create invalid frame by set umem frame_size and pkt length equal to 2048 */
+	test->ifobj_tx->umem->frame_size = 2048;
+	pkt_stream_replace(test, 2 * DEFAULT_PKT_CNT, 2048);
+	testapp_validate_traffic_single_thread(test, test->ifobj_tx);
+
+	pkt_stream_restore_default(test);
+}
+
+static void testapp_poll_rxq_tmout(struct test_spec *test)
+{
+	test_spec_set_name(test, "POLL_RXQ_EMPTY");
+	test->ifobj_rx->use_poll = true;
+	testapp_validate_traffic_single_thread(test, test->ifobj_rx);
+}
+
 static int xsk_load_xdp_programs(struct ifobject *ifobj)
 {
 	ifobj->xdp_progs = xsk_xdp_progs__open_and_load();
@@ -1784,18 +1780,10 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_
 		testapp_validate_traffic(test);
 		break;
 	case TEST_TYPE_POLL_TXQ_TMOUT:
-		test_spec_set_name(test, "POLL_TXQ_FULL");
-		test->ifobj_tx->use_poll = true;
-		/* create invalid frame by set umem frame_size and pkt length equal to 2048 */
-		test->ifobj_tx->umem->frame_size = 2048;
-		pkt_stream_replace(test, 2 * DEFAULT_PKT_CNT, 2048);
-		testapp_validate_traffic_single_thread(test, test->ifobj_tx, type);
-		pkt_stream_restore_default(test);
+		testapp_poll_txq_tmout(test);
 		break;
 	case TEST_TYPE_POLL_RXQ_TMOUT:
-		test_spec_set_name(test, "POLL_RXQ_EMPTY");
-		test->ifobj_rx->use_poll = true;
-		testapp_validate_traffic_single_thread(test, test->ifobj_rx, type);
+		testapp_poll_rxq_tmout(test);
 		break;
 	case TEST_TYPE_ALIGNED_INV_DESC:
 		test_spec_set_name(test, "ALIGNED_INV_DESC");
-- 
cgit 


From e67b2554f301b3e12322230ebb81ac7b7892f1a4 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:25 +0100
Subject: selftests/xsk: automatically restore packet stream

Automatically restore the default packet stream if needed at the end
of each test. This so that test writers do not forget to do this.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-15-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/xskxceiver.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 11e4f29d40f7..66863504c76a 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -1501,8 +1501,6 @@ static void testapp_stats_tx_invalid_descs(struct test_spec *test)
 	pkt_stream_replace_half(test, XSK_UMEM__INVALID_FRAME_SIZE, 0);
 	test->ifobj_tx->validation_func = validate_tx_invalid_descs;
 	testapp_validate_traffic(test);
-
-	pkt_stream_restore_default(test);
 }
 
 static void testapp_stats_rx_full(struct test_spec *test)
@@ -1518,8 +1516,6 @@ static void testapp_stats_rx_full(struct test_spec *test)
 	test->ifobj_rx->release_rx = false;
 	test->ifobj_rx->validation_func = validate_rx_full;
 	testapp_validate_traffic(test);
-
-	pkt_stream_restore_default(test);
 }
 
 static void testapp_stats_fill_empty(struct test_spec *test)
@@ -1534,8 +1530,6 @@ static void testapp_stats_fill_empty(struct test_spec *test)
 	test->ifobj_rx->use_fill_ring = false;
 	test->ifobj_rx->validation_func = validate_fill_empty;
 	testapp_validate_traffic(test);
-
-	pkt_stream_restore_default(test);
 }
 
 /* Simple test */
@@ -1568,7 +1562,6 @@ static bool testapp_unaligned(struct test_spec *test)
 	test->ifobj_rx->pkt_stream->use_addr_for_fill = true;
 	testapp_validate_traffic(test);
 
-	pkt_stream_restore_default(test);
 	return true;
 }
 
@@ -1578,7 +1571,6 @@ static void testapp_single_pkt(struct test_spec *test)
 
 	pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts));
 	testapp_validate_traffic(test);
-	pkt_stream_restore_default(test);
 }
 
 static void testapp_invalid_desc(struct test_spec *test)
@@ -1619,7 +1611,6 @@ static void testapp_invalid_desc(struct test_spec *test)
 
 	pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts));
 	testapp_validate_traffic(test);
-	pkt_stream_restore_default(test);
 }
 
 static void testapp_xdp_drop(struct test_spec *test)
@@ -1640,7 +1631,6 @@ static void testapp_xdp_drop(struct test_spec *test)
 	pkt_stream_receive_half(test);
 	testapp_validate_traffic(test);
 
-	pkt_stream_restore_default(test);
 	xsk_detach_xdp_program(ifobj->ifindex, ifobj->xdp_flags);
 	err = xsk_attach_xdp_program(ifobj->xdp_progs->progs.xsk_def_prog, ifobj->ifindex,
 				     ifobj->xdp_flags);
@@ -1659,8 +1649,6 @@ static void testapp_poll_txq_tmout(struct test_spec *test)
 	test->ifobj_tx->umem->frame_size = 2048;
 	pkt_stream_replace(test, 2 * DEFAULT_PKT_CNT, 2048);
 	testapp_validate_traffic_single_thread(test, test->ifobj_tx);
-
-	pkt_stream_restore_default(test);
 }
 
 static void testapp_poll_rxq_tmout(struct test_spec *test)
@@ -1766,8 +1754,6 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_
 		test->ifobj_rx->umem->frame_size = 2048;
 		pkt_stream_replace(test, DEFAULT_PKT_CNT, PKT_SIZE);
 		testapp_validate_traffic(test);
-
-		pkt_stream_restore_default(test);
 		break;
 	case TEST_TYPE_RX_POLL:
 		test->ifobj_rx->use_poll = true;
@@ -1822,6 +1808,7 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_
 	if (!test->fail)
 		ksft_test_result_pass("PASS: %s %s%s\n", mode_string(test), busy_poll_string(test),
 				      test->name);
+	pkt_stream_restore_default(test);
 }
 
 static struct ifobject *ifobject_create(void)
-- 
cgit 


From 7d8319a7cc668607aa054ea2da0bb730538a510b Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jan 2023 10:35:26 +0100
Subject: selftests/xsk: automatically switch XDP programs

Implement automatic switching of XDP programs and execution modes if
needed by a test. This makes it much simpler to write a test as it
only has to say what XDP program it needs if it is not the default
one. This also makes it possible to remove the initial explicit
attachment of the XDP program as well as the explicit mode switch in
the code. These are now handled by the same code that just checks if a
switch is necessary, so no special cases are needed.

The default XDP program for all tests is one that sends all packets to
the AF_XDP socket. If you need another one, please use the new
function test_spec_set_xdp_prog() to specify what XDP programs and
maps to use for this test.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/r/20230111093526.11682-16-magnus.karlsson@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/xsk.c        |  19 +++++
 tools/testing/selftests/bpf/xsk.h        |   1 +
 tools/testing/selftests/bpf/xskxceiver.c | 137 ++++++++++++++++---------------
 tools/testing/selftests/bpf/xskxceiver.h |   7 +-
 4 files changed, 96 insertions(+), 68 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c
index dc6b47280ec4..687d83e707f8 100644
--- a/tools/testing/selftests/bpf/xsk.c
+++ b/tools/testing/selftests/bpf/xsk.c
@@ -267,6 +267,25 @@ out_umem_alloc:
 	return err;
 }
 
+bool xsk_is_in_mode(u32 ifindex, int mode)
+{
+	LIBBPF_OPTS(bpf_xdp_query_opts, opts);
+	int ret;
+
+	ret = bpf_xdp_query(ifindex, mode, &opts);
+	if (ret) {
+		printf("XDP mode query returned error %s\n", strerror(errno));
+		return false;
+	}
+
+	if (mode == XDP_FLAGS_DRV_MODE)
+		return opts.attach_mode == XDP_ATTACHED_DRV;
+	else if (mode == XDP_FLAGS_SKB_MODE)
+		return opts.attach_mode == XDP_ATTACHED_SKB;
+
+	return false;
+}
+
 int xsk_attach_xdp_program(struct bpf_program *prog, int ifindex, u32 xdp_flags)
 {
 	int prog_fd;
diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h
index 5624d31b8db7..04ed8b544712 100644
--- a/tools/testing/selftests/bpf/xsk.h
+++ b/tools/testing/selftests/bpf/xsk.h
@@ -201,6 +201,7 @@ int xsk_attach_xdp_program(struct bpf_program *prog, int ifindex, u32 xdp_flags)
 void xsk_detach_xdp_program(int ifindex, u32 xdp_flags);
 int xsk_update_xskmap(struct bpf_map *map, struct xsk_socket *xsk);
 void xsk_clear_xskmap(struct bpf_map *map);
+bool xsk_is_in_mode(u32 ifindex, int mode);
 
 struct xsk_socket_config {
 	__u32 rx_size;
diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 66863504c76a..a17655107a94 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -96,6 +96,8 @@
 #include <time.h>
 #include <unistd.h>
 #include <stdatomic.h>
+
+#include "xsk_xdp_progs.skel.h"
 #include "xsk.h"
 #include "xskxceiver.h"
 #include <bpf/bpf.h>
@@ -356,7 +358,6 @@ static bool ifobj_zc_avail(struct ifobject *ifobject)
 	xsk = calloc(1, sizeof(struct xsk_socket_info));
 	if (!xsk)
 		goto out;
-	ifobject->xdp_flags = XDP_FLAGS_DRV_MODE;
 	ifobject->bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY;
 	ifobject->rx_on = true;
 	xsk->rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS;
@@ -493,6 +494,10 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
 	test->total_steps = 1;
 	test->nb_sockets = 1;
 	test->fail = false;
+	test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog;
+	test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk;
+	test->xdp_prog_tx = ifobj_tx->xdp_progs->progs.xsk_def_prog;
+	test->xskmap_tx = ifobj_tx->xdp_progs->maps.xsk;
 }
 
 static void test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
@@ -532,6 +537,16 @@ static void test_spec_set_name(struct test_spec *test, const char *name)
 	strncpy(test->name, name, MAX_TEST_NAME_SIZE);
 }
 
+static void test_spec_set_xdp_prog(struct test_spec *test, struct bpf_program *xdp_prog_rx,
+				   struct bpf_program *xdp_prog_tx, struct bpf_map *xskmap_rx,
+				   struct bpf_map *xskmap_tx)
+{
+	test->xdp_prog_rx = xdp_prog_rx;
+	test->xdp_prog_tx = xdp_prog_tx;
+	test->xskmap_rx = xskmap_rx;
+	test->xskmap_tx = xskmap_tx;
+}
+
 static void pkt_stream_reset(struct pkt_stream *pkt_stream)
 {
 	if (pkt_stream)
@@ -1356,6 +1371,47 @@ static void handler(int signum)
 	pthread_exit(NULL);
 }
 
+static bool xdp_prog_changed(struct test_spec *test, struct ifobject *ifobj)
+{
+	return ifobj->xdp_prog != test->xdp_prog_rx || ifobj->mode != test->mode;
+}
+
+static void xsk_reattach_xdp(struct ifobject *ifobj, struct bpf_program *xdp_prog,
+			     struct bpf_map *xskmap, enum test_mode mode)
+{
+	int err;
+
+	xsk_detach_xdp_program(ifobj->ifindex, mode_to_xdp_flags(ifobj->mode));
+	err = xsk_attach_xdp_program(xdp_prog, ifobj->ifindex, mode_to_xdp_flags(mode));
+	if (err) {
+		printf("Error attaching XDP program\n");
+		exit_with_error(-err);
+	}
+
+	if (ifobj->mode != mode && (mode == TEST_MODE_DRV || mode == TEST_MODE_ZC))
+		if (!xsk_is_in_mode(ifobj->ifindex, XDP_FLAGS_DRV_MODE)) {
+			ksft_print_msg("ERROR: XDP prog not in DRV mode\n");
+			exit_with_error(EINVAL);
+		}
+
+	ifobj->xdp_prog = xdp_prog;
+	ifobj->xskmap = xskmap;
+	ifobj->mode = mode;
+}
+
+static void xsk_attach_xdp_progs(struct test_spec *test, struct ifobject *ifobj_rx,
+				 struct ifobject *ifobj_tx)
+{
+	if (xdp_prog_changed(test, ifobj_rx))
+		xsk_reattach_xdp(ifobj_rx, test->xdp_prog_rx, test->xskmap_rx, test->mode);
+
+	if (!ifobj_tx || ifobj_tx->shared_umem)
+		return;
+
+	if (xdp_prog_changed(test, ifobj_tx))
+		xsk_reattach_xdp(ifobj_tx, test->xdp_prog_tx, test->xskmap_tx, test->mode);
+}
+
 static int __testapp_validate_traffic(struct test_spec *test, struct ifobject *ifobj1,
 				      struct ifobject *ifobj2)
 {
@@ -1403,7 +1459,11 @@ static int __testapp_validate_traffic(struct test_spec *test, struct ifobject *i
 
 static int testapp_validate_traffic(struct test_spec *test)
 {
-	return __testapp_validate_traffic(test, test->ifobj_rx, test->ifobj_tx);
+	struct ifobject *ifobj_rx = test->ifobj_rx;
+	struct ifobject *ifobj_tx = test->ifobj_tx;
+
+	xsk_attach_xdp_progs(test, ifobj_rx, ifobj_tx);
+	return __testapp_validate_traffic(test, ifobj_rx, ifobj_tx);
 }
 
 static int testapp_validate_traffic_single_thread(struct test_spec *test, struct ifobject *ifobj)
@@ -1446,7 +1506,7 @@ static void testapp_bidi(struct test_spec *test)
 
 	print_verbose("Switching Tx/Rx vectors\n");
 	swap_directions(&test->ifobj_rx, &test->ifobj_tx);
-	testapp_validate_traffic(test);
+	__testapp_validate_traffic(test, test->ifobj_rx, test->ifobj_tx);
 
 	swap_directions(&test->ifobj_rx, &test->ifobj_tx);
 }
@@ -1615,29 +1675,15 @@ static void testapp_invalid_desc(struct test_spec *test)
 
 static void testapp_xdp_drop(struct test_spec *test)
 {
-	struct ifobject *ifobj = test->ifobj_rx;
-	int err;
+	struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs;
+	struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs;
 
 	test_spec_set_name(test, "XDP_DROP_HALF");
-	xsk_detach_xdp_program(ifobj->ifindex, ifobj->xdp_flags);
-	err = xsk_attach_xdp_program(ifobj->xdp_progs->progs.xsk_xdp_drop, ifobj->ifindex,
-				     ifobj->xdp_flags);
-	if (err) {
-		printf("Error attaching XDP_DROP program\n");
-		test->fail = true;
-		return;
-	}
+	test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_drop, skel_tx->progs.xsk_xdp_drop,
+			       skel_rx->maps.xsk, skel_tx->maps.xsk);
 
 	pkt_stream_receive_half(test);
 	testapp_validate_traffic(test);
-
-	xsk_detach_xdp_program(ifobj->ifindex, ifobj->xdp_flags);
-	err = xsk_attach_xdp_program(ifobj->xdp_progs->progs.xsk_def_prog, ifobj->ifindex,
-				     ifobj->xdp_flags);
-	if (err) {
-		printf("Error restoring default XDP program\n");
-		exit_with_error(-err);
-	}
 }
 
 static void testapp_poll_txq_tmout(struct test_spec *test)
@@ -1674,7 +1720,7 @@ static void xsk_unload_xdp_programs(struct ifobject *ifobj)
 
 static void init_iface(struct ifobject *ifobj, const char *dst_mac, const char *src_mac,
 		       const char *dst_ip, const char *src_ip, const u16 dst_port,
-		       const u16 src_port, thread_func_t func_ptr, bool load_xdp)
+		       const u16 src_port, thread_func_t func_ptr)
 {
 	struct in_addr ip;
 	int err;
@@ -1693,23 +1739,11 @@ static void init_iface(struct ifobject *ifobj, const char *dst_mac, const char *
 
 	ifobj->func_ptr = func_ptr;
 
-	if (!load_xdp)
-		return;
-
 	err = xsk_load_xdp_programs(ifobj);
 	if (err) {
 		printf("Error loading XDP program\n");
 		exit_with_error(err);
 	}
-
-	ifobj->xdp_flags = mode_to_xdp_flags(TEST_MODE_SKB);
-	err = xsk_attach_xdp_program(ifobj->xdp_progs->progs.xsk_def_prog, ifobj->ifindex,
-				     ifobj->xdp_flags);
-	if (err) {
-		printf("Error attaching XDP program\n");
-		exit_with_error(-err);
-	}
-	ifobj->xskmap = ifobj->xdp_progs->maps.xsk;
 }
 
 static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_type type)
@@ -1871,31 +1905,6 @@ static bool is_xdp_supported(int ifindex)
 	return true;
 }
 
-static void change_to_drv_mode(struct ifobject *ifobj)
-{
-	LIBBPF_OPTS(bpf_xdp_query_opts, opts);
-	int ret;
-
-	xsk_detach_xdp_program(ifobj->ifindex, ifobj->xdp_flags);
-	ifobj->xdp_flags = XDP_FLAGS_DRV_MODE;
-	ret = xsk_attach_xdp_program(ifobj->xdp_progs->progs.xsk_def_prog, ifobj->ifindex,
-				     ifobj->xdp_flags);
-	if (ret) {
-		ksft_print_msg("Error attaching XDP program\n");
-		exit_with_error(-ret);
-	}
-	ifobj->xskmap = ifobj->xdp_progs->maps.xsk;
-
-	ret = bpf_xdp_query(ifobj->ifindex, XDP_FLAGS_DRV_MODE, &opts);
-	if (ret)
-		exit_with_error(errno);
-
-	if (opts.attach_mode != XDP_ATTACHED_DRV) {
-		ksft_print_msg("ERROR: XDP prog not in DRV mode\n");
-		exit_with_error(EINVAL);
-	}
-}
-
 int main(int argc, char **argv)
 {
 	struct pkt_stream *rx_pkt_stream_default;
@@ -1936,9 +1945,9 @@ int main(int argc, char **argv)
 	}
 
 	init_iface(ifobj_rx, MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2,
-		   worker_testapp_validate_rx, true);
+		   worker_testapp_validate_rx);
 	init_iface(ifobj_tx, MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1,
-		   worker_testapp_validate_tx, !shared_netdev);
+		   worker_testapp_validate_tx);
 
 	test_spec_init(&test, ifobj_tx, ifobj_rx, 0);
 	tx_pkt_stream_default = pkt_stream_generate(ifobj_tx->umem, DEFAULT_PKT_CNT, PKT_SIZE);
@@ -1951,12 +1960,6 @@ int main(int argc, char **argv)
 	ksft_set_plan(modes * TEST_TYPE_MAX);
 
 	for (i = 0; i < modes; i++) {
-		if (i == TEST_MODE_DRV) {
-			change_to_drv_mode(ifobj_rx);
-			if (!shared_netdev)
-				change_to_drv_mode(ifobj_tx);
-		}
-
 		for (j = 0; j < TEST_TYPE_MAX; j++) {
 			test_spec_init(&test, ifobj_tx, ifobj_rx, i);
 			run_pkt_test(&test, i, j);
diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h
index a4daa5134fc0..3e8ec7d8ec32 100644
--- a/tools/testing/selftests/bpf/xskxceiver.h
+++ b/tools/testing/selftests/bpf/xskxceiver.h
@@ -143,10 +143,11 @@ struct ifobject {
 	struct pkt_stream *pkt_stream;
 	struct xsk_xdp_progs *xdp_progs;
 	struct bpf_map *xskmap;
+	struct bpf_program *xdp_prog;
+	enum test_mode mode;
 	int ifindex;
 	u32 dst_ip;
 	u32 src_ip;
-	u32 xdp_flags;
 	u32 bind_flags;
 	u16 src_port;
 	u16 dst_port;
@@ -166,6 +167,10 @@ struct test_spec {
 	struct ifobject *ifobj_rx;
 	struct pkt_stream *tx_pkt_stream_default;
 	struct pkt_stream *rx_pkt_stream_default;
+	struct bpf_program *xdp_prog_rx;
+	struct bpf_program *xdp_prog_tx;
+	struct bpf_map *xskmap_rx;
+	struct bpf_map *xskmap_tx;
 	u16 total_steps;
 	u16 current_step;
 	u16 nb_sockets;
-- 
cgit 


From c91b0bf3e590a2dd0f2af4bb048abc0542eabf2e Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Wed, 4 Jan 2023 11:35:42 -0500
Subject: selftests/rseq: Revert "selftests/rseq: Add mm_numa_cid to test
 script"

The mm_numa_cid related rseq patches from the series were not picked up
into the tip tree, so enabling the mm_numa_cid test needs to be
reverted.

This reverts commit b344b8f2d88dbf095caf97ac57fd3645843fa70f.

Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/oe-lkp/202301040903.2dd1e25b-oliver.sang@intel.com
---
 tools/testing/selftests/rseq/run_param_test.sh | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/run_param_test.sh b/tools/testing/selftests/rseq/run_param_test.sh
index 603b3b69d20c..8d31426ab41f 100755
--- a/tools/testing/selftests/rseq/run_param_test.sh
+++ b/tools/testing/selftests/rseq/run_param_test.sh
@@ -47,11 +47,6 @@ function do_tests()
 		./param_test_mm_cid ${TEST_LIST[$i]} -r ${REPS} -t ${NR_THREADS} ${@} ${EXTRA_ARGS} || exit 1
 		echo "Running mm_cid compare-twice test ${TEST_NAME[$i]}"
 		./param_test_mm_cid_compare_twice ${TEST_LIST[$i]} -r ${REPS} -t ${NR_THREADS} ${@} ${EXTRA_ARGS} || exit 1
-
-		echo "Running mm_numa_cid test ${TEST_NAME[$i]}"
-		./param_test_mm_numa_cid ${TEST_LIST[$i]} -r ${REPS} -t ${NR_THREADS} ${@} ${EXTRA_ARGS} || exit 1
-		echo "Running mm_numa_cid compare-twice test ${TEST_NAME[$i]}"
-		./param_test_mm_numa_cid_compare_twice ${TEST_LIST[$i]} -r ${REPS} -t ${NR_THREADS} ${@} ${EXTRA_ARGS} || exit 1
 		let "i++"
 	done
 }
-- 
cgit 


From 5c338112e48ab1937f99e4a8990a82c999fda126 Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
Date: Tue, 10 Jan 2023 10:15:15 +0000
Subject: test/vsock: rework message bounds test

This updates message bound test making it more complex. Instead of
sending 1 bytes messages with one MSG_EOR bit, it sends messages of
random length(one half of messages are smaller than page size, second
half are bigger) with random number of MSG_EOR bits set. Receiver
also don't know total number of messages.

Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/vsock/control.c    |  28 +++++++++
 tools/testing/vsock/control.h    |   2 +
 tools/testing/vsock/util.c       |  13 ++++
 tools/testing/vsock/util.h       |   1 +
 tools/testing/vsock/vsock_test.c | 128 ++++++++++++++++++++++++++++++++++-----
 5 files changed, 157 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/vsock/control.c b/tools/testing/vsock/control.c
index 4874872fc5a3..d2deb4b15b94 100644
--- a/tools/testing/vsock/control.c
+++ b/tools/testing/vsock/control.c
@@ -141,6 +141,34 @@ void control_writeln(const char *str)
 	timeout_end();
 }
 
+void control_writeulong(unsigned long value)
+{
+	char str[32];
+
+	if (snprintf(str, sizeof(str), "%lu", value) >= sizeof(str)) {
+		perror("snprintf");
+		exit(EXIT_FAILURE);
+	}
+
+	control_writeln(str);
+}
+
+unsigned long control_readulong(void)
+{
+	unsigned long value;
+	char *str;
+
+	str = control_readln();
+
+	if (!str)
+		exit(EXIT_FAILURE);
+
+	value = strtoul(str, NULL, 10);
+	free(str);
+
+	return value;
+}
+
 /* Return the next line from the control socket (without the trailing newline).
  *
  * The program terminates if a timeout occurs.
diff --git a/tools/testing/vsock/control.h b/tools/testing/vsock/control.h
index 51814b4f9ac1..c1f77fdb2c7a 100644
--- a/tools/testing/vsock/control.h
+++ b/tools/testing/vsock/control.h
@@ -9,7 +9,9 @@ void control_init(const char *control_host, const char *control_port,
 void control_cleanup(void);
 void control_writeln(const char *str);
 char *control_readln(void);
+unsigned long control_readulong(void);
 void control_expectln(const char *str);
 bool control_cmpln(char *line, const char *str, bool fail);
+void control_writeulong(unsigned long value);
 
 #endif /* CONTROL_H */
diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 2acbb7703c6a..01b636d3039a 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -395,3 +395,16 @@ void skip_test(struct test_case *test_cases, size_t test_cases_len,
 
 	test_cases[test_id].skip = true;
 }
+
+unsigned long hash_djb2(const void *data, size_t len)
+{
+	unsigned long hash = 5381;
+	int i = 0;
+
+	while (i < len) {
+		hash = ((hash << 5) + hash) + ((unsigned char *)data)[i];
+		i++;
+	}
+
+	return hash;
+}
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index a3375ad2fb7f..fb99208a95ea 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -49,4 +49,5 @@ void run_tests(const struct test_case *test_cases,
 void list_tests(const struct test_case *test_cases);
 void skip_test(struct test_case *test_cases, size_t test_cases_len,
 	       const char *test_id_str);
+unsigned long hash_djb2(const void *data, size_t len);
 #endif /* UTIL_H */
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index bb6d691cb30d..26c38ad9d07b 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -284,10 +284,14 @@ static void test_stream_msg_peek_server(const struct test_opts *opts)
 	close(fd);
 }
 
-#define MESSAGES_CNT 7
-#define MSG_EOR_IDX (MESSAGES_CNT / 2)
+#define SOCK_BUF_SIZE (2 * 1024 * 1024)
+#define MAX_MSG_SIZE (32 * 1024)
+
 static void test_seqpacket_msg_bounds_client(const struct test_opts *opts)
 {
+	unsigned long curr_hash;
+	int page_size;
+	int msg_count;
 	int fd;
 
 	fd = vsock_seqpacket_connect(opts->peer_cid, 1234);
@@ -296,18 +300,79 @@ static void test_seqpacket_msg_bounds_client(const struct test_opts *opts)
 		exit(EXIT_FAILURE);
 	}
 
-	/* Send several messages, one with MSG_EOR flag */
-	for (int i = 0; i < MESSAGES_CNT; i++)
-		send_byte(fd, 1, (i == MSG_EOR_IDX) ? MSG_EOR : 0);
+	/* Wait, until receiver sets buffer size. */
+	control_expectln("SRVREADY");
+
+	curr_hash = 0;
+	page_size = getpagesize();
+	msg_count = SOCK_BUF_SIZE / MAX_MSG_SIZE;
+
+	for (int i = 0; i < msg_count; i++) {
+		ssize_t send_size;
+		size_t buf_size;
+		int flags;
+		void *buf;
+
+		/* Use "small" buffers and "big" buffers. */
+		if (i & 1)
+			buf_size = page_size +
+					(rand() % (MAX_MSG_SIZE - page_size));
+		else
+			buf_size = 1 + (rand() % page_size);
+
+		buf = malloc(buf_size);
+
+		if (!buf) {
+			perror("malloc");
+			exit(EXIT_FAILURE);
+		}
+
+		memset(buf, rand() & 0xff, buf_size);
+		/* Set at least one MSG_EOR + some random. */
+		if (i == (msg_count / 2) || (rand() & 1)) {
+			flags = MSG_EOR;
+			curr_hash++;
+		} else {
+			flags = 0;
+		}
+
+		send_size = send(fd, buf, buf_size, flags);
+
+		if (send_size < 0) {
+			perror("send");
+			exit(EXIT_FAILURE);
+		}
+
+		if (send_size != buf_size) {
+			fprintf(stderr, "Invalid send size\n");
+			exit(EXIT_FAILURE);
+		}
+
+		/*
+		 * Hash sum is computed at both client and server in
+		 * the same way:
+		 * H += hash('message data')
+		 * Such hash "controls" both data integrity and message
+		 * bounds. After data exchange, both sums are compared
+		 * using control socket, and if message bounds wasn't
+		 * broken - two values must be equal.
+		 */
+		curr_hash += hash_djb2(buf, buf_size);
+		free(buf);
+	}
 
 	control_writeln("SENDDONE");
+	control_writeulong(curr_hash);
 	close(fd);
 }
 
 static void test_seqpacket_msg_bounds_server(const struct test_opts *opts)
 {
+	unsigned long sock_buf_size;
+	unsigned long remote_hash;
+	unsigned long curr_hash;
 	int fd;
-	char buf[16];
+	char buf[MAX_MSG_SIZE];
 	struct msghdr msg = {0};
 	struct iovec iov = {0};
 
@@ -317,25 +382,57 @@ static void test_seqpacket_msg_bounds_server(const struct test_opts *opts)
 		exit(EXIT_FAILURE);
 	}
 
+	sock_buf_size = SOCK_BUF_SIZE;
+
+	if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE,
+		       &sock_buf_size, sizeof(sock_buf_size))) {
+		perror("setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)");
+		exit(EXIT_FAILURE);
+	}
+
+	if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+		       &sock_buf_size, sizeof(sock_buf_size))) {
+		perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+		exit(EXIT_FAILURE);
+	}
+
+	/* Ready to receive data. */
+	control_writeln("SRVREADY");
+	/* Wait, until peer sends whole data. */
 	control_expectln("SENDDONE");
 	iov.iov_base = buf;
 	iov.iov_len = sizeof(buf);
 	msg.msg_iov = &iov;
 	msg.msg_iovlen = 1;
 
-	for (int i = 0; i < MESSAGES_CNT; i++) {
-		if (recvmsg(fd, &msg, 0) != 1) {
-			perror("message bound violated");
-			exit(EXIT_FAILURE);
-		}
+	curr_hash = 0;
 
-		if ((i == MSG_EOR_IDX) ^ !!(msg.msg_flags & MSG_EOR)) {
-			perror("MSG_EOR");
+	while (1) {
+		ssize_t recv_size;
+
+		recv_size = recvmsg(fd, &msg, 0);
+
+		if (!recv_size)
+			break;
+
+		if (recv_size < 0) {
+			perror("recvmsg");
 			exit(EXIT_FAILURE);
 		}
+
+		if (msg.msg_flags & MSG_EOR)
+			curr_hash++;
+
+		curr_hash += hash_djb2(msg.msg_iov[0].iov_base, recv_size);
 	}
 
 	close(fd);
+	remote_hash = control_readulong();
+
+	if (curr_hash != remote_hash) {
+		fprintf(stderr, "Message bounds broken\n");
+		exit(EXIT_FAILURE);
+	}
 }
 
 #define MESSAGE_TRUNC_SZ 32
@@ -427,7 +524,7 @@ static void test_seqpacket_timeout_client(const struct test_opts *opts)
 	tv.tv_usec = 0;
 
 	if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (void *)&tv, sizeof(tv)) == -1) {
-		perror("setsockopt 'SO_RCVTIMEO'");
+		perror("setsockopt(SO_RCVTIMEO)");
 		exit(EXIT_FAILURE);
 	}
 
@@ -644,7 +741,7 @@ static void test_stream_poll_rcvlowat_client(const struct test_opts *opts)
 
 	if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
 		       &lowat_val, sizeof(lowat_val))) {
-		perror("setsockopt");
+		perror("setsockopt(SO_RCVLOWAT)");
 		exit(EXIT_FAILURE);
 	}
 
@@ -837,6 +934,7 @@ int main(int argc, char **argv)
 		.peer_cid = VMADDR_CID_ANY,
 	};
 
+	srand(time(NULL));
 	init_signals();
 
 	for (;;) {
-- 
cgit 


From 685a21c314a8aec3549f25cb3f9d872e92245210 Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
Date: Tue, 10 Jan 2023 10:17:02 +0000
Subject: test/vsock: add big message test

This adds test for sending message, bigger than peer's buffer size.
For SOCK_SEQPACKET socket it must fail, as this type of socket has
message size limit.

Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/vsock/vsock_test.c | 69 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 26c38ad9d07b..67e9f9df3a8c 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -569,6 +569,70 @@ static void test_seqpacket_timeout_server(const struct test_opts *opts)
 	close(fd);
 }
 
+static void test_seqpacket_bigmsg_client(const struct test_opts *opts)
+{
+	unsigned long sock_buf_size;
+	ssize_t send_size;
+	socklen_t len;
+	void *data;
+	int fd;
+
+	len = sizeof(sock_buf_size);
+
+	fd = vsock_seqpacket_connect(opts->peer_cid, 1234);
+	if (fd < 0) {
+		perror("connect");
+		exit(EXIT_FAILURE);
+	}
+
+	if (getsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+		       &sock_buf_size, &len)) {
+		perror("getsockopt");
+		exit(EXIT_FAILURE);
+	}
+
+	sock_buf_size++;
+
+	data = malloc(sock_buf_size);
+	if (!data) {
+		perror("malloc");
+		exit(EXIT_FAILURE);
+	}
+
+	send_size = send(fd, data, sock_buf_size, 0);
+	if (send_size != -1) {
+		fprintf(stderr, "expected 'send(2)' failure, got %zi\n",
+			send_size);
+		exit(EXIT_FAILURE);
+	}
+
+	if (errno != EMSGSIZE) {
+		fprintf(stderr, "expected EMSGSIZE in 'errno', got %i\n",
+			errno);
+		exit(EXIT_FAILURE);
+	}
+
+	control_writeln("CLISENT");
+
+	free(data);
+	close(fd);
+}
+
+static void test_seqpacket_bigmsg_server(const struct test_opts *opts)
+{
+	int fd;
+
+	fd = vsock_seqpacket_accept(VMADDR_CID_ANY, 1234, NULL);
+	if (fd < 0) {
+		perror("accept");
+		exit(EXIT_FAILURE);
+	}
+
+	control_expectln("CLISENT");
+
+	close(fd);
+}
+
 #define BUF_PATTERN_1 'a'
 #define BUF_PATTERN_2 'b'
 
@@ -851,6 +915,11 @@ static struct test_case test_cases[] = {
 		.run_client = test_stream_poll_rcvlowat_client,
 		.run_server = test_stream_poll_rcvlowat_server,
 	},
+	{
+		.name = "SOCK_SEQPACKET big message",
+		.run_client = test_seqpacket_bigmsg_client,
+		.run_server = test_seqpacket_bigmsg_server,
+	},
 	{},
 };
 
-- 
cgit 


From 8abbffd27cedd0f89f69e5ee2ff6841ea01511eb Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
Date: Tue, 10 Jan 2023 10:18:32 +0000
Subject: test/vsock: vsock_perf utility

This adds utility to check vsock rx/tx performance.

Usage as sender:
./vsock_perf --sender <cid> --port <port> --bytes <bytes to send>
Usage as receiver:
./vsock_perf --port <port> --rcvlowat <SO_RCVLOWAT>

Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/vsock/Makefile     |   3 +-
 tools/testing/vsock/README       |  34 ++++
 tools/testing/vsock/vsock_perf.c | 427 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 463 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/vsock/vsock_perf.c

(limited to 'tools')

diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index f8293c6910c9..43a254f0e14d 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0-only
-all: test
+all: test vsock_perf
 test: vsock_test vsock_diag_test
 vsock_test: vsock_test.o timeout.o control.o util.o
 vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
+vsock_perf: vsock_perf.o
 
 CFLAGS += -g -O2 -Werror -Wall -I. -I../../include -I../../../usr/include -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -D_GNU_SOURCE
 .PHONY: all test clean
diff --git a/tools/testing/vsock/README b/tools/testing/vsock/README
index 4d5045e7d2c3..84ee217ba8ee 100644
--- a/tools/testing/vsock/README
+++ b/tools/testing/vsock/README
@@ -35,3 +35,37 @@ Invoke test binaries in both directions as follows:
                        --control-port=$GUEST_IP \
                        --control-port=1234 \
                        --peer-cid=3
+
+vsock_perf utility
+-------------------
+'vsock_perf' is a simple tool to measure vsock performance. It works in
+sender/receiver modes: sender connect to peer at the specified port and
+starts data transmission to the receiver. After data processing is done,
+it prints several metrics(see below).
+
+Usage:
+# run as sender
+# connect to CID 2, port 1234, send 1G of data, tx buf size is 1M
+./vsock_perf --sender 2 --port 1234 --bytes 1G --buf-size 1M
+
+Output:
+tx performance: A Gbits/s
+
+Output explanation:
+A is calculated as "number of bits to send" / "time in tx loop"
+
+# run as receiver
+# listen port 1234, rx buf size is 1M, socket buf size is 1G, SO_RCVLOWAT is 64K
+./vsock_perf --port 1234 --buf-size 1M --vsk-size 1G --rcvlowat 64K
+
+Output:
+rx performance: A Gbits/s
+total in 'read()': B sec
+POLLIN wakeups: C
+average in 'read()': D ns
+
+Output explanation:
+A is calculated as "number of received bits" / "time in rx loop".
+B is time, spent in 'read()' system call(excluding 'poll()')
+C is number of 'poll()' wake ups with POLLIN bit set.
+D is B / C, e.g. average amount of time, spent in single 'read()'.
diff --git a/tools/testing/vsock/vsock_perf.c b/tools/testing/vsock/vsock_perf.c
new file mode 100644
index 000000000000..a72520338f84
--- /dev/null
+++ b/tools/testing/vsock/vsock_perf.c
@@ -0,0 +1,427 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vsock_perf - benchmark utility for vsock.
+ *
+ * Copyright (C) 2022 SberDevices.
+ *
+ * Author: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
+ */
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <time.h>
+#include <stdint.h>
+#include <poll.h>
+#include <sys/socket.h>
+#include <linux/vm_sockets.h>
+
+#define DEFAULT_BUF_SIZE_BYTES	(128 * 1024)
+#define DEFAULT_TO_SEND_BYTES	(64 * 1024)
+#define DEFAULT_VSOCK_BUF_BYTES (256 * 1024)
+#define DEFAULT_RCVLOWAT_BYTES	1
+#define DEFAULT_PORT		1234
+
+#define BYTES_PER_GB		(1024 * 1024 * 1024ULL)
+#define NSEC_PER_SEC		(1000000000ULL)
+
+static unsigned int port = DEFAULT_PORT;
+static unsigned long buf_size_bytes = DEFAULT_BUF_SIZE_BYTES;
+static unsigned long vsock_buf_bytes = DEFAULT_VSOCK_BUF_BYTES;
+
+static void error(const char *s)
+{
+	perror(s);
+	exit(EXIT_FAILURE);
+}
+
+static time_t current_nsec(void)
+{
+	struct timespec ts;
+
+	if (clock_gettime(CLOCK_REALTIME, &ts))
+		error("clock_gettime");
+
+	return (ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec;
+}
+
+/* From lib/cmdline.c. */
+static unsigned long memparse(const char *ptr)
+{
+	char *endptr;
+
+	unsigned long long ret = strtoull(ptr, &endptr, 0);
+
+	switch (*endptr) {
+	case 'E':
+	case 'e':
+		ret <<= 10;
+	case 'P':
+	case 'p':
+		ret <<= 10;
+	case 'T':
+	case 't':
+		ret <<= 10;
+	case 'G':
+	case 'g':
+		ret <<= 10;
+	case 'M':
+	case 'm':
+		ret <<= 10;
+	case 'K':
+	case 'k':
+		ret <<= 10;
+		endptr++;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static void vsock_increase_buf_size(int fd)
+{
+	if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE,
+		       &vsock_buf_bytes, sizeof(vsock_buf_bytes)))
+		error("setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)");
+
+	if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+		       &vsock_buf_bytes, sizeof(vsock_buf_bytes)))
+		error("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+}
+
+static int vsock_connect(unsigned int cid, unsigned int port)
+{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_vm svm;
+	} addr = {
+		.svm = {
+			.svm_family = AF_VSOCK,
+			.svm_port = port,
+			.svm_cid = cid,
+		},
+	};
+	int fd;
+
+	fd = socket(AF_VSOCK, SOCK_STREAM, 0);
+
+	if (fd < 0) {
+		perror("socket");
+		return -1;
+	}
+
+	if (connect(fd, &addr.sa, sizeof(addr.svm)) < 0) {
+		perror("connect");
+		close(fd);
+		return -1;
+	}
+
+	return fd;
+}
+
+static float get_gbps(unsigned long bits, time_t ns_delta)
+{
+	return ((float)bits / 1000000000ULL) /
+	       ((float)ns_delta / NSEC_PER_SEC);
+}
+
+static void run_receiver(unsigned long rcvlowat_bytes)
+{
+	unsigned int read_cnt;
+	time_t rx_begin_ns;
+	time_t in_read_ns;
+	size_t total_recv;
+	int client_fd;
+	char *data;
+	int fd;
+	union {
+		struct sockaddr sa;
+		struct sockaddr_vm svm;
+	} addr = {
+		.svm = {
+			.svm_family = AF_VSOCK,
+			.svm_port = port,
+			.svm_cid = VMADDR_CID_ANY,
+		},
+	};
+	union {
+		struct sockaddr sa;
+		struct sockaddr_vm svm;
+	} clientaddr;
+
+	socklen_t clientaddr_len = sizeof(clientaddr.svm);
+
+	printf("Run as receiver\n");
+	printf("Listen port %u\n", port);
+	printf("RX buffer %lu bytes\n", buf_size_bytes);
+	printf("vsock buffer %lu bytes\n", vsock_buf_bytes);
+	printf("SO_RCVLOWAT %lu bytes\n", rcvlowat_bytes);
+
+	fd = socket(AF_VSOCK, SOCK_STREAM, 0);
+
+	if (fd < 0)
+		error("socket");
+
+	if (bind(fd, &addr.sa, sizeof(addr.svm)) < 0)
+		error("bind");
+
+	if (listen(fd, 1) < 0)
+		error("listen");
+
+	client_fd = accept(fd, &clientaddr.sa, &clientaddr_len);
+
+	if (client_fd < 0)
+		error("accept");
+
+	vsock_increase_buf_size(client_fd);
+
+	if (setsockopt(client_fd, SOL_SOCKET, SO_RCVLOWAT,
+		       &rcvlowat_bytes,
+		       sizeof(rcvlowat_bytes)))
+		error("setsockopt(SO_RCVLOWAT)");
+
+	data = malloc(buf_size_bytes);
+
+	if (!data) {
+		fprintf(stderr, "'malloc()' failed\n");
+		exit(EXIT_FAILURE);
+	}
+
+	read_cnt = 0;
+	in_read_ns = 0;
+	total_recv = 0;
+	rx_begin_ns = current_nsec();
+
+	while (1) {
+		struct pollfd fds = { 0 };
+
+		fds.fd = client_fd;
+		fds.events = POLLIN | POLLERR |
+			     POLLHUP | POLLRDHUP;
+
+		if (poll(&fds, 1, -1) < 0)
+			error("poll");
+
+		if (fds.revents & POLLERR) {
+			fprintf(stderr, "'poll()' error\n");
+			exit(EXIT_FAILURE);
+		}
+
+		if (fds.revents & POLLIN) {
+			ssize_t bytes_read;
+			time_t t;
+
+			t = current_nsec();
+			bytes_read = read(fds.fd, data, buf_size_bytes);
+			in_read_ns += (current_nsec() - t);
+			read_cnt++;
+
+			if (!bytes_read)
+				break;
+
+			if (bytes_read < 0) {
+				perror("read");
+				exit(EXIT_FAILURE);
+			}
+
+			total_recv += bytes_read;
+		}
+
+		if (fds.revents & (POLLHUP | POLLRDHUP))
+			break;
+	}
+
+	printf("total bytes received: %zu\n", total_recv);
+	printf("rx performance: %f Gbits/s\n",
+	       get_gbps(total_recv * 8, current_nsec() - rx_begin_ns));
+	printf("total time in 'read()': %f sec\n", (float)in_read_ns / NSEC_PER_SEC);
+	printf("average time in 'read()': %f ns\n", (float)in_read_ns / read_cnt);
+	printf("POLLIN wakeups: %i\n", read_cnt);
+
+	free(data);
+	close(client_fd);
+	close(fd);
+}
+
+static void run_sender(int peer_cid, unsigned long to_send_bytes)
+{
+	time_t tx_begin_ns;
+	time_t tx_total_ns;
+	size_t total_send;
+	void *data;
+	int fd;
+
+	printf("Run as sender\n");
+	printf("Connect to %i:%u\n", peer_cid, port);
+	printf("Send %lu bytes\n", to_send_bytes);
+	printf("TX buffer %lu bytes\n", buf_size_bytes);
+
+	fd = vsock_connect(peer_cid, port);
+
+	if (fd < 0)
+		exit(EXIT_FAILURE);
+
+	data = malloc(buf_size_bytes);
+
+	if (!data) {
+		fprintf(stderr, "'malloc()' failed\n");
+		exit(EXIT_FAILURE);
+	}
+
+	memset(data, 0, buf_size_bytes);
+	total_send = 0;
+	tx_begin_ns = current_nsec();
+
+	while (total_send < to_send_bytes) {
+		ssize_t sent;
+
+		sent = write(fd, data, buf_size_bytes);
+
+		if (sent <= 0)
+			error("write");
+
+		total_send += sent;
+	}
+
+	tx_total_ns = current_nsec() - tx_begin_ns;
+
+	printf("total bytes sent: %zu\n", total_send);
+	printf("tx performance: %f Gbits/s\n",
+	       get_gbps(total_send * 8, tx_total_ns));
+	printf("total time in 'write()': %f sec\n",
+	       (float)tx_total_ns / NSEC_PER_SEC);
+
+	close(fd);
+	free(data);
+}
+
+static const char optstring[] = "";
+static const struct option longopts[] = {
+	{
+		.name = "help",
+		.has_arg = no_argument,
+		.val = 'H',
+	},
+	{
+		.name = "sender",
+		.has_arg = required_argument,
+		.val = 'S',
+	},
+	{
+		.name = "port",
+		.has_arg = required_argument,
+		.val = 'P',
+	},
+	{
+		.name = "bytes",
+		.has_arg = required_argument,
+		.val = 'M',
+	},
+	{
+		.name = "buf-size",
+		.has_arg = required_argument,
+		.val = 'B',
+	},
+	{
+		.name = "vsk-size",
+		.has_arg = required_argument,
+		.val = 'V',
+	},
+	{
+		.name = "rcvlowat",
+		.has_arg = required_argument,
+		.val = 'R',
+	},
+	{},
+};
+
+static void usage(void)
+{
+	printf("Usage: ./vsock_perf [--help] [options]\n"
+	       "\n"
+	       "This is benchmarking utility, to test vsock performance.\n"
+	       "It runs in two modes: sender or receiver. In sender mode, it\n"
+	       "connects to the specified CID and starts data transmission.\n"
+	       "\n"
+	       "Options:\n"
+	       "  --help			This message\n"
+	       "  --sender   <cid>		Sender mode (receiver default)\n"
+	       "                                <cid> of the receiver to connect to\n"
+	       "  --port     <port>		Port (default %d)\n"
+	       "  --bytes    <bytes>KMG		Bytes to send (default %d)\n"
+	       "  --buf-size <bytes>KMG		Data buffer size (default %d). In sender mode\n"
+	       "                                it is the buffer size, passed to 'write()'. In\n"
+	       "                                receiver mode it is the buffer size passed to 'read()'.\n"
+	       "  --vsk-size <bytes>KMG		Socket buffer size (default %d)\n"
+	       "  --rcvlowat <bytes>KMG		SO_RCVLOWAT value (default %d)\n"
+	       "\n", DEFAULT_PORT, DEFAULT_TO_SEND_BYTES,
+	       DEFAULT_BUF_SIZE_BYTES, DEFAULT_VSOCK_BUF_BYTES,
+	       DEFAULT_RCVLOWAT_BYTES);
+	exit(EXIT_FAILURE);
+}
+
+static long strtolx(const char *arg)
+{
+	long value;
+	char *end;
+
+	value = strtol(arg, &end, 10);
+
+	if (end != arg + strlen(arg))
+		usage();
+
+	return value;
+}
+
+int main(int argc, char **argv)
+{
+	unsigned long to_send_bytes = DEFAULT_TO_SEND_BYTES;
+	unsigned long rcvlowat_bytes = DEFAULT_RCVLOWAT_BYTES;
+	int peer_cid = -1;
+	bool sender = false;
+
+	while (1) {
+		int opt = getopt_long(argc, argv, optstring, longopts, NULL);
+
+		if (opt == -1)
+			break;
+
+		switch (opt) {
+		case 'V': /* Peer buffer size. */
+			vsock_buf_bytes = memparse(optarg);
+			break;
+		case 'R': /* SO_RCVLOWAT value. */
+			rcvlowat_bytes = memparse(optarg);
+			break;
+		case 'P': /* Port to connect to. */
+			port = strtolx(optarg);
+			break;
+		case 'M': /* Bytes to send. */
+			to_send_bytes = memparse(optarg);
+			break;
+		case 'B': /* Size of rx/tx buffer. */
+			buf_size_bytes = memparse(optarg);
+			break;
+		case 'S': /* Sender mode. CID to connect to. */
+			peer_cid = strtolx(optarg);
+			sender = true;
+			break;
+		case 'H': /* Help. */
+			usage();
+			break;
+		default:
+			usage();
+		}
+	}
+
+	if (!sender)
+		run_receiver(rcvlowat_bytes);
+	else
+		run_sender(peer_cid, to_send_bytes);
+
+	return 0;
+}
-- 
cgit 


From 660569472dd7ac64571375b6727c3f2c1d70ba40 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Wed, 11 Jan 2023 23:20:28 -0800
Subject: x86/cpufeature: Add the CPU feature bit for LKGS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the CPU feature bit for LKGS (Load "Kernel" GS).

LKGS instruction is introduced with Intel FRED (flexible return and
event delivery) specification. Search for the latest FRED spec in most
search engines with this search pattern:

  site:intel.com FRED (flexible return and event delivery) specification

LKGS behaves like the MOV to GS instruction except that it loads
the base address into the IA32_KERNEL_GS_BASE MSR instead of the
GS segment’s descriptor cache, which is exactly what Linux kernel
does to load a user level GS base.  Thus, with LKGS, there is no
need to SWAPGS away from the kernel GS base.

[ mingo: Minor tweaks to the description. ]

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230112072032.35626-2-xin3.li@intel.com
---
 arch/x86/include/asm/cpufeatures.h       | 1 +
 tools/arch/x86/include/asm/cpufeatures.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'tools')

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 61012476d66e..b70111a75688 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -312,6 +312,7 @@
 #define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* AVX VNNI instructions */
 #define X86_FEATURE_AVX512_BF16		(12*32+ 5) /* AVX512 BFLOAT16 instructions */
 #define X86_FEATURE_CMPCCXADD           (12*32+ 7) /* "" CMPccXADD instructions */
+#define X86_FEATURE_LKGS		(12*32+18) /* "" Load "kernel" (userspace) GS */
 #define X86_FEATURE_AMX_FP16		(12*32+21) /* "" AMX fp16 Support */
 #define X86_FEATURE_AVX_IFMA            (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
 
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 61012476d66e..b70111a75688 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -312,6 +312,7 @@
 #define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* AVX VNNI instructions */
 #define X86_FEATURE_AVX512_BF16		(12*32+ 5) /* AVX512 BFLOAT16 instructions */
 #define X86_FEATURE_CMPCCXADD           (12*32+ 7) /* "" CMPccXADD instructions */
+#define X86_FEATURE_LKGS		(12*32+18) /* "" Load "kernel" (userspace) GS */
 #define X86_FEATURE_AMX_FP16		(12*32+21) /* "" AMX fp16 Support */
 #define X86_FEATURE_AVX_IFMA            (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
 
-- 
cgit 


From 5a91f12660fe7249e37b11372bf599e02b6a319c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Wed, 11 Jan 2023 23:20:29 -0800
Subject: x86/opcode: Add the LKGS instruction to x86-opcode-map

Add the instruction opcode used by LKGS to x86-opcode-map.

Opcode number is per public FRED draft spec v3.0.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230112072032.35626-3-xin3.li@intel.com
---
 arch/x86/lib/x86-opcode-map.txt       | 1 +
 tools/arch/x86/lib/x86-opcode-map.txt | 1 +
 2 files changed, 2 insertions(+)

(limited to 'tools')

diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index d12d1358f96d..5168ee0360b2 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1047,6 +1047,7 @@ GrpTable: Grp6
 3: LTR Ew
 4: VERR Ew
 5: VERW Ew
+6: LKGS Ew (F2)
 EndTable
 
 GrpTable: Grp7
diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt
index d12d1358f96d..5168ee0360b2 100644
--- a/tools/arch/x86/lib/x86-opcode-map.txt
+++ b/tools/arch/x86/lib/x86-opcode-map.txt
@@ -1047,6 +1047,7 @@ GrpTable: Grp6
 3: LTR Ew
 4: VERR Ew
 5: VERW Ew
+6: LKGS Ew (F2)
 EndTable
 
 GrpTable: Grp7
-- 
cgit 


From 97ec597b26df774a257e3f8e97353fd1b4471615 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Dec 2022 13:06:35 +0000
Subject: kselftest/arm64: Fix syscall-abi for systems without 128 bit SME

SME does not mandate any specific VL so we may not have 128 bit SME but
the algorithm used for enumerating VLs assumes that we will. Add the
required check to ensure that the algorithm terminates.

Fixes: 43e3f85523e4 ("kselftest/arm64: Add SME support to syscall ABI test")
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221223-arm64-syscall-abi-sme-only-v1-1-4fabfbd62087@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/syscall-abi.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
index dd7ebe536d05..ffe719b50c21 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi.c
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -390,6 +390,10 @@ static void test_one_syscall(struct syscall_cfg *cfg)
 
 			sme_vl &= PR_SME_VL_LEN_MASK;
 
+			/* Found lowest VL */
+			if (sve_vq_from_vl(sme_vl) > sme_vq)
+				break;
+
 			if (sme_vq != sve_vq_from_vl(sme_vl))
 				sme_vq = sve_vq_from_vl(sme_vl);
 
@@ -461,6 +465,10 @@ int sme_count_vls(void)
 
 		vl &= PR_SME_VL_LEN_MASK;
 
+		/* Found lowest VL */
+		if (sve_vq_from_vl(vl) > vq)
+			break;
+
 		if (vq != sve_vq_from_vl(vl))
 			vq = sve_vq_from_vl(vl);
 
-- 
cgit 


From fae491e52cc2b48097bd93fceb687ccbacd263c4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Dec 2022 13:06:36 +0000
Subject: kselftest/arm64: Only enumerate VLs once in syscall-abi

Currently syscall-abi not only enumerates the SVE VLs twice while working
out how many tests are planned, it also repeats the enumeration process
while doing the actual tests. Record the VLs when we enumerate and use that
list when we are performing the tests, removing some duplicated logic.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221223-arm64-syscall-abi-sme-only-v1-2-4fabfbd62087@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/syscall-abi.c | 95 +++++++++++--------------
 1 file changed, 41 insertions(+), 54 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
index ffe719b50c21..45fdcbe3e909 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi.c
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -24,6 +24,11 @@
 
 static int default_sme_vl;
 
+static int sve_vl_count;
+static unsigned int sve_vls[SVE_VQ_MAX];
+static int sme_vl_count;
+static unsigned int sme_vls[SVE_VQ_MAX];
+
 extern void do_syscall(int sve_vl, int sme_vl);
 
 static void fill_random(void *buf, size_t size)
@@ -355,72 +360,55 @@ static bool do_test(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
 
 static void test_one_syscall(struct syscall_cfg *cfg)
 {
-	int sve_vq, sve_vl;
-	int sme_vq, sme_vl;
+	int sve, sme;
+	int ret;
 
 	/* FPSIMD only case */
 	ksft_test_result(do_test(cfg, 0, default_sme_vl, 0),
 			 "%s FPSIMD\n", cfg->name);
 
-	if (!(getauxval(AT_HWCAP) & HWCAP_SVE))
-		return;
-
-	for (sve_vq = SVE_VQ_MAX; sve_vq > 0; --sve_vq) {
-		sve_vl = prctl(PR_SVE_SET_VL, sve_vq * 16);
-		if (sve_vl == -1)
+	for (sve = 0; sve < sve_vl_count; sve++) {
+		ret = prctl(PR_SVE_SET_VL, sve_vls[sve]);
+		if (ret == -1)
 			ksft_exit_fail_msg("PR_SVE_SET_VL failed: %s (%d)\n",
 					   strerror(errno), errno);
 
-		sve_vl &= PR_SVE_VL_LEN_MASK;
-
-		if (sve_vq != sve_vq_from_vl(sve_vl))
-			sve_vq = sve_vq_from_vl(sve_vl);
+		ksft_test_result(do_test(cfg, sve_vls[sve], default_sme_vl, 0),
+				 "%s SVE VL %d\n", cfg->name, sve_vls[sve]);
 
-		ksft_test_result(do_test(cfg, sve_vl, default_sme_vl, 0),
-				 "%s SVE VL %d\n", cfg->name, sve_vl);
-
-		if (!(getauxval(AT_HWCAP2) & HWCAP2_SME))
-			continue;
-
-		for (sme_vq = SVE_VQ_MAX; sme_vq > 0; --sme_vq) {
-			sme_vl = prctl(PR_SME_SET_VL, sme_vq * 16);
-			if (sme_vl == -1)
+		for (sme = 0; sme < sme_vl_count; sme++) {
+			ret = prctl(PR_SME_SET_VL, sme_vls[sme]);
+			if (ret == -1)
 				ksft_exit_fail_msg("PR_SME_SET_VL failed: %s (%d)\n",
 						   strerror(errno), errno);
 
-			sme_vl &= PR_SME_VL_LEN_MASK;
-
-			/* Found lowest VL */
-			if (sve_vq_from_vl(sme_vl) > sme_vq)
-				break;
-
-			if (sme_vq != sve_vq_from_vl(sme_vl))
-				sme_vq = sve_vq_from_vl(sme_vl);
-
-			ksft_test_result(do_test(cfg, sve_vl, sme_vl,
+			ksft_test_result(do_test(cfg, sve_vls[sve],
+						 sme_vls[sme],
 						 SVCR_ZA_MASK | SVCR_SM_MASK),
 					 "%s SVE VL %d/SME VL %d SM+ZA\n",
-					 cfg->name, sve_vl, sme_vl);
-			ksft_test_result(do_test(cfg, sve_vl, sme_vl,
-						 SVCR_SM_MASK),
+					 cfg->name, sve_vls[sve],
+					 sme_vls[sme]);
+			ksft_test_result(do_test(cfg, sve_vls[sve],
+						 sme_vls[sme], SVCR_SM_MASK),
 					 "%s SVE VL %d/SME VL %d SM\n",
-					 cfg->name, sve_vl, sme_vl);
-			ksft_test_result(do_test(cfg, sve_vl, sme_vl,
-						 SVCR_ZA_MASK),
+					 cfg->name, sve_vls[sve],
+					 sme_vls[sme]);
+			ksft_test_result(do_test(cfg, sve_vls[sve],
+						 sme_vls[sme], SVCR_ZA_MASK),
 					 "%s SVE VL %d/SME VL %d ZA\n",
-					 cfg->name, sve_vl, sme_vl);
+					 cfg->name, sve_vls[sve],
+					 sme_vls[sme]);
 		}
 	}
 }
 
-int sve_count_vls(void)
+void sve_count_vls(void)
 {
 	unsigned int vq;
-	int vl_count = 0;
 	int vl;
 
 	if (!(getauxval(AT_HWCAP) & HWCAP_SVE))
-		return 0;
+		return;
 
 	/*
 	 * Enumerate up to SVE_VQ_MAX vector lengths
@@ -436,23 +424,17 @@ int sve_count_vls(void)
 		if (vq != sve_vq_from_vl(vl))
 			vq = sve_vq_from_vl(vl);
 
-		vl_count++;
+		sve_vls[sve_vl_count++] = vl;
 	}
-
-	return vl_count;
 }
 
-int sme_count_vls(void)
+void sme_count_vls(void)
 {
 	unsigned int vq;
-	int vl_count = 0;
 	int vl;
 
 	if (!(getauxval(AT_HWCAP2) & HWCAP2_SME))
-		return 0;
-
-	/* Ensure we configure a SME VL, used to flag if SVCR is set */
-	default_sme_vl = 16;
+		return;
 
 	/*
 	 * Enumerate up to SVE_VQ_MAX vector lengths
@@ -472,10 +454,11 @@ int sme_count_vls(void)
 		if (vq != sve_vq_from_vl(vl))
 			vq = sve_vq_from_vl(vl);
 
-		vl_count++;
+		sme_vls[sme_vl_count++] = vl;
 	}
 
-	return vl_count;
+	/* Ensure we configure a SME VL, used to flag if SVCR is set */
+	default_sme_vl = sme_vls[0];
 }
 
 int main(void)
@@ -486,8 +469,12 @@ int main(void)
 	srandom(getpid());
 
 	ksft_print_header();
-	tests += sve_count_vls();
-	tests += (sve_count_vls() * sme_count_vls()) * 3;
+
+	sve_count_vls();
+	sme_count_vls();
+
+	tests += sve_vl_count;
+	tests += (sve_vl_count * sme_vl_count) * 3;
 	ksft_set_plan(ARRAY_SIZE(syscalls) * tests);
 
 	if (getauxval(AT_HWCAP2) & HWCAP2_SME_FA64)
-- 
cgit 


From 024e4a155874dcd3c3473fb26ff4f491c746b4d3 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Dec 2022 13:06:37 +0000
Subject: kselftest/arm64: Verify SME only ABI in syscall-abi

Currently syscall-abi only covers SME in the case where the system supports
SVE however it is architecturally valid to support SME without SVE. Update
the program to cover this case, this requires adjustments in the code to
check for SVCR.SM being set when deciding if we're handling the FPSIMD or
SVE registers and the addition of new test cases for the SME only case.

Note that in the SME only case we should not save the SVE registers after a
syscall since even if we were in streaming mode and therefore set them the
syscall should have exited streaming mode, we check that we have done so by
looking at SVCR.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221223-arm64-syscall-abi-sme-only-v1-3-4fabfbd62087@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../testing/selftests/arm64/abi/syscall-abi-asm.S  | 14 +++++----
 tools/testing/selftests/arm64/abi/syscall-abi.c    | 34 +++++++++++++++++++++-
 2 files changed, 42 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/abi/syscall-abi-asm.S b/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
index acd5e9f3bc0b..cdfafc939a9e 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
+++ b/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
@@ -92,8 +92,11 @@ do_syscall:
 	str	x29, [x2], #8		// FP
 	str	x30, [x2], #8		// LR
 
-	// Load FPRs if we're not doing SVE
+	// Load FPRs if we're not doing neither SVE nor streaming SVE
 	cbnz	x0, 1f
+	ldr	x2, =svcr_in
+	tbnz	x2, #SVCR_SM_SHIFT, 1f
+
 	ldr	x2, =fpr_in
 	ldp	q0, q1, [x2]
 	ldp	q2, q3, [x2, #16 * 2]
@@ -111,10 +114,11 @@ do_syscall:
 	ldp	q26, q27, [x2, #16 * 26]
 	ldp	q28, q29, [x2, #16 * 28]
 	ldp	q30, q31, [x2, #16 * 30]
+
+	b	2f
 1:
 
 	// Load the SVE registers if we're doing SVE/SME
-	cbz	x0, 1f
 
 	ldr	x2, =z_in
 	ldr	z0, [x2, #0, MUL VL]
@@ -155,9 +159,9 @@ do_syscall:
 	ldr	x2, =ffr_in
 	ldr	p0, [x2]
 	ldr	x2, [x2, #0]
-	cbz	x2, 2f
+	cbz	x2, 1f
 	wrffr	p0.b
-2:
+1:
 
 	ldr	x2, =p_in
 	ldr	p0, [x2, #0, MUL VL]
@@ -176,7 +180,7 @@ do_syscall:
 	ldr	p13, [x2, #13, MUL VL]
 	ldr	p14, [x2, #14, MUL VL]
 	ldr	p15, [x2, #15, MUL VL]
-1:
+2:
 
 	// Do the syscall
 	svc	#0
diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
index 45fdcbe3e909..7c9b6e947040 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi.c
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -88,6 +88,7 @@ static int check_gpr(struct syscall_cfg *cfg, int sve_vl, int sme_vl, uint64_t s
 #define NUM_FPR 32
 uint64_t fpr_in[NUM_FPR * 2];
 uint64_t fpr_out[NUM_FPR * 2];
+uint64_t fpr_zero[NUM_FPR * 2];
 
 static void setup_fpr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
 		      uint64_t svcr)
@@ -102,7 +103,7 @@ static int check_fpr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
 	int errors = 0;
 	int i;
 
-	if (!sve_vl) {
+	if (!sve_vl && !(svcr & SVCR_SM_MASK)) {
 		for (i = 0; i < ARRAY_SIZE(fpr_in); i++) {
 			if (fpr_in[i] != fpr_out[i]) {
 				ksft_print_msg("%s Q%d/%d mismatch %llx != %llx\n",
@@ -114,6 +115,18 @@ static int check_fpr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
 		}
 	}
 
+	/*
+	 * In streaming mode the whole register set should be cleared
+	 * by the transition out of streaming mode.
+	 */
+	if (svcr & SVCR_SM_MASK) {
+		if (memcmp(fpr_zero, fpr_out, sizeof(fpr_out)) != 0) {
+			ksft_print_msg("%s FPSIMD registers non-zero exiting SM\n",
+				       cfg->name);
+			errors++;
+		}
+	}
+
 	return errors;
 }
 
@@ -400,6 +413,24 @@ static void test_one_syscall(struct syscall_cfg *cfg)
 					 sme_vls[sme]);
 		}
 	}
+
+	for (sme = 0; sme < sme_vl_count; sme++) {
+		ret = prctl(PR_SME_SET_VL, sme_vls[sme]);
+		if (ret == -1)
+			ksft_exit_fail_msg("PR_SME_SET_VL failed: %s (%d)\n",
+						   strerror(errno), errno);
+
+		ksft_test_result(do_test(cfg, 0, sme_vls[sme],
+					 SVCR_ZA_MASK | SVCR_SM_MASK),
+				 "%s SME VL %d SM+ZA\n",
+				 cfg->name, sme_vls[sme]);
+		ksft_test_result(do_test(cfg, 0, sme_vls[sme], SVCR_SM_MASK),
+				 "%s SME VL %d SM\n",
+				 cfg->name, sme_vls[sme]);
+		ksft_test_result(do_test(cfg, 0, sme_vls[sme], SVCR_ZA_MASK),
+				 "%s SME VL %d ZA\n",
+				 cfg->name, sme_vls[sme]);
+	}
 }
 
 void sve_count_vls(void)
@@ -474,6 +505,7 @@ int main(void)
 	sme_count_vls();
 
 	tests += sve_vl_count;
+	tests += sme_vl_count * 3;
 	tests += (sve_vl_count * sme_vl_count) * 3;
 	ksft_set_plan(ARRAY_SIZE(syscalls) * tests);
 
-- 
cgit 


From 10f326fbb4584f3b9fbf1102c1a71a9ecac0e97f Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Dec 2022 13:06:38 +0000
Subject: kselftest/arm64: Only enumerate power of two VLs in syscall-abi

As documented in issue C215 in the known issues list for DDI0487I.a [1] Arm
will be making a retroactive change to SVE to remove the possibility of
selecting non power of two vector lengths. This has no impact on existing
physical implementations but most virtual implementations have implemented
the full range of permissible vector lengths.

Since virtual implementations are noticeably slow in general and the larger
vector lengths amplify the issue there's a useful improvement in runtime
from only covering the vector lengths that will exist in practical systems,
adjust our enumeration accordingly. We have other tests that aim to cover
the enumeration interfaces.

For symmetry we apply the same change to the eumeration for SME vector
lengths, though the power of two restriction was already present for SME
so there is no impact on the set of vector lengths tested.

[1] https://developer.arm.com/documentation/102105/ia-00/

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221223-arm64-syscall-abi-sme-only-v1-4-4fabfbd62087@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/syscall-abi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
index 7c9b6e947040..8afcbf6861fd 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi.c
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -444,7 +444,7 @@ void sve_count_vls(void)
 	/*
 	 * Enumerate up to SVE_VQ_MAX vector lengths
 	 */
-	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
+	for (vq = SVE_VQ_MAX; vq > 0; vq /= 2) {
 		vl = prctl(PR_SVE_SET_VL, vq * 16);
 		if (vl == -1)
 			ksft_exit_fail_msg("PR_SVE_SET_VL failed: %s (%d)\n",
@@ -470,7 +470,7 @@ void sme_count_vls(void)
 	/*
 	 * Enumerate up to SVE_VQ_MAX vector lengths
 	 */
-	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
+	for (vq = SVE_VQ_MAX; vq > 0; vq /= 2) {
 		vl = prctl(PR_SME_SET_VL, vq * 16);
 		if (vl == -1)
 			ksft_exit_fail_msg("PR_SME_SET_VL failed: %s (%d)\n",
-- 
cgit 


From 67f49869106f78882a8a09b736d4884be85aba18 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Dec 2022 13:07:45 +0000
Subject: kselftest/arm64: Skip non-power of 2 SVE vector lengths in fp-stress

As documented in issue C215 in the known issues list for DDI0487I.a [1] Arm
will be making a retroactive change to SVE to remove the possibility of
selecting non power of two vector lengths. This has no impact on existing
physical implementations but most virtual implementations have implemented
the full range of permissible vector lengths. Given how demanding fp-stress
is for these implementations update to only attempt to enumerate the power
of two vector lengths, reducing the load created on existing virtual
implementations and only exercising the functionality that will be seen in
physical implementations.

[1] https://developer.arm.com/documentation/102105/ia-00/

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221220-arm64-fp-stress-pow2-v1-1-d0ce756b57af@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-stress.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index f8b2f41aac36..2b95f9451b1b 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -377,7 +377,7 @@ static void probe_vls(int vls[], int *vl_count, int set_vl)
 
 	*vl_count = 0;
 
-	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
+	for (vq = SVE_VQ_MAX; vq > 0; vq /= 2) {
 		vl = prctl(set_vl, vq * 16);
 		if (vl == -1)
 			ksft_exit_fail_msg("SET_VL failed: %s (%d)\n",
@@ -385,6 +385,9 @@ static void probe_vls(int vls[], int *vl_count, int set_vl)
 
 		vl &= PR_SVE_VL_LEN_MASK;
 
+		if (*vl_count && (vl == vls[*vl_count - 1]))
+			break;
+
 		vq = sve_vq_from_vl(vl);
 
 		vls[*vl_count] = vl;
-- 
cgit 


From 30792e7c18b659599d9b67922d60d76eee1a0e5d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 10 Jan 2023 20:49:58 +0000
Subject: kselftest/arm64: Fix test numbering when skipping tests

Currently when skipping tests in the BTI testsuite we assign the same
number to every test since we forget to increment the current test number
as we skip, causing warnings about not running the expected test count and
potentially otherwise confusing result parsers. Fix this by adding an
appropriate increment.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230110-arm64-bti-selftest-skip-v1-1-143ecdc84567@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/bti/test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/bti/test.c b/tools/testing/selftests/arm64/bti/test.c
index 67b77ab83c20..4b6dda987c58 100644
--- a/tools/testing/selftests/arm64/bti/test.c
+++ b/tools/testing/selftests/arm64/bti/test.c
@@ -112,7 +112,7 @@ static void __do_test(void (*trampoline)(void (*)(void)),
 	if (skip_all) {
 		test_skipped++;
 		putstr("ok ");
-		putnum(test_num);
+		putnum(test_num++);
 		putstr(" ");
 		puttestname(name, trampoline_name);
 		putstr(" # SKIP\n");
-- 
cgit 


From 1c3b614548b50227c5950258831b130d832703b5 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 10 Jan 2023 20:49:59 +0000
Subject: kselftest/arm64: Run BTI selftests on systems without BTI

The BTI selftests are built both with and without BTI support, validating
both the generation of BTI signals as expected for binaries without BTI
support. Both versions of the binary currently skip all their tests when
the system does not support BTI, however this is excessive since we do have
a defined ABI for how the programs should function in this case (especially
for the non-BTI binary). Update the test program to run all the tests
unconditionally, adding a runtime adjustment of the expected results on
systems that don't support BTI where we currently handle the build time
case.

The tests all use HINT space instructions, BTI itself is a HINT as is
are the PAC instructions that function as landing pads, so nothing in the
tests depends on support for BTI in the kernel or hardware.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230110-arm64-bti-selftest-skip-v1-2-143ecdc84567@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/bti/test.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/bti/test.c b/tools/testing/selftests/arm64/bti/test.c
index 4b6dda987c58..2cd8dcee5aec 100644
--- a/tools/testing/selftests/arm64/bti/test.c
+++ b/tools/testing/selftests/arm64/bti/test.c
@@ -6,6 +6,7 @@
 
 #include "system.h"
 
+#include <stdbool.h>
 #include <stddef.h>
 #include <linux/errno.h>
 #include <linux/auxvec.h>
@@ -101,7 +102,8 @@ static void handler(int n, siginfo_t *si __always_unused,
 	uc->uc_mcontext.pstate &= ~PSR_BTYPE_MASK;
 }
 
-static int skip_all;
+/* Does the system have BTI? */
+static bool have_bti;
 
 static void __do_test(void (*trampoline)(void (*)(void)),
 		      void (*fn)(void),
@@ -109,19 +111,11 @@ static void __do_test(void (*trampoline)(void (*)(void)),
 		      const char *name,
 		      int expect_sigill)
 {
-	if (skip_all) {
-		test_skipped++;
-		putstr("ok ");
-		putnum(test_num++);
-		putstr(" ");
-		puttestname(name, trampoline_name);
-		putstr(" # SKIP\n");
-
-		return;
-	}
-
-	/* Branch Target exceptions should only happen in BTI binaries: */
-	if (!BTI)
+	/*
+	 * Branch Target exceptions should only happen for BTI
+	 * binaries running on a system with BTI:
+	 */
+	if (!BTI || !have_bti)
 		expect_sigill = 0;
 
 	sigill_expected = expect_sigill;
@@ -199,9 +193,10 @@ void start(int *argcp)
 		putstr("# HWCAP2_BTI present\n");
 		if (!(hwcap & HWCAP_PACA))
 			putstr("# Bad hardware?  Expect problems.\n");
+		have_bti = true;
 	} else {
 		putstr("# HWCAP2_BTI not present\n");
-		skip_all = 1;
+		have_bti = false;
 	}
 
 	putstr("# Test binary");
-- 
cgit 


From 2b5a0e425e6e319b1978db1e9564f6af4228a567 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 12 Jan 2023 20:43:31 +0100
Subject: objtool/idle: Validate __cpuidle code as noinstr

Idle code is very like entry code in that RCU isn't available. As
such, add a little validation.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Tony Lindgren <tony@atomide.com>
Tested-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20230112195540.373461409@infradead.org
---
 arch/alpha/kernel/vmlinux.lds.S      |  1 -
 arch/arc/kernel/vmlinux.lds.S        |  1 -
 arch/arm/include/asm/vmlinux.lds.h   |  1 -
 arch/arm64/kernel/vmlinux.lds.S      |  1 -
 arch/csky/kernel/vmlinux.lds.S       |  1 -
 arch/hexagon/kernel/vmlinux.lds.S    |  1 -
 arch/ia64/kernel/vmlinux.lds.S       |  1 -
 arch/loongarch/kernel/vmlinux.lds.S  |  1 -
 arch/m68k/kernel/vmlinux-nommu.lds   |  1 -
 arch/m68k/kernel/vmlinux-std.lds     |  1 -
 arch/m68k/kernel/vmlinux-sun3.lds    |  1 -
 arch/microblaze/kernel/vmlinux.lds.S |  1 -
 arch/mips/kernel/vmlinux.lds.S       |  1 -
 arch/nios2/kernel/vmlinux.lds.S      |  1 -
 arch/openrisc/kernel/vmlinux.lds.S   |  1 -
 arch/parisc/kernel/vmlinux.lds.S     |  1 -
 arch/powerpc/kernel/vmlinux.lds.S    |  1 -
 arch/riscv/kernel/vmlinux-xip.lds.S  |  1 -
 arch/riscv/kernel/vmlinux.lds.S      |  1 -
 arch/s390/kernel/vmlinux.lds.S       |  1 -
 arch/sh/kernel/vmlinux.lds.S         |  1 -
 arch/sparc/kernel/vmlinux.lds.S      |  1 -
 arch/um/kernel/dyn.lds.S             |  1 -
 arch/um/kernel/uml.lds.S             |  1 -
 arch/x86/include/asm/irqflags.h      | 11 ++++-------
 arch/x86/include/asm/mwait.h         |  2 +-
 arch/x86/kernel/vmlinux.lds.S        |  1 -
 arch/xtensa/kernel/vmlinux.lds.S     |  1 -
 include/asm-generic/vmlinux.lds.h    |  9 +++------
 include/linux/compiler_types.h       |  8 ++++++--
 include/linux/cpu.h                  |  3 ---
 tools/objtool/check.c                | 13 +++++++++++++
 32 files changed, 27 insertions(+), 45 deletions(-)

(limited to 'tools')

diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S
index 5b78d640725d..2efa7dfc798a 100644
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -27,7 +27,6 @@ SECTIONS
 		HEAD_TEXT
 		TEXT_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		*(.fixup)
 		*(.gnu.warning)
diff --git a/arch/arc/kernel/vmlinux.lds.S b/arch/arc/kernel/vmlinux.lds.S
index 529ae50f9fe2..549c3f407918 100644
--- a/arch/arc/kernel/vmlinux.lds.S
+++ b/arch/arc/kernel/vmlinux.lds.S
@@ -85,7 +85,6 @@ SECTIONS
 		_stext = .;
 		TEXT_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
diff --git a/arch/arm/include/asm/vmlinux.lds.h b/arch/arm/include/asm/vmlinux.lds.h
index fad45c884e98..4c8632d5c432 100644
--- a/arch/arm/include/asm/vmlinux.lds.h
+++ b/arch/arm/include/asm/vmlinux.lds.h
@@ -96,7 +96,6 @@
 		SOFTIRQENTRY_TEXT					\
 		TEXT_TEXT						\
 		SCHED_TEXT						\
-		CPUIDLE_TEXT						\
 		LOCK_TEXT						\
 		KPROBES_TEXT						\
 		ARM_STUBS_TEXT						\
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 4c13dafc98b8..2777214cbf1a 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -175,7 +175,6 @@ SECTIONS
 			ENTRY_TEXT
 			TEXT_TEXT
 			SCHED_TEXT
-			CPUIDLE_TEXT
 			LOCK_TEXT
 			KPROBES_TEXT
 			HYPERVISOR_TEXT
diff --git a/arch/csky/kernel/vmlinux.lds.S b/arch/csky/kernel/vmlinux.lds.S
index 68c980d08482..d718961786d2 100644
--- a/arch/csky/kernel/vmlinux.lds.S
+++ b/arch/csky/kernel/vmlinux.lds.S
@@ -34,7 +34,6 @@ SECTIONS
 		SOFTIRQENTRY_TEXT
 		TEXT_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		*(.fixup)
diff --git a/arch/hexagon/kernel/vmlinux.lds.S b/arch/hexagon/kernel/vmlinux.lds.S
index 57465bff1fe4..1140051a0c45 100644
--- a/arch/hexagon/kernel/vmlinux.lds.S
+++ b/arch/hexagon/kernel/vmlinux.lds.S
@@ -41,7 +41,6 @@ SECTIONS
 		IRQENTRY_TEXT
 		SOFTIRQENTRY_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		*(.fixup)
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 9b265783be6a..53dfde161c8a 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -51,7 +51,6 @@ SECTIONS {
 		__end_ivt_text = .;
 		TEXT_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
diff --git a/arch/loongarch/kernel/vmlinux.lds.S b/arch/loongarch/kernel/vmlinux.lds.S
index 733b16e8d55d..78506b31ba61 100644
--- a/arch/loongarch/kernel/vmlinux.lds.S
+++ b/arch/loongarch/kernel/vmlinux.lds.S
@@ -43,7 +43,6 @@ SECTIONS
 	.text : {
 		TEXT_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
diff --git a/arch/m68k/kernel/vmlinux-nommu.lds b/arch/m68k/kernel/vmlinux-nommu.lds
index 387f334e87d3..2624fc18c131 100644
--- a/arch/m68k/kernel/vmlinux-nommu.lds
+++ b/arch/m68k/kernel/vmlinux-nommu.lds
@@ -48,7 +48,6 @@ SECTIONS {
 		IRQENTRY_TEXT
 		SOFTIRQENTRY_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		*(.fixup)
 		. = ALIGN(16);
diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds
index ed1d9eda3190..1ccdd04ae462 100644
--- a/arch/m68k/kernel/vmlinux-std.lds
+++ b/arch/m68k/kernel/vmlinux-std.lds
@@ -19,7 +19,6 @@ SECTIONS
 	IRQENTRY_TEXT
 	SOFTIRQENTRY_TEXT
 	SCHED_TEXT
-	CPUIDLE_TEXT
 	LOCK_TEXT
 	*(.fixup)
 	*(.gnu.warning)
diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds
index 4a52f44f2ef0..f13ddcc2af5c 100644
--- a/arch/m68k/kernel/vmlinux-sun3.lds
+++ b/arch/m68k/kernel/vmlinux-sun3.lds
@@ -19,7 +19,6 @@ SECTIONS
 	IRQENTRY_TEXT
 	SOFTIRQENTRY_TEXT
 	SCHED_TEXT
-	CPUIDLE_TEXT
 	LOCK_TEXT
 	*(.fixup)
 	*(.gnu.warning)
diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S
index fb31747ec092..ae50d3d04a7d 100644
--- a/arch/microblaze/kernel/vmlinux.lds.S
+++ b/arch/microblaze/kernel/vmlinux.lds.S
@@ -36,7 +36,6 @@ SECTIONS {
 		EXIT_TEXT
 		EXIT_CALL
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index 1f98947fe715..52cbde60edf5 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -61,7 +61,6 @@ SECTIONS
 	.text : {
 		TEXT_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
diff --git a/arch/nios2/kernel/vmlinux.lds.S b/arch/nios2/kernel/vmlinux.lds.S
index 126e114744cb..37b958055064 100644
--- a/arch/nios2/kernel/vmlinux.lds.S
+++ b/arch/nios2/kernel/vmlinux.lds.S
@@ -24,7 +24,6 @@ SECTIONS
 	.text : {
 		TEXT_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		IRQENTRY_TEXT
 		SOFTIRQENTRY_TEXT
diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S
index d5c7bb0fae57..bc1306047837 100644
--- a/arch/openrisc/kernel/vmlinux.lds.S
+++ b/arch/openrisc/kernel/vmlinux.lds.S
@@ -52,7 +52,6 @@ SECTIONS
           _stext = .;
 	  TEXT_TEXT
 	  SCHED_TEXT
-	  CPUIDLE_TEXT
 	  LOCK_TEXT
 	  KPROBES_TEXT
 	  IRQENTRY_TEXT
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index 2769eb991f58..1aaa2ca09800 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -86,7 +86,6 @@ SECTIONS
 		TEXT_TEXT
 		LOCK_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
 		SOFTIRQENTRY_TEXT
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 8c3862b4c259..86d5ca19a540 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -111,7 +111,6 @@ SECTIONS
 #endif
 		NOINSTR_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
diff --git a/arch/riscv/kernel/vmlinux-xip.lds.S b/arch/riscv/kernel/vmlinux-xip.lds.S
index 75e0fa8a700a..eab9edc3b631 100644
--- a/arch/riscv/kernel/vmlinux-xip.lds.S
+++ b/arch/riscv/kernel/vmlinux-xip.lds.S
@@ -39,7 +39,6 @@ SECTIONS
 		_stext = .;
 		TEXT_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		ENTRY_TEXT
diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
index 4e6c88aa4d87..643ab60e9efb 100644
--- a/arch/riscv/kernel/vmlinux.lds.S
+++ b/arch/riscv/kernel/vmlinux.lds.S
@@ -42,7 +42,6 @@ SECTIONS
 		_stext = .;
 		TEXT_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		ENTRY_TEXT
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 5ea3830af0cc..d1f1ab297995 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -42,7 +42,6 @@ SECTIONS
 		HEAD_TEXT
 		TEXT_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index 3161b9ccd2a5..947e2e213ff9 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -29,7 +29,6 @@ SECTIONS
 		HEAD_TEXT
 		TEXT_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index d55ae65a07ad..d317a843f7ea 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -50,7 +50,6 @@ SECTIONS
 		HEAD_TEXT
 		TEXT_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		IRQENTRY_TEXT
diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S
index 2b7fc5b54164..3385d653ebd0 100644
--- a/arch/um/kernel/dyn.lds.S
+++ b/arch/um/kernel/dyn.lds.S
@@ -74,7 +74,6 @@ SECTIONS
     _stext = .;
     TEXT_TEXT
     SCHED_TEXT
-    CPUIDLE_TEXT
     LOCK_TEXT
     IRQENTRY_TEXT
     SOFTIRQENTRY_TEXT
diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S
index 71a59b8adbdc..5c92d58a78e8 100644
--- a/arch/um/kernel/uml.lds.S
+++ b/arch/um/kernel/uml.lds.S
@@ -35,7 +35,6 @@ SECTIONS
     _stext = .;
     TEXT_TEXT
     SCHED_TEXT
-    CPUIDLE_TEXT
     LOCK_TEXT
     IRQENTRY_TEXT
     SOFTIRQENTRY_TEXT
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 7793e52d6237..8c5ae649d2df 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -8,9 +8,6 @@
 
 #include <asm/nospec-branch.h>
 
-/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
-#define __cpuidle __section(".cpuidle.text")
-
 /*
  * Interrupt control:
  */
@@ -45,13 +42,13 @@ static __always_inline void native_irq_enable(void)
 	asm volatile("sti": : :"memory");
 }
 
-static inline __cpuidle void native_safe_halt(void)
+static __always_inline void native_safe_halt(void)
 {
 	mds_idle_clear_cpu_buffers();
 	asm volatile("sti; hlt": : :"memory");
 }
 
-static inline __cpuidle void native_halt(void)
+static __always_inline void native_halt(void)
 {
 	mds_idle_clear_cpu_buffers();
 	asm volatile("hlt": : :"memory");
@@ -84,7 +81,7 @@ static __always_inline void arch_local_irq_enable(void)
  * Used in the idle loop; sti takes one instruction cycle
  * to complete:
  */
-static inline __cpuidle void arch_safe_halt(void)
+static __always_inline void arch_safe_halt(void)
 {
 	native_safe_halt();
 }
@@ -93,7 +90,7 @@ static inline __cpuidle void arch_safe_halt(void)
  * Used when interrupts are already enabled or to
  * shutdown the processor:
  */
-static inline __cpuidle void halt(void)
+static __always_inline void halt(void)
 {
 	native_halt();
 }
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 3a8fdf881313..f2242167efe3 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -105,7 +105,7 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
  * New with Core Duo processors, MWAIT can take some hints based on CPU
  * capability.
  */
-static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 {
 	if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) {
 		if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) {
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 2e0ee14229bf..25f155205770 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -129,7 +129,6 @@ SECTIONS
 		HEAD_TEXT
 		TEXT_TEXT
 		SCHED_TEXT
-		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		SOFTIRQENTRY_TEXT
diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S
index 965a3952c47b..c14fd96f459d 100644
--- a/arch/xtensa/kernel/vmlinux.lds.S
+++ b/arch/xtensa/kernel/vmlinux.lds.S
@@ -125,7 +125,6 @@ SECTIONS
     ENTRY_TEXT
     TEXT_TEXT
     SCHED_TEXT
-    CPUIDLE_TEXT
     LOCK_TEXT
     *(.fixup)
   }
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index a94219e9916f..ad0d39403cf2 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -558,6 +558,9 @@
 		ALIGN_FUNCTION();					\
 		__noinstr_text_start = .;				\
 		*(.noinstr.text)					\
+		__cpuidle_text_start = .;				\
+		*(.cpuidle.text)					\
+		__cpuidle_text_end = .;					\
 		__noinstr_text_end = .;
 
 /*
@@ -598,12 +601,6 @@
 		*(.spinlock.text)					\
 		__lock_text_end = .;
 
-#define CPUIDLE_TEXT							\
-		ALIGN_FUNCTION();					\
-		__cpuidle_text_start = .;				\
-		*(.cpuidle.text)					\
-		__cpuidle_text_end = .;
-
 #define KPROBES_TEXT							\
 		ALIGN_FUNCTION();					\
 		__kprobes_text_start = .;				\
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 7c1afe0f4129..d7858901f035 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -232,11 +232,15 @@ struct ftrace_likely_data {
 #endif
 
 /* Section for code which can't be instrumented at all */
-#define noinstr								\
-	noinline notrace __attribute((__section__(".noinstr.text")))	\
+#define __noinstr_section(section)					\
+	noinline notrace __attribute((__section__(section)))		\
 	__no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \
 	__no_sanitize_memory
 
+#define noinstr __noinstr_section(".noinstr.text")
+
+#define __cpuidle __noinstr_section(".cpuidle.text")
+
 #endif /* __KERNEL__ */
 
 #endif /* __ASSEMBLY__ */
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 314802f98b9d..f83e4519c5f0 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -176,9 +176,6 @@ void __noreturn cpu_startup_entry(enum cpuhp_state state);
 
 void cpu_idle_poll_ctrl(bool enable);
 
-/* Attach to any functions which should be considered cpuidle. */
-#define __cpuidle	__section(".cpuidle.text")
-
 bool cpu_in_idle(unsigned long pc);
 
 void arch_cpu_idle(void);
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 4350be739f4f..64954aa83522 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -376,6 +376,7 @@ static int decode_instructions(struct objtool_file *file)
 
 		if (!strcmp(sec->name, ".noinstr.text") ||
 		    !strcmp(sec->name, ".entry.text") ||
+		    !strcmp(sec->name, ".cpuidle.text") ||
 		    !strncmp(sec->name, ".text.__x86.", 12))
 			sec->noinstr = true;
 
@@ -3365,6 +3366,12 @@ static inline bool noinstr_call_dest(struct objtool_file *file,
 	if (func->sec->noinstr)
 		return true;
 
+	/*
+	 * If the symbol is a static_call trampoline, we can't tell.
+	 */
+	if (func->static_call_tramp)
+		return true;
+
 	/*
 	 * The __ubsan_handle_*() calls are like WARN(), they only happen when
 	 * something 'BAD' happened. At the risk of taking the machine down,
@@ -4162,6 +4169,12 @@ static int validate_noinstr_sections(struct objtool_file *file)
 		warnings += validate_unwind_hints(file, sec);
 	}
 
+	sec = find_section_by_name(file->elf, ".cpuidle.text");
+	if (sec) {
+		warnings += validate_section(file, sec);
+		warnings += validate_unwind_hints(file, sec);
+	}
+
 	return warnings;
 }
 
-- 
cgit 


From f18b0d7ee815abb33eb1cc540fdde64fbb922d1a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 12 Jan 2023 20:43:56 +0100
Subject: ubsan: Fix objtool UACCESS warns

clang-14 allyesconfig gives:

  vmlinux.o: warning: objtool: emulator_cmpxchg_emulated+0x705: call to __ubsan_handle_load_invalid_value() with UACCESS enabled
  vmlinux.o: warning: objtool: paging64_update_accessed_dirty_bits+0x39e: call to __ubsan_handle_load_invalid_value() with UACCESS enabled
  vmlinux.o: warning: objtool: paging32_update_accessed_dirty_bits+0x390: call to __ubsan_handle_load_invalid_value() with UACCESS enabled
  vmlinux.o: warning: objtool: ept_update_accessed_dirty_bits+0x43f: call to __ubsan_handle_load_invalid_value() with UACCESS enabled

Add the required eflags save/restore and whitelist the thing.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Tony Lindgren <tony@atomide.com>
Tested-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20230112195541.906007455@infradead.org
---
 lib/ubsan.c           | 5 ++++-
 tools/objtool/check.c | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/lib/ubsan.c b/lib/ubsan.c
index 60c7099857a0..4d39e0babb98 100644
--- a/lib/ubsan.c
+++ b/lib/ubsan.c
@@ -339,9 +339,10 @@ void __ubsan_handle_load_invalid_value(void *_data, void *val)
 {
 	struct invalid_value_data *data = _data;
 	char val_str[VALUE_LENGTH];
+	unsigned long ua_flags = user_access_save();
 
 	if (suppress_report(&data->location))
-		return;
+		goto out;
 
 	ubsan_prologue(&data->location, "invalid-load");
 
@@ -351,6 +352,8 @@ void __ubsan_handle_load_invalid_value(void *_data, void *val)
 		val_str, data->type->type_name);
 
 	ubsan_epilogue();
+out:
+	user_access_restore(ua_flags);
 }
 EXPORT_SYMBOL(__ubsan_handle_load_invalid_value);
 
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 64954aa83522..9767babfd9ed 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1216,6 +1216,7 @@ static const char *uaccess_safe_builtin[] = {
 	"__ubsan_handle_type_mismatch",
 	"__ubsan_handle_type_mismatch_v1",
 	"__ubsan_handle_shift_out_of_bounds",
+	"__ubsan_handle_load_invalid_value",
 	/* misc */
 	"csum_partial_copy_generic",
 	"copy_mc_fragile",
-- 
cgit 


From 69d4c0d3218692ffa56b0e1b9c76c50c699d7044 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 12 Jan 2023 20:43:58 +0100
Subject: entry, kasan, x86: Disallow overriding mem*() functions

KASAN cannot just hijack the mem*() functions, it needs to emit
__asan_mem*() variants if it wants instrumentation (other sanitizers
already do this).

  vmlinux.o: warning: objtool: sync_regs+0x24: call to memcpy() leaves .noinstr.text section
  vmlinux.o: warning: objtool: vc_switch_off_ist+0xbe: call to memcpy() leaves .noinstr.text section
  vmlinux.o: warning: objtool: fixup_bad_iret+0x36: call to memset() leaves .noinstr.text section
  vmlinux.o: warning: objtool: __sev_get_ghcb+0xa0: call to memcpy() leaves .noinstr.text section
  vmlinux.o: warning: objtool: __sev_put_ghcb+0x35: call to memcpy() leaves .noinstr.text section

Remove the weak aliases to ensure nobody hijacks these functions and
add them to the noinstr section.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Tony Lindgren <tony@atomide.com>
Tested-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20230112195542.028523143@infradead.org
---
 arch/x86/lib/memcpy_64.S  |  5 ++---
 arch/x86/lib/memmove_64.S |  4 +++-
 arch/x86/lib/memset_64.S  |  4 +++-
 mm/kasan/kasan.h          |  4 ++++
 mm/kasan/shadow.c         | 38 ++++++++++++++++++++++++++++++++++++++
 tools/objtool/check.c     |  3 +++
 6 files changed, 53 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index dd8cd8831251..a64017602010 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -8,7 +8,7 @@
 #include <asm/alternative.h>
 #include <asm/export.h>
 
-.pushsection .noinstr.text, "ax"
+.section .noinstr.text, "ax"
 
 /*
  * We build a jump to memcpy_orig by default which gets NOPped out on
@@ -43,7 +43,7 @@ SYM_TYPED_FUNC_START(__memcpy)
 SYM_FUNC_END(__memcpy)
 EXPORT_SYMBOL(__memcpy)
 
-SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
+SYM_FUNC_ALIAS(memcpy, __memcpy)
 EXPORT_SYMBOL(memcpy)
 
 /*
@@ -184,4 +184,3 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
 	RET
 SYM_FUNC_END(memcpy_orig)
 
-.popsection
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 724bbf83eb5b..02661861e5dd 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -13,6 +13,8 @@
 
 #undef memmove
 
+.section .noinstr.text, "ax"
+
 /*
  * Implement memmove(). This can handle overlap between src and dst.
  *
@@ -213,5 +215,5 @@ SYM_FUNC_START(__memmove)
 SYM_FUNC_END(__memmove)
 EXPORT_SYMBOL(__memmove)
 
-SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
+SYM_FUNC_ALIAS(memmove, __memmove)
 EXPORT_SYMBOL(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index fc9ffd3ff3b2..6143b1a6fa2c 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -6,6 +6,8 @@
 #include <asm/alternative.h>
 #include <asm/export.h>
 
+.section .noinstr.text, "ax"
+
 /*
  * ISO C memset - set a memory block to a byte value. This function uses fast
  * string to get better performance than the original function. The code is
@@ -43,7 +45,7 @@ SYM_FUNC_START(__memset)
 SYM_FUNC_END(__memset)
 EXPORT_SYMBOL(__memset)
 
-SYM_FUNC_ALIAS_WEAK(memset, __memset)
+SYM_FUNC_ALIAS(memset, __memset)
 EXPORT_SYMBOL(memset)
 
 /*
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index ea8cf1310b1e..71c15438afcf 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -618,6 +618,10 @@ void __asan_set_shadow_f3(const void *addr, size_t size);
 void __asan_set_shadow_f5(const void *addr, size_t size);
 void __asan_set_shadow_f8(const void *addr, size_t size);
 
+void *__asan_memset(void *addr, int c, size_t len);
+void *__asan_memmove(void *dest, const void *src, size_t len);
+void *__asan_memcpy(void *dest, const void *src, size_t len);
+
 void __hwasan_load1_noabort(unsigned long addr);
 void __hwasan_store1_noabort(unsigned long addr);
 void __hwasan_load2_noabort(unsigned long addr);
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 2fba1f51f042..98269936a5e4 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -38,6 +38,12 @@ bool __kasan_check_write(const volatile void *p, unsigned int size)
 }
 EXPORT_SYMBOL(__kasan_check_write);
 
+#ifndef CONFIG_GENERIC_ENTRY
+/*
+ * CONFIG_GENERIC_ENTRY relies on compiler emitted mem*() calls to not be
+ * instrumented. KASAN enabled toolchains should emit __asan_mem*() functions
+ * for the sites they want to instrument.
+ */
 #undef memset
 void *memset(void *addr, int c, size_t len)
 {
@@ -68,6 +74,38 @@ void *memcpy(void *dest, const void *src, size_t len)
 
 	return __memcpy(dest, src, len);
 }
+#endif
+
+void *__asan_memset(void *addr, int c, size_t len)
+{
+	if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_))
+		return NULL;
+
+	return __memset(addr, c, len);
+}
+EXPORT_SYMBOL(__asan_memset);
+
+#ifdef __HAVE_ARCH_MEMMOVE
+void *__asan_memmove(void *dest, const void *src, size_t len)
+{
+	if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
+	    !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+		return NULL;
+
+	return __memmove(dest, src, len);
+}
+EXPORT_SYMBOL(__asan_memmove);
+#endif
+
+void *__asan_memcpy(void *dest, const void *src, size_t len)
+{
+	if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
+	    !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+		return NULL;
+
+	return __memcpy(dest, src, len);
+}
+EXPORT_SYMBOL(__asan_memcpy);
 
 void kasan_poison(const void *addr, size_t size, u8 value, bool init)
 {
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 9767babfd9ed..92554c5c80aa 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1074,6 +1074,9 @@ static const char *uaccess_safe_builtin[] = {
 	"__asan_store16_noabort",
 	"__kasan_check_read",
 	"__kasan_check_write",
+	"__asan_memset",
+	"__asan_memmove",
+	"__asan_memcpy",
 	/* KASAN in-line */
 	"__asan_report_load_n_noabort",
 	"__asan_report_load1_noabort",
-- 
cgit 


From 878625e1c7a10dfbb1fdaaaae2c4d2a58fbce627 Mon Sep 17 00:00:00 2001
From: Holger Hoffstätte <holger@applied-asynchrony.com>
Date: Fri, 13 Jan 2023 16:40:23 +0100
Subject: bpftool: Always disable stack protection for BPF objects
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the clang toolchain has stack protection enabled in order to be
consistent with gcc - which just happens to be the case on Gentoo -
the bpftool build fails:

  [...]
  clang \
	-I. \
	-I/tmp/portage/dev-util/bpftool-6.0.12/work/linux-6.0/tools/include/uapi/ \
	-I/tmp/portage/dev-util/bpftool-6.0.12/work/linux-6.0/tools/bpf/bpftool/bootstrap/libbpf/include \
	-g -O2 -Wall -target bpf -c skeleton/pid_iter.bpf.c -o pid_iter.bpf.o
  clang \
	-I. \
	-I/tmp/portage/dev-util/bpftool-6.0.12/work/linux-6.0/tools/include/uapi/ \
	-I/tmp/portage/dev-util/bpftool-6.0.12/work/linux-6.0/tools/bpf/bpftool/bootstrap/libbpf/include \
	-g -O2 -Wall -target bpf -c skeleton/profiler.bpf.c -o profiler.bpf.o
  skeleton/profiler.bpf.c:40:14: error: A call to built-in function '__stack_chk_fail' is not supported.
  int BPF_PROG(fentry_XXX)
                ^
  skeleton/profiler.bpf.c:94:14: error: A call to built-in function '__stack_chk_fail' is not supported.
  int BPF_PROG(fexit_XXX)
                ^
  2 errors generated.
  [...]

Since stack-protector makes no sense for the BPF bits just unconditionally
disable it.

Bug: https://bugs.gentoo.org/890638
Signed-off-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/74cd9d2e-6052-312a-241e-2b514a75c92c@applied-asynchrony.com
---
 tools/bpf/bpftool/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index d40e31bc4c9d..681fbcc5ed50 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -215,7 +215,8 @@ $(OUTPUT)%.bpf.o: skeleton/%.bpf.c $(OUTPUT)vmlinux.h $(LIBBPF_BOOTSTRAP)
 		-I$(or $(OUTPUT),.) \
 		-I$(srctree)/tools/include/uapi/ \
 		-I$(LIBBPF_BOOTSTRAP_INCLUDE) \
-		-g -O2 -Wall -target bpf -c $< -o $@
+		-g -O2 -Wall -fno-stack-protector \
+		-target bpf -c $< -o $@
 	$(Q)$(LLVM_STRIP) -g $@
 
 $(OUTPUT)%.skel.h: $(OUTPUT)%.bpf.o $(BPFTOOL_BOOTSTRAP)
-- 
cgit 


From 366617a69e60610912836570546f118006ebc7cb Mon Sep 17 00:00:00 2001
From: Jeff Xu <jeffxu@google.com>
Date: Fri, 13 Jan 2023 05:32:29 +0000
Subject: selftests/landlock: Skip overlayfs tests when not supported
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

overlayfs may be disabled in the kernel configuration, causing related
tests to fail.  Check that overlayfs is supported at runtime, so we can
skip layout2_overlay.* accordingly.

Signed-off-by: Jeff Xu <jeffxu@google.com>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230113053229.1281774-2-jeffxu@google.com
[mic: Reword comments and constify variables]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c | 47 ++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index d5dab986f612..b6c4be3faf7a 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -11,6 +11,7 @@
 #include <fcntl.h>
 #include <linux/landlock.h>
 #include <sched.h>
+#include <stdio.h>
 #include <string.h>
 #include <sys/capability.h>
 #include <sys/mount.h>
@@ -89,6 +90,40 @@ static const char dir_s3d3[] = TMP_DIR "/s3d1/s3d2/s3d3";
  *         └── s3d3
  */
 
+static bool fgrep(FILE *const inf, const char *const str)
+{
+	char line[32];
+	const int slen = strlen(str);
+
+	while (!feof(inf)) {
+		if (!fgets(line, sizeof(line), inf))
+			break;
+		if (strncmp(line, str, slen))
+			continue;
+
+		return true;
+	}
+
+	return false;
+}
+
+static bool supports_overlayfs(void)
+{
+	bool res;
+	FILE *const inf = fopen("/proc/filesystems", "r");
+
+	/*
+	 * Consider that the filesystem is supported if we cannot get the
+	 * supported ones.
+	 */
+	if (!inf)
+		return true;
+
+	res = fgrep(inf, "nodev\toverlay\n");
+	fclose(inf);
+	return res;
+}
+
 static void mkdir_parents(struct __test_metadata *const _metadata,
 			  const char *const path)
 {
@@ -4001,6 +4036,9 @@ FIXTURE(layout2_overlay) {};
 
 FIXTURE_SETUP(layout2_overlay)
 {
+	if (!supports_overlayfs())
+		SKIP(return, "overlayfs is not supported");
+
 	prepare_layout(_metadata);
 
 	create_directory(_metadata, LOWER_BASE);
@@ -4037,6 +4075,9 @@ FIXTURE_SETUP(layout2_overlay)
 
 FIXTURE_TEARDOWN(layout2_overlay)
 {
+	if (!supports_overlayfs())
+		SKIP(return, "overlayfs is not supported");
+
 	EXPECT_EQ(0, remove_path(lower_do1_fl3));
 	EXPECT_EQ(0, remove_path(lower_dl1_fl2));
 	EXPECT_EQ(0, remove_path(lower_fl1));
@@ -4068,6 +4109,9 @@ FIXTURE_TEARDOWN(layout2_overlay)
 
 TEST_F_FORK(layout2_overlay, no_restriction)
 {
+	if (!supports_overlayfs())
+		SKIP(return, "overlayfs is not supported");
+
 	ASSERT_EQ(0, test_open(lower_fl1, O_RDONLY));
 	ASSERT_EQ(0, test_open(lower_dl1, O_RDONLY));
 	ASSERT_EQ(0, test_open(lower_dl1_fl2, O_RDONLY));
@@ -4231,6 +4275,9 @@ TEST_F_FORK(layout2_overlay, same_content_different_file)
 	size_t i;
 	const char *path_entry;
 
+	if (!supports_overlayfs())
+		SKIP(return, "overlayfs is not supported");
+
 	/* Sets rules on base directories (i.e. outside overlay scope). */
 	ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer1_base);
 	ASSERT_LE(0, ruleset_fd);
-- 
cgit 


From 2fa074536590a82ee6c670d5f119f3dfb4ca6015 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Fri, 13 Jan 2023 17:34:27 +0800
Subject: libbpf: Replace '.' with '_' in legacy kprobe event name

'.' is not allowed in the event name of kprobe. Therefore, we will get a
EINVAL if the kernel function name has a '.' in legacy kprobe attach
case, such as 'icmp_reply.constprop.0'.

In order to adapt this case, we need to replace the '.' with other char
in gen_kprobe_legacy_event_name(). And I use '_' for this propose.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/bpf/20230113093427.1666466-1-imagedong@tencent.com
---
 tools/lib/bpf/libbpf.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index f8dfee32c2bc..27d9faa80471 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -9994,9 +9994,16 @@ static void gen_kprobe_legacy_event_name(char *buf, size_t buf_sz,
 					 const char *kfunc_name, size_t offset)
 {
 	static int index = 0;
+	int i;
 
 	snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx_%d", getpid(), kfunc_name, offset,
 		 __sync_fetch_and_add(&index, 1));
+
+	/* sanitize binary_path in the probe name */
+	for (i = 0; buf[i]; i++) {
+		if (!isalnum(buf[i]))
+			buf[i] = '_';
+	}
 }
 
 static int add_kprobe_event_legacy(const char *probe_name, bool retprobe,
-- 
cgit 


From 1c48391bc6739f5e3306919d4b887b92c35d5490 Mon Sep 17 00:00:00 2001
From: Roberto Valenzuela <valenzuelarober@gmail.com>
Date: Fri, 13 Jan 2023 13:02:57 -0500
Subject: selftests/bpf: Fix missing space error

Add the missing space after 'dest' variable assignment.
This change will resolve the following checkpatch.pl
script error:

ERROR: spaces required around that '+=' (ctx:VxW)

Signed-off-by: Roberto Valenzuela <valenzuelarober@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230113180257.39769-1-valenzuelarober@gmail.com
---
 tools/testing/selftests/bpf/progs/test_xdp_vlan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c
index 134768f6b788..cdf3c48d6cbb 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c
@@ -195,7 +195,7 @@ int  xdp_prognum2(struct xdp_md *ctx)
 
 	/* Moving Ethernet header, dest overlap with src, memmove handle this */
 	dest = data;
-	dest+= VLAN_HDR_SZ;
+	dest += VLAN_HDR_SZ;
 	/*
 	 * Notice: Taking over vlan_hdr->h_vlan_encapsulated_proto, by
 	 * only moving two MAC addrs (12 bytes), not overwriting last 2 bytes
-- 
cgit 


From d219df60a70ed0739aa5dd34b477763311fc5a7b Mon Sep 17 00:00:00 2001
From: Ziyang Xuan <william.xuanziyang@huawei.com>
Date: Fri, 13 Jan 2023 17:24:51 +0800
Subject: bpf: Add ipip6 and ip6ip decap support for bpf_skb_adjust_room()

Add ipip6 and ip6ip decap support for bpf_skb_adjust_room().
Main use case is for using cls_bpf on ingress hook to decapsulate
IPv4 over IPv6 and IPv6 over IPv4 tunnel packets.

Add two new flags BPF_F_ADJ_ROOM_DECAP_L3_IPV{4,6} to indicate the
new IP header version after decapsulating the outer IP header.

Suggested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/b268ec7f0ff9431f4f43b1b40ab856ebb28cb4e1.1673574419.git.william.xuanziyang@huawei.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/uapi/linux/bpf.h       |  7 +++++++
 net/core/filter.c              | 31 ++++++++++++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h |  7 +++++++
 3 files changed, 44 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bc1a3d232ae4..adae5b168f9d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2647,6 +2647,11 @@ union bpf_attr {
  *		  Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
  *		  L2 type as Ethernet.
  *
+ *		* **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**,
+ *		  **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**:
+ *		  Indicate the new IP header version after decapsulating the outer
+ *		  IP header. Used when the inner and outer IP versions are different.
+ *
  * 		A call to this helper is susceptible to change the underlying
  * 		packet buffer. Therefore, at load time, all checks on pointers
  * 		previously done by the verifier are invalidated and must be
@@ -5807,6 +5812,8 @@ enum {
 	BPF_F_ADJ_ROOM_ENCAP_L4_UDP	= (1ULL << 4),
 	BPF_F_ADJ_ROOM_NO_CSUM_RESET	= (1ULL << 5),
 	BPF_F_ADJ_ROOM_ENCAP_L2_ETH	= (1ULL << 6),
+	BPF_F_ADJ_ROOM_DECAP_L3_IPV4	= (1ULL << 7),
+	BPF_F_ADJ_ROOM_DECAP_L3_IPV6	= (1ULL << 8),
 };
 
 enum {
diff --git a/net/core/filter.c b/net/core/filter.c
index d9befa6ba04e..b4547a2c02f4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3381,13 +3381,17 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
 #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK	(BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
 					 BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
 
+#define BPF_F_ADJ_ROOM_DECAP_L3_MASK	(BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \
+					 BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
+
 #define BPF_F_ADJ_ROOM_MASK		(BPF_F_ADJ_ROOM_FIXED_GSO | \
 					 BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
 					 BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
 					 BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
 					 BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
 					 BPF_F_ADJ_ROOM_ENCAP_L2( \
-					  BPF_ADJ_ROOM_ENCAP_L2_MASK))
+					  BPF_ADJ_ROOM_ENCAP_L2_MASK) | \
+					 BPF_F_ADJ_ROOM_DECAP_L3_MASK)
 
 static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
 			    u64 flags)
@@ -3501,6 +3505,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
 	int ret;
 
 	if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
+			       BPF_F_ADJ_ROOM_DECAP_L3_MASK |
 			       BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
 		return -EINVAL;
 
@@ -3519,6 +3524,14 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
 	if (unlikely(ret < 0))
 		return ret;
 
+	/* Match skb->protocol to new outer l3 protocol */
+	if (skb->protocol == htons(ETH_P_IP) &&
+	    flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
+		skb->protocol = htons(ETH_P_IPV6);
+	else if (skb->protocol == htons(ETH_P_IPV6) &&
+		 flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
+		skb->protocol = htons(ETH_P_IP);
+
 	if (skb_is_gso(skb)) {
 		struct skb_shared_info *shinfo = skb_shinfo(skb);
 
@@ -3608,6 +3621,22 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
 		return -ENOTSUPP;
 	}
 
+	if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
+		if (!shrink)
+			return -EINVAL;
+
+		switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
+		case BPF_F_ADJ_ROOM_DECAP_L3_IPV4:
+			len_min = sizeof(struct iphdr);
+			break;
+		case BPF_F_ADJ_ROOM_DECAP_L3_IPV6:
+			len_min = sizeof(struct ipv6hdr);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
 	len_cur = skb->len - skb_network_offset(skb);
 	if ((shrink && (len_diff_abs >= len_cur ||
 			len_cur - len_diff_abs < len_min)) ||
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index bc1a3d232ae4..142b81bcbb2e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2647,6 +2647,11 @@ union bpf_attr {
  *		  Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
  *		  L2 type as Ethernet.
  *
+ *		* **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**,
+ *		  **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**:
+ *		  Indicate the new IP header version after decapsulating the outer
+ *		  IP header. Used when the inner and outer IP versions are different.
+ *
  * 		A call to this helper is susceptible to change the underlying
  * 		packet buffer. Therefore, at load time, all checks on pointers
  * 		previously done by the verifier are invalidated and must be
@@ -5807,6 +5812,8 @@ enum {
 	BPF_F_ADJ_ROOM_ENCAP_L4_UDP	= (1ULL << 4),
 	BPF_F_ADJ_ROOM_NO_CSUM_RESET	= (1ULL << 5),
 	BPF_F_ADJ_ROOM_ENCAP_L2_ETH	= (1ULL << 6),
+	BPF_F_ADJ_ROOM_DECAP_L3_IPV4    = (1ULL << 7),
+	BPF_F_ADJ_ROOM_DECAP_L3_IPV6    = (1ULL << 8),
 };
 
 enum {
-- 
cgit 


From 7105f76fb56f5ed66a59bc048bc71e9f100e1d39 Mon Sep 17 00:00:00 2001
From: Ziyang Xuan <william.xuanziyang@huawei.com>
Date: Fri, 13 Jan 2023 17:25:10 +0800
Subject: selftests/bpf: add ipip6 and ip6ip decap to test_tc_tunnel

Add ipip6 and ip6ip decap testcases. Verify that bpf_skb_adjust_room()
correctly decapsulate ipip6 and ip6ip tunnel packets.

Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/dfd2d8cfdf9111bd129170d4345296f53bee6a67.1673574419.git.william.xuanziyang@huawei.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 91 +++++++++++++++++++++-
 tools/testing/selftests/bpf/test_tc_tunnel.sh      | 15 ++--
 2 files changed, 98 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
index a0e7762b1e5a..e6e678aa9874 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
@@ -38,6 +38,10 @@ static const int cfg_udp_src = 20000;
 #define	VXLAN_FLAGS     0x8
 #define	VXLAN_VNI       1
 
+#ifndef NEXTHDR_DEST
+#define NEXTHDR_DEST	60
+#endif
+
 /* MPLS label 1000 with S bit (last label) set and ttl of 255. */
 static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
 						     MPLS_LS_S_MASK | 0xff);
@@ -363,6 +367,61 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
 	return TC_ACT_OK;
 }
 
+static int encap_ipv6_ipip6(struct __sk_buff *skb)
+{
+	struct iphdr iph_inner;
+	struct v6hdr h_outer;
+	struct tcphdr tcph;
+	struct ethhdr eth;
+	__u64 flags;
+	int olen;
+
+	if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
+			       sizeof(iph_inner)) < 0)
+		return TC_ACT_OK;
+
+	/* filter only packets we want */
+	if (bpf_skb_load_bytes(skb, ETH_HLEN + (iph_inner.ihl << 2),
+			       &tcph, sizeof(tcph)) < 0)
+		return TC_ACT_OK;
+
+	if (tcph.dest != __bpf_constant_htons(cfg_port))
+		return TC_ACT_OK;
+
+	olen = sizeof(h_outer.ip);
+
+	flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6;
+
+	/* add room between mac and network header */
+	if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
+		return TC_ACT_SHOT;
+
+	/* prepare new outer network header */
+	memset(&h_outer.ip, 0, sizeof(h_outer.ip));
+	h_outer.ip.version = 6;
+	h_outer.ip.hop_limit = iph_inner.ttl;
+	h_outer.ip.saddr.s6_addr[1] = 0xfd;
+	h_outer.ip.saddr.s6_addr[15] = 1;
+	h_outer.ip.daddr.s6_addr[1] = 0xfd;
+	h_outer.ip.daddr.s6_addr[15] = 2;
+	h_outer.ip.payload_len = iph_inner.tot_len;
+	h_outer.ip.nexthdr = IPPROTO_IPIP;
+
+	/* store new outer network header */
+	if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
+				BPF_F_INVALIDATE_HASH) < 0)
+		return TC_ACT_SHOT;
+
+	/* update eth->h_proto */
+	if (bpf_skb_load_bytes(skb, 0, &eth, sizeof(eth)) < 0)
+		return TC_ACT_SHOT;
+	eth.h_proto = bpf_htons(ETH_P_IPV6);
+	if (bpf_skb_store_bytes(skb, 0, &eth, sizeof(eth), 0) < 0)
+		return TC_ACT_SHOT;
+
+	return TC_ACT_OK;
+}
+
 static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
 				      __u16 l2_proto)
 {
@@ -461,6 +520,15 @@ int __encap_ip6tnl_none(struct __sk_buff *skb)
 		return TC_ACT_OK;
 }
 
+SEC("encap_ipip6_none")
+int __encap_ipip6_none(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+		return encap_ipv6_ipip6(skb);
+	else
+		return TC_ACT_OK;
+}
+
 SEC("encap_ip6gre_none")
 int __encap_ip6gre_none(struct __sk_buff *skb)
 {
@@ -528,13 +596,33 @@ int __encap_ip6vxlan_eth(struct __sk_buff *skb)
 
 static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
 {
+	__u64 flags = BPF_F_ADJ_ROOM_FIXED_GSO;
+	struct ipv6_opt_hdr ip6_opt_hdr;
 	struct gre_hdr greh;
 	struct udphdr udph;
 	int olen = len;
 
 	switch (proto) {
 	case IPPROTO_IPIP:
+		flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV4;
+		break;
 	case IPPROTO_IPV6:
+		flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV6;
+		break;
+	case NEXTHDR_DEST:
+		if (bpf_skb_load_bytes(skb, off + len, &ip6_opt_hdr,
+				       sizeof(ip6_opt_hdr)) < 0)
+			return TC_ACT_OK;
+		switch (ip6_opt_hdr.nexthdr) {
+		case IPPROTO_IPIP:
+			flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV4;
+			break;
+		case IPPROTO_IPV6:
+			flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV6;
+			break;
+		default:
+			return TC_ACT_OK;
+		}
 		break;
 	case IPPROTO_GRE:
 		olen += sizeof(struct gre_hdr);
@@ -569,8 +657,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
 		return TC_ACT_OK;
 	}
 
-	if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC,
-				BPF_F_ADJ_ROOM_FIXED_GSO))
+	if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, flags))
 		return TC_ACT_SHOT;
 
 	return TC_ACT_OK;
diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh
index 334bdfeab940..910044f08908 100755
--- a/tools/testing/selftests/bpf/test_tc_tunnel.sh
+++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh
@@ -100,6 +100,9 @@ if [[ "$#" -eq "0" ]]; then
 	echo "ipip"
 	$0 ipv4 ipip none 100
 
+	echo "ipip6"
+	$0 ipv4 ipip6 none 100
+
 	echo "ip6ip6"
 	$0 ipv6 ip6tnl none 100
 
@@ -224,6 +227,9 @@ elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
 elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then
 	ttype="vxlan"
 	targs="id 1 dstport 8472 udp6zerocsumrx"
+elif [[ "$tuntype" == "ipip6" ]]; then
+	ttype="ip6tnl"
+	targs=""
 else
 	ttype=$tuntype
 	targs=""
@@ -233,6 +239,9 @@ fi
 if [[ "${tuntype}" == "sit" ]]; then
 	link_addr1="${ns1_v4}"
 	link_addr2="${ns2_v4}"
+elif [[ "${tuntype}" == "ipip6" ]]; then
+	link_addr1="${ns1_v6}"
+	link_addr2="${ns2_v6}"
 else
 	link_addr1="${addr1}"
 	link_addr2="${addr2}"
@@ -287,12 +296,6 @@ else
 	server_listen
 fi
 
-# bpf_skb_net_shrink does not take tunnel flags yet, cannot update L3.
-if [[ "${tuntype}" == "sit" ]]; then
-	echo OK
-	exit 0
-fi
-
 # serverside, use BPF for decap
 ip netns exec "${ns2}" ip link del dev testtun0
 ip netns exec "${ns2}" tc qdisc add dev veth2 clsact
-- 
cgit 


From c0f264e4edb60db410462d513e29d2de3ef7de56 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 17 Jan 2023 23:37:05 +0100
Subject: bpf/selftests: Add verifier tests for loading sleepable programs

Adding verifier tests for loading all types od allowed
sleepable programs plus reject for tp_btf type.

Acked-by: Song Liu <song@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20230117223705.440975-2-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/verifier/sleepable.c | 91 ++++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/verifier/sleepable.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/verifier/sleepable.c b/tools/testing/selftests/bpf/verifier/sleepable.c
new file mode 100644
index 000000000000..bea0daef908a
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/sleepable.c
@@ -0,0 +1,91 @@
+{
+	"sleepable fentry accept",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_TRACING,
+	.expected_attach_type = BPF_TRACE_FENTRY,
+	.kfunc = "bpf_fentry_test1",
+	.result = ACCEPT,
+	.flags = BPF_F_SLEEPABLE,
+	.runs = -1,
+},
+{
+	"sleepable fexit accept",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_TRACING,
+	.expected_attach_type = BPF_TRACE_FENTRY,
+	.kfunc = "bpf_fentry_test1",
+	.result = ACCEPT,
+	.flags = BPF_F_SLEEPABLE,
+	.runs = -1,
+},
+{
+	"sleepable fmod_ret accept",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_TRACING,
+	.expected_attach_type = BPF_MODIFY_RETURN,
+	.kfunc = "bpf_fentry_test1",
+	.result = ACCEPT,
+	.flags = BPF_F_SLEEPABLE,
+	.runs = -1,
+},
+{
+	"sleepable iter accept",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_TRACING,
+	.expected_attach_type = BPF_TRACE_ITER,
+	.kfunc = "task",
+	.result = ACCEPT,
+	.flags = BPF_F_SLEEPABLE,
+	.runs = -1,
+},
+{
+	"sleepable lsm accept",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_LSM,
+	.kfunc = "bpf",
+	.expected_attach_type = BPF_LSM_MAC,
+	.result = ACCEPT,
+	.flags = BPF_F_SLEEPABLE,
+	.runs = -1,
+},
+{
+	"sleepable uprobe accept",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_KPROBE,
+	.kfunc = "bpf_fentry_test1",
+	.result = ACCEPT,
+	.flags = BPF_F_SLEEPABLE,
+	.runs = -1,
+},
+{
+	"sleepable raw tracepoint reject",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_TRACING,
+	.expected_attach_type = BPF_TRACE_RAW_TP,
+	.kfunc = "sched_switch",
+	.result = REJECT,
+	.errstr = "Only fentry/fexit/fmod_ret, lsm, iter and uprobe programs can be sleepable",
+	.flags = BPF_F_SLEEPABLE,
+	.runs = -1,
+},
-- 
cgit 


From abf08576afe31506b812c8c1be9714f78613f300 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 13 Jan 2023 12:49:10 +0100
Subject: fs: port vfs_*() helpers to struct mnt_idmap

Convert to struct mnt_idmap.

Last cycle we merged the necessary infrastructure in
256c8aed2b42 ("fs: introduce dedicated idmap type for mounts").
This is just the conversion to struct mnt_idmap.

Currently we still pass around the plain namespace that was attached to a
mount. This is in general pretty convenient but it makes it easy to
conflate namespaces that are relevant on the filesystem with namespaces
that are relevent on the mount level. Especially for non-vfs developers
without detailed knowledge in this area this can be a potential source for
bugs.

Once the conversion to struct mnt_idmap is done all helpers down to the
really low-level helpers will take a struct mnt_idmap argument instead of
two namespace arguments. This way it becomes impossible to conflate the two
eliminating the possibility of any bugs. All of the vfs and all filesystems
only operate on struct mnt_idmap.

Acked-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 drivers/base/devtmpfs.c                          |  12 +-
 fs/attr.c                                        |  13 +-
 fs/cachefiles/interface.c                        |   4 +-
 fs/cachefiles/namei.c                            |  12 +-
 fs/coredump.c                                    |   6 +-
 fs/ecryptfs/inode.c                              |  24 +--
 fs/init.c                                        |  12 +-
 fs/inode.c                                       |   6 +-
 fs/ksmbd/smb2pdu.c                               |   6 +-
 fs/ksmbd/smbacl.c                                |   5 +-
 fs/ksmbd/vfs.c                                   |  47 +++---
 fs/ksmbd/vfs.h                                   |   4 +-
 fs/ksmbd/vfs_cache.c                             |   2 +-
 fs/namei.c                                       | 193 ++++++++++++-----------
 fs/nfsd/nfs3proc.c                               |   2 +-
 fs/nfsd/nfs4recover.c                            |   6 +-
 fs/nfsd/vfs.c                                    |  23 +--
 fs/open.c                                        |  23 +--
 fs/overlayfs/overlayfs.h                         |  25 +--
 fs/overlayfs/ovl_entry.h                         |   5 +
 fs/utimes.c                                      |   2 +-
 include/linux/fs.h                               |  32 ++--
 ipc/mqueue.c                                     |   2 +-
 net/unix/af_unix.c                               |   8 +-
 tools/testing/selftests/bpf/progs/profiler.inc.h |   2 +-
 25 files changed, 255 insertions(+), 221 deletions(-)

(limited to 'tools')

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index e4bffeabf344..03e8a95f1f35 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -173,7 +173,7 @@ static int dev_mkdir(const char *name, umode_t mode)
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	err = vfs_mkdir(&init_user_ns, d_inode(path.dentry), dentry, mode);
+	err = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode);
 	if (!err)
 		/* mark as kernel-created inode */
 		d_inode(dentry)->i_private = &thread;
@@ -223,7 +223,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	err = vfs_mknod(&init_user_ns, d_inode(path.dentry), dentry, mode,
+	err = vfs_mknod(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode,
 			dev->devt);
 	if (!err) {
 		struct iattr newattrs;
@@ -233,7 +233,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
 		newattrs.ia_gid = gid;
 		newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID;
 		inode_lock(d_inode(dentry));
-		notify_change(&init_user_ns, dentry, &newattrs, NULL);
+		notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL);
 		inode_unlock(d_inode(dentry));
 
 		/* mark as kernel-created inode */
@@ -254,7 +254,7 @@ static int dev_rmdir(const char *name)
 		return PTR_ERR(dentry);
 	if (d_really_is_positive(dentry)) {
 		if (d_inode(dentry)->i_private == &thread)
-			err = vfs_rmdir(&init_user_ns, d_inode(parent.dentry),
+			err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry),
 					dentry);
 		else
 			err = -EPERM;
@@ -341,9 +341,9 @@ static int handle_remove(const char *nodename, struct device *dev)
 			newattrs.ia_valid =
 				ATTR_UID|ATTR_GID|ATTR_MODE;
 			inode_lock(d_inode(dentry));
-			notify_change(&init_user_ns, dentry, &newattrs, NULL);
+			notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL);
 			inode_unlock(d_inode(dentry));
-			err = vfs_unlink(&init_user_ns, d_inode(parent.dentry),
+			err = vfs_unlink(&nop_mnt_idmap, d_inode(parent.dentry),
 					 dentry, NULL);
 			if (!err || err == -ENOENT)
 				deleted = 1;
diff --git a/fs/attr.c b/fs/attr.c
index b45f30e516fa..023a3860568a 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -352,7 +352,7 @@ EXPORT_SYMBOL(may_setattr);
 
 /**
  * notify_change - modify attributes of a filesytem object
- * @mnt_userns:	user namespace of the mount the inode was found from
+ * @idmap:	idmap of the mount the inode was found from
  * @dentry:	object affected
  * @attr:	new attributes
  * @delegated_inode: returns inode, if the inode is delegated
@@ -371,15 +371,16 @@ EXPORT_SYMBOL(may_setattr);
  * the file open for write, as there can be no conflicting delegation in
  * that case.
  *
- * If the inode has been found through an idmapped mount the user namespace of
- * the vfsmount must be passed through @mnt_userns. This function will then
- * take care to map the inode according to @mnt_userns before checking
+ * If the inode has been found through an idmapped mount the idmap of
+ * the vfsmount must be passed through @idmap. This function will then
+ * take care to map the inode according to @idmap before checking
  * permissions. On non-idmapped mounts or if permission checking is to be
- * performed on the raw inode simply passs init_user_ns.
+ * performed on the raw inode simply pass @nop_mnt_idmap.
  */
-int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
+int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
 		  struct iattr *attr, struct inode **delegated_inode)
 {
+	struct user_namespace *mnt_userns = mnt_idmap_owner(idmap);
 	struct inode *inode = dentry->d_inode;
 	umode_t mode = inode->i_mode;
 	int error;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index a69073a1d3f0..40052bdb3365 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -138,7 +138,7 @@ static int cachefiles_adjust_size(struct cachefiles_object *object)
 		newattrs.ia_size = oi_size & PAGE_MASK;
 		ret = cachefiles_inject_remove_error();
 		if (ret == 0)
-			ret = notify_change(&init_user_ns, file->f_path.dentry,
+			ret = notify_change(&nop_mnt_idmap, file->f_path.dentry,
 					    &newattrs, NULL);
 		if (ret < 0)
 			goto truncate_failed;
@@ -148,7 +148,7 @@ static int cachefiles_adjust_size(struct cachefiles_object *object)
 	newattrs.ia_size = ni_size;
 	ret = cachefiles_inject_write_error();
 	if (ret == 0)
-		ret = notify_change(&init_user_ns, file->f_path.dentry,
+		ret = notify_change(&nop_mnt_idmap, file->f_path.dentry,
 				    &newattrs, NULL);
 
 truncate_failed:
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 03ca8f2f657a..82219a8f6084 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -130,7 +130,7 @@ retry:
 			goto mkdir_error;
 		ret = cachefiles_inject_write_error();
 		if (ret == 0)
-			ret = vfs_mkdir(&init_user_ns, d_inode(dir), subdir, 0700);
+			ret = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700);
 		if (ret < 0) {
 			trace_cachefiles_vfs_error(NULL, d_inode(dir), ret,
 						   cachefiles_trace_mkdir_error);
@@ -245,7 +245,7 @@ static int cachefiles_unlink(struct cachefiles_cache *cache,
 
 	ret = cachefiles_inject_remove_error();
 	if (ret == 0) {
-		ret = vfs_unlink(&init_user_ns, d_backing_inode(dir), dentry, NULL);
+		ret = vfs_unlink(&nop_mnt_idmap, d_backing_inode(dir), dentry, NULL);
 		if (ret == -EIO)
 			cachefiles_io_error(cache, "Unlink failed");
 	}
@@ -382,10 +382,10 @@ try_again:
 		cachefiles_io_error(cache, "Rename security error %d", ret);
 	} else {
 		struct renamedata rd = {
-			.old_mnt_userns	= &init_user_ns,
+			.old_mnt_idmap	= &nop_mnt_idmap,
 			.old_dir	= d_inode(dir),
 			.old_dentry	= rep,
-			.new_mnt_userns	= &init_user_ns,
+			.new_mnt_idmap	= &nop_mnt_idmap,
 			.new_dir	= d_inode(cache->graveyard),
 			.new_dentry	= grave,
 		};
@@ -451,7 +451,7 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
 
 	ret = cachefiles_inject_write_error();
 	if (ret == 0) {
-		file = vfs_tmpfile_open(&init_user_ns, &parentpath, S_IFREG,
+		file = vfs_tmpfile_open(&nop_mnt_idmap, &parentpath, S_IFREG,
 					O_RDWR | O_LARGEFILE | O_DIRECT,
 					cache->cache_cred);
 		ret = PTR_ERR_OR_ZERO(file);
@@ -714,7 +714,7 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
 
 	ret = cachefiles_inject_read_error();
 	if (ret == 0)
-		ret = vfs_link(object->file->f_path.dentry, &init_user_ns,
+		ret = vfs_link(object->file->f_path.dentry, &nop_mnt_idmap,
 			       d_inode(fan), dentry, NULL);
 	if (ret < 0) {
 		trace_cachefiles_vfs_error(object, d_inode(fan), ret,
diff --git a/fs/coredump.c b/fs/coredump.c
index de78bde2991b..27847d16d2b8 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -644,6 +644,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 			goto close_fail;
 		}
 	} else {
+		struct mnt_idmap *idmap;
 		struct user_namespace *mnt_userns;
 		struct inode *inode;
 		int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
@@ -722,7 +723,8 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 		 * a process dumps core while its cwd is e.g. on a vfat
 		 * filesystem.
 		 */
-		mnt_userns = file_mnt_user_ns(cprm.file);
+		idmap = file_mnt_idmap(cprm.file);
+		mnt_userns = mnt_idmap_owner(idmap);
 		if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode),
 				    current_fsuid())) {
 			pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n",
@@ -736,7 +738,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 		}
 		if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
 			goto close_fail;
-		if (do_truncate(mnt_userns, cprm.file->f_path.dentry,
+		if (do_truncate(idmap, cprm.file->f_path.dentry,
 				0, 0, cprm.file))
 			goto close_fail;
 	}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index f3cd00fac9c3..d2c5a8c55322 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -139,7 +139,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
 		if (d_unhashed(lower_dentry))
 			rc = -EINVAL;
 		else
-			rc = vfs_unlink(&init_user_ns, lower_dir, lower_dentry,
+			rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry,
 					NULL);
 	}
 	if (rc) {
@@ -180,7 +180,7 @@ ecryptfs_do_create(struct inode *directory_inode,
 
 	rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir);
 	if (!rc)
-		rc = vfs_create(&init_user_ns, lower_dir,
+		rc = vfs_create(&nop_mnt_idmap, lower_dir,
 				lower_dentry, mode, true);
 	if (rc) {
 		printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
@@ -191,7 +191,7 @@ ecryptfs_do_create(struct inode *directory_inode,
 	inode = __ecryptfs_get_inode(d_inode(lower_dentry),
 				     directory_inode->i_sb);
 	if (IS_ERR(inode)) {
-		vfs_unlink(&init_user_ns, lower_dir, lower_dentry, NULL);
+		vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, NULL);
 		goto out_lock;
 	}
 	fsstack_copy_attr_times(directory_inode, lower_dir);
@@ -434,7 +434,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 	lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
 	rc = lock_parent(new_dentry, &lower_new_dentry, &lower_dir);
 	if (!rc)
-		rc = vfs_link(lower_old_dentry, &init_user_ns, lower_dir,
+		rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir,
 			      lower_new_dentry, NULL);
 	if (rc || d_really_is_negative(lower_new_dentry))
 		goto out_lock;
@@ -478,7 +478,7 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns,
 						  strlen(symname));
 	if (rc)
 		goto out_lock;
-	rc = vfs_symlink(&init_user_ns, lower_dir, lower_dentry,
+	rc = vfs_symlink(&nop_mnt_idmap, lower_dir, lower_dentry,
 			 encoded_symname);
 	kfree(encoded_symname);
 	if (rc || d_really_is_negative(lower_dentry))
@@ -504,7 +504,7 @@ static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 
 	rc = lock_parent(dentry, &lower_dentry, &lower_dir);
 	if (!rc)
-		rc = vfs_mkdir(&init_user_ns, lower_dir,
+		rc = vfs_mkdir(&nop_mnt_idmap, lower_dir,
 			       lower_dentry, mode);
 	if (rc || d_really_is_negative(lower_dentry))
 		goto out;
@@ -533,7 +533,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
 		if (d_unhashed(lower_dentry))
 			rc = -EINVAL;
 		else
-			rc = vfs_rmdir(&init_user_ns, lower_dir, lower_dentry);
+			rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry);
 	}
 	if (!rc) {
 		clear_nlink(d_inode(dentry));
@@ -557,7 +557,7 @@ ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
 
 	rc = lock_parent(dentry, &lower_dentry, &lower_dir);
 	if (!rc)
-		rc = vfs_mknod(&init_user_ns, lower_dir,
+		rc = vfs_mknod(&nop_mnt_idmap, lower_dir,
 			       lower_dentry, mode, dev);
 	if (rc || d_really_is_negative(lower_dentry))
 		goto out;
@@ -616,10 +616,10 @@ ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 		goto out_lock;
 	}
 
-	rd.old_mnt_userns	= &init_user_ns;
+	rd.old_mnt_idmap	= &nop_mnt_idmap;
 	rd.old_dir		= d_inode(lower_old_dir_dentry);
 	rd.old_dentry		= lower_old_dentry;
-	rd.new_mnt_userns	= &init_user_ns;
+	rd.new_mnt_idmap	= &nop_mnt_idmap;
 	rd.new_dir		= d_inode(lower_new_dir_dentry);
 	rd.new_dentry		= lower_new_dentry;
 	rc = vfs_rename(&rd);
@@ -856,7 +856,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 		struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 
 		inode_lock(d_inode(lower_dentry));
-		rc = notify_change(&init_user_ns, lower_dentry,
+		rc = notify_change(&nop_mnt_idmap, lower_dentry,
 				   &lower_ia, NULL);
 		inode_unlock(d_inode(lower_dentry));
 	}
@@ -965,7 +965,7 @@ static int ecryptfs_setattr(struct user_namespace *mnt_userns,
 		lower_ia.ia_valid &= ~ATTR_MODE;
 
 	inode_lock(d_inode(lower_dentry));
-	rc = notify_change(&init_user_ns, lower_dentry, &lower_ia, NULL);
+	rc = notify_change(&nop_mnt_idmap, lower_dentry, &lower_ia, NULL);
 	inode_unlock(d_inode(lower_dentry));
 out:
 	fsstack_copy_attr_all(inode, lower_inode);
diff --git a/fs/init.c b/fs/init.c
index 5c36adaa9b44..f43f1e78bf7a 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -157,7 +157,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
 		mode &= ~current_umask();
 	error = security_path_mknod(&path, dentry, mode, dev);
 	if (!error)
-		error = vfs_mknod(mnt_user_ns(path.mnt), path.dentry->d_inode,
+		error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode,
 				  dentry, mode, new_decode_dev(dev));
 	done_path_create(&path, dentry);
 	return error;
@@ -167,6 +167,7 @@ int __init init_link(const char *oldname, const char *newname)
 {
 	struct dentry *new_dentry;
 	struct path old_path, new_path;
+	struct mnt_idmap *idmap;
 	struct user_namespace *mnt_userns;
 	int error;
 
@@ -182,14 +183,15 @@ int __init init_link(const char *oldname, const char *newname)
 	error = -EXDEV;
 	if (old_path.mnt != new_path.mnt)
 		goto out_dput;
-	mnt_userns = mnt_user_ns(new_path.mnt);
+	idmap = mnt_idmap(new_path.mnt);
+	mnt_userns = mnt_idmap_owner(idmap);
 	error = may_linkat(mnt_userns, &old_path);
 	if (unlikely(error))
 		goto out_dput;
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
 	if (error)
 		goto out_dput;
-	error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
+	error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
 			 new_dentry, NULL);
 out_dput:
 	done_path_create(&new_path, new_dentry);
@@ -209,7 +211,7 @@ int __init init_symlink(const char *oldname, const char *newname)
 		return PTR_ERR(dentry);
 	error = security_path_symlink(&path, dentry, oldname);
 	if (!error)
-		error = vfs_symlink(mnt_user_ns(path.mnt), path.dentry->d_inode,
+		error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
 				    dentry, oldname);
 	done_path_create(&path, dentry);
 	return error;
@@ -233,7 +235,7 @@ int __init init_mkdir(const char *pathname, umode_t mode)
 		mode &= ~current_umask();
 	error = security_path_mkdir(&path, dentry, mode);
 	if (!error)
-		error = vfs_mkdir(mnt_user_ns(path.mnt), path.dentry->d_inode,
+		error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
 				  dentry, mode);
 	done_path_create(&path, dentry);
 	return error;
diff --git a/fs/inode.c b/fs/inode.c
index f453eb58fd03..84b5da325ee8 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1972,7 +1972,7 @@ int dentry_needs_remove_privs(struct user_namespace *mnt_userns,
 	return mask;
 }
 
-static int __remove_privs(struct user_namespace *mnt_userns,
+static int __remove_privs(struct mnt_idmap *idmap,
 			  struct dentry *dentry, int kill)
 {
 	struct iattr newattrs;
@@ -1982,7 +1982,7 @@ static int __remove_privs(struct user_namespace *mnt_userns,
 	 * Note we call this on write, so notify_change will not
 	 * encounter any conflicting delegations:
 	 */
-	return notify_change(mnt_userns, dentry, &newattrs, NULL);
+	return notify_change(idmap, dentry, &newattrs, NULL);
 }
 
 static int __file_remove_privs(struct file *file, unsigned int flags)
@@ -2003,7 +2003,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
 		if (flags & IOCB_NOWAIT)
 			return -EAGAIN;
 
-		error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
+		error = __remove_privs(file_mnt_idmap(file), dentry, kill);
 	}
 
 	if (!error)
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 14d7f3599c63..f787f66b329c 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -5615,6 +5615,7 @@ static int set_file_basic_info(struct ksmbd_file *fp,
 	struct iattr attrs;
 	struct file *filp;
 	struct inode *inode;
+	struct mnt_idmap *idmap;
 	struct user_namespace *user_ns;
 	int rc = 0;
 
@@ -5624,7 +5625,8 @@ static int set_file_basic_info(struct ksmbd_file *fp,
 	attrs.ia_valid = 0;
 	filp = fp->filp;
 	inode = file_inode(filp);
-	user_ns = file_mnt_user_ns(filp);
+	idmap = file_mnt_idmap(filp);
+	user_ns = mnt_idmap_owner(idmap);
 
 	if (file_info->CreationTime)
 		fp->create_time = le64_to_cpu(file_info->CreationTime);
@@ -5686,7 +5688,7 @@ static int set_file_basic_info(struct ksmbd_file *fp,
 		inode_lock(inode);
 		inode->i_ctime = attrs.ia_ctime;
 		attrs.ia_valid &= ~ATTR_CTIME;
-		rc = notify_change(user_ns, dentry, &attrs, NULL);
+		rc = notify_change(idmap, dentry, &attrs, NULL);
 		inode_unlock(inode);
 	}
 	return rc;
diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c
index ab5c68cc0e13..6490342bbb38 100644
--- a/fs/ksmbd/smbacl.c
+++ b/fs/ksmbd/smbacl.c
@@ -1360,7 +1360,8 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
 	int rc;
 	struct smb_fattr fattr = {{0}};
 	struct inode *inode = d_inode(path->dentry);
-	struct user_namespace *user_ns = mnt_user_ns(path->mnt);
+	struct mnt_idmap *idmap = mnt_idmap(path->mnt);
+	struct user_namespace *user_ns = mnt_idmap_owner(idmap);
 	struct iattr newattrs;
 
 	fattr.cf_uid = INVALID_UID;
@@ -1403,7 +1404,7 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
 	}
 
 	inode_lock(inode);
-	rc = notify_change(user_ns, path->dentry, &newattrs, NULL);
+	rc = notify_change(idmap, path->dentry, &newattrs, NULL);
 	inode_unlock(inode);
 	if (rc)
 		goto out;
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index ff0e7a4fcd4d..5b284dd61056 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -177,7 +177,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
 	}
 
 	mode |= S_IFREG;
-	err = vfs_create(mnt_user_ns(path.mnt), d_inode(path.dentry),
+	err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry),
 			 dentry, mode, true);
 	if (!err) {
 		ksmbd_vfs_inherit_owner(work, d_inode(path.dentry),
@@ -199,6 +199,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
  */
 int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
 {
+	struct mnt_idmap *idmap;
 	struct user_namespace *user_ns;
 	struct path path;
 	struct dentry *dentry;
@@ -215,9 +216,10 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
 		return err;
 	}
 
-	user_ns = mnt_user_ns(path.mnt);
+	idmap = mnt_idmap(path.mnt);
+	user_ns = mnt_idmap_owner(idmap);
 	mode |= S_IFDIR;
-	err = vfs_mkdir(user_ns, d_inode(path.dentry), dentry, mode);
+	err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode);
 	if (err) {
 		goto out;
 	} else if (d_unhashed(dentry)) {
@@ -583,6 +585,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id)
  */
 int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name)
 {
+	struct mnt_idmap *idmap;
 	struct user_namespace *user_ns;
 	struct path path;
 	struct dentry *parent;
@@ -598,7 +601,8 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name)
 		return err;
 	}
 
-	user_ns = mnt_user_ns(path.mnt);
+	idmap = mnt_idmap(path.mnt);
+	user_ns = mnt_idmap_owner(idmap);
 	parent = dget_parent(path.dentry);
 	err = ksmbd_vfs_lock_parent(user_ns, parent, path.dentry);
 	if (err) {
@@ -614,12 +618,12 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name)
 	}
 
 	if (S_ISDIR(d_inode(path.dentry)->i_mode)) {
-		err = vfs_rmdir(user_ns, d_inode(parent), path.dentry);
+		err = vfs_rmdir(idmap, d_inode(parent), path.dentry);
 		if (err && err != -ENOTEMPTY)
 			ksmbd_debug(VFS, "%s: rmdir failed, err %d\n", name,
 				    err);
 	} else {
-		err = vfs_unlink(user_ns, d_inode(parent), path.dentry, NULL);
+		err = vfs_unlink(idmap, d_inode(parent), path.dentry, NULL);
 		if (err)
 			ksmbd_debug(VFS, "%s: unlink failed, err %d\n", name,
 				    err);
@@ -672,7 +676,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname,
 		goto out3;
 	}
 
-	err = vfs_link(oldpath.dentry, mnt_user_ns(newpath.mnt),
+	err = vfs_link(oldpath.dentry, mnt_idmap(newpath.mnt),
 		       d_inode(newpath.dentry),
 		       dentry, NULL);
 	if (err)
@@ -711,10 +715,10 @@ static int ksmbd_validate_entry_in_use(struct dentry *src_dent)
 }
 
 static int __ksmbd_vfs_rename(struct ksmbd_work *work,
-			      struct user_namespace *src_user_ns,
+			      struct mnt_idmap *src_idmap,
 			      struct dentry *src_dent_parent,
 			      struct dentry *src_dent,
-			      struct user_namespace *dst_user_ns,
+			      struct mnt_idmap *dst_idmap,
 			      struct dentry *dst_dent_parent,
 			      struct dentry *trap_dent,
 			      char *dst_name)
@@ -740,8 +744,8 @@ static int __ksmbd_vfs_rename(struct ksmbd_work *work,
 	if (ksmbd_override_fsids(work))
 		return -ENOMEM;
 
-	dst_dent = lookup_one(dst_user_ns, dst_name, dst_dent_parent,
-			      strlen(dst_name));
+	dst_dent = lookup_one(mnt_idmap_owner(dst_idmap), dst_name,
+			      dst_dent_parent, strlen(dst_name));
 	err = PTR_ERR(dst_dent);
 	if (IS_ERR(dst_dent)) {
 		pr_err("lookup failed %s [%d]\n", dst_name, err);
@@ -751,10 +755,10 @@ static int __ksmbd_vfs_rename(struct ksmbd_work *work,
 	err = -ENOTEMPTY;
 	if (dst_dent != trap_dent && !d_really_is_positive(dst_dent)) {
 		struct renamedata rd = {
-			.old_mnt_userns	= src_user_ns,
+			.old_mnt_idmap	= src_idmap,
 			.old_dir	= d_inode(src_dent_parent),
 			.old_dentry	= src_dent,
-			.new_mnt_userns	= dst_user_ns,
+			.new_mnt_idmap	= dst_idmap,
 			.new_dir	= d_inode(dst_dent_parent),
 			.new_dentry	= dst_dent,
 		};
@@ -772,6 +776,7 @@ out:
 int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
 			char *newname)
 {
+	struct mnt_idmap *idmap;
 	struct user_namespace *user_ns;
 	struct path dst_path;
 	struct dentry *src_dent_parent, *dst_dent_parent;
@@ -800,7 +805,8 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
 	trap_dent = lock_rename(src_dent_parent, dst_dent_parent);
 	dget(src_dent);
 	dget(dst_dent_parent);
-	user_ns = file_mnt_user_ns(fp->filp);
+	idmap = file_mnt_idmap(fp->filp);
+	user_ns = mnt_idmap_owner(idmap);
 	src_child = lookup_one(user_ns, src_dent->d_name.name, src_dent_parent,
 			       src_dent->d_name.len);
 	if (IS_ERR(src_child)) {
@@ -816,10 +822,10 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
 	dput(src_child);
 
 	err = __ksmbd_vfs_rename(work,
-				 user_ns,
+				 idmap,
 				 src_dent_parent,
 				 src_dent,
-				 mnt_user_ns(dst_path.mnt),
+				 mnt_idmap(dst_path.mnt),
 				 dst_dent_parent,
 				 trap_dent,
 				 dst_name);
@@ -1080,20 +1086,21 @@ int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns,
 	return vfs_removexattr(user_ns, dentry, attr_name);
 }
 
-int ksmbd_vfs_unlink(struct user_namespace *user_ns,
+int ksmbd_vfs_unlink(struct mnt_idmap *idmap,
 		     struct dentry *dir, struct dentry *dentry)
 {
 	int err = 0;
+	struct user_namespace *mnt_userns = mnt_idmap_owner(idmap);
 
-	err = ksmbd_vfs_lock_parent(user_ns, dir, dentry);
+	err = ksmbd_vfs_lock_parent(mnt_userns, dir, dentry);
 	if (err)
 		return err;
 	dget(dentry);
 
 	if (S_ISDIR(d_inode(dentry)->i_mode))
-		err = vfs_rmdir(user_ns, d_inode(dir), dentry);
+		err = vfs_rmdir(idmap, d_inode(dir), dentry);
 	else
-		err = vfs_unlink(user_ns, d_inode(dir), dentry, NULL);
+		err = vfs_unlink(idmap, d_inode(dir), dentry, NULL);
 
 	dput(dentry);
 	inode_unlock(d_inode(dir));
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 0d73d735cc39..0c9b04ae5fbf 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -131,8 +131,8 @@ struct file_allocated_range_buffer;
 int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
 			 struct file_allocated_range_buffer *ranges,
 			 unsigned int in_count, unsigned int *out_count);
-int ksmbd_vfs_unlink(struct user_namespace *user_ns,
-		     struct dentry *dir, struct dentry *dentry);
+int ksmbd_vfs_unlink(struct mnt_idmap *idmap, struct dentry *dir,
+		     struct dentry *dentry);
 void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat);
 int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work,
 				struct user_namespace *user_ns,
diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c
index da9163b00350..8489ff4d601a 100644
--- a/fs/ksmbd/vfs_cache.c
+++ b/fs/ksmbd/vfs_cache.c
@@ -266,7 +266,7 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp)
 			dir = dentry->d_parent;
 			ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING);
 			write_unlock(&ci->m_lock);
-			ksmbd_vfs_unlink(file_mnt_user_ns(filp), dir, dentry);
+			ksmbd_vfs_unlink(file_mnt_idmap(filp), dir, dentry);
 			write_lock(&ci->m_lock);
 		}
 		write_unlock(&ci->m_lock);
diff --git a/fs/namei.c b/fs/namei.c
index 309ae6fc8c99..3e727efed860 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3084,7 +3084,7 @@ static inline umode_t vfs_prepare_mode(struct user_namespace *mnt_userns,
 
 /**
  * vfs_create - create new file
- * @mnt_userns:	user namespace of the mount the inode was found from
+ * @idmap:	idmap of the mount the inode was found from
  * @dir:	inode of @dentry
  * @dentry:	pointer to dentry of the base directory
  * @mode:	mode of the new file
@@ -3092,16 +3092,19 @@ static inline umode_t vfs_prepare_mode(struct user_namespace *mnt_userns,
  *
  * Create a new file.
  *
- * If the inode has been found through an idmapped mount the user namespace of
- * the vfsmount must be passed through @mnt_userns. This function will then take
- * care to map the inode according to @mnt_userns before checking permissions.
+ * If the inode has been found through an idmapped mount the idmap of
+ * the vfsmount must be passed through @idmap. This function will then take
+ * care to map the inode according to @idmap before checking permissions.
  * On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs init_user_ns.
+ * raw inode simply passs @nop_mnt_idmap.
  */
-int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
 	       struct dentry *dentry, umode_t mode, bool want_excl)
 {
-	int error = may_create(mnt_userns, dir, dentry);
+	struct user_namespace *mnt_userns = mnt_idmap_owner(idmap);
+	int error;
+
+	error = may_create(mnt_userns, dir, dentry);
 	if (error)
 		return error;
 
@@ -3203,7 +3206,7 @@ static int may_open(struct user_namespace *mnt_userns, const struct path *path,
 	return 0;
 }
 
-static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
+static int handle_truncate(struct mnt_idmap *idmap, struct file *filp)
 {
 	const struct path *path = &filp->f_path;
 	struct inode *inode = path->dentry->d_inode;
@@ -3213,7 +3216,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
 
 	error = security_file_truncate(filp);
 	if (!error) {
-		error = do_truncate(mnt_userns, path->dentry, 0,
+		error = do_truncate(idmap, path->dentry, 0,
 				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
 				    filp);
 	}
@@ -3513,6 +3516,7 @@ finish_lookup:
 static int do_open(struct nameidata *nd,
 		   struct file *file, const struct open_flags *op)
 {
+	struct mnt_idmap *idmap;
 	struct user_namespace *mnt_userns;
 	int open_flag = op->open_flag;
 	bool do_truncate;
@@ -3526,7 +3530,8 @@ static int do_open(struct nameidata *nd,
 	}
 	if (!(file->f_mode & FMODE_CREATED))
 		audit_inode(nd->name, nd->path.dentry, 0);
-	mnt_userns = mnt_user_ns(nd->path.mnt);
+	idmap = mnt_idmap(nd->path.mnt);
+	mnt_userns = mnt_idmap_owner(idmap);
 	if (open_flag & O_CREAT) {
 		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
 			return -EEXIST;
@@ -3558,7 +3563,7 @@ static int do_open(struct nameidata *nd,
 	if (!error)
 		error = ima_file_check(file, op->acc_mode);
 	if (!error && do_truncate)
-		error = handle_truncate(mnt_userns, file);
+		error = handle_truncate(idmap, file);
 	if (unlikely(error > 0)) {
 		WARN_ON(1);
 		error = -EINVAL;
@@ -3570,23 +3575,24 @@ static int do_open(struct nameidata *nd,
 
 /**
  * vfs_tmpfile - create tmpfile
- * @mnt_userns:	user namespace of the mount the inode was found from
+ * @idmap:	idmap of the mount the inode was found from
  * @dentry:	pointer to dentry of the base directory
  * @mode:	mode of the new tmpfile
  * @open_flag:	flags
  *
  * Create a temporary file.
  *
- * If the inode has been found through an idmapped mount the user namespace of
- * the vfsmount must be passed through @mnt_userns. This function will then take
- * care to map the inode according to @mnt_userns before checking permissions.
+ * If the inode has been found through an idmapped mount the idmap of
+ * the vfsmount must be passed through @idmap. This function will then take
+ * care to map the inode according to @idmap before checking permissions.
  * On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs init_user_ns.
+ * raw inode simply passs @nop_mnt_idmap.
  */
-static int vfs_tmpfile(struct user_namespace *mnt_userns,
+static int vfs_tmpfile(struct mnt_idmap *idmap,
 		       const struct path *parentpath,
 		       struct file *file, umode_t mode)
 {
+	struct user_namespace *mnt_userns = mnt_idmap_owner(idmap);
 	struct dentry *child;
 	struct inode *dir = d_inode(parentpath->dentry);
 	struct inode *inode;
@@ -3625,7 +3631,7 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns,
 
 /**
  * vfs_tmpfile_open - open a tmpfile for kernel internal use
- * @mnt_userns:	user namespace of the mount the inode was found from
+ * @idmap:	idmap of the mount the inode was found from
  * @parentpath:	path of the base directory
  * @mode:	mode of the new tmpfile
  * @open_flag:	flags
@@ -3635,7 +3641,7 @@ static int vfs_tmpfile(struct user_namespace *mnt_userns,
  * hence this is only for kernel internal use, and must not be installed into
  * file tables or such.
  */
-struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns,
+struct file *vfs_tmpfile_open(struct mnt_idmap *idmap,
 			  const struct path *parentpath,
 			  umode_t mode, int open_flag, const struct cred *cred)
 {
@@ -3644,7 +3650,7 @@ struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns,
 
 	file = alloc_empty_file_noaccount(open_flag, cred);
 	if (!IS_ERR(file)) {
-		error = vfs_tmpfile(mnt_userns, parentpath, file, mode);
+		error = vfs_tmpfile(idmap, parentpath, file, mode);
 		if (error) {
 			fput(file);
 			file = ERR_PTR(error);
@@ -3658,7 +3664,6 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 		const struct open_flags *op,
 		struct file *file)
 {
-	struct user_namespace *mnt_userns;
 	struct path path;
 	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
 
@@ -3667,8 +3672,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 	error = mnt_want_write(path.mnt);
 	if (unlikely(error))
 		goto out;
-	mnt_userns = mnt_user_ns(path.mnt);
-	error = vfs_tmpfile(mnt_userns, &path, file, op->mode);
+	error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode);
 	if (error)
 		goto out2;
 	audit_inode(nd->name, file->f_path.dentry, 0);
@@ -3873,7 +3877,7 @@ EXPORT_SYMBOL(user_path_create);
 
 /**
  * vfs_mknod - create device node or file
- * @mnt_userns:	user namespace of the mount the inode was found from
+ * @idmap:	idmap of the mount the inode was found from
  * @dir:	inode of @dentry
  * @dentry:	pointer to dentry of the base directory
  * @mode:	mode of the new device node or file
@@ -3881,15 +3885,16 @@ EXPORT_SYMBOL(user_path_create);
  *
  * Create a device node or file.
  *
- * If the inode has been found through an idmapped mount the user namespace of
- * the vfsmount must be passed through @mnt_userns. This function will then take
- * care to map the inode according to @mnt_userns before checking permissions.
+ * If the inode has been found through an idmapped mount the idmap of
+ * the vfsmount must be passed through @idmap. This function will then take
+ * care to map the inode according to @idmap before checking permissions.
  * On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs init_user_ns.
+ * raw inode simply passs @nop_mnt_idmap.
  */
-int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	      struct dentry *dentry, umode_t mode, dev_t dev)
 {
+	struct user_namespace *mnt_userns = mnt_idmap_owner(idmap);
 	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
 	int error = may_create(mnt_userns, dir, dentry);
 
@@ -3939,6 +3944,7 @@ static int may_mknod(umode_t mode)
 static int do_mknodat(int dfd, struct filename *name, umode_t mode,
 		unsigned int dev)
 {
+	struct mnt_idmap *idmap;
 	struct user_namespace *mnt_userns;
 	struct dentry *dentry;
 	struct path path;
@@ -3959,20 +3965,21 @@ retry:
 	if (error)
 		goto out2;
 
-	mnt_userns = mnt_user_ns(path.mnt);
+	idmap = mnt_idmap(path.mnt);
+	mnt_userns = mnt_idmap_owner(idmap);
 	switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
-			error = vfs_create(mnt_userns, path.dentry->d_inode,
+			error = vfs_create(idmap, path.dentry->d_inode,
 					   dentry, mode, true);
 			if (!error)
 				ima_post_path_mknod(mnt_userns, dentry);
 			break;
 		case S_IFCHR: case S_IFBLK:
-			error = vfs_mknod(mnt_userns, path.dentry->d_inode,
+			error = vfs_mknod(idmap, path.dentry->d_inode,
 					  dentry, mode, new_decode_dev(dev));
 			break;
 		case S_IFIFO: case S_IFSOCK:
-			error = vfs_mknod(mnt_userns, path.dentry->d_inode,
+			error = vfs_mknod(idmap, path.dentry->d_inode,
 					  dentry, mode, 0);
 			break;
 	}
@@ -4000,25 +4007,27 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
 
 /**
  * vfs_mkdir - create directory
- * @mnt_userns:	user namespace of the mount the inode was found from
+ * @idmap:	idmap of the mount the inode was found from
  * @dir:	inode of @dentry
  * @dentry:	pointer to dentry of the base directory
  * @mode:	mode of the new directory
  *
  * Create a directory.
  *
- * If the inode has been found through an idmapped mount the user namespace of
- * the vfsmount must be passed through @mnt_userns. This function will then take
- * care to map the inode according to @mnt_userns before checking permissions.
+ * If the inode has been found through an idmapped mount the idmap of
+ * the vfsmount must be passed through @idmap. This function will then take
+ * care to map the inode according to @idmap before checking permissions.
  * On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs init_user_ns.
+ * raw inode simply passs @nop_mnt_idmap.
  */
-int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	      struct dentry *dentry, umode_t mode)
 {
-	int error = may_create(mnt_userns, dir, dentry);
+	struct user_namespace *mnt_userns = mnt_idmap_owner(idmap);
+	int error;
 	unsigned max_links = dir->i_sb->s_max_links;
 
+	error = may_create(mnt_userns, dir, dentry);
 	if (error)
 		return error;
 
@@ -4056,10 +4065,8 @@ retry:
 	error = security_path_mkdir(&path, dentry,
 			mode_strip_umask(path.dentry->d_inode, mode));
 	if (!error) {
-		struct user_namespace *mnt_userns;
-		mnt_userns = mnt_user_ns(path.mnt);
-		error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry,
-				  mode);
+		error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
+				  dentry, mode);
 	}
 	done_path_create(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
@@ -4083,21 +4090,22 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
 
 /**
  * vfs_rmdir - remove directory
- * @mnt_userns:	user namespace of the mount the inode was found from
+ * @idmap:	idmap of the mount the inode was found from
  * @dir:	inode of @dentry
  * @dentry:	pointer to dentry of the base directory
  *
  * Remove a directory.
  *
- * If the inode has been found through an idmapped mount the user namespace of
- * the vfsmount must be passed through @mnt_userns. This function will then take
- * care to map the inode according to @mnt_userns before checking permissions.
+ * If the inode has been found through an idmapped mount the idmap of
+ * the vfsmount must be passed through @idmap. This function will then take
+ * care to map the inode according to @idmap before checking permissions.
  * On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs init_user_ns.
+ * raw inode simply passs @nop_mnt_idmap.
  */
-int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
+int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
 		     struct dentry *dentry)
 {
+	struct user_namespace *mnt_userns = mnt_idmap_owner(idmap);
 	int error = may_delete(mnt_userns, dir, dentry, 1);
 
 	if (error)
@@ -4138,7 +4146,6 @@ EXPORT_SYMBOL(vfs_rmdir);
 
 int do_rmdir(int dfd, struct filename *name)
 {
-	struct user_namespace *mnt_userns;
 	int error;
 	struct dentry *dentry;
 	struct path path;
@@ -4178,8 +4185,7 @@ retry:
 	error = security_path_rmdir(&path, dentry);
 	if (error)
 		goto exit4;
-	mnt_userns = mnt_user_ns(path.mnt);
-	error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
+	error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry);
 exit4:
 	dput(dentry);
 exit3:
@@ -4203,7 +4209,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
 
 /**
  * vfs_unlink - unlink a filesystem object
- * @mnt_userns:	user namespace of the mount the inode was found from
+ * @idmap:	idmap of the mount the inode was found from
  * @dir:	parent directory
  * @dentry:	victim
  * @delegated_inode: returns victim inode, if the inode is delegated.
@@ -4220,15 +4226,16 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
  * be appropriate for callers that expect the underlying filesystem not
  * to be NFS exported.
  *
- * If the inode has been found through an idmapped mount the user namespace of
- * the vfsmount must be passed through @mnt_userns. This function will then take
- * care to map the inode according to @mnt_userns before checking permissions.
+ * If the inode has been found through an idmapped mount the idmap of
+ * the vfsmount must be passed through @idmap. This function will then take
+ * care to map the inode according to @idmap before checking permissions.
  * On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs init_user_ns.
+ * raw inode simply passs @nop_mnt_idmap.
  */
-int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
+int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
 	       struct dentry *dentry, struct inode **delegated_inode)
 {
+	struct user_namespace *mnt_userns = mnt_idmap_owner(idmap);
 	struct inode *target = dentry->d_inode;
 	int error = may_delete(mnt_userns, dir, dentry, 0);
 
@@ -4304,7 +4311,6 @@ retry_deleg:
 	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
-		struct user_namespace *mnt_userns;
 
 		/* Why not before? Because we want correct error value */
 		if (last.name[last.len])
@@ -4316,9 +4322,8 @@ retry_deleg:
 		error = security_path_unlink(&path, dentry);
 		if (error)
 			goto exit3;
-		mnt_userns = mnt_user_ns(path.mnt);
-		error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry,
-				   &delegated_inode);
+		error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode,
+				   dentry, &delegated_inode);
 exit3:
 		dput(dentry);
 	}
@@ -4370,22 +4375,23 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
 
 /**
  * vfs_symlink - create symlink
- * @mnt_userns:	user namespace of the mount the inode was found from
+ * @idmap:	idmap of the mount the inode was found from
  * @dir:	inode of @dentry
  * @dentry:	pointer to dentry of the base directory
  * @oldname:	name of the file to link to
  *
  * Create a symlink.
  *
- * If the inode has been found through an idmapped mount the user namespace of
- * the vfsmount must be passed through @mnt_userns. This function will then take
- * care to map the inode according to @mnt_userns before checking permissions.
+ * If the inode has been found through an idmapped mount the idmap of
+ * the vfsmount must be passed through @idmap. This function will then take
+ * care to map the inode according to @idmap before checking permissions.
  * On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs init_user_ns.
+ * raw inode simply passs @nop_mnt_idmap.
  */
-int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		struct dentry *dentry, const char *oldname)
 {
+	struct user_namespace *mnt_userns = mnt_idmap_owner(idmap);
 	int error = may_create(mnt_userns, dir, dentry);
 
 	if (error)
@@ -4423,13 +4429,9 @@ retry:
 		goto out_putnames;
 
 	error = security_path_symlink(&path, dentry, from->name);
-	if (!error) {
-		struct user_namespace *mnt_userns;
-
-		mnt_userns = mnt_user_ns(path.mnt);
-		error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry,
-				    from->name);
-	}
+	if (!error)
+		error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
+				    dentry, from->name);
 	done_path_create(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -4455,7 +4457,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
 /**
  * vfs_link - create a new link
  * @old_dentry:	object to be linked
- * @mnt_userns:	the user namespace of the mount
+ * @idmap:	idmap of the mount
  * @dir:	new parent
  * @new_dentry:	where to create the new link
  * @delegated_inode: returns inode needing a delegation break
@@ -4472,16 +4474,17 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
  * be appropriate for callers that expect the underlying filesystem not
  * to be NFS exported.
  *
- * If the inode has been found through an idmapped mount the user namespace of
- * the vfsmount must be passed through @mnt_userns. This function will then take
- * care to map the inode according to @mnt_userns before checking permissions.
+ * If the inode has been found through an idmapped mount the idmap of
+ * the vfsmount must be passed through @idmap. This function will then take
+ * care to map the inode according to @idmap before checking permissions.
  * On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs init_user_ns.
+ * raw inode simply passs @nop_mnt_idmap.
  */
-int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns,
+int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
 	     struct inode *dir, struct dentry *new_dentry,
 	     struct inode **delegated_inode)
 {
+	struct user_namespace *mnt_userns = mnt_idmap_owner(idmap);
 	struct inode *inode = old_dentry->d_inode;
 	unsigned max_links = dir->i_sb->s_max_links;
 	int error;
@@ -4553,6 +4556,7 @@ EXPORT_SYMBOL(vfs_link);
 int do_linkat(int olddfd, struct filename *old, int newdfd,
 	      struct filename *new, int flags)
 {
+	struct mnt_idmap *idmap;
 	struct user_namespace *mnt_userns;
 	struct dentry *new_dentry;
 	struct path old_path, new_path;
@@ -4590,14 +4594,15 @@ retry:
 	error = -EXDEV;
 	if (old_path.mnt != new_path.mnt)
 		goto out_dput;
-	mnt_userns = mnt_user_ns(new_path.mnt);
+	idmap = mnt_idmap(new_path.mnt);
+	mnt_userns = mnt_idmap_owner(idmap);
 	error = may_linkat(mnt_userns, &old_path);
 	if (unlikely(error))
 		goto out_dput;
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
 	if (error)
 		goto out_dput;
-	error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
+	error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
 			 new_dentry, &delegated_inode);
 out_dput:
 	done_path_create(&new_path, new_dentry);
@@ -4693,24 +4698,26 @@ int vfs_rename(struct renamedata *rd)
 	bool new_is_dir = false;
 	unsigned max_links = new_dir->i_sb->s_max_links;
 	struct name_snapshot old_name;
+	struct user_namespace *old_mnt_userns = mnt_idmap_owner(rd->old_mnt_idmap),
+			      *new_mnt_userns = mnt_idmap_owner(rd->new_mnt_idmap);
 
 	if (source == target)
 		return 0;
 
-	error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir);
+	error = may_delete(old_mnt_userns, old_dir, old_dentry, is_dir);
 	if (error)
 		return error;
 
 	if (!target) {
-		error = may_create(rd->new_mnt_userns, new_dir, new_dentry);
+		error = may_create(new_mnt_userns, new_dir, new_dentry);
 	} else {
 		new_is_dir = d_is_dir(new_dentry);
 
 		if (!(flags & RENAME_EXCHANGE))
-			error = may_delete(rd->new_mnt_userns, new_dir,
+			error = may_delete(new_mnt_userns, new_dir,
 					   new_dentry, is_dir);
 		else
-			error = may_delete(rd->new_mnt_userns, new_dir,
+			error = may_delete(new_mnt_userns, new_dir,
 					   new_dentry, new_is_dir);
 	}
 	if (error)
@@ -4725,13 +4732,13 @@ int vfs_rename(struct renamedata *rd)
 	 */
 	if (new_dir != old_dir) {
 		if (is_dir) {
-			error = inode_permission(rd->old_mnt_userns, source,
+			error = inode_permission(old_mnt_userns, source,
 						 MAY_WRITE);
 			if (error)
 				return error;
 		}
 		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
-			error = inode_permission(rd->new_mnt_userns, target,
+			error = inode_permission(new_mnt_userns, target,
 						 MAY_WRITE);
 			if (error)
 				return error;
@@ -4776,7 +4783,7 @@ int vfs_rename(struct renamedata *rd)
 		if (error)
 			goto out;
 	}
-	error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry,
+	error = old_dir->i_op->rename(new_mnt_userns, old_dir, old_dentry,
 				      new_dir, new_dentry, flags);
 	if (error)
 		goto out;
@@ -4921,10 +4928,10 @@ retry_deleg:
 
 	rd.old_dir	   = old_path.dentry->d_inode;
 	rd.old_dentry	   = old_dentry;
-	rd.old_mnt_userns  = mnt_user_ns(old_path.mnt);
+	rd.old_mnt_idmap   = mnt_idmap(old_path.mnt);
 	rd.new_dir	   = new_path.dentry->d_inode;
 	rd.new_dentry	   = new_dentry;
-	rd.new_mnt_userns  = mnt_user_ns(new_path.mnt);
+	rd.new_mnt_idmap   = mnt_idmap(new_path.mnt);
 	rd.delegated_inode = &delegated_inode;
 	rd.flags	   = flags;
 	error = vfs_rename(&rd);
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index d01b29aba662..f41992ecd0d7 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -320,7 +320,7 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		iap->ia_mode &= ~current_umask();
 
 	fh_fill_pre_attrs(fhp);
-	host_err = vfs_create(&init_user_ns, inode, child, iap->ia_mode, true);
+	host_err = vfs_create(&nop_mnt_idmap, inode, child, iap->ia_mode, true);
 	if (host_err < 0) {
 		status = nfserrno(host_err);
 		goto out;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 78b8cd9651d5..3509e73abe1f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -233,7 +233,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 		 * as well be forgiving and just succeed silently.
 		 */
 		goto out_put;
-	status = vfs_mkdir(&init_user_ns, d_inode(dir), dentry, S_IRWXU);
+	status = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU);
 out_put:
 	dput(dentry);
 out_unlock:
@@ -353,7 +353,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
 	status = -ENOENT;
 	if (d_really_is_negative(dentry))
 		goto out;
-	status = vfs_rmdir(&init_user_ns, d_inode(dir), dentry);
+	status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry);
 out:
 	dput(dentry);
 out_unlock:
@@ -443,7 +443,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 	if (nfs4_has_reclaimed_state(name, nn))
 		goto out_free;
 
-	status = vfs_rmdir(&init_user_ns, d_inode(parent), child);
+	status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child);
 	if (status)
 		printk("failed to remove client recovery directory %pd\n",
 				child);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 4c3a0d84043c..371d7f03fe2d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -426,7 +426,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
 		if (iap->ia_size < 0)
 			return -EFBIG;
 
-		host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL);
+		host_err = notify_change(&nop_mnt_idmap, dentry, &size_attr, NULL);
 		if (host_err)
 			return host_err;
 		iap->ia_valid &= ~ATTR_SIZE;
@@ -444,7 +444,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
 		return 0;
 
 	iap->ia_valid |= ATTR_CTIME;
-	return notify_change(&init_user_ns, dentry, iap, NULL);
+	return notify_change(&nop_mnt_idmap, dentry, iap, NULL);
 }
 
 /**
@@ -1363,12 +1363,13 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	err = 0;
 	switch (type) {
 	case S_IFREG:
-		host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
+		host_err = vfs_create(&nop_mnt_idmap, dirp, dchild,
+				      iap->ia_mode, true);
 		if (!host_err)
 			nfsd_check_ignore_resizing(iap);
 		break;
 	case S_IFDIR:
-		host_err = vfs_mkdir(&init_user_ns, dirp, dchild, iap->ia_mode);
+		host_err = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode);
 		if (!host_err && unlikely(d_unhashed(dchild))) {
 			struct dentry *d;
 			d = lookup_one_len(dchild->d_name.name,
@@ -1396,7 +1397,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	case S_IFBLK:
 	case S_IFIFO:
 	case S_IFSOCK:
-		host_err = vfs_mknod(&init_user_ns, dirp, dchild,
+		host_err = vfs_mknod(&nop_mnt_idmap, dirp, dchild,
 				     iap->ia_mode, rdev);
 		break;
 	default:
@@ -1557,7 +1558,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		goto out_drop_write;
 	}
 	fh_fill_pre_attrs(fhp);
-	host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path);
+	host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path);
 	err = nfserrno(host_err);
 	cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
 	if (!err)
@@ -1625,7 +1626,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	if (d_really_is_negative(dold))
 		goto out_dput;
 	fh_fill_pre_attrs(ffhp);
-	host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL);
+	host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL);
 	fh_fill_post_attrs(ffhp);
 	inode_unlock(dirp);
 	if (!host_err) {
@@ -1745,10 +1746,10 @@ retry:
 		goto out_dput_old;
 	} else {
 		struct renamedata rd = {
-			.old_mnt_userns	= &init_user_ns,
+			.old_mnt_idmap	= &nop_mnt_idmap,
 			.old_dir	= fdir,
 			.old_dentry	= odentry,
-			.new_mnt_userns	= &init_user_ns,
+			.new_mnt_idmap	= &nop_mnt_idmap,
 			.new_dir	= tdir,
 			.new_dentry	= ndentry,
 		};
@@ -1850,14 +1851,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 			nfsd_close_cached_files(rdentry);
 
 		for (retries = 1;;) {
-			host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL);
+			host_err = vfs_unlink(&nop_mnt_idmap, dirp, rdentry, NULL);
 			if (host_err != -EAGAIN || !retries--)
 				break;
 			if (!nfsd_wait_for_delegreturn(rqstp, rinode))
 				break;
 		}
 	} else {
-		host_err = vfs_rmdir(&init_user_ns, dirp, rdentry);
+		host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry);
 	}
 	fh_fill_post_attrs(fhp);
 
diff --git a/fs/open.c b/fs/open.c
index 82c1a28b3308..60a81db586ef 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -36,9 +36,10 @@
 
 #include "internal.h"
 
-int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry,
+int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry,
 		loff_t length, unsigned int time_attrs, struct file *filp)
 {
+	struct user_namespace *mnt_userns = mnt_idmap_owner(idmap);
 	int ret;
 	struct iattr newattrs;
 
@@ -62,13 +63,14 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry,
 
 	inode_lock(dentry->d_inode);
 	/* Note any delegations or leases have already been broken: */
-	ret = notify_change(mnt_userns, dentry, &newattrs, NULL);
+	ret = notify_change(idmap, dentry, &newattrs, NULL);
 	inode_unlock(dentry->d_inode);
 	return ret;
 }
 
 long vfs_truncate(const struct path *path, loff_t length)
 {
+	struct mnt_idmap *idmap;
 	struct user_namespace *mnt_userns;
 	struct inode *inode;
 	long error;
@@ -85,7 +87,8 @@ long vfs_truncate(const struct path *path, loff_t length)
 	if (error)
 		goto out;
 
-	mnt_userns = mnt_user_ns(path->mnt);
+	idmap = mnt_idmap(path->mnt);
+	mnt_userns = mnt_idmap_owner(idmap);
 	error = inode_permission(mnt_userns, inode, MAY_WRITE);
 	if (error)
 		goto mnt_drop_write_and_out;
@@ -108,7 +111,7 @@ long vfs_truncate(const struct path *path, loff_t length)
 
 	error = security_path_truncate(path);
 	if (!error)
-		error = do_truncate(mnt_userns, path->dentry, length, 0, NULL);
+		error = do_truncate(idmap, path->dentry, length, 0, NULL);
 
 put_write_and_out:
 	put_write_access(inode);
@@ -190,7 +193,7 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 	sb_start_write(inode->i_sb);
 	error = security_file_truncate(f.file);
 	if (!error)
-		error = do_truncate(file_mnt_user_ns(f.file), dentry, length,
+		error = do_truncate(file_mnt_idmap(f.file), dentry, length,
 				    ATTR_MTIME | ATTR_CTIME, f.file);
 	sb_end_write(inode->i_sb);
 out_putf:
@@ -603,7 +606,7 @@ retry_deleg:
 		goto out_unlock;
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-	error = notify_change(mnt_user_ns(path->mnt), path->dentry,
+	error = notify_change(mnt_idmap(path->mnt), path->dentry,
 			      &newattrs, &delegated_inode);
 out_unlock:
 	inode_unlock(inode);
@@ -701,6 +704,7 @@ static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid)
 
 int chown_common(const struct path *path, uid_t user, gid_t group)
 {
+	struct mnt_idmap *idmap;
 	struct user_namespace *mnt_userns, *fs_userns;
 	struct inode *inode = path->dentry->d_inode;
 	struct inode *delegated_inode = NULL;
@@ -712,7 +716,8 @@ int chown_common(const struct path *path, uid_t user, gid_t group)
 	uid = make_kuid(current_user_ns(), user);
 	gid = make_kgid(current_user_ns(), group);
 
-	mnt_userns = mnt_user_ns(path->mnt);
+	idmap = mnt_idmap(path->mnt);
+	mnt_userns = mnt_idmap_owner(idmap);
 	fs_userns = i_user_ns(inode);
 
 retry_deleg:
@@ -733,7 +738,7 @@ retry_deleg:
 		from_vfsuid(mnt_userns, fs_userns, newattrs.ia_vfsuid),
 		from_vfsgid(mnt_userns, fs_userns, newattrs.ia_vfsgid));
 	if (!error)
-		error = notify_change(mnt_userns, path->dentry, &newattrs,
+		error = notify_change(idmap, path->dentry, &newattrs,
 				      &delegated_inode);
 	inode_unlock(inode);
 	if (delegated_inode) {
@@ -1064,7 +1069,7 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 	if (IS_ERR(f))
 		return f;
 
-	error = vfs_create(mnt_user_ns(path->mnt),
+	error = vfs_create(mnt_idmap(path->mnt),
 			   d_inode(path->dentry->d_parent),
 			   path->dentry, mode, true);
 	if (!error)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 1df7f850ff3b..ff89454b07fc 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -141,13 +141,13 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs,
 				       struct dentry *upperdentry,
 				       struct iattr *attr)
 {
-	return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL);
+	return notify_change(ovl_upper_mnt_idmap(ofs), upperdentry, attr, NULL);
 }
 
 static inline int ovl_do_rmdir(struct ovl_fs *ofs,
 			       struct inode *dir, struct dentry *dentry)
 {
-	int err = vfs_rmdir(ovl_upper_mnt_userns(ofs), dir, dentry);
+	int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry);
 
 	pr_debug("rmdir(%pd2) = %i\n", dentry, err);
 	return err;
@@ -156,7 +156,7 @@ static inline int ovl_do_rmdir(struct ovl_fs *ofs,
 static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir,
 				struct dentry *dentry)
 {
-	int err = vfs_unlink(ovl_upper_mnt_userns(ofs), dir, dentry, NULL);
+	int err = vfs_unlink(ovl_upper_mnt_idmap(ofs), dir, dentry, NULL);
 
 	pr_debug("unlink(%pd2) = %i\n", dentry, err);
 	return err;
@@ -165,7 +165,8 @@ static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir,
 static inline int ovl_do_link(struct ovl_fs *ofs, struct dentry *old_dentry,
 			      struct inode *dir, struct dentry *new_dentry)
 {
-	int err = vfs_link(old_dentry, ovl_upper_mnt_userns(ofs), dir, new_dentry, NULL);
+	int err = vfs_link(old_dentry, ovl_upper_mnt_idmap(ofs), dir,
+			   new_dentry, NULL);
 
 	pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err);
 	return err;
@@ -175,7 +176,7 @@ static inline int ovl_do_create(struct ovl_fs *ofs,
 				struct inode *dir, struct dentry *dentry,
 				umode_t mode)
 {
-	int err = vfs_create(ovl_upper_mnt_userns(ofs), dir, dentry, mode, true);
+	int err = vfs_create(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, true);
 
 	pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
 	return err;
@@ -185,7 +186,7 @@ static inline int ovl_do_mkdir(struct ovl_fs *ofs,
 			       struct inode *dir, struct dentry *dentry,
 			       umode_t mode)
 {
-	int err = vfs_mkdir(ovl_upper_mnt_userns(ofs), dir, dentry, mode);
+	int err = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode);
 	pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
 	return err;
 }
@@ -194,7 +195,7 @@ static inline int ovl_do_mknod(struct ovl_fs *ofs,
 			       struct inode *dir, struct dentry *dentry,
 			       umode_t mode, dev_t dev)
 {
-	int err = vfs_mknod(ovl_upper_mnt_userns(ofs), dir, dentry, mode, dev);
+	int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev);
 
 	pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err);
 	return err;
@@ -204,7 +205,7 @@ static inline int ovl_do_symlink(struct ovl_fs *ofs,
 				 struct inode *dir, struct dentry *dentry,
 				 const char *oldname)
 {
-	int err = vfs_symlink(ovl_upper_mnt_userns(ofs), dir, dentry, oldname);
+	int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname);
 
 	pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
 	return err;
@@ -298,10 +299,10 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir,
 {
 	int err;
 	struct renamedata rd = {
-		.old_mnt_userns	= ovl_upper_mnt_userns(ofs),
+		.old_mnt_idmap	= ovl_upper_mnt_idmap(ofs),
 		.old_dir 	= olddir,
 		.old_dentry 	= olddentry,
-		.new_mnt_userns	= ovl_upper_mnt_userns(ofs),
+		.new_mnt_idmap	= ovl_upper_mnt_idmap(ofs),
 		.new_dir 	= newdir,
 		.new_dentry 	= newdentry,
 		.flags 		= flags,
@@ -319,7 +320,7 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir,
 static inline int ovl_do_whiteout(struct ovl_fs *ofs,
 				  struct inode *dir, struct dentry *dentry)
 {
-	int err = vfs_whiteout(ovl_upper_mnt_userns(ofs), dir, dentry);
+	int err = vfs_whiteout(ovl_upper_mnt_idmap(ofs), dir, dentry);
 	pr_debug("whiteout(%pd2) = %i\n", dentry, err);
 	return err;
 }
@@ -328,7 +329,7 @@ static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs,
 					  struct dentry *dentry, umode_t mode)
 {
 	struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry };
-	struct file *file = vfs_tmpfile_open(ovl_upper_mnt_userns(ofs), &path, mode,
+	struct file *file = vfs_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path, mode,
 					O_LARGEFILE | O_WRONLY, current_cred());
 	int err = PTR_ERR_OR_ZERO(file);
 
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index e1af8f660698..a6a9235c6168 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -95,6 +95,11 @@ static inline struct user_namespace *ovl_upper_mnt_userns(struct ovl_fs *ofs)
 	return mnt_user_ns(ovl_upper_mnt(ofs));
 }
 
+static inline struct mnt_idmap *ovl_upper_mnt_idmap(struct ovl_fs *ofs)
+{
+	return mnt_idmap(ovl_upper_mnt(ofs));
+}
+
 static inline struct ovl_fs *OVL_FS(struct super_block *sb)
 {
 	return (struct ovl_fs *)sb->s_fs_info;
diff --git a/fs/utimes.c b/fs/utimes.c
index 39f356017635..5f71e2209e6f 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -62,7 +62,7 @@ int vfs_utimes(const struct path *path, struct timespec64 *times)
 	}
 retry_deleg:
 	inode_lock(inode);
-	error = notify_change(mnt_user_ns(path->mnt), path->dentry, &newattrs,
+	error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs,
 			      &delegated_inode);
 	inode_unlock(inode);
 	if (delegated_inode) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 066555ad1bf8..7aa302d2ce39 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1944,36 +1944,36 @@ bool inode_owner_or_capable(struct user_namespace *mnt_userns,
 /*
  * VFS helper functions..
  */
-int vfs_create(struct user_namespace *, struct inode *,
+int vfs_create(struct mnt_idmap *, struct inode *,
 	       struct dentry *, umode_t, bool);
-int vfs_mkdir(struct user_namespace *, struct inode *,
+int vfs_mkdir(struct mnt_idmap *, struct inode *,
 	      struct dentry *, umode_t);
-int vfs_mknod(struct user_namespace *, struct inode *, struct dentry *,
+int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
               umode_t, dev_t);
-int vfs_symlink(struct user_namespace *, struct inode *,
+int vfs_symlink(struct mnt_idmap *, struct inode *,
 		struct dentry *, const char *);
-int vfs_link(struct dentry *, struct user_namespace *, struct inode *,
+int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *,
 	     struct dentry *, struct inode **);
-int vfs_rmdir(struct user_namespace *, struct inode *, struct dentry *);
-int vfs_unlink(struct user_namespace *, struct inode *, struct dentry *,
+int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *);
+int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *,
 	       struct inode **);
 
 /**
  * struct renamedata - contains all information required for renaming
- * @old_mnt_userns:    old user namespace of the mount the inode was found from
+ * @old_mnt_idmap:     idmap of the old mount the inode was found from
  * @old_dir:           parent of source
  * @old_dentry:                source
- * @new_mnt_userns:    new user namespace of the mount the inode was found from
+ * @new_mnt_idmap:     idmap of the new mount the inode was found from
  * @new_dir:           parent of destination
  * @new_dentry:                destination
  * @delegated_inode:   returns an inode needing a delegation break
  * @flags:             rename flags
  */
 struct renamedata {
-	struct user_namespace *old_mnt_userns;
+	struct mnt_idmap *old_mnt_idmap;
 	struct inode *old_dir;
 	struct dentry *old_dentry;
-	struct user_namespace *new_mnt_userns;
+	struct mnt_idmap *new_mnt_idmap;
 	struct inode *new_dir;
 	struct dentry *new_dentry;
 	struct inode **delegated_inode;
@@ -1982,14 +1982,14 @@ struct renamedata {
 
 int vfs_rename(struct renamedata *);
 
-static inline int vfs_whiteout(struct user_namespace *mnt_userns,
+static inline int vfs_whiteout(struct mnt_idmap *idmap,
 			       struct inode *dir, struct dentry *dentry)
 {
-	return vfs_mknod(mnt_userns, dir, dentry, S_IFCHR | WHITEOUT_MODE,
+	return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE,
 			 WHITEOUT_DEV);
 }
 
-struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns,
+struct file *vfs_tmpfile_open(struct mnt_idmap *idmap,
 			const struct path *parentpath,
 			umode_t mode, int open_flag, const struct cred *cred);
 
@@ -2746,7 +2746,7 @@ static inline bool is_idmapped_mnt(const struct vfsmount *mnt)
 }
 
 extern long vfs_truncate(const struct path *, loff_t);
-int do_truncate(struct user_namespace *, struct dentry *, loff_t start,
+int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start,
 		unsigned int time_attrs, struct file *filp);
 extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
 			loff_t len);
@@ -2901,7 +2901,7 @@ static inline int bmap(struct inode *inode,  sector_t *block)
 }
 #endif
 
-int notify_change(struct user_namespace *, struct dentry *,
+int notify_change(struct mnt_idmap *, struct dentry *,
 		  struct iattr *, struct inode **);
 int inode_permission(struct user_namespace *, struct inode *, int);
 int generic_permission(struct user_namespace *, struct inode *, int);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index d09aa1c1e3e6..4b78095fe779 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -979,7 +979,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 		err = -ENOENT;
 	} else {
 		ihold(inode);
-		err = vfs_unlink(&init_user_ns, d_inode(dentry->d_parent),
+		err = vfs_unlink(&nop_mnt_idmap, d_inode(dentry->d_parent),
 				 dentry, NULL);
 	}
 	dput(dentry);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index f0c2293f1d3b..81ff98298996 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1190,7 +1190,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
 	unsigned int new_hash, old_hash = sk->sk_hash;
 	struct unix_sock *u = unix_sk(sk);
 	struct net *net = sock_net(sk);
-	struct user_namespace *ns; // barf...
+	struct mnt_idmap *idmap;
 	struct unix_address *addr;
 	struct dentry *dentry;
 	struct path parent;
@@ -1217,10 +1217,10 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
 	/*
 	 * All right, let's create it.
 	 */
-	ns = mnt_user_ns(parent.mnt);
+	idmap = mnt_idmap(parent.mnt);
 	err = security_path_mknod(&parent, dentry, mode, 0);
 	if (!err)
-		err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
+		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
 	if (err)
 		goto out_path;
 	err = mutex_lock_interruptible(&u->bindlock);
@@ -1245,7 +1245,7 @@ out_unlock:
 	err = -EINVAL;
 out_unlink:
 	/* failed after successful mknod?  unlink what we'd created... */
-	vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
+	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
 out_path:
 	done_path_create(&parent, dentry);
 out:
diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h
index 92331053dba3..7bd76b9e0f98 100644
--- a/tools/testing/selftests/bpf/progs/profiler.inc.h
+++ b/tools/testing/selftests/bpf/progs/profiler.inc.h
@@ -826,7 +826,7 @@ out:
 
 SEC("kprobe/vfs_link")
 int BPF_KPROBE(kprobe__vfs_link,
-	       struct dentry* old_dentry, struct user_namespace *mnt_userns,
+	       struct dentry* old_dentry, struct mnt_idmap *idmap,
 	       struct inode* dir, struct dentry* new_dentry,
 	       struct inode** delegated_inode)
 {
-- 
cgit 


From 507806e9fdf09774d390c5f22893ba4d87ce40d5 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 13 Jan 2023 10:09:27 +0100
Subject: selftests: hid: add vmtest.sh

Similar-ish in many points from the script in selftests/bpf, with a few
differences:
- relies on boot2container instead of a plain qemu image (meaning that
  we can take any container in a registry as a base)
- runs in the hid selftest dir, and such uses the test program from there
- the working directory to store the config is in
  tools/selftests/hid/results

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 tools/testing/selftests/hid/.gitignore    |   1 +
 tools/testing/selftests/hid/config.common | 241 +++++++++++++++++++++++++
 tools/testing/selftests/hid/config.x86_64 |   4 +
 tools/testing/selftests/hid/vmtest.sh     | 284 ++++++++++++++++++++++++++++++
 4 files changed, 530 insertions(+)
 create mode 100644 tools/testing/selftests/hid/config.common
 create mode 100644 tools/testing/selftests/hid/config.x86_64
 create mode 100755 tools/testing/selftests/hid/vmtest.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/.gitignore b/tools/testing/selftests/hid/.gitignore
index a462ca6ab2c0..995af0670f69 100644
--- a/tools/testing/selftests/hid/.gitignore
+++ b/tools/testing/selftests/hid/.gitignore
@@ -2,3 +2,4 @@ bpftool
 *.skel.h
 /tools
 hid_bpf
+results
diff --git a/tools/testing/selftests/hid/config.common b/tools/testing/selftests/hid/config.common
new file mode 100644
index 000000000000..0617275d93cc
--- /dev/null
+++ b/tools/testing/selftests/hid/config.common
@@ -0,0 +1,241 @@
+CONFIG_9P_FS_POSIX_ACL=y
+CONFIG_9P_FS_SECURITY=y
+CONFIG_9P_FS=y
+CONFIG_AUDIT=y
+CONFIG_BINFMT_MISC=y
+CONFIG_BLK_CGROUP_IOLATENCY=y
+CONFIG_BLK_CGROUP=y
+CONFIG_BLK_DEV_BSGLIB=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_BLK_DEV_RAM_SIZE=16384
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_THROTTLING=y
+CONFIG_BONDING=y
+CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
+CONFIG_BOOTTIME_TRACING=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_CFS_BANDWIDTH=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_HUGETLB=y
+CONFIG_CGROUP_NET_CLASSID=y
+CONFIG_CGROUP_NET_PRIO=y
+CONFIG_CGROUP_PERF=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_RDMA=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_WRITEBACK=y
+CONFIG_CMA_AREAS=7
+CONFIG_CMA=y
+CONFIG_COMPAT_32BIT_TIME=y
+CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
+CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_STAT=y
+CONFIG_CPU_IDLE_GOV_LADDER=y
+CONFIG_CPUSETS=y
+CONFIG_CRC_T10DIF=y
+CONFIG_CRYPTO_BLAKE2B=y
+CONFIG_CRYPTO_DEV_VIRTIO=y
+CONFIG_CRYPTO_SEQIV=y
+CONFIG_CRYPTO_XXHASH=y
+CONFIG_DCB=y
+CONFIG_DEBUG_ATOMIC_SLEEP=y
+CONFIG_DEBUG_CREDENTIALS=y
+CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_DEFAULT_FQ_CODEL=y
+CONFIG_DEFAULT_RENO=y
+CONFIG_DEFAULT_SECURITY_DAC=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_DEVTMPFS=y
+CONFIG_DMA_CMA=y
+CONFIG_DNS_RESOLVER=y
+CONFIG_EFI_STUB=y
+CONFIG_EFI=y
+CONFIG_EXPERT=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_EXT4_FS=y
+CONFIG_FAIL_FUNCTION=y
+CONFIG_FAULT_INJECTION_DEBUG_FS=y
+CONFIG_FAULT_INJECTION=y
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+CONFIG_FB_VESA=y
+CONFIG_FB=y
+CONFIG_FONT_8x16=y
+CONFIG_FONT_MINI_4x6=y
+CONFIG_FONTS=y
+CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
+CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_FUSE_FS=y
+CONFIG_FW_LOADER_USER_HELPER=y
+CONFIG_GART_IOMMU=y
+CONFIG_GENERIC_PHY=y
+CONFIG_HARDLOCKUP_DETECTOR=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_HPET=y
+CONFIG_HUGETLBFS=y
+CONFIG_HUGETLB_PAGE=y
+CONFIG_HWPOISON_INJECT=y
+CONFIG_HZ_1000=y
+CONFIG_INET=y
+CONFIG_INTEL_POWERCLAMP=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_NAT=y
+CONFIG_IP6_NF_TARGET_MASQUERADE=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_SEG6_LWTUNNEL=y
+CONFIG_IPV6_SUBTREES=y
+CONFIG_IRQ_POLL=y
+CONFIG_JUMP_LABEL=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_KEXEC=y
+CONFIG_KPROBES=y
+CONFIG_KSM=y
+CONFIG_LEGACY_VSYSCALL_NONE=y
+CONFIG_LOG_BUF_SHIFT=21
+CONFIG_LOG_CPU_MAX_BUF_SHIFT=0
+CONFIG_LOGO=y
+CONFIG_LSM="selinux,bpf,integrity"
+CONFIG_MAC_PARTITION=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_MCORE2=y
+CONFIG_MEMCG=y
+CONFIG_MEMORY_FAILURE=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_MODULES=y
+CONFIG_NAMESPACES=y
+CONFIG_NET_9P_VIRTIO=y
+CONFIG_NET_9P=y
+CONFIG_NET_ACT_BPF=y
+CONFIG_NET_CLS_CGROUP=y
+CONFIG_NETDEVICES=y
+CONFIG_NET_EMATCH=y
+CONFIG_NETFILTER_NETLINK_LOG=y
+CONFIG_NETFILTER_NETLINK_QUEUE=y
+CONFIG_NETFILTER_XTABLES=y
+CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=y
+CONFIG_NETFILTER_XT_MATCH_BPF=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_MULTIPORT=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_NAT=y
+CONFIG_NETFILTER_XT_TARGET_MASQUERADE=y
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_NET_L3_MASTER_DEV=y
+CONFIG_NETLABEL=y
+CONFIG_NET_SCH_DEFAULT=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_FQ_CODEL=y
+CONFIG_NET_TC_SKB_EXT=y
+CONFIG_NET_VRF=y
+CONFIG_NET=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_NAT_MASQUERADE=y
+CONFIG_NF_NAT=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NO_HZ=y
+CONFIG_NR_CPUS=128
+CONFIG_NUMA_BALANCING=y
+CONFIG_NUMA=y
+CONFIG_NVMEM=y
+CONFIG_OSF_PARTITION=y
+CONFIG_OVERLAY_FS_INDEX=y
+CONFIG_OVERLAY_FS_METACOPY=y
+CONFIG_OVERLAY_FS_XINO_AUTO=y
+CONFIG_OVERLAY_FS=y
+CONFIG_PACKET=y
+CONFIG_PANIC_ON_OOPS=y
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_PCIEPORTBUS=y
+CONFIG_PCI_IOV=y
+CONFIG_PCI_MSI=y
+CONFIG_PCI=y
+CONFIG_PHYSICAL_ALIGN=0x1000000
+CONFIG_POSIX_MQUEUE=y
+CONFIG_POWER_SUPPLY=y
+CONFIG_PREEMPT=y
+CONFIG_PRINTK_TIME=y
+CONFIG_PROC_KCORE=y
+CONFIG_PROFILING=y
+CONFIG_PROVE_LOCKING=y
+CONFIG_PTP_1588_CLOCK=y
+CONFIG_RC_DEVICES=y
+CONFIG_RC_LOOPBACK=y
+CONFIG_RCU_CPU_STALL_TIMEOUT=60
+CONFIG_SCHED_STACK_END_CHECK=y
+CONFIG_SCHEDSTATS=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_DETECT_IRQ=y
+CONFIG_SERIAL_8250_EXTENDED=y
+CONFIG_SERIAL_8250_MANY_PORTS=y
+CONFIG_SERIAL_8250_NR_UARTS=32
+CONFIG_SERIAL_8250_RSA=y
+CONFIG_SERIAL_8250_SHARE_IRQ=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_NONSTANDARD=y
+CONFIG_SERIO_LIBPS2=y
+CONFIG_SGI_PARTITION=y
+CONFIG_SMP=y
+CONFIG_SOCK_CGROUP_DATA=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_SYNC_FILE=y
+CONFIG_SYSVIPC=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_XACCT=y
+CONFIG_TCP_CONG_ADVANCED=y
+CONFIG_TCP_MD5SIG=y
+CONFIG_TLS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_TMPFS=y
+CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y
+CONFIG_TRANSPARENT_HUGEPAGE=y
+CONFIG_TUN=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_UNIX=y
+CONFIG_USER_NS=y
+CONFIG_VALIDATE_FS_PARSER=y
+CONFIG_VETH=y
+CONFIG_VIRT_DRIVERS=y
+CONFIG_VIRTIO_BALLOON=y
+CONFIG_VIRTIO_BLK=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO_FS=y
+CONFIG_VIRTIO_NET=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VLAN_8021Q=y
+CONFIG_XFRM_SUB_POLICY=y
+CONFIG_XFRM_USER=y
+CONFIG_ZEROPLUS_FF=y
diff --git a/tools/testing/selftests/hid/config.x86_64 b/tools/testing/selftests/hid/config.x86_64
new file mode 100644
index 000000000000..a8721f403c21
--- /dev/null
+++ b/tools/testing/selftests/hid/config.x86_64
@@ -0,0 +1,4 @@
+CONFIG_X86_ACPI_CPUFREQ=y
+CONFIG_X86_CPUID=y
+CONFIG_X86_MSR=y
+CONFIG_X86_POWERNOW_K8=y
diff --git a/tools/testing/selftests/hid/vmtest.sh b/tools/testing/selftests/hid/vmtest.sh
new file mode 100755
index 000000000000..90f34150f257
--- /dev/null
+++ b/tools/testing/selftests/hid/vmtest.sh
@@ -0,0 +1,284 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+set -u
+set -e
+
+# This script currently only works for x86_64
+ARCH="$(uname -m)"
+case "${ARCH}" in
+x86_64)
+	QEMU_BINARY=qemu-system-x86_64
+	BZIMAGE="arch/x86/boot/bzImage"
+	;;
+*)
+	echo "Unsupported architecture"
+	exit 1
+	;;
+esac
+DEFAULT_COMMAND="./hid_bpf"
+SCRIPT_DIR="$(dirname $(realpath $0))"
+OUTPUT_DIR="$SCRIPT_DIR/results"
+KCONFIG_REL_PATHS=("${SCRIPT_DIR}/config" "${SCRIPT_DIR}/config.common" "${SCRIPT_DIR}/config.${ARCH}")
+B2C_URL="https://gitlab.freedesktop.org/mupuf/boot2container/-/raw/master/vm2c.py"
+NUM_COMPILE_JOBS="$(nproc)"
+LOG_FILE_BASE="$(date +"hid_selftests.%Y-%m-%d_%H-%M-%S")"
+LOG_FILE="${LOG_FILE_BASE}.log"
+EXIT_STATUS_FILE="${LOG_FILE_BASE}.exit_status"
+CONTAINER_IMAGE="registry.fedoraproject.org/fedora:36"
+
+usage()
+{
+	cat <<EOF
+Usage: $0 [-i] [-s] [-d <output_dir>] -- [<command>]
+
+<command> is the command you would normally run when you are in
+tools/testing/selftests/bpf. e.g:
+
+	$0 -- ./hid_bpf
+
+If no command is specified and a debug shell (-s) is not requested,
+"${DEFAULT_COMMAND}" will be run by default.
+
+If you build your kernel using KBUILD_OUTPUT= or O= options, these
+can be passed as environment variables to the script:
+
+  O=<kernel_build_path> $0 -- ./hid_bpf
+
+or
+
+  KBUILD_OUTPUT=<kernel_build_path> $0 -- ./hid_bpf
+
+Options:
+
+	-u)		Update the boot2container script to a newer version.
+	-d)		Update the output directory (default: ${OUTPUT_DIR})
+	-j)		Number of jobs for compilation, similar to -j in make
+			(default: ${NUM_COMPILE_JOBS})
+	-s)		Instead of powering off the VM, start an interactive
+			shell. If <command> is specified, the shell runs after
+			the command finishes executing
+EOF
+}
+
+download()
+{
+	local file="$1"
+
+	echo "Downloading $file..." >&2
+	curl -Lsf "$file" -o "${@:2}"
+}
+
+recompile_kernel()
+{
+	local kernel_checkout="$1"
+	local make_command="$2"
+
+	cd "${kernel_checkout}"
+
+	${make_command} olddefconfig
+	${make_command}
+}
+
+update_selftests()
+{
+	local kernel_checkout="$1"
+	local selftests_dir="${kernel_checkout}/tools/testing/selftests/hid"
+
+	cd "${selftests_dir}"
+	${make_command}
+}
+
+run_vm()
+{
+	local b2c="$1"
+	local kernel_bzimage="$2"
+	local command="$3"
+	local post_command=""
+
+	if ! which "${QEMU_BINARY}" &> /dev/null; then
+		cat <<EOF
+Could not find ${QEMU_BINARY}
+Please install qemu or set the QEMU_BINARY environment variable.
+EOF
+		exit 1
+	fi
+
+	# alpine (used in post-container requires the PATH to have /bin
+	export PATH=$PATH:/bin
+
+	if [[ "${debug_shell}" != "yes" ]]
+	then
+		touch ${OUTPUT_DIR}/${LOG_FILE}
+		command="mount bpffs -t bpf /sys/fs/bpf/; set -o pipefail ; ${command} 2>&1 | tee ${OUTPUT_DIR}/${LOG_FILE}"
+		post_command="cat ${OUTPUT_DIR}/${LOG_FILE}"
+	else
+		command="mount bpffs -t bpf /sys/fs/bpf/; ${command}"
+	fi
+
+	set +e
+	$b2c --command "${command}" \
+	     --kernel ${kernel_bzimage} \
+	     --workdir ${OUTPUT_DIR} \
+	     --image ${CONTAINER_IMAGE}
+
+	echo $? > ${OUTPUT_DIR}/${EXIT_STATUS_FILE}
+
+	set -e
+
+	${post_command}
+}
+
+is_rel_path()
+{
+	local path="$1"
+
+	[[ ${path:0:1} != "/" ]]
+}
+
+do_update_kconfig()
+{
+	local kernel_checkout="$1"
+	local kconfig_file="$2"
+
+	rm -f "$kconfig_file" 2> /dev/null
+
+	for config in "${KCONFIG_REL_PATHS[@]}"; do
+		local kconfig_src="${config}"
+		cat "$kconfig_src" >> "$kconfig_file"
+	done
+}
+
+update_kconfig()
+{
+	local kernel_checkout="$1"
+	local kconfig_file="$2"
+
+	if [[ -f "${kconfig_file}" ]]; then
+		local local_modified="$(stat -c %Y "${kconfig_file}")"
+
+		for config in "${KCONFIG_REL_PATHS[@]}"; do
+			local kconfig_src="${config}"
+			local src_modified="$(stat -c %Y "${kconfig_src}")"
+			# Only update the config if it has been updated after the
+			# previously cached config was created. This avoids
+			# unnecessarily compiling the kernel and selftests.
+			if [[ "${src_modified}" -gt "${local_modified}" ]]; then
+				do_update_kconfig "$kernel_checkout" "$kconfig_file"
+				# Once we have found one outdated configuration
+				# there is no need to check other ones.
+				break
+			fi
+		done
+	else
+		do_update_kconfig "$kernel_checkout" "$kconfig_file"
+	fi
+}
+
+main()
+{
+	local script_dir="$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)"
+	local kernel_checkout=$(realpath "${script_dir}"/../../../../)
+	# By default the script searches for the kernel in the checkout directory but
+	# it also obeys environment variables O= and KBUILD_OUTPUT=
+	local kernel_bzimage="${kernel_checkout}/${BZIMAGE}"
+	local command="${DEFAULT_COMMAND}"
+	local update_b2c="no"
+	local debug_shell="no"
+
+	while getopts ':hsud:j:' opt; do
+		case ${opt} in
+		u)
+			update_b2c="yes"
+			;;
+		d)
+			OUTPUT_DIR="$OPTARG"
+			;;
+		j)
+			NUM_COMPILE_JOBS="$OPTARG"
+			;;
+		s)
+			command="/bin/sh"
+			debug_shell="yes"
+			;;
+		h)
+			usage
+			exit 0
+			;;
+		\? )
+			echo "Invalid Option: -$OPTARG"
+			usage
+			exit 1
+			;;
+		: )
+			echo "Invalid Option: -$OPTARG requires an argument"
+			usage
+			exit 1
+			;;
+		esac
+	done
+	shift $((OPTIND -1))
+
+	# trap 'catch "$?"' EXIT
+
+	if [[ "${debug_shell}" == "no" ]]; then
+		if [[ $# -eq 0 ]]; then
+			echo "No command specified, will run ${DEFAULT_COMMAND} in the vm"
+		else
+			command="$@"
+
+			if [[ "${command}" == "/bin/bash" || "${command}" == "bash" ]]
+			then
+				debug_shell="yes"
+			fi
+		fi
+	fi
+
+	local kconfig_file="${OUTPUT_DIR}/latest.config"
+	local make_command="make -j ${NUM_COMPILE_JOBS} KCONFIG_CONFIG=${kconfig_file}"
+
+	# Figure out where the kernel is being built.
+	# O takes precedence over KBUILD_OUTPUT.
+	if [[ "${O:=""}" != "" ]]; then
+		if is_rel_path "${O}"; then
+			O="$(realpath "${PWD}/${O}")"
+		fi
+		kernel_bzimage="${O}/${BZIMAGE}"
+		make_command="${make_command} O=${O}"
+	elif [[ "${KBUILD_OUTPUT:=""}" != "" ]]; then
+		if is_rel_path "${KBUILD_OUTPUT}"; then
+			KBUILD_OUTPUT="$(realpath "${PWD}/${KBUILD_OUTPUT}")"
+		fi
+		kernel_bzimage="${KBUILD_OUTPUT}/${BZIMAGE}"
+		make_command="${make_command} KBUILD_OUTPUT=${KBUILD_OUTPUT}"
+	fi
+
+	local b2c="${OUTPUT_DIR}/vm2c.py"
+
+	echo "Output directory: ${OUTPUT_DIR}"
+
+	mkdir -p "${OUTPUT_DIR}"
+	update_kconfig "${kernel_checkout}" "${kconfig_file}"
+
+	recompile_kernel "${kernel_checkout}" "${make_command}"
+
+	if [[ "${update_b2c}" == "no" && ! -f "${b2c}" ]]; then
+		echo "vm2c script not found in ${b2c}"
+		update_b2c="yes"
+	fi
+
+	if [[ "${update_b2c}" == "yes" ]]; then
+		download $B2C_URL $b2c
+		chmod +x $b2c
+	fi
+
+	update_selftests "${kernel_checkout}" "${make_command}"
+	run_vm $b2c "${kernel_bzimage}" "${command}"
+	if [[ "${debug_shell}" != "yes" ]]; then
+		echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}"
+	fi
+
+	exit $(cat ${OUTPUT_DIR}/${EXIT_STATUS_FILE})
+}
+
+main "$@"
-- 
cgit 


From 633ba3be76426056a047b2a355082d94afa9230d Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 13 Jan 2023 10:09:28 +0100
Subject: selftests: hid: allow to compile hid_bpf with LLVM

clang doesn't like to compile a source to the final binary directly:

clang-14: error: cannot specify -o when generating multiple output files

So split the final rule in 2, and ensure we compile all dependencies
before.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 tools/testing/selftests/hid/Makefile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/Makefile b/tools/testing/selftests/hid/Makefile
index f6fc5cfff770..83e8f87d643a 100644
--- a/tools/testing/selftests/hid/Makefile
+++ b/tools/testing/selftests/hid/Makefile
@@ -91,10 +91,6 @@ $(MAKE_DIRS):
 	$(call msg,MKDIR,,$@)
 	$(Q)mkdir -p $@
 
-$(OUTPUT)/%.o: %.c
-	$(call msg,CC,,$@)
-	$(Q)$(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@
-
 # LLVM's ld.lld doesn't support all the architectures, so use it only on x86
 ifeq ($(SRCARCH),x86)
 LLD := lld
@@ -223,7 +219,11 @@ $(BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(OUTPUT)
 	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
 	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked1.o) name $(notdir $(<:.bpf.o=)) > $@
 
-$(OUTPUT)/%:%.c $(BPF_SKELS) $(KHDR_INCLUDES)/linux/hid.h
+$(OUTPUT)/%.o: %.c $(BPF_SKELS) $(KHDR_INCLUDES)/linux/hid.h
+	$(call msg,CC,,$@)
+	$(Q)$(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@
+
+$(OUTPUT)/%: $(OUTPUT)/%.o
 	$(call msg,BINARY,,$@)
 	$(Q)$(LINK.c) $^ $(LDLIBS) -o $@
 
-- 
cgit 


From cea6c4d969bb104d2847e1f4ed708ae0126d6f42 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 13 Jan 2023 10:09:29 +0100
Subject: selftests: hid: attach/detach 2 bpf programs, not just one

Add a second BPF program to attach to the device, as the development of
this feature showed that we also need to ensure we can detach multiple
programs to a device (hid_bpf_link->hid_table_index was actually not set
initially, and this lead to any BPF program not being released except for
the first one).

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 tools/testing/selftests/hid/hid_bpf.c   |  8 +++++++-
 tools/testing/selftests/hid/progs/hid.c | 13 +++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index 6615c26fb5dd..d215bb492eb4 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -616,6 +616,7 @@ TEST_F(hid_bpf, test_attach_detach)
 {
 	const struct test_program progs[] = {
 		{ .name = "hid_first_event" },
+		{ .name = "hid_second_event" },
 	};
 	__u8 buf[10] = {0};
 	int err;
@@ -634,7 +635,10 @@ TEST_F(hid_bpf, test_attach_detach)
 	ASSERT_EQ(buf[0], 1);
 	ASSERT_EQ(buf[2], 47);
 
-	/* pin the program and immediately unpin it */
+	/* make sure both programs are run */
+	ASSERT_EQ(buf[3], 52);
+
+	/* pin the first program and immediately unpin it */
 #define PIN_PATH "/sys/fs/bpf/hid_first_event"
 	bpf_program__pin(self->skel->progs.hid_first_event, PIN_PATH);
 	remove(PIN_PATH);
@@ -660,6 +664,7 @@ TEST_F(hid_bpf, test_attach_detach)
 	ASSERT_EQ(buf[0], 1);
 	ASSERT_EQ(buf[1], 47);
 	ASSERT_EQ(buf[2], 0);
+	ASSERT_EQ(buf[3], 0);
 
 	/* re-attach our program */
 
@@ -677,6 +682,7 @@ TEST_F(hid_bpf, test_attach_detach)
 	ASSERT_EQ(err, 6) TH_LOG("read_hidraw");
 	ASSERT_EQ(buf[0], 1);
 	ASSERT_EQ(buf[2], 47);
+	ASSERT_EQ(buf[3], 52);
 }
 
 /*
diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c
index 6a86af0aa545..88c593f753b5 100644
--- a/tools/testing/selftests/hid/progs/hid.c
+++ b/tools/testing/selftests/hid/progs/hid.c
@@ -32,6 +32,19 @@ int BPF_PROG(hid_first_event, struct hid_bpf_ctx *hid_ctx)
 	return hid_ctx->size;
 }
 
+SEC("?fmod_ret/hid_bpf_device_event")
+int BPF_PROG(hid_second_event, struct hid_bpf_ctx *hid_ctx)
+{
+	__u8 *rw_data = hid_bpf_get_data(hid_ctx, 0 /* offset */, 4 /* size */);
+
+	if (!rw_data)
+		return 0; /* EPERM check */
+
+	rw_data[3] = rw_data[2] + 5;
+
+	return hid_ctx->size;
+}
+
 SEC("?fmod_ret/hid_bpf_device_event")
 int BPF_PROG(hid_change_report_id, struct hid_bpf_ctx *hid_ctx)
 {
-- 
cgit 


From d9db1bb55f1032020328ede7a88b2490574646d3 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 13 Jan 2023 10:09:30 +0100
Subject: selftests: hid: ensure the program is correctly pinned

Turns out that if bpffs was not mounted, the test was silently passing.

So ensure it passes by checking the mount command result.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 tools/testing/selftests/hid/hid_bpf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index d215bb492eb4..3b204a15e44b 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -640,7 +640,8 @@ TEST_F(hid_bpf, test_attach_detach)
 
 	/* pin the first program and immediately unpin it */
 #define PIN_PATH "/sys/fs/bpf/hid_first_event"
-	bpf_program__pin(self->skel->progs.hid_first_event, PIN_PATH);
+	err = bpf_program__pin(self->skel->progs.hid_first_event, PIN_PATH);
+	ASSERT_OK(err) TH_LOG("error while calling bpf_program__pin");
 	remove(PIN_PATH);
 #undef PIN_PATH
 	usleep(100000);
-- 
cgit 


From 2574917a2b48f87bd45f1c3e6d80c976f4fe3c3d Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 13 Jan 2023 10:09:31 +0100
Subject: selftests: hid: prepare tests for HID_BPF API change

We plan on changing the return value of hid_bpf_attach_prog().
Instead of returning an error code, it will return an fd to a bpf_link.
This bpf_link is responsible for the binding between the bpf program and
the hid device.

Add a fallback mechanism to not break bisections by pinning the program
when we run this test against the non changed kernel.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 tools/testing/selftests/hid/hid_bpf.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index 3b204a15e44b..9a262fe99b32 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -444,13 +444,21 @@ FIXTURE(hid_bpf) {
 	int hid_id;
 	pthread_t tid;
 	struct hid *skel;
+	int hid_links[3]; /* max number of programs loaded in a single test */
 };
 static void detach_bpf(FIXTURE_DATA(hid_bpf) * self)
 {
+	int i;
+
 	if (self->hidraw_fd)
 		close(self->hidraw_fd);
 	self->hidraw_fd = 0;
 
+	for (i = 0; i < ARRAY_SIZE(self->hid_links); i++) {
+		if (self->hid_links[i])
+			close(self->hid_links[i]);
+	}
+
 	hid__destroy(self->skel);
 	self->skel = NULL;
 }
@@ -512,6 +520,9 @@ static void load_programs(const struct test_program programs[],
 			    .ctx_size_in = sizeof(args),
 	);
 
+	ASSERT_LE(progs_count, ARRAY_SIZE(self->hid_links))
+		TH_LOG("too many programs are to be loaded");
+
 	/* open the bpf file */
 	self->skel = hid__open();
 	ASSERT_OK_PTR(self->skel) TEARDOWN_LOG("Error while calling hid__open");
@@ -543,7 +554,10 @@ static void load_programs(const struct test_program programs[],
 		args.hid = self->hid_id;
 		args.insert_head = programs[i].insert_head;
 		err = bpf_prog_test_run_opts(attach_fd, &tattr);
-		ASSERT_OK(args.retval) TH_LOG("attach_hid(%s): %d", programs[i].name, args.retval);
+		ASSERT_GE(args.retval, 0)
+			TH_LOG("attach_hid(%s): %d", programs[i].name, args.retval);
+
+		self->hid_links[i] = args.retval;
 	}
 
 	self->hidraw_fd = open_hidraw(self->dev_id);
@@ -619,10 +633,17 @@ TEST_F(hid_bpf, test_attach_detach)
 		{ .name = "hid_second_event" },
 	};
 	__u8 buf[10] = {0};
-	int err;
+	int err, link;
 
 	LOAD_PROGRAMS(progs);
 
+	link = self->hid_links[0];
+	/* we might not be using the new code path where hid_bpf_attach_prog()
+	 * returns a link.
+	 */
+	if (!link)
+		link = bpf_program__fd(self->skel->progs.hid_first_event);
+
 	/* inject one event */
 	buf[0] = 1;
 	buf[1] = 42;
@@ -640,8 +661,8 @@ TEST_F(hid_bpf, test_attach_detach)
 
 	/* pin the first program and immediately unpin it */
 #define PIN_PATH "/sys/fs/bpf/hid_first_event"
-	err = bpf_program__pin(self->skel->progs.hid_first_event, PIN_PATH);
-	ASSERT_OK(err) TH_LOG("error while calling bpf_program__pin");
+	err = bpf_obj_pin(link, PIN_PATH);
+	ASSERT_OK(err) TH_LOG("error while calling bpf_obj_pin");
 	remove(PIN_PATH);
 #undef PIN_PATH
 	usleep(100000);
-- 
cgit 


From 6e021d64e4897759e19ce431ee6a366338c00be8 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 13 Jan 2023 10:09:33 +0100
Subject: selftests: hid: enforce new attach API

Now that the new API for hid_bpf_attach_prog() is in place, ensure we
get an fd when calling this function. And remove the fallback code.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 tools/testing/selftests/hid/hid_bpf.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index 9a262fe99b32..2cf96f818f25 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -638,11 +638,7 @@ TEST_F(hid_bpf, test_attach_detach)
 	LOAD_PROGRAMS(progs);
 
 	link = self->hid_links[0];
-	/* we might not be using the new code path where hid_bpf_attach_prog()
-	 * returns a link.
-	 */
-	if (!link)
-		link = bpf_program__fd(self->skel->progs.hid_first_event);
+	ASSERT_GT(link, 0) TH_LOG("HID-BPF link not created");
 
 	/* inject one event */
 	buf[0] = 1;
-- 
cgit 


From 92afc5329a5b23d876b215b783d200352d5aaea6 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Wed, 18 Jan 2023 15:56:44 +0800
Subject: selftests/bpf: Fix build errors if CONFIG_NF_CONNTRACK=m

If CONFIG_NF_CONNTRACK=m, there are no definitions of NF_NAT_MANIP_SRC
and NF_NAT_MANIP_DST in vmlinux.h, build test_bpf_nf.c failed.

$ make -C tools/testing/selftests/bpf/

  CLNG-BPF [test_maps] test_bpf_nf.bpf.o
progs/test_bpf_nf.c:160:42: error: use of undeclared identifier 'NF_NAT_MANIP_SRC'
                bpf_ct_set_nat_info(ct, &saddr, sport, NF_NAT_MANIP_SRC);
                                                       ^
progs/test_bpf_nf.c:163:42: error: use of undeclared identifier 'NF_NAT_MANIP_DST'
                bpf_ct_set_nat_info(ct, &daddr, dport, NF_NAT_MANIP_DST);
                                                       ^
2 errors generated.

Copy the definitions in include/net/netfilter/nf_nat.h to test_bpf_nf.c,
in order to avoid redefinitions if CONFIG_NF_CONNTRACK=y, rename them with
___local suffix. This is similar with commit 1058b6a78db2 ("selftests/bpf:
Do not fail build if CONFIG_NF_CONNTRACK=m/n").

Fixes: b06b45e82b59 ("selftests/bpf: add tests for bpf_ct_set_nat_info kfunc")
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/1674028604-7113-1-git-send-email-yangtiezhu@loongson.cn
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/progs/test_bpf_nf.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
index 227e85e85dda..9fc603c9d673 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
@@ -34,6 +34,11 @@ __be16 dport = 0;
 int test_exist_lookup = -ENOENT;
 u32 test_exist_lookup_mark = 0;
 
+enum nf_nat_manip_type___local {
+	NF_NAT_MANIP_SRC___local,
+	NF_NAT_MANIP_DST___local
+};
+
 struct nf_conn;
 
 struct bpf_ct_opts___local {
@@ -58,7 +63,7 @@ int bpf_ct_change_timeout(struct nf_conn *, u32) __ksym;
 int bpf_ct_set_status(struct nf_conn *, u32) __ksym;
 int bpf_ct_change_status(struct nf_conn *, u32) __ksym;
 int bpf_ct_set_nat_info(struct nf_conn *, union nf_inet_addr *,
-			int port, enum nf_nat_manip_type) __ksym;
+			int port, enum nf_nat_manip_type___local) __ksym;
 
 static __always_inline void
 nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
@@ -157,10 +162,10 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
 
 		/* snat */
 		saddr.ip = bpf_get_prandom_u32();
-		bpf_ct_set_nat_info(ct, &saddr, sport, NF_NAT_MANIP_SRC);
+		bpf_ct_set_nat_info(ct, &saddr, sport, NF_NAT_MANIP_SRC___local);
 		/* dnat */
 		daddr.ip = bpf_get_prandom_u32();
-		bpf_ct_set_nat_info(ct, &daddr, dport, NF_NAT_MANIP_DST);
+		bpf_ct_set_nat_info(ct, &daddr, dport, NF_NAT_MANIP_DST___local);
 
 		ct_ins = bpf_ct_insert_entry(ct);
 		if (ct_ins) {
-- 
cgit 


From 32d118ad50a5afecb74358bcefc5cb6ea6ccfc2b Mon Sep 17 00:00:00 2001
From: Daniel Verkamp <dverkamp@chromium.org>
Date: Thu, 15 Dec 2022 00:12:02 +0000
Subject: selftests/memfd: add tests for F_SEAL_EXEC

Basic tests to ensure that user/group/other execute bits cannot be changed
after applying F_SEAL_EXEC to a memfd.

Link: https://lkml.kernel.org/r/20221215001205.51969-3-jeffxu@google.com
Signed-off-by: Daniel Verkamp <dverkamp@chromium.org>
Co-developed-by: Jeff Xu <jeffxu@google.com>
Signed-off-by: Jeff Xu <jeffxu@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: David Herrmann <dh.herrmann@gmail.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jorge Lucangeli Obes <jorgelo@chromium.org>
Cc: kernel test robot <lkp@intel.com>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/memfd/memfd_test.c | 123 ++++++++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 94df2692e6e4..f18a15a1f275 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -28,12 +28,38 @@
 #define MFD_DEF_SIZE 8192
 #define STACK_SIZE 65536
 
+#define F_SEAL_EXEC	0x0020
+
 /*
  * Default is not to test hugetlbfs
  */
 static size_t mfd_def_size = MFD_DEF_SIZE;
 static const char *memfd_str = MEMFD_STR;
 
+static ssize_t fd2name(int fd, char *buf, size_t bufsize)
+{
+	char buf1[PATH_MAX];
+	int size;
+	ssize_t nbytes;
+
+	size = snprintf(buf1, PATH_MAX, "/proc/self/fd/%d", fd);
+	if (size < 0) {
+		printf("snprintf(%d) failed on %m\n", fd);
+		abort();
+	}
+
+	/*
+	 * reserver one byte for string termination.
+	 */
+	nbytes = readlink(buf1, buf, bufsize-1);
+	if (nbytes == -1) {
+		printf("readlink(%s) failed %m\n", buf1);
+		abort();
+	}
+	buf[nbytes] = '\0';
+	return nbytes;
+}
+
 static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
 {
 	int r, fd;
@@ -98,11 +124,14 @@ static unsigned int mfd_assert_get_seals(int fd)
 
 static void mfd_assert_has_seals(int fd, unsigned int seals)
 {
+	char buf[PATH_MAX];
+	int nbytes;
 	unsigned int s;
+	fd2name(fd, buf, PATH_MAX);
 
 	s = mfd_assert_get_seals(fd);
 	if (s != seals) {
-		printf("%u != %u = GET_SEALS(%d)\n", seals, s, fd);
+		printf("%u != %u = GET_SEALS(%s)\n", seals, s, buf);
 		abort();
 	}
 }
@@ -594,6 +623,64 @@ static void mfd_fail_grow_write(int fd)
 	}
 }
 
+static void mfd_assert_mode(int fd, int mode)
+{
+	struct stat st;
+	char buf[PATH_MAX];
+	int nbytes;
+
+	fd2name(fd, buf, PATH_MAX);
+
+	if (fstat(fd, &st) < 0) {
+		printf("fstat(%s) failed: %m\n", buf);
+		abort();
+	}
+
+	if ((st.st_mode & 07777) != mode) {
+		printf("fstat(%s) wrong file mode 0%04o, but expected 0%04o\n",
+		       buf, (int)st.st_mode & 07777, mode);
+		abort();
+	}
+}
+
+static void mfd_assert_chmod(int fd, int mode)
+{
+	char buf[PATH_MAX];
+	int nbytes;
+
+	fd2name(fd, buf, PATH_MAX);
+
+	if (fchmod(fd, mode) < 0) {
+		printf("fchmod(%s, 0%04o) failed: %m\n", buf, mode);
+		abort();
+	}
+
+	mfd_assert_mode(fd, mode);
+}
+
+static void mfd_fail_chmod(int fd, int mode)
+{
+	struct stat st;
+	char buf[PATH_MAX];
+	int nbytes;
+
+	fd2name(fd, buf, PATH_MAX);
+
+	if (fstat(fd, &st) < 0) {
+		printf("fstat(%s) failed: %m\n", buf);
+		abort();
+	}
+
+	if (fchmod(fd, mode) == 0) {
+		printf("fchmod(%s, 0%04o) didn't fail as expected\n",
+		       buf, mode);
+		abort();
+	}
+
+	/* verify that file mode bits did not change */
+	mfd_assert_mode(fd, st.st_mode & 07777);
+}
+
 static int idle_thread_fn(void *arg)
 {
 	sigset_t set;
@@ -880,6 +967,39 @@ static void test_seal_resize(void)
 	close(fd);
 }
 
+/*
+ * Test SEAL_EXEC
+ * Test that chmod() cannot change x bits after sealing
+ */
+static void test_seal_exec(void)
+{
+	int fd;
+
+	printf("%s SEAL-EXEC\n", memfd_str);
+
+	fd = mfd_assert_new("kern_memfd_seal_exec",
+			    mfd_def_size,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+	mfd_assert_mode(fd, 0777);
+
+	mfd_assert_chmod(fd, 0644);
+
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_add_seals(fd, F_SEAL_EXEC);
+	mfd_assert_has_seals(fd, F_SEAL_EXEC);
+
+	mfd_assert_chmod(fd, 0600);
+	mfd_fail_chmod(fd, 0777);
+	mfd_fail_chmod(fd, 0670);
+	mfd_fail_chmod(fd, 0605);
+	mfd_fail_chmod(fd, 0700);
+	mfd_fail_chmod(fd, 0100);
+	mfd_assert_chmod(fd, 0666);
+
+	close(fd);
+}
+
 /*
  * Test sharing via dup()
  * Test that seals are shared between dupped FDs and they're all equal.
@@ -1059,6 +1179,7 @@ int main(int argc, char **argv)
 	test_seal_shrink();
 	test_seal_grow();
 	test_seal_resize();
+	test_seal_exec();
 
 	test_share_dup("SHARE-DUP", "");
 	test_share_mmap("SHARE-MMAP", "");
-- 
cgit 


From 11f75a01448f1b7a739e75dbd8f17b844fcfc510 Mon Sep 17 00:00:00 2001
From: Jeff Xu <jeffxu@google.com>
Date: Thu, 15 Dec 2022 00:12:05 +0000
Subject: selftests/memfd: add tests for MFD_NOEXEC_SEAL MFD_EXEC

Tests to verify MFD_NOEXEC, MFD_EXEC and vm.memfd_noexec sysctl.

Link: https://lkml.kernel.org/r/20221215001205.51969-6-jeffxu@google.com
Signed-off-by: Jeff Xu <jeffxu@google.com>
Co-developed-by: Daniel Verkamp <dverkamp@chromium.org>
Signed-off-by: Daniel Verkamp <dverkamp@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: David Herrmann <dh.herrmann@gmail.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jorge Lucangeli Obes <jorgelo@chromium.org>
Cc: kernel test robot <lkp@intel.com>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/memfd/fuse_test.c  |   1 +
 tools/testing/selftests/memfd/memfd_test.c | 228 ++++++++++++++++++++++++++++-
 2 files changed, 224 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c
index be675002f918..93798c8c5d54 100644
--- a/tools/testing/selftests/memfd/fuse_test.c
+++ b/tools/testing/selftests/memfd/fuse_test.c
@@ -22,6 +22,7 @@
 #include <linux/falloc.h>
 #include <fcntl.h>
 #include <linux/memfd.h>
+#include <linux/types.h>
 #include <sched.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index f18a15a1f275..ae71f15f790d 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -30,6 +30,14 @@
 
 #define F_SEAL_EXEC	0x0020
 
+#define F_WX_SEALS (F_SEAL_SHRINK | \
+		    F_SEAL_GROW | \
+		    F_SEAL_WRITE | \
+		    F_SEAL_FUTURE_WRITE | \
+		    F_SEAL_EXEC)
+
+#define MFD_NOEXEC_SEAL	0x0008U
+
 /*
  * Default is not to test hugetlbfs
  */
@@ -80,6 +88,37 @@ static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
 	return fd;
 }
 
+static void sysctl_assert_write(const char *val)
+{
+	int fd = open("/proc/sys/vm/memfd_noexec", O_WRONLY | O_CLOEXEC);
+
+	if (fd < 0) {
+		printf("open sysctl failed\n");
+		abort();
+	}
+
+	if (write(fd, val, strlen(val)) < 0) {
+		printf("write sysctl failed\n");
+		abort();
+	}
+}
+
+static void sysctl_fail_write(const char *val)
+{
+	int fd = open("/proc/sys/vm/memfd_noexec", O_WRONLY | O_CLOEXEC);
+
+	if (fd < 0) {
+		printf("open sysctl failed\n");
+		abort();
+	}
+
+	if (write(fd, val, strlen(val)) >= 0) {
+		printf("write sysctl %s succeeded, but failure expected\n",
+				val);
+		abort();
+	}
+}
+
 static int mfd_assert_reopen_fd(int fd_in)
 {
 	int fd;
@@ -758,6 +797,9 @@ static void test_create(void)
 	mfd_fail_new("", ~0);
 	mfd_fail_new("", 0x80000000U);
 
+	/* verify EXEC and NOEXEC_SEAL can't both be set */
+	mfd_fail_new("", MFD_EXEC | MFD_NOEXEC_SEAL);
+
 	/* verify MFD_CLOEXEC is allowed */
 	fd = mfd_assert_new("", 0, MFD_CLOEXEC);
 	close(fd);
@@ -969,20 +1011,21 @@ static void test_seal_resize(void)
 
 /*
  * Test SEAL_EXEC
- * Test that chmod() cannot change x bits after sealing
+ * Test fd is created with exec and allow sealing.
+ * chmod() cannot change x bits after sealing.
  */
-static void test_seal_exec(void)
+static void test_exec_seal(void)
 {
 	int fd;
 
 	printf("%s SEAL-EXEC\n", memfd_str);
 
+	printf("%s	Apply SEAL_EXEC\n", memfd_str);
 	fd = mfd_assert_new("kern_memfd_seal_exec",
 			    mfd_def_size,
-			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_EXEC);
 
 	mfd_assert_mode(fd, 0777);
-
 	mfd_assert_chmod(fd, 0644);
 
 	mfd_assert_has_seals(fd, 0);
@@ -996,10 +1039,181 @@ static void test_seal_exec(void)
 	mfd_fail_chmod(fd, 0700);
 	mfd_fail_chmod(fd, 0100);
 	mfd_assert_chmod(fd, 0666);
+	mfd_assert_write(fd);
+	close(fd);
+
+	printf("%s	Apply ALL_SEALS\n", memfd_str);
+	fd = mfd_assert_new("kern_memfd_seal_exec",
+			    mfd_def_size,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_EXEC);
+
+	mfd_assert_mode(fd, 0777);
+	mfd_assert_chmod(fd, 0700);
+
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_add_seals(fd, F_SEAL_EXEC);
+	mfd_assert_has_seals(fd, F_WX_SEALS);
 
+	mfd_fail_chmod(fd, 0711);
+	mfd_fail_chmod(fd, 0600);
+	mfd_fail_write(fd);
+	close(fd);
+}
+
+/*
+ * Test EXEC_NO_SEAL
+ * Test fd is created with exec and not allow sealing.
+ */
+static void test_exec_no_seal(void)
+{
+	int fd;
+
+	printf("%s EXEC_NO_SEAL\n", memfd_str);
+
+	/* Create with EXEC but without ALLOW_SEALING */
+	fd = mfd_assert_new("kern_memfd_exec_no_sealing",
+			    mfd_def_size,
+			    MFD_CLOEXEC | MFD_EXEC);
+	mfd_assert_mode(fd, 0777);
+	mfd_assert_has_seals(fd, F_SEAL_SEAL);
+	mfd_assert_chmod(fd, 0666);
 	close(fd);
 }
 
+/*
+ * Test memfd_create with MFD_NOEXEC flag
+ */
+static void test_noexec_seal(void)
+{
+	int fd;
+
+	printf("%s NOEXEC_SEAL\n", memfd_str);
+
+	/* Create with NOEXEC and ALLOW_SEALING */
+	fd = mfd_assert_new("kern_memfd_noexec",
+			    mfd_def_size,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_NOEXEC_SEAL);
+	mfd_assert_mode(fd, 0666);
+	mfd_assert_has_seals(fd, F_SEAL_EXEC);
+	mfd_fail_chmod(fd, 0777);
+	close(fd);
+
+	/* Create with NOEXEC but without ALLOW_SEALING */
+	fd = mfd_assert_new("kern_memfd_noexec",
+			    mfd_def_size,
+			    MFD_CLOEXEC | MFD_NOEXEC_SEAL);
+	mfd_assert_mode(fd, 0666);
+	mfd_assert_has_seals(fd, F_SEAL_EXEC);
+	mfd_fail_chmod(fd, 0777);
+	close(fd);
+}
+
+static void test_sysctl_child(void)
+{
+	int fd;
+
+	printf("%s sysctl 0\n", memfd_str);
+	sysctl_assert_write("0");
+	fd = mfd_assert_new("kern_memfd_sysctl_0",
+			    mfd_def_size,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+	mfd_assert_mode(fd, 0777);
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_chmod(fd, 0644);
+	close(fd);
+
+	printf("%s sysctl 1\n", memfd_str);
+	sysctl_assert_write("1");
+	fd = mfd_assert_new("kern_memfd_sysctl_1",
+			    mfd_def_size,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+	mfd_assert_mode(fd, 0666);
+	mfd_assert_has_seals(fd, F_SEAL_EXEC);
+	mfd_fail_chmod(fd, 0777);
+	sysctl_fail_write("0");
+	close(fd);
+
+	printf("%s sysctl 2\n", memfd_str);
+	sysctl_assert_write("2");
+	mfd_fail_new("kern_memfd_sysctl_2",
+		MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	sysctl_fail_write("0");
+	sysctl_fail_write("1");
+}
+
+static int newpid_thread_fn(void *arg)
+{
+	test_sysctl_child();
+	return 0;
+}
+
+static void test_sysctl_child2(void)
+{
+	int fd;
+
+	sysctl_fail_write("0");
+	fd = mfd_assert_new("kern_memfd_sysctl_1",
+			    mfd_def_size,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+	mfd_assert_mode(fd, 0666);
+	mfd_assert_has_seals(fd, F_SEAL_EXEC);
+	mfd_fail_chmod(fd, 0777);
+	close(fd);
+}
+
+static int newpid_thread_fn2(void *arg)
+{
+	test_sysctl_child2();
+	return 0;
+}
+static pid_t spawn_newpid_thread(unsigned int flags, int (*fn)(void *))
+{
+	uint8_t *stack;
+	pid_t pid;
+
+	stack = malloc(STACK_SIZE);
+	if (!stack) {
+		printf("malloc(STACK_SIZE) failed: %m\n");
+		abort();
+	}
+
+	pid = clone(fn,
+		    stack + STACK_SIZE,
+		    SIGCHLD | flags,
+		    NULL);
+	if (pid < 0) {
+		printf("clone() failed: %m\n");
+		abort();
+	}
+
+	return pid;
+}
+
+static void join_newpid_thread(pid_t pid)
+{
+	waitpid(pid, NULL, 0);
+}
+
+/*
+ * Test sysctl
+ * A very basic sealing test to see whether setting/retrieving seals works.
+ */
+static void test_sysctl(void)
+{
+	int pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn);
+
+	join_newpid_thread(pid);
+
+	printf("%s child ns\n", memfd_str);
+	sysctl_assert_write("1");
+
+	pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn2);
+	join_newpid_thread(pid);
+}
+
 /*
  * Test sharing via dup()
  * Test that seals are shared between dupped FDs and they're all equal.
@@ -1173,13 +1387,15 @@ int main(int argc, char **argv)
 
 	test_create();
 	test_basic();
+	test_exec_seal();
+	test_exec_no_seal();
+	test_noexec_seal();
 
 	test_seal_write();
 	test_seal_future_write();
 	test_seal_shrink();
 	test_seal_grow();
 	test_seal_resize();
-	test_seal_exec();
 
 	test_share_dup("SHARE-DUP", "");
 	test_share_mmap("SHARE-MMAP", "");
@@ -1195,6 +1411,8 @@ int main(int argc, char **argv)
 	test_share_fork("SHARE-FORK", SHARED_FT_STR);
 	join_idle_thread(pid);
 
+	test_sysctl();
+
 	printf("memfd: DONE\n");
 
 	return 0;
-- 
cgit 


From 553b014244298d9f807286d6a71d722bc1f50f84 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 5 Dec 2022 23:08:28 +0000
Subject: selftests/damon/sysfs: test filters directory

Add simple test cases for scheme filters of DAMON sysfs interface.  The
test cases check if the files are populated as expected, receives some
valid inputs, and refuses some invalid inputs.

Link: https://lkml.kernel.org/r/20221205230830.144349-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/sysfs.sh | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh
index db4942383a50..a00336ffdcad 100644
--- a/tools/testing/selftests/damon/sysfs.sh
+++ b/tools/testing/selftests/damon/sysfs.sh
@@ -96,6 +96,34 @@ test_stats()
 	done
 }
 
+test_filter()
+{
+	filter_dir=$1
+	ensure_file "$filter_dir/type" "exist" "600"
+	ensure_write_succ "$filter_dir/type" "anon" "valid input"
+	ensure_write_succ "$filter_dir/type" "memcg" "valid input"
+	ensure_write_fail "$filter_dir/type" "foo" "invalid input"
+	ensure_file "$filter_dir/matching" "exist" "600"
+	ensure_file "$filter_dir/memcg_path" "exist" "600"
+}
+
+test_filters()
+{
+	filters_dir=$1
+	ensure_dir "$filters_dir" "exist"
+	ensure_file "$filters_dir/nr_filters" "exist" "600"
+	ensure_write_succ  "$filters_dir/nr_filters" "1" "valid input"
+	test_filter "$filters_dir/0"
+
+	ensure_write_succ  "$filters_dir/nr_filters" "2" "valid input"
+	test_filter "$filters_dir/0"
+	test_filter "$filters_dir/1"
+
+	ensure_write_succ "$filters_dir/nr_filters" "0" "valid input"
+	ensure_dir "$filters_dir/0" "not_exist"
+	ensure_dir "$filters_dir/1" "not_exist"
+}
+
 test_watermarks()
 {
 	watermarks_dir=$1
@@ -143,6 +171,7 @@ test_scheme()
 	test_access_pattern "$scheme_dir/access_pattern"
 	test_quotas "$scheme_dir/quotas"
 	test_watermarks "$scheme_dir/watermarks"
+	test_filters "$scheme_dir/filters"
 	test_stats "$scheme_dir/stats"
 	test_tried_regions "$scheme_dir/tried_regions"
 }
-- 
cgit 


From ef1faf0e370a8e33fe625088ddc5fde02cf8c4c4 Mon Sep 17 00:00:00 2001
From: Jianlin Lv <iecedge@gmail.com>
Date: Mon, 19 Dec 2022 16:49:17 +0000
Subject: tools/vm/page_owner_sort: free memory before exit

Although when a process terminates, the kernel will removes memory
associated with that process, It's neither good style nor proper design to
leave it to kernel.  This patch free allocated memory before process exit.

Link: https://lkml.kernel.org/r/20221219164917.14132-1-iecedge@gmail.com
Signed-off-by: Jianlin Lv <iecedge@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/vm/page_owner_sort.c | 65 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c
index ce860ab94162..7c2ac124cdc8 100644
--- a/tools/vm/page_owner_sort.c
+++ b/tools/vm/page_owner_sort.c
@@ -246,15 +246,16 @@ static int search_pattern(regex_t *pattern, char *pattern_str, char *buf)
 	return 0;
 }
 
-static void check_regcomp(regex_t *pattern, const char *regex)
+static bool check_regcomp(regex_t *pattern, const char *regex)
 {
 	int err;
 
 	err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE);
 	if (err != 0 || pattern->re_nsub != 1) {
 		fprintf(stderr, "Invalid pattern %s code %d\n", regex, err);
-		exit(1);
+		return false;
 	}
+	return true;
 }
 
 static char **explode(char sep, const char *str, int *size)
@@ -494,28 +495,28 @@ static bool is_need(char *buf)
 	return true;
 }
 
-static void add_list(char *buf, int len, char *ext_buf)
+static bool add_list(char *buf, int len, char *ext_buf)
 {
 	if (list_size != 0 &&
 		len == list[list_size-1].len &&
 		memcmp(buf, list[list_size-1].txt, len) == 0) {
 		list[list_size-1].num++;
 		list[list_size-1].page_num += get_page_num(buf);
-		return;
+		return true;
 	}
 	if (list_size == max_size) {
 		fprintf(stderr, "max_size too small??\n");
-		exit(1);
+		return false;
 	}
 	if (!is_need(buf))
-		return;
+		return true;
 	list[list_size].pid = get_pid(buf);
 	list[list_size].tgid = get_tgid(buf);
 	list[list_size].comm = get_comm(buf);
 	list[list_size].txt = malloc(len+1);
 	if (!list[list_size].txt) {
 		fprintf(stderr, "Out of memory\n");
-		exit(1);
+		return false;
 	}
 	memcpy(list[list_size].txt, buf, len);
 	list[list_size].txt[len] = 0;
@@ -534,6 +535,7 @@ static void add_list(char *buf, int len, char *ext_buf)
 		printf("loaded %d\r", list_size);
 		fflush(stdout);
 	}
+	return true;
 }
 
 static bool parse_cull_args(const char *arg_str)
@@ -790,12 +792,19 @@ int main(int argc, char **argv)
 		exit(1);
 	}
 
-	check_regcomp(&order_pattern, "order\\s*([0-9]*),");
-	check_regcomp(&pid_pattern, "pid\\s*([0-9]*),");
-	check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) ");
-	check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts");
-	check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,");
-	check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns");
+	if (!check_regcomp(&order_pattern, "order\\s*([0-9]*),"))
+		goto out_order;
+	if (!check_regcomp(&pid_pattern, "pid\\s*([0-9]*),"))
+		goto out_pid;
+	if (!check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) "))
+		goto out_tgid;
+	if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts"))
+		goto out_comm;
+	if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,"))
+		goto out_ts;
+	if (!check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns"))
+		goto out_free_ts;
+
 	fstat(fileno(fin), &st);
 	max_size = st.st_size / 100; /* hack ... */
 
@@ -804,7 +813,7 @@ int main(int argc, char **argv)
 	ext_buf = malloc(BUF_SIZE);
 	if (!list || !buf || !ext_buf) {
 		fprintf(stderr, "Out of memory\n");
-		exit(1);
+		goto out_free;
 	}
 
 	for ( ; ; ) {
@@ -812,7 +821,8 @@ int main(int argc, char **argv)
 
 		if (buf_len < 0)
 			break;
-		add_list(buf, buf_len, ext_buf);
+		if (!add_list(buf, buf_len, ext_buf))
+			goto out_free;
 	}
 
 	printf("loaded %d\n", list_size);
@@ -862,11 +872,26 @@ int main(int argc, char **argv)
 			fprintf(fout, "\n");
 		}
 	}
-	regfree(&order_pattern);
-	regfree(&pid_pattern);
-	regfree(&tgid_pattern);
-	regfree(&comm_pattern);
-	regfree(&ts_nsec_pattern);
+
+out_free:
+	if (ext_buf)
+		free(ext_buf);
+	if (buf)
+		free(buf);
+	if (list)
+		free(list);
+out_free_ts:
 	regfree(&free_ts_nsec_pattern);
+out_ts:
+	regfree(&ts_nsec_pattern);
+out_comm:
+	regfree(&comm_pattern);
+out_tgid:
+	regfree(&tgid_pattern);
+out_pid:
+	regfree(&pid_pattern);
+out_order:
+	regfree(&order_pattern);
+
 	return 0;
 }
-- 
cgit 


From a9af8e6bb3e5de8ea9d29c1d318bcfbc5667c939 Mon Sep 17 00:00:00 2001
From: Xu Panda <xu.panda@zte.com.cn>
Date: Fri, 23 Dec 2022 10:50:24 +0800
Subject: selftests/vm: ksm_functional_tests: fix a typo in comment

Fix a typo of "comaring" which should be "comparing".

Link: https://lkml.kernel.org/r/202212231050245952617@zte.com.cn
Signed-off-by: Xu Panda <xu.panda@zte.com.cn>
Signed-off-by: xu xin <xu.xin16@zte.com.cn>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/vm/ksm_functional_tests.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c
index b11b7e5115dc..d8b5b4930412 100644
--- a/tools/testing/selftests/vm/ksm_functional_tests.c
+++ b/tools/testing/selftests/vm/ksm_functional_tests.c
@@ -37,7 +37,7 @@ static bool range_maps_duplicates(char *addr, unsigned long size)
 	/*
 	 * There is no easy way to check if there are KSM pages mapped into
 	 * this range. We only check that the range does not map the same PFN
-	 * twice by comaring each pair of mapped pages.
+	 * twice by comparing each pair of mapped pages.
 	 */
 	for (offs_a = 0; offs_a < size; offs_a += pagesize) {
 		pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a);
-- 
cgit 


From 541e06b772c1aaffb3b6a245ccface36d7107af2 Mon Sep 17 00:00:00 2001
From: Liam Howlett <liam.howlett@oracle.com>
Date: Thu, 5 Jan 2023 16:05:34 +0000
Subject: maple_tree: remove GFP_ZERO from kmem_cache_alloc() and
 kmem_cache_alloc_bulk()

Preallocations are common in the VMA code to avoid allocating under
certain locking conditions.  The preallocations must also cover the
worst-case scenario.  Removing the GFP_ZERO flag from the
kmem_cache_alloc() (and bulk variant) calls will reduce the amount of time
spent zeroing memory that may not be used.  Only zero out the necessary
area to keep track of the allocations in the maple state.  Zero the entire
node prior to using it in the tree.

This required internal changes to node counting on allocation, so the test
code is also updated.

This restores some micro-benchmark performance: up to +9% in mmtests mmap1
by my testing +10% to +20% in mmap, mmapaddr, mmapmany tests reported by
Red Hat

Link: https://bugzilla.redhat.com/show_bug.cgi?id=2149636
Link: https://lkml.kernel.org/r/20230105160427.2988454-1-Liam.Howlett@oracle.com
Signed-off-by: Liam Howlett <Liam.Howlett@oracle.com>
Reported-by: Jirka Hladky <jhladky@redhat.com>
Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/maple_tree.c                 | 80 +++++++++++++++++++++-------------------
 tools/testing/radix-tree/maple.c | 18 ++++-----
 2 files changed, 52 insertions(+), 46 deletions(-)

(limited to 'tools')

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 94f0053ec3e0..8db3c336d19f 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -149,13 +149,12 @@ struct maple_subtree_state {
 /* Functions */
 static inline struct maple_node *mt_alloc_one(gfp_t gfp)
 {
-	return kmem_cache_alloc(maple_node_cache, gfp | __GFP_ZERO);
+	return kmem_cache_alloc(maple_node_cache, gfp);
 }
 
 static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes)
 {
-	return kmem_cache_alloc_bulk(maple_node_cache, gfp | __GFP_ZERO, size,
-				     nodes);
+	return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes);
 }
 
 static inline void mt_free_bulk(size_t size, void __rcu **nodes)
@@ -1125,9 +1124,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas)
 {
 	struct maple_alloc *ret, *node = mas->alloc;
 	unsigned long total = mas_allocated(mas);
+	unsigned int req = mas_alloc_req(mas);
 
 	/* nothing or a request pending. */
-	if (unlikely(!total))
+	if (WARN_ON(!total))
 		return NULL;
 
 	if (total == 1) {
@@ -1137,27 +1137,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas)
 		goto single_node;
 	}
 
-	if (!node->node_count) {
+	if (node->node_count == 1) {
 		/* Single allocation in this node. */
 		mas->alloc = node->slot[0];
-		node->slot[0] = NULL;
 		mas->alloc->total = node->total - 1;
 		ret = node;
 		goto new_head;
 	}
-
 	node->total--;
-	ret = node->slot[node->node_count];
-	node->slot[node->node_count--] = NULL;
+	ret = node->slot[--node->node_count];
+	node->slot[node->node_count] = NULL;
 
 single_node:
 new_head:
-	ret->total = 0;
-	ret->node_count = 0;
-	if (ret->request_count) {
-		mas_set_alloc_req(mas, ret->request_count + 1);
-		ret->request_count = 0;
+	if (req) {
+		req++;
+		mas_set_alloc_req(mas, req);
 	}
+
+	memset(ret, 0, sizeof(*ret));
 	return (struct maple_node *)ret;
 }
 
@@ -1176,21 +1174,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used)
 	unsigned long count;
 	unsigned int requested = mas_alloc_req(mas);
 
-	memset(reuse, 0, sizeof(*reuse));
 	count = mas_allocated(mas);
 
-	if (count && (head->node_count < MAPLE_ALLOC_SLOTS - 1)) {
-		if (head->slot[0])
-			head->node_count++;
-		head->slot[head->node_count] = reuse;
+	reuse->request_count = 0;
+	reuse->node_count = 0;
+	if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) {
+		head->slot[head->node_count++] = reuse;
 		head->total++;
 		goto done;
 	}
 
 	reuse->total = 1;
 	if ((head) && !((unsigned long)head & 0x1)) {
-		head->request_count = 0;
 		reuse->slot[0] = head;
+		reuse->node_count = 1;
 		reuse->total += head->total;
 	}
 
@@ -1209,7 +1206,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
 {
 	struct maple_alloc *node;
 	unsigned long allocated = mas_allocated(mas);
-	unsigned long success = allocated;
 	unsigned int requested = mas_alloc_req(mas);
 	unsigned int count;
 	void **slots = NULL;
@@ -1225,24 +1221,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
 		WARN_ON(!allocated);
 	}
 
-	if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS - 1) {
+	if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) {
 		node = (struct maple_alloc *)mt_alloc_one(gfp);
 		if (!node)
 			goto nomem_one;
 
-		if (allocated)
+		if (allocated) {
 			node->slot[0] = mas->alloc;
+			node->node_count = 1;
+		} else {
+			node->node_count = 0;
+		}
 
-		success++;
 		mas->alloc = node;
+		node->total = ++allocated;
 		requested--;
 	}
 
 	node = mas->alloc;
+	node->request_count = 0;
 	while (requested) {
 		max_req = MAPLE_ALLOC_SLOTS;
-		if (node->slot[0]) {
-			unsigned int offset = node->node_count + 1;
+		if (node->node_count) {
+			unsigned int offset = node->node_count;
 
 			slots = (void **)&node->slot[offset];
 			max_req -= offset;
@@ -1256,15 +1257,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
 			goto nomem_bulk;
 
 		node->node_count += count;
-		/* zero indexed. */
-		if (slots == (void **)&node->slot)
-			node->node_count--;
-
-		success += count;
+		allocated += count;
 		node = node->slot[0];
+		node->node_count = 0;
+		node->request_count = 0;
 		requested -= count;
 	}
-	mas->alloc->total = success;
+	mas->alloc->total = allocated;
 	return;
 
 nomem_bulk:
@@ -1273,7 +1272,7 @@ nomem_bulk:
 nomem_one:
 	mas_set_alloc_req(mas, requested);
 	if (mas->alloc && !(((unsigned long)mas->alloc & 0x1)))
-		mas->alloc->total = success;
+		mas->alloc->total = allocated;
 	mas_set_err(mas, -ENOMEM);
 }
 
@@ -5734,6 +5733,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
 void mas_destroy(struct ma_state *mas)
 {
 	struct maple_alloc *node;
+	unsigned long total;
 
 	/*
 	 * When using mas_for_each() to insert an expected number of elements,
@@ -5756,14 +5756,20 @@ void mas_destroy(struct ma_state *mas)
 	}
 	mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC);
 
-	while (mas->alloc && !((unsigned long)mas->alloc & 0x1)) {
+	total = mas_allocated(mas);
+	while (total) {
 		node = mas->alloc;
 		mas->alloc = node->slot[0];
-		if (node->node_count > 0)
-			mt_free_bulk(node->node_count,
-				     (void __rcu **)&node->slot[1]);
+		if (node->node_count > 1) {
+			size_t count = node->node_count - 1;
+
+			mt_free_bulk(count, (void __rcu **)&node->slot[1]);
+			total -= count;
+		}
 		kmem_cache_free(maple_node_cache, node);
+		total--;
 	}
+
 	mas->alloc = NULL;
 }
 EXPORT_SYMBOL_GPL(mas_destroy);
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 81fa7ec2e66a..1f36bc1c5d36 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -173,11 +173,11 @@ static noinline void check_new_node(struct maple_tree *mt)
 
 		if (!MAPLE_32BIT) {
 			if (i >= 35)
-				e = i - 35;
+				e = i - 34;
 			else if (i >= 5)
-				e = i - 5;
+				e = i - 4;
 			else if (i >= 2)
-				e = i - 2;
+				e = i - 1;
 		} else {
 			if (i >= 4)
 				e = i - 4;
@@ -305,17 +305,17 @@ static noinline void check_new_node(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM));
 	MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL));
 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
-	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
+	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS);
 
 	mn = mas_pop_node(&mas); /* get the next node. */
 	MT_BUG_ON(mt, mn == NULL);
 	MT_BUG_ON(mt, not_empty(mn));
 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS);
-	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 2);
+	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
 
 	mas_push_node(&mas, mn);
 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
-	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
+	MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS);
 
 	/* Check the limit of pop/push/pop */
 	mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */
@@ -323,14 +323,14 @@ static noinline void check_new_node(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM));
 	MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL));
 	MT_BUG_ON(mt, mas_alloc_req(&mas));
-	MT_BUG_ON(mt, mas.alloc->node_count);
+	MT_BUG_ON(mt, mas.alloc->node_count != 1);
 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2);
 	mn = mas_pop_node(&mas);
 	MT_BUG_ON(mt, not_empty(mn));
 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1);
-	MT_BUG_ON(mt, mas.alloc->node_count  != MAPLE_ALLOC_SLOTS - 1);
+	MT_BUG_ON(mt, mas.alloc->node_count  != MAPLE_ALLOC_SLOTS);
 	mas_push_node(&mas, mn);
-	MT_BUG_ON(mt, mas.alloc->node_count);
+	MT_BUG_ON(mt, mas.alloc->node_count != 1);
 	MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2);
 	mn = mas_pop_node(&mas);
 	MT_BUG_ON(mt, not_empty(mn));
-- 
cgit 


From dee2ad120571f38433211098cd6b95a59bdfc8c7 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 4 Jan 2023 15:49:05 +0100
Subject: selftests/vm: cow: add COW tests for collapsing of PTE-mapped anon
 THP

Currently, anonymous PTE-mapped THPs cannot be collapsed in-place:
collapsing (e.g., via MADV_COLLAPSE) implies allocating a fresh THP and
mapping that new THP via a PMD: as it's a fresh anon THP, it will get the
exclusive flag set on the head page and everybody is happy.

However, if the kernel would ever support in-place collapse of anonymous
THPs (replacing a page table mapping each sub-page of a THP via PTEs with
a single PMD mapping the complete THP), exclusivity information stored for
each sub-page would have to be collapsed accordingly:

(1) All PTEs map !exclusive anon sub-pages: the in-place collapsed THP
    must not not have the exclusive flag set on the head page mapped by
    the PMD. This is the easiest case to handle ("simply don't set any
    exclusive flags").

(2) All PTEs map exclusive anon sub-pages: when collapsing, we have to
    clear the exclusive flag from all tail pages and only leave the
    exclusive flag set for the head page. Otherwise, fork() after
    collapse would not clear the exclusive flags from the tail pages
    and we'd be in trouble once PTE-mapping the shared THP when writing
    to shared tail pages that still have the exclusive flag set. This
    would effectively revert what the PTE-mapping code does when
    propagating the exclusive flag to all sub-pages.

(3) PTEs map a mixture of exclusive and !exclusive anon sub-pages (can
    happen e.g., due to MADV_DONTFORK before fork()). We must not
    collapse the THP in-place, otherwise bad things may happen:
    the exclusive flags of sub-pages would get ignored and the
    exclusive flag of the head page would get used instead.

Now that we have MADV_COLLAPSE in place to trigger collapsing a THP, let's
add some test cases that would bail out early, if we'd
voluntarily/accidantially unlock in-place collapse for anon THPs and
forget about taking proper care of exclusive flags.

Running the test on a kernel with MADV_COLLAPSE support:
  # [INFO] Anonymous THP tests
  # [RUN] Basic COW after fork() when collapsing before fork()
  ok 169 No leak from parent into child
  # [RUN] Basic COW after fork() when collapsing after fork() (fully shared)
  ok 170 # SKIP MADV_COLLAPSE failed: Invalid argument
  # [RUN] Basic COW after fork() when collapsing after fork() (lower shared)
  ok 171 No leak from parent into child
  # [RUN] Basic COW after fork() when collapsing after fork() (upper shared)
  ok 172 No leak from parent into child

For now, MADV_COLLAPSE always seems to fail if all PTEs map shared
sub-pages.

Link: https://lkml.kernel.org/r/20230104144905.460075-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Zach O'Keefe <zokeefe@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/vm/cow.c | 228 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 228 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/vm/cow.c b/tools/testing/selftests/vm/cow.c
index 26f6ea3079e2..16216d893d96 100644
--- a/tools/testing/selftests/vm/cow.c
+++ b/tools/testing/selftests/vm/cow.c
@@ -30,6 +30,10 @@
 #include "../kselftest.h"
 #include "vm_util.h"
 
+#ifndef MADV_COLLAPSE
+#define MADV_COLLAPSE 25
+#endif
+
 static size_t pagesize;
 static int pagemap_fd;
 static size_t thpsize;
@@ -1178,6 +1182,228 @@ static int tests_per_anon_test_case(void)
 	return tests;
 }
 
+enum anon_thp_collapse_test {
+	ANON_THP_COLLAPSE_UNSHARED,
+	ANON_THP_COLLAPSE_FULLY_SHARED,
+	ANON_THP_COLLAPSE_LOWER_SHARED,
+	ANON_THP_COLLAPSE_UPPER_SHARED,
+};
+
+static void do_test_anon_thp_collapse(char *mem, size_t size,
+				      enum anon_thp_collapse_test test)
+{
+	struct comm_pipes comm_pipes;
+	char buf;
+	int ret;
+
+	ret = setup_comm_pipes(&comm_pipes);
+	if (ret) {
+		ksft_test_result_fail("pipe() failed\n");
+		return;
+	}
+
+	/*
+	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
+	 * R/O, such that we can try collapsing it later.
+	 */
+	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
+	if (ret) {
+		ksft_test_result_fail("mprotect() failed\n");
+		goto close_comm_pipes;
+	}
+	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
+	if (ret) {
+		ksft_test_result_fail("mprotect() failed\n");
+		goto close_comm_pipes;
+	}
+
+	switch (test) {
+	case ANON_THP_COLLAPSE_UNSHARED:
+		/* Collapse before actually COW-sharing the page. */
+		ret = madvise(mem, size, MADV_COLLAPSE);
+		if (ret) {
+			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
+					      strerror(errno));
+			goto close_comm_pipes;
+		}
+		break;
+	case ANON_THP_COLLAPSE_FULLY_SHARED:
+		/* COW-share the full PTE-mapped THP. */
+		break;
+	case ANON_THP_COLLAPSE_LOWER_SHARED:
+		/* Don't COW-share the upper part of the THP. */
+		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
+		if (ret) {
+			ksft_test_result_fail("MADV_DONTFORK failed\n");
+			goto close_comm_pipes;
+		}
+		break;
+	case ANON_THP_COLLAPSE_UPPER_SHARED:
+		/* Don't COW-share the lower part of the THP. */
+		ret = madvise(mem, size / 2, MADV_DONTFORK);
+		if (ret) {
+			ksft_test_result_fail("MADV_DONTFORK failed\n");
+			goto close_comm_pipes;
+		}
+		break;
+	default:
+		assert(false);
+	}
+
+	ret = fork();
+	if (ret < 0) {
+		ksft_test_result_fail("fork() failed\n");
+		goto close_comm_pipes;
+	} else if (!ret) {
+		switch (test) {
+		case ANON_THP_COLLAPSE_UNSHARED:
+		case ANON_THP_COLLAPSE_FULLY_SHARED:
+			exit(child_memcmp_fn(mem, size, &comm_pipes));
+			break;
+		case ANON_THP_COLLAPSE_LOWER_SHARED:
+			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
+			break;
+		case ANON_THP_COLLAPSE_UPPER_SHARED:
+			exit(child_memcmp_fn(mem + size / 2, size / 2,
+					     &comm_pipes));
+			break;
+		default:
+			assert(false);
+		}
+	}
+
+	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+		;
+
+	switch (test) {
+	case ANON_THP_COLLAPSE_UNSHARED:
+		break;
+	case ANON_THP_COLLAPSE_UPPER_SHARED:
+	case ANON_THP_COLLAPSE_LOWER_SHARED:
+		/*
+		 * Revert MADV_DONTFORK such that we merge the VMAs and are
+		 * able to actually collapse.
+		 */
+		ret = madvise(mem, size, MADV_DOFORK);
+		if (ret) {
+			ksft_test_result_fail("MADV_DOFORK failed\n");
+			write(comm_pipes.parent_ready[1], "0", 1);
+			wait(&ret);
+			goto close_comm_pipes;
+		}
+		/* FALLTHROUGH */
+	case ANON_THP_COLLAPSE_FULLY_SHARED:
+		/* Collapse before anyone modified the COW-shared page. */
+		ret = madvise(mem, size, MADV_COLLAPSE);
+		if (ret) {
+			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
+					      strerror(errno));
+			write(comm_pipes.parent_ready[1], "0", 1);
+			wait(&ret);
+			goto close_comm_pipes;
+		}
+		break;
+	default:
+		assert(false);
+	}
+
+	/* Modify the page. */
+	memset(mem, 0xff, size);
+	write(comm_pipes.parent_ready[1], "0", 1);
+
+	wait(&ret);
+	if (WIFEXITED(ret))
+		ret = WEXITSTATUS(ret);
+	else
+		ret = -EINVAL;
+
+	ksft_test_result(!ret, "No leak from parent into child\n");
+close_comm_pipes:
+	close_comm_pipes(&comm_pipes);
+}
+
+static void test_anon_thp_collapse_unshared(char *mem, size_t size)
+{
+	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
+}
+
+static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
+{
+	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
+}
+
+static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
+{
+	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
+}
+
+static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
+{
+	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
+}
+
+/*
+ * Test cases that are specific to anonymous THP: pages in private mappings
+ * that may get shared via COW during fork().
+ */
+static const struct test_case anon_thp_test_cases[] = {
+	/*
+	 * Basic COW test for fork() without any GUP when collapsing a THP
+	 * before fork().
+	 *
+	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
+	 * collapse") might easily get COW handling wrong when not collapsing
+	 * exclusivity information properly.
+	 */
+	{
+		"Basic COW after fork() when collapsing before fork()",
+		test_anon_thp_collapse_unshared,
+	},
+	/* Basic COW test, but collapse after COW-sharing a full THP. */
+	{
+		"Basic COW after fork() when collapsing after fork() (fully shared)",
+		test_anon_thp_collapse_fully_shared,
+	},
+	/*
+	 * Basic COW test, but collapse after COW-sharing the lower half of a
+	 * THP.
+	 */
+	{
+		"Basic COW after fork() when collapsing after fork() (lower shared)",
+		test_anon_thp_collapse_lower_shared,
+	},
+	/*
+	 * Basic COW test, but collapse after COW-sharing the upper half of a
+	 * THP.
+	 */
+	{
+		"Basic COW after fork() when collapsing after fork() (upper shared)",
+		test_anon_thp_collapse_upper_shared,
+	},
+};
+
+static void run_anon_thp_test_cases(void)
+{
+	int i;
+
+	if (!thpsize)
+		return;
+
+	ksft_print_msg("[INFO] Anonymous THP tests\n");
+
+	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
+		struct test_case const *test_case = &anon_thp_test_cases[i];
+
+		ksft_print_msg("[RUN] %s\n", test_case->desc);
+		do_run_with_thp(test_case->fn, THP_RUN_PMD);
+	}
+}
+
+static int tests_per_anon_thp_test_case(void)
+{
+	return thpsize ? 1 : 0;
+}
+
 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
 
 static void test_cow(char *mem, const char *smem, size_t size)
@@ -1518,6 +1744,7 @@ int main(int argc, char **argv)
 
 	ksft_print_header();
 	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
+		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
 		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
 
 	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
@@ -1526,6 +1753,7 @@ int main(int argc, char **argv)
 		ksft_exit_fail_msg("opening pagemap failed\n");
 
 	run_anon_test_cases();
+	run_anon_thp_test_cases();
 	run_non_anon_test_cases();
 
 	err = ksft_get_fail_cnt();
-- 
cgit 


From 799fb82aa132fa3a3886b7872997a5a84e820062 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 3 Jan 2023 18:07:52 +0000
Subject: tools/vm: rename tools/vm to tools/mm

Rename tools/vm to tools/mm for being more consistent with the code and
documentation directories, and won't be confused with virtual machines.

Link: https://lkml.kernel.org/r/20230103180754.129637-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../admin-guide/mm/idle_page_tracking.rst          |    2 +-
 Documentation/admin-guide/mm/pagemap.rst           |    4 +-
 Documentation/mm/page_owner.rst                    |    2 +-
 Documentation/mm/slub.rst                          |    2 +-
 Documentation/translations/zh_CN/mm/page_owner.rst |    2 +-
 MAINTAINERS                                        |    2 +-
 mm/Kconfig.debug                                   |    2 +-
 mm/memory-failure.c                                |    2 +-
 tools/mm/.gitignore                                |    4 +
 tools/mm/Makefile                                  |   32 +
 tools/mm/page-types.c                              | 1396 ++++++++++++++++++
 tools/mm/page_owner_sort.c                         |  897 ++++++++++++
 tools/mm/slabinfo-gnuplot.sh                       |  268 ++++
 tools/mm/slabinfo.c                                | 1544 ++++++++++++++++++++
 tools/vm/.gitignore                                |    4 -
 tools/vm/Makefile                                  |   32 -
 tools/vm/page-types.c                              | 1396 ------------------
 tools/vm/page_owner_sort.c                         |  897 ------------
 tools/vm/slabinfo-gnuplot.sh                       |  268 ----
 tools/vm/slabinfo.c                                | 1544 --------------------
 20 files changed, 4150 insertions(+), 4150 deletions(-)
 create mode 100644 tools/mm/.gitignore
 create mode 100644 tools/mm/Makefile
 create mode 100644 tools/mm/page-types.c
 create mode 100644 tools/mm/page_owner_sort.c
 create mode 100644 tools/mm/slabinfo-gnuplot.sh
 create mode 100644 tools/mm/slabinfo.c
 delete mode 100644 tools/vm/.gitignore
 delete mode 100644 tools/vm/Makefile
 delete mode 100644 tools/vm/page-types.c
 delete mode 100644 tools/vm/page_owner_sort.c
 delete mode 100644 tools/vm/slabinfo-gnuplot.sh
 delete mode 100644 tools/vm/slabinfo.c

(limited to 'tools')

diff --git a/Documentation/admin-guide/mm/idle_page_tracking.rst b/Documentation/admin-guide/mm/idle_page_tracking.rst
index df9394fb39c2..19492064278c 100644
--- a/Documentation/admin-guide/mm/idle_page_tracking.rst
+++ b/Documentation/admin-guide/mm/idle_page_tracking.rst
@@ -65,7 +65,7 @@ workload one should:
     are not reclaimable, he or she can filter them out using
     ``/proc/kpageflags``.
 
-The page-types tool in the tools/vm directory can be used to assist in this.
+The page-types tool in the tools/mm directory can be used to assist in this.
 If the tool is run initially with the appropriate option, it will mark all the
 queried pages as idle.  Subsequent runs of the tool can then show which pages have
 their idle flag cleared in the interim.
diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst
index 6e2e416af783..ceb5da3172ba 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -46,7 +46,7 @@ There are four components to pagemap:
  * ``/proc/kpagecount``.  This file contains a 64-bit count of the number of
    times each page is mapped, indexed by PFN.
 
-The page-types tool in the tools/vm directory can be used to query the
+The page-types tool in the tools/mm directory can be used to query the
 number of times a page is mapped.
 
  * ``/proc/kpageflags``.  This file contains a 64-bit set of flags for each
@@ -173,7 +173,7 @@ LRU related page flags
 14 - SWAPBACKED
    The page is backed by swap/RAM.
 
-The page-types tool in the tools/vm directory can be used to query the
+The page-types tool in the tools/mm directory can be used to query the
 above flags.
 
 Using pagemap to do something useful
diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst
index 127514955a5e..5df26c0a0c1f 100644
--- a/Documentation/mm/page_owner.rst
+++ b/Documentation/mm/page_owner.rst
@@ -61,7 +61,7 @@ Usage
 
 1) Build user-space helper::
 
-	cd tools/vm
+	cd tools/mm
 	make page_owner_sort
 
 2) Enable page owner: add "page_owner=on" to boot cmdline.
diff --git a/Documentation/mm/slub.rst b/Documentation/mm/slub.rst
index 7f652216dabe..3ffa7eded251 100644
--- a/Documentation/mm/slub.rst
+++ b/Documentation/mm/slub.rst
@@ -21,7 +21,7 @@ slabs that have data in them. See "slabinfo -h" for more options when
 running the command. ``slabinfo`` can be compiled with
 ::
 
-	gcc -o slabinfo tools/vm/slabinfo.c
+	gcc -o slabinfo tools/mm/slabinfo.c
 
 Some of the modes of operation of ``slabinfo`` require that slub debugging
 be enabled on the command line. F.e. no tracking information will be
diff --git a/Documentation/translations/zh_CN/mm/page_owner.rst b/Documentation/translations/zh_CN/mm/page_owner.rst
index 21a6a0837d42..4d3b2c33e4ef 100644
--- a/Documentation/translations/zh_CN/mm/page_owner.rst
+++ b/Documentation/translations/zh_CN/mm/page_owner.rst
@@ -62,7 +62,7 @@ page owner在默认情况下是禁用的。所以，如果你想使用它，你
 
 1) 构建用户空间的帮助::
 
-	cd tools/vm
+	cd tools/mm
 	make page_owner_sort
 
 2) 启用page owner: 添加 "page_owner=on" 到 boot cmdline.
diff --git a/MAINTAINERS b/MAINTAINERS
index c05f95aa7af1..c726adfd1f0d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13483,8 +13483,8 @@ F:	include/linux/mm.h
 F:	include/linux/mmzone.h
 F:	include/linux/pagewalk.h
 F:	mm/
+F:	tools/mm/
 F:	tools/testing/selftests/vm/
-F:	tools/vm/
 
 VMALLOC
 M:	Andrew Morton <akpm@linux-foundation.org>
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index fca699ad1fb0..d62f48131952 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -90,7 +90,7 @@ config PAGE_OWNER
 	  help to find bare alloc_page(s) leaks. Even if you include this
 	  feature on your build, it is disabled in default. You should pass
 	  "page_owner=on" to boot parameter in order to enable it. Eats
-	  a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
+	  a fair amount of memory if enabled. See tools/mm/page_owner_sort.c
 	  for user-space helper.
 
 	  If unsure, say N.
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c77a9e37e27e..6bf07345ea2c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -24,7 +24,7 @@
  * - You have a test that can be added to mce-test
  *   https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
  * - The case actually shows up as a frequent (top 10) page state in
- *   tools/vm/page-types when running a real workload.
+ *   tools/mm/page-types when running a real workload.
  * 
  * There are several operations here with exponential complexity because
  * of unsuitable VM data structures. For example the operation to map back 
diff --git a/tools/mm/.gitignore b/tools/mm/.gitignore
new file mode 100644
index 000000000000..922879f93fc8
--- /dev/null
+++ b/tools/mm/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+slabinfo
+page-types
+page_owner_sort
diff --git a/tools/mm/Makefile b/tools/mm/Makefile
new file mode 100644
index 000000000000..9860622cbb15
--- /dev/null
+++ b/tools/mm/Makefile
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for vm tools
+#
+include ../scripts/Makefile.include
+
+TARGETS=page-types slabinfo page_owner_sort
+
+LIB_DIR = ../lib/api
+LIBS = $(LIB_DIR)/libapi.a
+
+CFLAGS = -Wall -Wextra -I../lib/
+LDFLAGS = $(LIBS)
+
+all: $(TARGETS)
+
+$(TARGETS): $(LIBS)
+
+$(LIBS):
+	make -C $(LIB_DIR)
+
+%: %.c
+	$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
+
+clean:
+	$(RM) page-types slabinfo page_owner_sort
+	make -C $(LIB_DIR) clean
+
+sbindir ?= /usr/sbin
+
+install: all
+	install -d $(DESTDIR)$(sbindir)
+	install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir)
diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c
new file mode 100644
index 000000000000..381dcc00cb62
--- /dev/null
+++ b/tools/mm/page-types.c
@@ -0,0 +1,1396 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * page-types: Tool for querying page flags
+ *
+ * Copyright (C) 2009 Intel corporation
+ *
+ * Authors: Wu Fengguang <fengguang.wu@intel.com>
+ */
+
+#define _FILE_OFFSET_BITS 64
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <string.h>
+#include <getopt.h>
+#include <limits.h>
+#include <assert.h>
+#include <ftw.h>
+#include <time.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/fcntl.h>
+#include <sys/mount.h>
+#include <sys/statfs.h>
+#include <sys/mman.h>
+#include "../../include/uapi/linux/magic.h"
+#include "../../include/uapi/linux/kernel-page-flags.h"
+#include <api/fs/fs.h>
+
+#ifndef MAX_PATH
+# define MAX_PATH 256
+#endif
+
+#ifndef STR
+# define _STR(x) #x
+# define STR(x) _STR(x)
+#endif
+
+/*
+ * pagemap kernel ABI bits
+ */
+
+#define PM_ENTRY_BYTES		8
+#define PM_PFRAME_BITS		55
+#define PM_PFRAME_MASK		((1LL << PM_PFRAME_BITS) - 1)
+#define PM_PFRAME(x)		((x) & PM_PFRAME_MASK)
+#define MAX_SWAPFILES_SHIFT	5
+#define PM_SWAP_OFFSET(x)	(((x) & PM_PFRAME_MASK) >> MAX_SWAPFILES_SHIFT)
+#define PM_SOFT_DIRTY		(1ULL << 55)
+#define PM_MMAP_EXCLUSIVE	(1ULL << 56)
+#define PM_FILE			(1ULL << 61)
+#define PM_SWAP			(1ULL << 62)
+#define PM_PRESENT		(1ULL << 63)
+
+/*
+ * kernel page flags
+ */
+
+#define KPF_BYTES		8
+#define PROC_KPAGEFLAGS		"/proc/kpageflags"
+#define PROC_KPAGECOUNT		"/proc/kpagecount"
+#define PROC_KPAGECGROUP	"/proc/kpagecgroup"
+
+#define SYS_KERNEL_MM_PAGE_IDLE "/sys/kernel/mm/page_idle/bitmap"
+
+/* [32-] kernel hacking assistances */
+#define KPF_RESERVED		32
+#define KPF_MLOCKED		33
+#define KPF_MAPPEDTODISK	34
+#define KPF_PRIVATE		35
+#define KPF_PRIVATE_2		36
+#define KPF_OWNER_PRIVATE	37
+#define KPF_ARCH		38
+#define KPF_UNCACHED		39
+#define KPF_SOFTDIRTY		40
+#define KPF_ARCH_2		41
+
+/* [47-] take some arbitrary free slots for expanding overloaded flags
+ * not part of kernel API
+ */
+#define KPF_ANON_EXCLUSIVE	47
+#define KPF_READAHEAD		48
+#define KPF_SLOB_FREE		49
+#define KPF_SLUB_FROZEN		50
+#define KPF_SLUB_DEBUG		51
+#define KPF_FILE		61
+#define KPF_SWAP		62
+#define KPF_MMAP_EXCLUSIVE	63
+
+#define KPF_ALL_BITS		((uint64_t)~0ULL)
+#define KPF_HACKERS_BITS	(0xffffULL << 32)
+#define KPF_OVERLOADED_BITS	(0xffffULL << 48)
+#define BIT(name)		(1ULL << KPF_##name)
+#define BITS_COMPOUND		(BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
+
+static const char * const page_flag_names[] = {
+	[KPF_LOCKED]		= "L:locked",
+	[KPF_ERROR]		= "E:error",
+	[KPF_REFERENCED]	= "R:referenced",
+	[KPF_UPTODATE]		= "U:uptodate",
+	[KPF_DIRTY]		= "D:dirty",
+	[KPF_LRU]		= "l:lru",
+	[KPF_ACTIVE]		= "A:active",
+	[KPF_SLAB]		= "S:slab",
+	[KPF_WRITEBACK]		= "W:writeback",
+	[KPF_RECLAIM]		= "I:reclaim",
+	[KPF_BUDDY]		= "B:buddy",
+
+	[KPF_MMAP]		= "M:mmap",
+	[KPF_ANON]		= "a:anonymous",
+	[KPF_SWAPCACHE]		= "s:swapcache",
+	[KPF_SWAPBACKED]	= "b:swapbacked",
+	[KPF_COMPOUND_HEAD]	= "H:compound_head",
+	[KPF_COMPOUND_TAIL]	= "T:compound_tail",
+	[KPF_HUGE]		= "G:huge",
+	[KPF_UNEVICTABLE]	= "u:unevictable",
+	[KPF_HWPOISON]		= "X:hwpoison",
+	[KPF_NOPAGE]		= "n:nopage",
+	[KPF_KSM]		= "x:ksm",
+	[KPF_THP]		= "t:thp",
+	[KPF_OFFLINE]		= "o:offline",
+	[KPF_PGTABLE]		= "g:pgtable",
+	[KPF_ZERO_PAGE]		= "z:zero_page",
+	[KPF_IDLE]              = "i:idle_page",
+
+	[KPF_RESERVED]		= "r:reserved",
+	[KPF_MLOCKED]		= "m:mlocked",
+	[KPF_MAPPEDTODISK]	= "d:mappedtodisk",
+	[KPF_PRIVATE]		= "P:private",
+	[KPF_PRIVATE_2]		= "p:private_2",
+	[KPF_OWNER_PRIVATE]	= "O:owner_private",
+	[KPF_ARCH]		= "h:arch",
+	[KPF_UNCACHED]		= "c:uncached",
+	[KPF_SOFTDIRTY]		= "f:softdirty",
+	[KPF_ARCH_2]		= "H:arch_2",
+
+	[KPF_ANON_EXCLUSIVE]	= "d:anon_exclusive",
+	[KPF_READAHEAD]		= "I:readahead",
+	[KPF_SLOB_FREE]		= "P:slob_free",
+	[KPF_SLUB_FROZEN]	= "A:slub_frozen",
+	[KPF_SLUB_DEBUG]	= "E:slub_debug",
+
+	[KPF_FILE]		= "F:file",
+	[KPF_SWAP]		= "w:swap",
+	[KPF_MMAP_EXCLUSIVE]	= "1:mmap_exclusive",
+};
+
+
+/*
+ * data structures
+ */
+
+static int		opt_raw;	/* for kernel developers */
+static int		opt_list;	/* list pages (in ranges) */
+static int		opt_mark_idle;	/* set accessed bit */
+static int		opt_no_summary;	/* don't show summary */
+static pid_t		opt_pid;	/* process to walk */
+const char		*opt_file;	/* file or directory path */
+static uint64_t		opt_cgroup;	/* cgroup inode */
+static int		opt_list_cgroup;/* list page cgroup */
+static int		opt_list_mapcnt;/* list page map count */
+static const char	*opt_kpageflags;/* kpageflags file to parse */
+
+#define MAX_ADDR_RANGES	1024
+static int		nr_addr_ranges;
+static unsigned long	opt_offset[MAX_ADDR_RANGES];
+static unsigned long	opt_size[MAX_ADDR_RANGES];
+
+#define MAX_VMAS	10240
+static int		nr_vmas;
+static unsigned long	pg_start[MAX_VMAS];
+static unsigned long	pg_end[MAX_VMAS];
+
+#define MAX_BIT_FILTERS	64
+static int		nr_bit_filters;
+static uint64_t		opt_mask[MAX_BIT_FILTERS];
+static uint64_t		opt_bits[MAX_BIT_FILTERS];
+
+static int		page_size;
+
+static int		pagemap_fd;
+static int		kpageflags_fd;
+static int		kpagecount_fd = -1;
+static int		kpagecgroup_fd = -1;
+static int		page_idle_fd = -1;
+
+static int		opt_hwpoison;
+static int		opt_unpoison;
+
+static const char	*hwpoison_debug_fs;
+static int		hwpoison_inject_fd;
+static int		hwpoison_forget_fd;
+
+#define HASH_SHIFT	13
+#define HASH_SIZE	(1 << HASH_SHIFT)
+#define HASH_MASK	(HASH_SIZE - 1)
+#define HASH_KEY(flags)	(flags & HASH_MASK)
+
+static unsigned long	total_pages;
+static unsigned long	nr_pages[HASH_SIZE];
+static uint64_t		page_flags[HASH_SIZE];
+
+
+/*
+ * helper functions
+ */
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define min_t(type, x, y) ({			\
+	type __min1 = (x);			\
+	type __min2 = (y);			\
+	__min1 < __min2 ? __min1 : __min2; })
+
+#define max_t(type, x, y) ({			\
+	type __max1 = (x);			\
+	type __max2 = (y);			\
+	__max1 > __max2 ? __max1 : __max2; })
+
+static unsigned long pages2mb(unsigned long pages)
+{
+	return (pages * page_size) >> 20;
+}
+
+static void fatal(const char *x, ...)
+{
+	va_list ap;
+
+	va_start(ap, x);
+	vfprintf(stderr, x, ap);
+	va_end(ap);
+	exit(EXIT_FAILURE);
+}
+
+static int checked_open(const char *pathname, int flags)
+{
+	int fd = open(pathname, flags);
+
+	if (fd < 0) {
+		perror(pathname);
+		exit(EXIT_FAILURE);
+	}
+
+	return fd;
+}
+
+/*
+ * pagemap/kpageflags routines
+ */
+
+static unsigned long do_u64_read(int fd, const char *name,
+				 uint64_t *buf,
+				 unsigned long index,
+				 unsigned long count)
+{
+	long bytes;
+
+	if (index > ULONG_MAX / 8)
+		fatal("index overflow: %lu\n", index);
+
+	bytes = pread(fd, buf, count * 8, (off_t)index * 8);
+	if (bytes < 0) {
+		perror(name);
+		exit(EXIT_FAILURE);
+	}
+	if (bytes % 8)
+		fatal("partial read: %lu bytes\n", bytes);
+
+	return bytes / 8;
+}
+
+static unsigned long kpageflags_read(uint64_t *buf,
+				     unsigned long index,
+				     unsigned long pages)
+{
+	return do_u64_read(kpageflags_fd, opt_kpageflags, buf, index, pages);
+}
+
+static unsigned long kpagecgroup_read(uint64_t *buf,
+				      unsigned long index,
+				      unsigned long pages)
+{
+	if (kpagecgroup_fd < 0)
+		return pages;
+
+	return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages);
+}
+
+static unsigned long kpagecount_read(uint64_t *buf,
+				     unsigned long index,
+				     unsigned long pages)
+{
+	return kpagecount_fd < 0 ? pages :
+		do_u64_read(kpagecount_fd, PROC_KPAGECOUNT,
+			    buf, index, pages);
+}
+
+static unsigned long pagemap_read(uint64_t *buf,
+				  unsigned long index,
+				  unsigned long pages)
+{
+	return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages);
+}
+
+static unsigned long pagemap_pfn(uint64_t val)
+{
+	unsigned long pfn;
+
+	if (val & PM_PRESENT)
+		pfn = PM_PFRAME(val);
+	else
+		pfn = 0;
+
+	return pfn;
+}
+
+static unsigned long pagemap_swap_offset(uint64_t val)
+{
+	return val & PM_SWAP ? PM_SWAP_OFFSET(val) : 0;
+}
+
+/*
+ * page flag names
+ */
+
+static char *page_flag_name(uint64_t flags)
+{
+	static char buf[65];
+	int present;
+	size_t i, j;
+
+	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+		present = (flags >> i) & 1;
+		if (!page_flag_names[i]) {
+			if (present)
+				fatal("unknown flag bit %d\n", i);
+			continue;
+		}
+		buf[j++] = present ? page_flag_names[i][0] : '_';
+	}
+
+	return buf;
+}
+
+static char *page_flag_longname(uint64_t flags)
+{
+	static char buf[1024];
+	size_t i, n;
+
+	for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+		if (!page_flag_names[i])
+			continue;
+		if ((flags >> i) & 1)
+			n += snprintf(buf + n, sizeof(buf) - n, "%s,",
+					page_flag_names[i] + 2);
+	}
+	if (n)
+		n--;
+	buf[n] = '\0';
+
+	return buf;
+}
+
+
+/*
+ * page list and summary
+ */
+
+static void show_page_range(unsigned long voffset, unsigned long offset,
+			    unsigned long size, uint64_t flags,
+			    uint64_t cgroup, uint64_t mapcnt)
+{
+	static uint64_t      flags0;
+	static uint64_t	     cgroup0;
+	static uint64_t      mapcnt0;
+	static unsigned long voff;
+	static unsigned long index;
+	static unsigned long count;
+
+	if (flags == flags0 && cgroup == cgroup0 && mapcnt == mapcnt0 &&
+	    offset == index + count && size && voffset == voff + count) {
+		count += size;
+		return;
+	}
+
+	if (count) {
+		if (opt_pid)
+			printf("%lx\t", voff);
+		if (opt_file)
+			printf("%lx\t", voff);
+		if (opt_list_cgroup)
+			printf("@%llu\t", (unsigned long long)cgroup0);
+		if (opt_list_mapcnt)
+			printf("%lu\t", mapcnt0);
+		printf("%lx\t%lx\t%s\n",
+				index, count, page_flag_name(flags0));
+	}
+
+	flags0 = flags;
+	cgroup0 = cgroup;
+	mapcnt0 = mapcnt;
+	index  = offset;
+	voff   = voffset;
+	count  = size;
+}
+
+static void flush_page_range(void)
+{
+	show_page_range(0, 0, 0, 0, 0, 0);
+}
+
+static void show_page(unsigned long voffset, unsigned long offset,
+		      uint64_t flags, uint64_t cgroup, uint64_t mapcnt)
+{
+	if (opt_pid)
+		printf("%lx\t", voffset);
+	if (opt_file)
+		printf("%lx\t", voffset);
+	if (opt_list_cgroup)
+		printf("@%llu\t", (unsigned long long)cgroup);
+	if (opt_list_mapcnt)
+		printf("%lu\t", mapcnt);
+
+	printf("%lx\t%s\n", offset, page_flag_name(flags));
+}
+
+static void show_summary(void)
+{
+	size_t i;
+
+	printf("             flags\tpage-count       MB"
+		"  symbolic-flags\t\t\tlong-symbolic-flags\n");
+
+	for (i = 0; i < ARRAY_SIZE(nr_pages); i++) {
+		if (nr_pages[i])
+			printf("0x%016llx\t%10lu %8lu  %s\t%s\n",
+				(unsigned long long)page_flags[i],
+				nr_pages[i],
+				pages2mb(nr_pages[i]),
+				page_flag_name(page_flags[i]),
+				page_flag_longname(page_flags[i]));
+	}
+
+	printf("             total\t%10lu %8lu\n",
+			total_pages, pages2mb(total_pages));
+}
+
+
+/*
+ * page flag filters
+ */
+
+static int bit_mask_ok(uint64_t flags)
+{
+	int i;
+
+	for (i = 0; i < nr_bit_filters; i++) {
+		if (opt_bits[i] == KPF_ALL_BITS) {
+			if ((flags & opt_mask[i]) == 0)
+				return 0;
+		} else {
+			if ((flags & opt_mask[i]) != opt_bits[i])
+				return 0;
+		}
+	}
+
+	return 1;
+}
+
+static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
+{
+	/* Anonymous pages overload PG_mappedtodisk */
+	if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK)))
+		flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE);
+
+	/* SLOB/SLUB overload several page flags */
+	if (flags & BIT(SLAB)) {
+		if (flags & BIT(PRIVATE))
+			flags ^= BIT(PRIVATE) | BIT(SLOB_FREE);
+		if (flags & BIT(ACTIVE))
+			flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
+		if (flags & BIT(ERROR))
+			flags ^= BIT(ERROR) | BIT(SLUB_DEBUG);
+	}
+
+	/* PG_reclaim is overloaded as PG_readahead in the read path */
+	if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM))
+		flags ^= BIT(RECLAIM) | BIT(READAHEAD);
+
+	if (pme & PM_SOFT_DIRTY)
+		flags |= BIT(SOFTDIRTY);
+	if (pme & PM_FILE)
+		flags |= BIT(FILE);
+	if (pme & PM_SWAP)
+		flags |= BIT(SWAP);
+	if (pme & PM_MMAP_EXCLUSIVE)
+		flags |= BIT(MMAP_EXCLUSIVE);
+
+	return flags;
+}
+
+static uint64_t well_known_flags(uint64_t flags)
+{
+	/* hide flags intended only for kernel hacker */
+	flags &= ~KPF_HACKERS_BITS;
+
+	/* hide non-hugeTLB compound pages */
+	if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE)))
+		flags &= ~BITS_COMPOUND;
+
+	return flags;
+}
+
+static uint64_t kpageflags_flags(uint64_t flags, uint64_t pme)
+{
+	if (opt_raw)
+		flags = expand_overloaded_flags(flags, pme);
+	else
+		flags = well_known_flags(flags);
+
+	return flags;
+}
+
+/*
+ * page actions
+ */
+
+static void prepare_hwpoison_fd(void)
+{
+	char buf[MAX_PATH + 1];
+
+	hwpoison_debug_fs = debugfs__mount();
+	if (!hwpoison_debug_fs) {
+		perror("mount debugfs");
+		exit(EXIT_FAILURE);
+	}
+
+	if (opt_hwpoison && !hwpoison_inject_fd) {
+		snprintf(buf, MAX_PATH, "%s/hwpoison/corrupt-pfn",
+			hwpoison_debug_fs);
+		hwpoison_inject_fd = checked_open(buf, O_WRONLY);
+	}
+
+	if (opt_unpoison && !hwpoison_forget_fd) {
+		snprintf(buf, MAX_PATH, "%s/hwpoison/unpoison-pfn",
+			hwpoison_debug_fs);
+		hwpoison_forget_fd = checked_open(buf, O_WRONLY);
+	}
+}
+
+static int hwpoison_page(unsigned long offset)
+{
+	char buf[100];
+	int len;
+
+	len = sprintf(buf, "0x%lx\n", offset);
+	len = write(hwpoison_inject_fd, buf, len);
+	if (len < 0) {
+		perror("hwpoison inject");
+		return len;
+	}
+	return 0;
+}
+
+static int unpoison_page(unsigned long offset)
+{
+	char buf[100];
+	int len;
+
+	len = sprintf(buf, "0x%lx\n", offset);
+	len = write(hwpoison_forget_fd, buf, len);
+	if (len < 0) {
+		perror("hwpoison forget");
+		return len;
+	}
+	return 0;
+}
+
+static int mark_page_idle(unsigned long offset)
+{
+	static unsigned long off;
+	static uint64_t buf;
+	int len;
+
+	if ((offset / 64 == off / 64) || buf == 0) {
+		buf |= 1UL << (offset % 64);
+		off = offset;
+		return 0;
+	}
+
+	len = pwrite(page_idle_fd, &buf, 8, 8 * (off / 64));
+	if (len < 0) {
+		perror("mark page idle");
+		return len;
+	}
+
+	buf = 1UL << (offset % 64);
+	off = offset;
+
+	return 0;
+}
+
+/*
+ * page frame walker
+ */
+
+static size_t hash_slot(uint64_t flags)
+{
+	size_t k = HASH_KEY(flags);
+	size_t i;
+
+	/* Explicitly reserve slot 0 for flags 0: the following logic
+	 * cannot distinguish an unoccupied slot from slot (flags==0).
+	 */
+	if (flags == 0)
+		return 0;
+
+	/* search through the remaining (HASH_SIZE-1) slots */
+	for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) {
+		if (!k || k >= ARRAY_SIZE(page_flags))
+			k = 1;
+		if (page_flags[k] == 0) {
+			page_flags[k] = flags;
+			return k;
+		}
+		if (page_flags[k] == flags)
+			return k;
+	}
+
+	fatal("hash table full: bump up HASH_SHIFT?\n");
+	exit(EXIT_FAILURE);
+}
+
+static void add_page(unsigned long voffset, unsigned long offset,
+		     uint64_t flags, uint64_t cgroup, uint64_t mapcnt,
+		     uint64_t pme)
+{
+	flags = kpageflags_flags(flags, pme);
+
+	if (!bit_mask_ok(flags))
+		return;
+
+	if (opt_cgroup && cgroup != (uint64_t)opt_cgroup)
+		return;
+
+	if (opt_hwpoison)
+		hwpoison_page(offset);
+	if (opt_unpoison)
+		unpoison_page(offset);
+
+	if (opt_mark_idle)
+		mark_page_idle(offset);
+
+	if (opt_list == 1)
+		show_page_range(voffset, offset, 1, flags, cgroup, mapcnt);
+	else if (opt_list == 2)
+		show_page(voffset, offset, flags, cgroup, mapcnt);
+
+	nr_pages[hash_slot(flags)]++;
+	total_pages++;
+}
+
+#define KPAGEFLAGS_BATCH	(64 << 10)	/* 64k pages */
+static void walk_pfn(unsigned long voffset,
+		     unsigned long index,
+		     unsigned long count,
+		     uint64_t pme)
+{
+	uint64_t buf[KPAGEFLAGS_BATCH];
+	uint64_t cgi[KPAGEFLAGS_BATCH];
+	uint64_t cnt[KPAGEFLAGS_BATCH];
+	unsigned long batch;
+	unsigned long pages;
+	unsigned long i;
+
+	/*
+	 * kpagecgroup_read() reads only if kpagecgroup were opened, but
+	 * /proc/kpagecgroup might even not exist, so it's better to fill
+	 * them with zeros here.
+	 */
+	if (count == 1)
+		cgi[0] = 0;
+	else
+		memset(cgi, 0, sizeof cgi);
+
+	while (count) {
+		batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH);
+		pages = kpageflags_read(buf, index, batch);
+		if (pages == 0)
+			break;
+
+		if (kpagecgroup_read(cgi, index, pages) != pages)
+			fatal("kpagecgroup returned fewer pages than expected");
+
+		if (kpagecount_read(cnt, index, pages) != pages)
+			fatal("kpagecount returned fewer pages than expected");
+
+		for (i = 0; i < pages; i++)
+			add_page(voffset + i, index + i,
+				 buf[i], cgi[i], cnt[i], pme);
+
+		index += pages;
+		count -= pages;
+	}
+}
+
+static void walk_swap(unsigned long voffset, uint64_t pme)
+{
+	uint64_t flags = kpageflags_flags(0, pme);
+
+	if (!bit_mask_ok(flags))
+		return;
+
+	if (opt_cgroup)
+		return;
+
+	if (opt_list == 1)
+		show_page_range(voffset, pagemap_swap_offset(pme),
+				1, flags, 0, 0);
+	else if (opt_list == 2)
+		show_page(voffset, pagemap_swap_offset(pme), flags, 0, 0);
+
+	nr_pages[hash_slot(flags)]++;
+	total_pages++;
+}
+
+#define PAGEMAP_BATCH	(64 << 10)
+static void walk_vma(unsigned long index, unsigned long count)
+{
+	uint64_t buf[PAGEMAP_BATCH];
+	unsigned long batch;
+	unsigned long pages;
+	unsigned long pfn;
+	unsigned long i;
+
+	while (count) {
+		batch = min_t(unsigned long, count, PAGEMAP_BATCH);
+		pages = pagemap_read(buf, index, batch);
+		if (pages == 0)
+			break;
+
+		for (i = 0; i < pages; i++) {
+			pfn = pagemap_pfn(buf[i]);
+			if (pfn)
+				walk_pfn(index + i, pfn, 1, buf[i]);
+			if (buf[i] & PM_SWAP)
+				walk_swap(index + i, buf[i]);
+		}
+
+		index += pages;
+		count -= pages;
+	}
+}
+
+static void walk_task(unsigned long index, unsigned long count)
+{
+	const unsigned long end = index + count;
+	unsigned long start;
+	int i = 0;
+
+	while (index < end) {
+
+		while (pg_end[i] <= index)
+			if (++i >= nr_vmas)
+				return;
+		if (pg_start[i] >= end)
+			return;
+
+		start = max_t(unsigned long, pg_start[i], index);
+		index = min_t(unsigned long, pg_end[i], end);
+
+		assert(start < index);
+		walk_vma(start, index - start);
+	}
+}
+
+static void add_addr_range(unsigned long offset, unsigned long size)
+{
+	if (nr_addr_ranges >= MAX_ADDR_RANGES)
+		fatal("too many addr ranges\n");
+
+	opt_offset[nr_addr_ranges] = offset;
+	opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset);
+	nr_addr_ranges++;
+}
+
+static void walk_addr_ranges(void)
+{
+	int i;
+
+	kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
+
+	if (!nr_addr_ranges)
+		add_addr_range(0, ULONG_MAX);
+
+	for (i = 0; i < nr_addr_ranges; i++)
+		if (!opt_pid)
+			walk_pfn(opt_offset[i], opt_offset[i], opt_size[i], 0);
+		else
+			walk_task(opt_offset[i], opt_size[i]);
+
+	if (opt_mark_idle)
+		mark_page_idle(0);
+
+	close(kpageflags_fd);
+}
+
+
+/*
+ * user interface
+ */
+
+static const char *page_flag_type(uint64_t flag)
+{
+	if (flag & KPF_HACKERS_BITS)
+		return "(r)";
+	if (flag & KPF_OVERLOADED_BITS)
+		return "(o)";
+	return "   ";
+}
+
+static void usage(void)
+{
+	size_t i, j;
+
+	printf(
+"page-types [options]\n"
+"            -r|--raw                   Raw mode, for kernel developers\n"
+"            -d|--describe flags        Describe flags\n"
+"            -a|--addr    addr-spec     Walk a range of pages\n"
+"            -b|--bits    bits-spec     Walk pages with specified bits\n"
+"            -c|--cgroup  path|@inode   Walk pages within memory cgroup\n"
+"            -p|--pid     pid           Walk process address space\n"
+"            -f|--file    filename      Walk file address space\n"
+"            -i|--mark-idle             Mark pages idle\n"
+"            -l|--list                  Show page details in ranges\n"
+"            -L|--list-each             Show page details one by one\n"
+"            -C|--list-cgroup           Show cgroup inode for pages\n"
+"            -M|--list-mapcnt           Show page map count\n"
+"            -N|--no-summary            Don't show summary info\n"
+"            -X|--hwpoison              hwpoison pages\n"
+"            -x|--unpoison              unpoison pages\n"
+"            -F|--kpageflags filename   kpageflags file to parse\n"
+"            -h|--help                  Show this usage message\n"
+"flags:\n"
+"            0x10                       bitfield format, e.g.\n"
+"            anon                       bit-name, e.g.\n"
+"            0x10,anon                  comma-separated list, e.g.\n"
+"addr-spec:\n"
+"            N                          one page at offset N (unit: pages)\n"
+"            N+M                        pages range from N to N+M-1\n"
+"            N,M                        pages range from N to M-1\n"
+"            N,                         pages range from N to end\n"
+"            ,M                         pages range from 0 to M-1\n"
+"bits-spec:\n"
+"            bit1,bit2                  (flags & (bit1|bit2)) != 0\n"
+"            bit1,bit2=bit1             (flags & (bit1|bit2)) == bit1\n"
+"            bit1,~bit2                 (flags & (bit1|bit2)) == bit1\n"
+"            =bit1,bit2                 flags == (bit1|bit2)\n"
+"bit-names:\n"
+	);
+
+	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+		if (!page_flag_names[i])
+			continue;
+		printf("%16s%s", page_flag_names[i] + 2,
+				 page_flag_type(1ULL << i));
+		if (++j > 3) {
+			j = 0;
+			putchar('\n');
+		}
+	}
+	printf("\n                                   "
+		"(r) raw mode bits  (o) overloaded bits\n");
+}
+
+static unsigned long long parse_number(const char *str)
+{
+	unsigned long long n;
+
+	n = strtoll(str, NULL, 0);
+
+	if (n == 0 && str[0] != '0')
+		fatal("invalid name or number: %s\n", str);
+
+	return n;
+}
+
+static void parse_pid(const char *str)
+{
+	FILE *file;
+	char buf[5000];
+
+	opt_pid = parse_number(str);
+
+	sprintf(buf, "/proc/%d/pagemap", opt_pid);
+	pagemap_fd = checked_open(buf, O_RDONLY);
+
+	sprintf(buf, "/proc/%d/maps", opt_pid);
+	file = fopen(buf, "r");
+	if (!file) {
+		perror(buf);
+		exit(EXIT_FAILURE);
+	}
+
+	while (fgets(buf, sizeof(buf), file) != NULL) {
+		unsigned long vm_start;
+		unsigned long vm_end;
+		unsigned long long pgoff;
+		int major, minor;
+		char r, w, x, s;
+		unsigned long ino;
+		int n;
+
+		n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu",
+			   &vm_start,
+			   &vm_end,
+			   &r, &w, &x, &s,
+			   &pgoff,
+			   &major, &minor,
+			   &ino);
+		if (n < 10) {
+			fprintf(stderr, "unexpected line: %s\n", buf);
+			continue;
+		}
+		pg_start[nr_vmas] = vm_start / page_size;
+		pg_end[nr_vmas] = vm_end / page_size;
+		if (++nr_vmas >= MAX_VMAS) {
+			fprintf(stderr, "too many VMAs\n");
+			break;
+		}
+	}
+	fclose(file);
+}
+
+static void show_file(const char *name, const struct stat *st)
+{
+	unsigned long long size = st->st_size;
+	char atime[64], mtime[64];
+	long now = time(NULL);
+
+	printf("%s\tInode: %u\tSize: %llu (%llu pages)\n",
+			name, (unsigned)st->st_ino,
+			size, (size + page_size - 1) / page_size);
+
+	strftime(atime, sizeof(atime), "%c", localtime(&st->st_atime));
+	strftime(mtime, sizeof(mtime), "%c", localtime(&st->st_mtime));
+
+	printf("Modify: %s (%ld seconds ago)\nAccess: %s (%ld seconds ago)\n",
+			mtime, now - st->st_mtime,
+			atime, now - st->st_atime);
+}
+
+static sigjmp_buf sigbus_jmp;
+
+static void * volatile sigbus_addr;
+
+static void sigbus_handler(int sig, siginfo_t *info, void *ucontex)
+{
+	(void)sig;
+	(void)ucontex;
+	sigbus_addr = info ? info->si_addr : NULL;
+	siglongjmp(sigbus_jmp, 1);
+}
+
+static struct sigaction sigbus_action = {
+	.sa_sigaction = sigbus_handler,
+	.sa_flags = SA_SIGINFO,
+};
+
+static void walk_file_range(const char *name, int fd,
+			    unsigned long off, unsigned long end)
+{
+	uint8_t vec[PAGEMAP_BATCH];
+	uint64_t buf[PAGEMAP_BATCH], flags;
+	uint64_t cgroup = 0;
+	uint64_t mapcnt = 0;
+	unsigned long nr_pages, pfn, i;
+	ssize_t len;
+	void *ptr;
+	int first = 1;
+
+	for (; off < end; off += len) {
+		nr_pages = (end - off + page_size - 1) / page_size;
+		if (nr_pages > PAGEMAP_BATCH)
+			nr_pages = PAGEMAP_BATCH;
+		len = nr_pages * page_size;
+
+		ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, off);
+		if (ptr == MAP_FAILED)
+			fatal("mmap failed: %s", name);
+
+		/* determine cached pages */
+		if (mincore(ptr, len, vec))
+			fatal("mincore failed: %s", name);
+
+		/* turn off readahead */
+		if (madvise(ptr, len, MADV_RANDOM))
+			fatal("madvice failed: %s", name);
+
+		if (sigsetjmp(sigbus_jmp, 1)) {
+			end = off + sigbus_addr ? sigbus_addr - ptr : 0;
+			fprintf(stderr, "got sigbus at offset %lld: %s\n",
+					(long long)end, name);
+			goto got_sigbus;
+		}
+
+		/* populate ptes */
+		for (i = 0; i < nr_pages ; i++) {
+			if (vec[i] & 1)
+				(void)*(volatile int *)(ptr + i * page_size);
+		}
+got_sigbus:
+
+		/* turn off harvesting reference bits */
+		if (madvise(ptr, len, MADV_SEQUENTIAL))
+			fatal("madvice failed: %s", name);
+
+		if (pagemap_read(buf, (unsigned long)ptr / page_size,
+					nr_pages) != nr_pages)
+			fatal("cannot read pagemap");
+
+		munmap(ptr, len);
+
+		for (i = 0; i < nr_pages; i++) {
+			pfn = pagemap_pfn(buf[i]);
+			if (!pfn)
+				continue;
+			if (!kpageflags_read(&flags, pfn, 1))
+				continue;
+			if (!kpagecgroup_read(&cgroup, pfn, 1))
+				fatal("kpagecgroup_read failed");
+			if (!kpagecount_read(&mapcnt, pfn, 1))
+				fatal("kpagecount_read failed");
+			if (first && opt_list) {
+				first = 0;
+				flush_page_range();
+			}
+			add_page(off / page_size + i, pfn,
+				 flags, cgroup, mapcnt, buf[i]);
+		}
+	}
+}
+
+static void walk_file(const char *name, const struct stat *st)
+{
+	int i;
+	int fd;
+
+	fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW);
+
+	if (!nr_addr_ranges)
+		add_addr_range(0, st->st_size / page_size);
+
+	for (i = 0; i < nr_addr_ranges; i++)
+		walk_file_range(name, fd, opt_offset[i] * page_size,
+				(opt_offset[i] + opt_size[i]) * page_size);
+
+	close(fd);
+}
+
+int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f)
+{
+	(void)f;
+	switch (type) {
+	case FTW_F:
+		if (S_ISREG(st->st_mode))
+			walk_file(name, st);
+		break;
+	case FTW_DNR:
+		fprintf(stderr, "cannot read dir: %s\n", name);
+		break;
+	}
+	return 0;
+}
+
+struct stat st;
+
+static void walk_page_cache(void)
+{
+	kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
+	pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY);
+	sigaction(SIGBUS, &sigbus_action, NULL);
+
+	if (stat(opt_file, &st))
+		fatal("stat failed: %s\n", opt_file);
+
+	if (S_ISREG(st.st_mode)) {
+		walk_file(opt_file, &st);
+	} else if (S_ISDIR(st.st_mode)) {
+		/* do not follow symlinks and mountpoints */
+		if (nftw(opt_file, walk_tree, 64, FTW_MOUNT | FTW_PHYS) < 0)
+			fatal("nftw failed: %s\n", opt_file);
+	} else
+		fatal("unhandled file type: %s\n", opt_file);
+
+	close(kpageflags_fd);
+	close(pagemap_fd);
+	signal(SIGBUS, SIG_DFL);
+}
+
+static void parse_file(const char *name)
+{
+	opt_file = name;
+}
+
+static void parse_cgroup(const char *path)
+{
+	if (path[0] == '@') {
+		opt_cgroup = parse_number(path + 1);
+		return;
+	}
+
+	struct stat st;
+
+	if (stat(path, &st))
+		fatal("stat failed: %s: %m\n", path);
+
+	if (!S_ISDIR(st.st_mode))
+		fatal("cgroup supposed to be a directory: %s\n", path);
+
+	opt_cgroup = st.st_ino;
+}
+
+static void parse_addr_range(const char *optarg)
+{
+	unsigned long offset;
+	unsigned long size;
+	char *p;
+
+	p = strchr(optarg, ',');
+	if (!p)
+		p = strchr(optarg, '+');
+
+	if (p == optarg) {
+		offset = 0;
+		size   = parse_number(p + 1);
+	} else if (p) {
+		offset = parse_number(optarg);
+		if (p[1] == '\0')
+			size = ULONG_MAX;
+		else {
+			size = parse_number(p + 1);
+			if (*p == ',') {
+				if (size < offset)
+					fatal("invalid range: %lu,%lu\n",
+							offset, size);
+				size -= offset;
+			}
+		}
+	} else {
+		offset = parse_number(optarg);
+		size   = 1;
+	}
+
+	add_addr_range(offset, size);
+}
+
+static void add_bits_filter(uint64_t mask, uint64_t bits)
+{
+	if (nr_bit_filters >= MAX_BIT_FILTERS)
+		fatal("too much bit filters\n");
+
+	opt_mask[nr_bit_filters] = mask;
+	opt_bits[nr_bit_filters] = bits;
+	nr_bit_filters++;
+}
+
+static uint64_t parse_flag_name(const char *str, int len)
+{
+	size_t i;
+
+	if (!*str || !len)
+		return 0;
+
+	if (len <= 8 && !strncmp(str, "compound", len))
+		return BITS_COMPOUND;
+
+	for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+		if (!page_flag_names[i])
+			continue;
+		if (!strncmp(str, page_flag_names[i] + 2, len))
+			return 1ULL << i;
+	}
+
+	return parse_number(str);
+}
+
+static uint64_t parse_flag_names(const char *str, int all)
+{
+	const char *p    = str;
+	uint64_t   flags = 0;
+
+	while (1) {
+		if (*p == ',' || *p == '=' || *p == '\0') {
+			if ((*str != '~') || (*str == '~' && all && *++str))
+				flags |= parse_flag_name(str, p - str);
+			if (*p != ',')
+				break;
+			str = p + 1;
+		}
+		p++;
+	}
+
+	return flags;
+}
+
+static void parse_bits_mask(const char *optarg)
+{
+	uint64_t mask;
+	uint64_t bits;
+	const char *p;
+
+	p = strchr(optarg, '=');
+	if (p == optarg) {
+		mask = KPF_ALL_BITS;
+		bits = parse_flag_names(p + 1, 0);
+	} else if (p) {
+		mask = parse_flag_names(optarg, 0);
+		bits = parse_flag_names(p + 1, 0);
+	} else if (strchr(optarg, '~')) {
+		mask = parse_flag_names(optarg, 1);
+		bits = parse_flag_names(optarg, 0);
+	} else {
+		mask = parse_flag_names(optarg, 0);
+		bits = KPF_ALL_BITS;
+	}
+
+	add_bits_filter(mask, bits);
+}
+
+static void parse_kpageflags(const char *name)
+{
+	opt_kpageflags = name;
+}
+
+static void describe_flags(const char *optarg)
+{
+	uint64_t flags = parse_flag_names(optarg, 0);
+
+	printf("0x%016llx\t%s\t%s\n",
+		(unsigned long long)flags,
+		page_flag_name(flags),
+		page_flag_longname(flags));
+}
+
+static const struct option opts[] = {
+	{ "raw"       , 0, NULL, 'r' },
+	{ "pid"       , 1, NULL, 'p' },
+	{ "file"      , 1, NULL, 'f' },
+	{ "addr"      , 1, NULL, 'a' },
+	{ "bits"      , 1, NULL, 'b' },
+	{ "cgroup"    , 1, NULL, 'c' },
+	{ "describe"  , 1, NULL, 'd' },
+	{ "mark-idle" , 0, NULL, 'i' },
+	{ "list"      , 0, NULL, 'l' },
+	{ "list-each" , 0, NULL, 'L' },
+	{ "list-cgroup", 0, NULL, 'C' },
+	{ "list-mapcnt", 0, NULL, 'M' },
+	{ "no-summary", 0, NULL, 'N' },
+	{ "hwpoison"  , 0, NULL, 'X' },
+	{ "unpoison"  , 0, NULL, 'x' },
+	{ "kpageflags", 0, NULL, 'F' },
+	{ "help"      , 0, NULL, 'h' },
+	{ NULL        , 0, NULL, 0 }
+};
+
+int main(int argc, char *argv[])
+{
+	int c;
+
+	page_size = getpagesize();
+
+	while ((c = getopt_long(argc, argv,
+				"rp:f:a:b:d:c:CilLMNXxF:h",
+				opts, NULL)) != -1) {
+		switch (c) {
+		case 'r':
+			opt_raw = 1;
+			break;
+		case 'p':
+			parse_pid(optarg);
+			break;
+		case 'f':
+			parse_file(optarg);
+			break;
+		case 'a':
+			parse_addr_range(optarg);
+			break;
+		case 'b':
+			parse_bits_mask(optarg);
+			break;
+		case 'c':
+			parse_cgroup(optarg);
+			break;
+		case 'C':
+			opt_list_cgroup = 1;
+			break;
+		case 'd':
+			describe_flags(optarg);
+			exit(0);
+		case 'i':
+			opt_mark_idle = 1;
+			break;
+		case 'l':
+			opt_list = 1;
+			break;
+		case 'L':
+			opt_list = 2;
+			break;
+		case 'M':
+			opt_list_mapcnt = 1;
+			break;
+		case 'N':
+			opt_no_summary = 1;
+			break;
+		case 'X':
+			opt_hwpoison = 1;
+			prepare_hwpoison_fd();
+			break;
+		case 'x':
+			opt_unpoison = 1;
+			prepare_hwpoison_fd();
+			break;
+		case 'F':
+			parse_kpageflags(optarg);
+			break;
+		case 'h':
+			usage();
+			exit(0);
+		default:
+			usage();
+			exit(1);
+		}
+	}
+
+	if (!opt_kpageflags)
+		opt_kpageflags = PROC_KPAGEFLAGS;
+
+	if (opt_cgroup || opt_list_cgroup)
+		kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY);
+
+	if (opt_list && opt_list_mapcnt)
+		kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY);
+
+	if (opt_mark_idle)
+		page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR);
+
+	if (opt_list && opt_pid)
+		printf("voffset\t");
+	if (opt_list && opt_file)
+		printf("foffset\t");
+	if (opt_list && opt_list_cgroup)
+		printf("cgroup\t");
+	if (opt_list && opt_list_mapcnt)
+		printf("map-cnt\t");
+
+	if (opt_list == 1)
+		printf("offset\tlen\tflags\n");
+	if (opt_list == 2)
+		printf("offset\tflags\n");
+
+	if (opt_file)
+		walk_page_cache();
+	else
+		walk_addr_ranges();
+
+	if (opt_list == 1)
+		flush_page_range();
+
+	if (opt_no_summary)
+		return 0;
+
+	if (opt_list)
+		printf("\n\n");
+
+	if (opt_file) {
+		show_file(opt_file, &st);
+		printf("\n");
+	}
+
+	show_summary();
+
+	if (opt_list_mapcnt)
+		close(kpagecount_fd);
+
+	if (page_idle_fd >= 0)
+		close(page_idle_fd);
+
+	return 0;
+}
diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c
new file mode 100644
index 000000000000..7c2ac124cdc8
--- /dev/null
+++ b/tools/mm/page_owner_sort.c
@@ -0,0 +1,897 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * User-space helper to sort the output of /sys/kernel/debug/page_owner
+ *
+ * Example use:
+ * cat /sys/kernel/debug/page_owner > page_owner_full.txt
+ * ./page_owner_sort page_owner_full.txt sorted_page_owner.txt
+ * Or sort by total memory:
+ * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt
+ *
+ * See Documentation/mm/page_owner.rst
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <regex.h>
+#include <errno.h>
+#include <linux/types.h>
+#include <getopt.h>
+
+#define bool int
+#define true 1
+#define false 0
+#define TASK_COMM_LEN 16
+
+struct block_list {
+	char *txt;
+	char *comm; // task command name
+	char *stacktrace;
+	__u64 ts_nsec;
+	__u64 free_ts_nsec;
+	int len;
+	int num;
+	int page_num;
+	pid_t pid;
+	pid_t tgid;
+	int allocator;
+};
+enum FILTER_BIT {
+	FILTER_UNRELEASE = 1<<1,
+	FILTER_PID = 1<<2,
+	FILTER_TGID = 1<<3,
+	FILTER_COMM = 1<<4
+};
+enum CULL_BIT {
+	CULL_UNRELEASE = 1<<1,
+	CULL_PID = 1<<2,
+	CULL_TGID = 1<<3,
+	CULL_COMM = 1<<4,
+	CULL_STACKTRACE = 1<<5,
+	CULL_ALLOCATOR = 1<<6
+};
+enum ALLOCATOR_BIT {
+	ALLOCATOR_CMA = 1<<1,
+	ALLOCATOR_SLAB = 1<<2,
+	ALLOCATOR_VMALLOC = 1<<3,
+	ALLOCATOR_OTHERS = 1<<4
+};
+enum ARG_TYPE {
+	ARG_TXT, ARG_COMM, ARG_STACKTRACE, ARG_ALLOC_TS, ARG_FREE_TS,
+	ARG_CULL_TIME, ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_FREE,
+	ARG_ALLOCATOR
+};
+enum SORT_ORDER {
+	SORT_ASC = 1,
+	SORT_DESC = -1,
+};
+struct filter_condition {
+	pid_t *pids;
+	pid_t *tgids;
+	char **comms;
+	int pids_size;
+	int tgids_size;
+	int comms_size;
+};
+struct sort_condition {
+	int (**cmps)(const void *, const void *);
+	int *signs;
+	int size;
+};
+static struct filter_condition fc;
+static struct sort_condition sc;
+static regex_t order_pattern;
+static regex_t pid_pattern;
+static regex_t tgid_pattern;
+static regex_t comm_pattern;
+static regex_t ts_nsec_pattern;
+static regex_t free_ts_nsec_pattern;
+static struct block_list *list;
+static int list_size;
+static int max_size;
+static int cull;
+static int filter;
+static bool debug_on;
+
+static void set_single_cmp(int (*cmp)(const void *, const void *), int sign);
+
+int read_block(char *buf, char *ext_buf, int buf_size, FILE *fin)
+{
+	char *curr = buf, *const buf_end = buf + buf_size;
+
+	while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) {
+		if (*curr == '\n') { /* empty line */
+			return curr - buf;
+		}
+		if (!strncmp(curr, "PFN", 3)) {
+			strcpy(ext_buf, curr);
+			continue;
+		}
+		curr += strlen(curr);
+	}
+
+	return -1; /* EOF or no space left in buf. */
+}
+
+static int compare_txt(const void *p1, const void *p2)
+{
+	const struct block_list *l1 = p1, *l2 = p2;
+
+	return strcmp(l1->txt, l2->txt);
+}
+
+static int compare_stacktrace(const void *p1, const void *p2)
+{
+	const struct block_list *l1 = p1, *l2 = p2;
+
+	return strcmp(l1->stacktrace, l2->stacktrace);
+}
+
+static int compare_num(const void *p1, const void *p2)
+{
+	const struct block_list *l1 = p1, *l2 = p2;
+
+	return l1->num - l2->num;
+}
+
+static int compare_page_num(const void *p1, const void *p2)
+{
+	const struct block_list *l1 = p1, *l2 = p2;
+
+	return l1->page_num - l2->page_num;
+}
+
+static int compare_pid(const void *p1, const void *p2)
+{
+	const struct block_list *l1 = p1, *l2 = p2;
+
+	return l1->pid - l2->pid;
+}
+
+static int compare_tgid(const void *p1, const void *p2)
+{
+	const struct block_list *l1 = p1, *l2 = p2;
+
+	return l1->tgid - l2->tgid;
+}
+
+static int compare_allocator(const void *p1, const void *p2)
+{
+	const struct block_list *l1 = p1, *l2 = p2;
+
+	return l1->allocator - l2->allocator;
+}
+
+static int compare_comm(const void *p1, const void *p2)
+{
+	const struct block_list *l1 = p1, *l2 = p2;
+
+	return strcmp(l1->comm, l2->comm);
+}
+
+static int compare_ts(const void *p1, const void *p2)
+{
+	const struct block_list *l1 = p1, *l2 = p2;
+
+	return l1->ts_nsec < l2->ts_nsec ? -1 : 1;
+}
+
+static int compare_free_ts(const void *p1, const void *p2)
+{
+	const struct block_list *l1 = p1, *l2 = p2;
+
+	return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1;
+}
+
+static int compare_release(const void *p1, const void *p2)
+{
+	const struct block_list *l1 = p1, *l2 = p2;
+
+	if (!l1->free_ts_nsec && !l2->free_ts_nsec)
+		return 0;
+	if (l1->free_ts_nsec && l2->free_ts_nsec)
+		return 0;
+	return l1->free_ts_nsec ? 1 : -1;
+}
+
+static int compare_cull_condition(const void *p1, const void *p2)
+{
+	if (cull == 0)
+		return compare_txt(p1, p2);
+	if ((cull & CULL_STACKTRACE) && compare_stacktrace(p1, p2))
+		return compare_stacktrace(p1, p2);
+	if ((cull & CULL_PID) && compare_pid(p1, p2))
+		return compare_pid(p1, p2);
+	if ((cull & CULL_TGID) && compare_tgid(p1, p2))
+		return compare_tgid(p1, p2);
+	if ((cull & CULL_COMM) && compare_comm(p1, p2))
+		return compare_comm(p1, p2);
+	if ((cull & CULL_UNRELEASE) && compare_release(p1, p2))
+		return compare_release(p1, p2);
+	if ((cull & CULL_ALLOCATOR) && compare_allocator(p1, p2))
+		return compare_allocator(p1, p2);
+	return 0;
+}
+
+static int compare_sort_condition(const void *p1, const void *p2)
+{
+	int cmp = 0;
+
+	for (int i = 0; i < sc.size; ++i)
+		if (cmp == 0)
+			cmp = sc.signs[i] * sc.cmps[i](p1, p2);
+	return cmp;
+}
+
+static int search_pattern(regex_t *pattern, char *pattern_str, char *buf)
+{
+	int err, val_len;
+	regmatch_t pmatch[2];
+
+	err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL);
+	if (err != 0 || pmatch[1].rm_so == -1) {
+		if (debug_on)
+			fprintf(stderr, "no matching pattern in %s\n", buf);
+		return -1;
+	}
+	val_len = pmatch[1].rm_eo - pmatch[1].rm_so;
+
+	memcpy(pattern_str, buf + pmatch[1].rm_so, val_len);
+
+	return 0;
+}
+
+static bool check_regcomp(regex_t *pattern, const char *regex)
+{
+	int err;
+
+	err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE);
+	if (err != 0 || pattern->re_nsub != 1) {
+		fprintf(stderr, "Invalid pattern %s code %d\n", regex, err);
+		return false;
+	}
+	return true;
+}
+
+static char **explode(char sep, const char *str, int *size)
+{
+	int count = 0, len = strlen(str);
+	int lastindex = -1, j = 0;
+
+	for (int i = 0; i < len; i++)
+		if (str[i] == sep)
+			count++;
+	char **ret = calloc(++count, sizeof(char *));
+
+	for (int i = 0; i < len; i++) {
+		if (str[i] == sep) {
+			ret[j] = calloc(i - lastindex, sizeof(char));
+			memcpy(ret[j++], str + lastindex + 1, i - lastindex - 1);
+			lastindex = i;
+		}
+	}
+	if (lastindex <= len - 1) {
+		ret[j] = calloc(len - lastindex, sizeof(char));
+		memcpy(ret[j++], str + lastindex + 1, strlen(str) - 1 - lastindex);
+	}
+	*size = j;
+	return ret;
+}
+
+static void free_explode(char **arr, int size)
+{
+	for (int i = 0; i < size; i++)
+		free(arr[i]);
+	free(arr);
+}
+
+# define FIELD_BUFF 25
+
+static int get_page_num(char *buf)
+{
+	int order_val;
+	char order_str[FIELD_BUFF] = {0};
+	char *endptr;
+
+	search_pattern(&order_pattern, order_str, buf);
+	errno = 0;
+	order_val = strtol(order_str, &endptr, 10);
+	if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') {
+		if (debug_on)
+			fprintf(stderr, "wrong order in follow buf:\n%s\n", buf);
+		return 0;
+	}
+
+	return 1 << order_val;
+}
+
+static pid_t get_pid(char *buf)
+{
+	pid_t pid;
+	char pid_str[FIELD_BUFF] = {0};
+	char *endptr;
+
+	search_pattern(&pid_pattern, pid_str, buf);
+	errno = 0;
+	pid = strtol(pid_str, &endptr, 10);
+	if (errno != 0 || endptr == pid_str || *endptr != '\0') {
+		if (debug_on)
+			fprintf(stderr, "wrong/invalid pid in follow buf:\n%s\n", buf);
+		return -1;
+	}
+
+	return pid;
+
+}
+
+static pid_t get_tgid(char *buf)
+{
+	pid_t tgid;
+	char tgid_str[FIELD_BUFF] = {0};
+	char *endptr;
+
+	search_pattern(&tgid_pattern, tgid_str, buf);
+	errno = 0;
+	tgid = strtol(tgid_str, &endptr, 10);
+	if (errno != 0 || endptr == tgid_str || *endptr != '\0') {
+		if (debug_on)
+			fprintf(stderr, "wrong/invalid tgid in follow buf:\n%s\n", buf);
+		return -1;
+	}
+
+	return tgid;
+
+}
+
+static __u64 get_ts_nsec(char *buf)
+{
+	__u64 ts_nsec;
+	char ts_nsec_str[FIELD_BUFF] = {0};
+	char *endptr;
+
+	search_pattern(&ts_nsec_pattern, ts_nsec_str, buf);
+	errno = 0;
+	ts_nsec = strtoull(ts_nsec_str, &endptr, 10);
+	if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') {
+		if (debug_on)
+			fprintf(stderr, "wrong ts_nsec in follow buf:\n%s\n", buf);
+		return -1;
+	}
+
+	return ts_nsec;
+}
+
+static __u64 get_free_ts_nsec(char *buf)
+{
+	__u64 free_ts_nsec;
+	char free_ts_nsec_str[FIELD_BUFF] = {0};
+	char *endptr;
+
+	search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf);
+	errno = 0;
+	free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10);
+	if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') {
+		if (debug_on)
+			fprintf(stderr, "wrong free_ts_nsec in follow buf:\n%s\n", buf);
+		return -1;
+	}
+
+	return free_ts_nsec;
+}
+
+static char *get_comm(char *buf)
+{
+	char *comm_str = malloc(TASK_COMM_LEN);
+
+	memset(comm_str, 0, TASK_COMM_LEN);
+
+	search_pattern(&comm_pattern, comm_str, buf);
+	errno = 0;
+	if (errno != 0) {
+		if (debug_on)
+			fprintf(stderr, "wrong comm in follow buf:\n%s\n", buf);
+		return NULL;
+	}
+
+	return comm_str;
+}
+
+static int get_arg_type(const char *arg)
+{
+	if (!strcmp(arg, "pid") || !strcmp(arg, "p"))
+		return ARG_PID;
+	else if (!strcmp(arg, "tgid") || !strcmp(arg, "tg"))
+		return ARG_TGID;
+	else if (!strcmp(arg, "name") || !strcmp(arg, "n"))
+		return  ARG_COMM;
+	else if (!strcmp(arg, "stacktrace") || !strcmp(arg, "st"))
+		return ARG_STACKTRACE;
+	else if (!strcmp(arg, "free") || !strcmp(arg, "f"))
+		return ARG_FREE;
+	else if (!strcmp(arg, "txt") || !strcmp(arg, "T"))
+		return ARG_TXT;
+	else if (!strcmp(arg, "free_ts") || !strcmp(arg, "ft"))
+		return ARG_FREE_TS;
+	else if (!strcmp(arg, "alloc_ts") || !strcmp(arg, "at"))
+		return ARG_ALLOC_TS;
+	else if (!strcmp(arg, "allocator") || !strcmp(arg, "ator"))
+		return ARG_ALLOCATOR;
+	else {
+		return ARG_UNKNOWN;
+	}
+}
+
+static int get_allocator(const char *buf, const char *migrate_info)
+{
+	char *tmp, *first_line, *second_line;
+	int allocator = 0;
+
+	if (strstr(migrate_info, "CMA"))
+		allocator |= ALLOCATOR_CMA;
+	if (strstr(migrate_info, "slab"))
+		allocator |= ALLOCATOR_SLAB;
+	tmp = strstr(buf, "__vmalloc_node_range");
+	if (tmp) {
+		second_line = tmp;
+		while (*tmp != '\n')
+			tmp--;
+		tmp--;
+		while (*tmp != '\n')
+			tmp--;
+		first_line = ++tmp;
+		tmp = strstr(tmp, "alloc_pages");
+		if (tmp && first_line <= tmp && tmp < second_line)
+			allocator |= ALLOCATOR_VMALLOC;
+	}
+	if (allocator == 0)
+		allocator = ALLOCATOR_OTHERS;
+	return allocator;
+}
+
+static bool match_num_list(int num, int *list, int list_size)
+{
+	for (int i = 0; i < list_size; ++i)
+		if (list[i] == num)
+			return true;
+	return false;
+}
+
+static bool match_str_list(const char *str, char **list, int list_size)
+{
+	for (int i = 0; i < list_size; ++i)
+		if (!strcmp(list[i], str))
+			return true;
+	return false;
+}
+
+static bool is_need(char *buf)
+{
+	__u64 ts_nsec, free_ts_nsec;
+
+	ts_nsec = get_ts_nsec(buf);
+	free_ts_nsec = get_free_ts_nsec(buf);
+
+	if ((filter & FILTER_UNRELEASE) && free_ts_nsec != 0 && ts_nsec < free_ts_nsec)
+		return false;
+	if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size))
+		return false;
+	if ((filter & FILTER_TGID) &&
+		!match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size))
+		return false;
+
+	char *comm = get_comm(buf);
+
+	if ((filter & FILTER_COMM) &&
+	!match_str_list(comm, fc.comms, fc.comms_size)) {
+		free(comm);
+		return false;
+	}
+	free(comm);
+	return true;
+}
+
+static bool add_list(char *buf, int len, char *ext_buf)
+{
+	if (list_size != 0 &&
+		len == list[list_size-1].len &&
+		memcmp(buf, list[list_size-1].txt, len) == 0) {
+		list[list_size-1].num++;
+		list[list_size-1].page_num += get_page_num(buf);
+		return true;
+	}
+	if (list_size == max_size) {
+		fprintf(stderr, "max_size too small??\n");
+		return false;
+	}
+	if (!is_need(buf))
+		return true;
+	list[list_size].pid = get_pid(buf);
+	list[list_size].tgid = get_tgid(buf);
+	list[list_size].comm = get_comm(buf);
+	list[list_size].txt = malloc(len+1);
+	if (!list[list_size].txt) {
+		fprintf(stderr, "Out of memory\n");
+		return false;
+	}
+	memcpy(list[list_size].txt, buf, len);
+	list[list_size].txt[len] = 0;
+	list[list_size].len = len;
+	list[list_size].num = 1;
+	list[list_size].page_num = get_page_num(buf);
+
+	list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: "";
+	if (*list[list_size].stacktrace == '\n')
+		list[list_size].stacktrace++;
+	list[list_size].ts_nsec = get_ts_nsec(buf);
+	list[list_size].free_ts_nsec = get_free_ts_nsec(buf);
+	list[list_size].allocator = get_allocator(buf, ext_buf);
+	list_size++;
+	if (list_size % 1000 == 0) {
+		printf("loaded %d\r", list_size);
+		fflush(stdout);
+	}
+	return true;
+}
+
+static bool parse_cull_args(const char *arg_str)
+{
+	int size = 0;
+	char **args = explode(',', arg_str, &size);
+
+	for (int i = 0; i < size; ++i) {
+		int arg_type = get_arg_type(args[i]);
+
+		if (arg_type == ARG_PID)
+			cull |= CULL_PID;
+		else if (arg_type == ARG_TGID)
+			cull |= CULL_TGID;
+		else if (arg_type == ARG_COMM)
+			cull |= CULL_COMM;
+		else if (arg_type == ARG_STACKTRACE)
+			cull |= CULL_STACKTRACE;
+		else if (arg_type == ARG_FREE)
+			cull |= CULL_UNRELEASE;
+		else if (arg_type == ARG_ALLOCATOR)
+			cull |= CULL_ALLOCATOR;
+		else {
+			free_explode(args, size);
+			return false;
+		}
+	}
+	free_explode(args, size);
+	if (sc.size == 0)
+		set_single_cmp(compare_num, SORT_DESC);
+	return true;
+}
+
+static void set_single_cmp(int (*cmp)(const void *, const void *), int sign)
+{
+	if (sc.signs == NULL || sc.size < 1)
+		sc.signs = calloc(1, sizeof(int));
+	sc.signs[0] = sign;
+	if (sc.cmps == NULL || sc.size < 1)
+		sc.cmps = calloc(1, sizeof(int *));
+	sc.cmps[0] = cmp;
+	sc.size = 1;
+}
+
+static bool parse_sort_args(const char *arg_str)
+{
+	int size = 0;
+
+	if (sc.size != 0) { /* reset sort_condition */
+		free(sc.signs);
+		free(sc.cmps);
+		size = 0;
+	}
+
+	char **args = explode(',', arg_str, &size);
+
+	sc.signs = calloc(size, sizeof(int));
+	sc.cmps = calloc(size, sizeof(int *));
+	for (int i = 0; i < size; ++i) {
+		int offset = 0;
+
+		sc.signs[i] = SORT_ASC;
+		if (args[i][0] == '-' || args[i][0] == '+') {
+			if (args[i][0] == '-')
+				sc.signs[i] = SORT_DESC;
+			offset = 1;
+		}
+
+		int arg_type = get_arg_type(args[i]+offset);
+
+		if (arg_type == ARG_PID)
+			sc.cmps[i] = compare_pid;
+		else if (arg_type == ARG_TGID)
+			sc.cmps[i] = compare_tgid;
+		else if (arg_type == ARG_COMM)
+			sc.cmps[i] = compare_comm;
+		else if (arg_type == ARG_STACKTRACE)
+			sc.cmps[i] = compare_stacktrace;
+		else if (arg_type == ARG_ALLOC_TS)
+			sc.cmps[i] = compare_ts;
+		else if (arg_type == ARG_FREE_TS)
+			sc.cmps[i] = compare_free_ts;
+		else if (arg_type == ARG_TXT)
+			sc.cmps[i] = compare_txt;
+		else if (arg_type == ARG_ALLOCATOR)
+			sc.cmps[i] = compare_allocator;
+		else {
+			free_explode(args, size);
+			sc.size = 0;
+			return false;
+		}
+	}
+	sc.size = size;
+	free_explode(args, size);
+	return true;
+}
+
+static int *parse_nums_list(char *arg_str, int *list_size)
+{
+	int size = 0;
+	char **args = explode(',', arg_str, &size);
+	int *list = calloc(size, sizeof(int));
+
+	errno = 0;
+	for (int i = 0; i < size; ++i) {
+		char *endptr = NULL;
+
+		list[i] = strtol(args[i], &endptr, 10);
+		if (errno != 0 || endptr == args[i] || *endptr != '\0') {
+			free(list);
+			return NULL;
+		}
+	}
+	*list_size = size;
+	free_explode(args, size);
+	return list;
+}
+
+static void print_allocator(FILE *out, int allocator)
+{
+	fprintf(out, "allocated by ");
+	if (allocator & ALLOCATOR_CMA)
+		fprintf(out, "CMA ");
+	if (allocator & ALLOCATOR_SLAB)
+		fprintf(out, "SLAB ");
+	if (allocator & ALLOCATOR_VMALLOC)
+		fprintf(out, "VMALLOC ");
+	if (allocator & ALLOCATOR_OTHERS)
+		fprintf(out, "OTHERS ");
+}
+
+#define BUF_SIZE	(128 * 1024)
+
+static void usage(void)
+{
+	printf("Usage: ./page_owner_sort [OPTIONS] <input> <output>\n"
+		"-m\t\tSort by total memory.\n"
+		"-s\t\tSort by the stack trace.\n"
+		"-t\t\tSort by times (default).\n"
+		"-p\t\tSort by pid.\n"
+		"-P\t\tSort by tgid.\n"
+		"-n\t\tSort by task command name.\n"
+		"-a\t\tSort by memory allocate time.\n"
+		"-r\t\tSort by memory release time.\n"
+		"-f\t\tFilter out the information of blocks whose memory has been released.\n"
+		"-d\t\tPrint debug information.\n"
+		"--pid <pidlist>\tSelect by pid. This selects the information of blocks whose process ID numbers appear in <pidlist>.\n"
+		"--tgid <tgidlist>\tSelect by tgid. This selects the information of blocks whose Thread Group ID numbers appear in <tgidlist>.\n"
+		"--name <cmdlist>\n\t\tSelect by command name. This selects the information of blocks whose command name appears in <cmdlist>.\n"
+		"--cull <rules>\tCull by user-defined rules.<rules> is a single argument in the form of a comma-separated list with some common fields predefined\n"
+		"--sort <order>\tSpecify sort order as: [+|-]key[,[+|-]key[,...]]\n"
+	);
+}
+
+int main(int argc, char **argv)
+{
+	FILE *fin, *fout;
+	char *buf, *ext_buf;
+	int i, count;
+	struct stat st;
+	int opt;
+	struct option longopts[] = {
+		{ "pid", required_argument, NULL, 1 },
+		{ "tgid", required_argument, NULL, 2 },
+		{ "name", required_argument, NULL, 3 },
+		{ "cull",  required_argument, NULL, 4 },
+		{ "sort",  required_argument, NULL, 5 },
+		{ 0, 0, 0, 0},
+	};
+
+	while ((opt = getopt_long(argc, argv, "adfmnprstP", longopts, NULL)) != -1)
+		switch (opt) {
+		case 'a':
+			set_single_cmp(compare_ts, SORT_ASC);
+			break;
+		case 'd':
+			debug_on = true;
+			break;
+		case 'f':
+			filter = filter | FILTER_UNRELEASE;
+			break;
+		case 'm':
+			set_single_cmp(compare_page_num, SORT_DESC);
+			break;
+		case 'p':
+			set_single_cmp(compare_pid, SORT_ASC);
+			break;
+		case 'r':
+			set_single_cmp(compare_free_ts, SORT_ASC);
+			break;
+		case 's':
+			set_single_cmp(compare_stacktrace, SORT_ASC);
+			break;
+		case 't':
+			set_single_cmp(compare_num, SORT_DESC);
+			break;
+		case 'P':
+			set_single_cmp(compare_tgid, SORT_ASC);
+			break;
+		case 'n':
+			set_single_cmp(compare_comm, SORT_ASC);
+			break;
+		case 1:
+			filter = filter | FILTER_PID;
+			fc.pids = parse_nums_list(optarg, &fc.pids_size);
+			if (fc.pids == NULL) {
+				fprintf(stderr, "wrong/invalid pid in from the command line:%s\n",
+						optarg);
+				exit(1);
+			}
+			break;
+		case 2:
+			filter = filter | FILTER_TGID;
+			fc.tgids = parse_nums_list(optarg, &fc.tgids_size);
+			if (fc.tgids == NULL) {
+				fprintf(stderr, "wrong/invalid tgid in from the command line:%s\n",
+						optarg);
+				exit(1);
+			}
+			break;
+		case 3:
+			filter = filter | FILTER_COMM;
+			fc.comms = explode(',', optarg, &fc.comms_size);
+			break;
+		case 4:
+			if (!parse_cull_args(optarg)) {
+				fprintf(stderr, "wrong argument after --cull option:%s\n",
+						optarg);
+				exit(1);
+			}
+			break;
+		case 5:
+			if (!parse_sort_args(optarg)) {
+				fprintf(stderr, "wrong argument after --sort option:%s\n",
+						optarg);
+				exit(1);
+			}
+			break;
+		default:
+			usage();
+			exit(1);
+		}
+
+	if (optind >= (argc - 1)) {
+		usage();
+		exit(1);
+	}
+
+	fin = fopen(argv[optind], "r");
+	fout = fopen(argv[optind + 1], "w");
+	if (!fin || !fout) {
+		usage();
+		perror("open: ");
+		exit(1);
+	}
+
+	if (!check_regcomp(&order_pattern, "order\\s*([0-9]*),"))
+		goto out_order;
+	if (!check_regcomp(&pid_pattern, "pid\\s*([0-9]*),"))
+		goto out_pid;
+	if (!check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) "))
+		goto out_tgid;
+	if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts"))
+		goto out_comm;
+	if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,"))
+		goto out_ts;
+	if (!check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns"))
+		goto out_free_ts;
+
+	fstat(fileno(fin), &st);
+	max_size = st.st_size / 100; /* hack ... */
+
+	list = malloc(max_size * sizeof(*list));
+	buf = malloc(BUF_SIZE);
+	ext_buf = malloc(BUF_SIZE);
+	if (!list || !buf || !ext_buf) {
+		fprintf(stderr, "Out of memory\n");
+		goto out_free;
+	}
+
+	for ( ; ; ) {
+		int buf_len = read_block(buf, ext_buf, BUF_SIZE, fin);
+
+		if (buf_len < 0)
+			break;
+		if (!add_list(buf, buf_len, ext_buf))
+			goto out_free;
+	}
+
+	printf("loaded %d\n", list_size);
+
+	printf("sorting ....\n");
+
+	qsort(list, list_size, sizeof(list[0]), compare_cull_condition);
+
+	printf("culling\n");
+
+	for (i = count = 0; i < list_size; i++) {
+		if (count == 0 ||
+		    compare_cull_condition((void *)(&list[count-1]), (void *)(&list[i])) != 0) {
+			list[count++] = list[i];
+		} else {
+			list[count-1].num += list[i].num;
+			list[count-1].page_num += list[i].page_num;
+		}
+	}
+
+	qsort(list, count, sizeof(list[0]), compare_sort_condition);
+
+	for (i = 0; i < count; i++) {
+		if (cull == 0) {
+			fprintf(fout, "%d times, %d pages, ", list[i].num, list[i].page_num);
+			print_allocator(fout, list[i].allocator);
+			fprintf(fout, ":\n%s\n", list[i].txt);
+		}
+		else {
+			fprintf(fout, "%d times, %d pages",
+					list[i].num, list[i].page_num);
+			if (cull & CULL_PID || filter & FILTER_PID)
+				fprintf(fout, ", PID %d", list[i].pid);
+			if (cull & CULL_TGID || filter & FILTER_TGID)
+				fprintf(fout, ", TGID %d", list[i].pid);
+			if (cull & CULL_COMM || filter & FILTER_COMM)
+				fprintf(fout, ", task_comm_name: %s", list[i].comm);
+			if (cull & CULL_ALLOCATOR) {
+				fprintf(fout, ", ");
+				print_allocator(fout, list[i].allocator);
+			}
+			if (cull & CULL_UNRELEASE)
+				fprintf(fout, " (%s)",
+						list[i].free_ts_nsec ? "UNRELEASED" : "RELEASED");
+			if (cull & CULL_STACKTRACE)
+				fprintf(fout, ":\n%s", list[i].stacktrace);
+			fprintf(fout, "\n");
+		}
+	}
+
+out_free:
+	if (ext_buf)
+		free(ext_buf);
+	if (buf)
+		free(buf);
+	if (list)
+		free(list);
+out_free_ts:
+	regfree(&free_ts_nsec_pattern);
+out_ts:
+	regfree(&ts_nsec_pattern);
+out_comm:
+	regfree(&comm_pattern);
+out_tgid:
+	regfree(&tgid_pattern);
+out_pid:
+	regfree(&pid_pattern);
+out_order:
+	regfree(&order_pattern);
+
+	return 0;
+}
diff --git a/tools/mm/slabinfo-gnuplot.sh b/tools/mm/slabinfo-gnuplot.sh
new file mode 100644
index 000000000000..873a892147e5
--- /dev/null
+++ b/tools/mm/slabinfo-gnuplot.sh
@@ -0,0 +1,268 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-only
+
+# Sergey Senozhatsky, 2015
+# sergey.senozhatsky.work@gmail.com
+#
+
+
+# This program is intended to plot a `slabinfo -X' stats, collected,
+# for example, using the following command:
+#   while [ 1 ]; do slabinfo -X >> stats; sleep 1; done
+#
+# Use `slabinfo-gnuplot.sh stats' to pre-process collected records
+# and generate graphs (totals, slabs sorted by size, slabs sorted
+# by size).
+#
+# Graphs can be [individually] regenerate with different ranges and
+# size (-r %d,%d and -s %d,%d options).
+#
+# To visually compare N `totals' graphs, do
+# slabinfo-gnuplot.sh -t FILE1-totals FILE2-totals ... FILEN-totals
+#
+
+min_slab_name_size=11
+xmin=0
+xmax=0
+width=1500
+height=700
+mode=preprocess
+
+usage()
+{
+	echo "Usage: [-s W,H] [-r MIN,MAX] [-t|-l] FILE1 [FILE2 ..]"
+	echo "FILEs must contain 'slabinfo -X' samples"
+	echo "-t 			- plot totals for FILE(s)"
+	echo "-l 			- plot slabs stats for FILE(s)"
+	echo "-s %d,%d		- set image width and height"
+	echo "-r %d,%d		- use data samples from a given range"
+}
+
+check_file_exist()
+{
+	if [ ! -f "$1" ]; then
+		echo "File '$1' does not exist"
+		exit 1
+	fi
+}
+
+do_slabs_plotting()
+{
+	local file=$1
+	local out_file
+	local range="every ::$xmin"
+	local xtic=""
+	local xtic_rotate="norotate"
+	local lines=2000000
+	local wc_lines
+
+	check_file_exist "$file"
+
+	out_file=`basename "$file"`
+	if [ $xmax -ne 0 ]; then
+		range="$range::$xmax"
+		lines=$((xmax-xmin))
+	fi
+
+	wc_lines=`cat "$file" | wc -l`
+	if [ $? -ne 0 ] || [ "$wc_lines" -eq 0 ] ; then
+		wc_lines=$lines
+	fi
+
+	if [ "$wc_lines" -lt "$lines" ]; then
+		lines=$wc_lines
+	fi
+
+	if [ $((width / lines)) -gt $min_slab_name_size ]; then
+		xtic=":xtic(1)"
+		xtic_rotate=90
+	fi
+
+gnuplot -p << EOF
+#!/usr/bin/env gnuplot
+
+set terminal png enhanced size $width,$height large
+set output '$out_file.png'
+set autoscale xy
+set xlabel 'samples'
+set ylabel 'bytes'
+set style histogram columnstacked title textcolor lt -1
+set style fill solid 0.15
+set xtics rotate $xtic_rotate
+set key left above Left title reverse
+
+plot "$file" $range u 2$xtic title 'SIZE' with boxes,\
+	'' $range u 3 title 'LOSS' with boxes
+EOF
+
+	if [ $? -eq 0 ]; then
+		echo "$out_file.png"
+	fi
+}
+
+do_totals_plotting()
+{
+	local gnuplot_cmd=""
+	local range="every ::$xmin"
+	local file=""
+
+	if [ $xmax -ne 0 ]; then
+		range="$range::$xmax"
+	fi
+
+	for i in "${t_files[@]}"; do
+		check_file_exist "$i"
+
+		file="$file"`basename "$i"`
+		gnuplot_cmd="$gnuplot_cmd '$i' $range using 1 title\
+			'$i Memory usage' with lines,"
+		gnuplot_cmd="$gnuplot_cmd '' $range using 2 title \
+			'$i Loss' with lines,"
+	done
+
+gnuplot -p << EOF
+#!/usr/bin/env gnuplot
+
+set terminal png enhanced size $width,$height large
+set autoscale xy
+set output '$file.png'
+set xlabel 'samples'
+set ylabel 'bytes'
+set key left above Left title reverse
+
+plot $gnuplot_cmd
+EOF
+
+	if [ $? -eq 0 ]; then
+		echo "$file.png"
+	fi
+}
+
+do_preprocess()
+{
+	local out
+	local lines
+	local in=$1
+
+	check_file_exist "$in"
+
+	# use only 'TOP' slab (biggest memory usage or loss)
+	let lines=3
+	out=`basename "$in"`"-slabs-by-loss"
+	`cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\
+		grep -E -iv '\-\-|Name|Slabs'\
+		| awk '{print $1" "$4+$2*$3" "$4}' > "$out"`
+	if [ $? -eq 0 ]; then
+		do_slabs_plotting "$out"
+	fi
+
+	let lines=3
+	out=`basename "$in"`"-slabs-by-size"
+	`cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\
+		grep -E -iv '\-\-|Name|Slabs'\
+		| awk '{print $1" "$4" "$4-$2*$3}' > "$out"`
+	if [ $? -eq 0 ]; then
+		do_slabs_plotting "$out"
+	fi
+
+	out=`basename "$in"`"-totals"
+	`cat "$in" | grep "Memory used" |\
+		awk '{print $3" "$7}' > "$out"`
+	if [ $? -eq 0 ]; then
+		t_files[0]=$out
+		do_totals_plotting
+	fi
+}
+
+parse_opts()
+{
+	local opt
+
+	while getopts "tlr::s::h" opt; do
+		case $opt in
+			t)
+				mode=totals
+				;;
+			l)
+				mode=slabs
+				;;
+			s)
+				array=(${OPTARG//,/ })
+				width=${array[0]}
+				height=${array[1]}
+				;;
+			r)
+				array=(${OPTARG//,/ })
+				xmin=${array[0]}
+				xmax=${array[1]}
+				;;
+			h)
+				usage
+				exit 0
+				;;
+			\?)
+				echo "Invalid option: -$OPTARG" >&2
+				exit 1
+				;;
+			:)
+				echo "-$OPTARG requires an argument." >&2
+				exit 1
+				;;
+		esac
+	done
+
+	return $OPTIND
+}
+
+parse_args()
+{
+	local idx=0
+	local p
+
+	for p in "$@"; do
+		case $mode in
+			preprocess)
+				files[$idx]=$p
+				idx=$idx+1
+				;;
+			totals)
+				t_files[$idx]=$p
+				idx=$idx+1
+				;;
+			slabs)
+				files[$idx]=$p
+				idx=$idx+1
+				;;
+		esac
+	done
+}
+
+parse_opts "$@"
+argstart=$?
+parse_args "${@:$argstart}"
+
+if [ ${#files[@]} -eq 0 ] && [ ${#t_files[@]} -eq 0 ]; then
+	usage
+	exit 1
+fi
+
+case $mode in
+	preprocess)
+		for i in "${files[@]}"; do
+			do_preprocess "$i"
+		done
+		;;
+	totals)
+		do_totals_plotting
+		;;
+	slabs)
+		for i in "${files[@]}"; do
+			do_slabs_plotting "$i"
+		done
+		;;
+	*)
+		echo "Unknown mode $mode" >&2
+		usage
+		exit 1
+	;;
+esac
diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c
new file mode 100644
index 000000000000..cfaeaea71042
--- /dev/null
+++ b/tools/mm/slabinfo.c
@@ -0,0 +1,1544 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Slabinfo: Tool to get reports about slabs
+ *
+ * (C) 2007 sgi, Christoph Lameter
+ * (C) 2011 Linux Foundation, Christoph Lameter
+ *
+ * Compile with:
+ *
+ * gcc -o slabinfo slabinfo.c
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <strings.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <getopt.h>
+#include <regex.h>
+#include <errno.h>
+
+#define MAX_SLABS 500
+#define MAX_ALIASES 500
+#define MAX_NODES 1024
+
+struct slabinfo {
+	char *name;
+	int alias;
+	int refs;
+	int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu;
+	unsigned int hwcache_align, object_size, objs_per_slab;
+	unsigned int sanity_checks, slab_size, store_user, trace;
+	int order, poison, reclaim_account, red_zone;
+	unsigned long partial, objects, slabs, objects_partial, objects_total;
+	unsigned long alloc_fastpath, alloc_slowpath;
+	unsigned long free_fastpath, free_slowpath;
+	unsigned long free_frozen, free_add_partial, free_remove_partial;
+	unsigned long alloc_from_partial, alloc_slab, free_slab, alloc_refill;
+	unsigned long cpuslab_flush, deactivate_full, deactivate_empty;
+	unsigned long deactivate_to_head, deactivate_to_tail;
+	unsigned long deactivate_remote_frees, order_fallback;
+	unsigned long cmpxchg_double_cpu_fail, cmpxchg_double_fail;
+	unsigned long alloc_node_mismatch, deactivate_bypass;
+	unsigned long cpu_partial_alloc, cpu_partial_free;
+	int numa[MAX_NODES];
+	int numa_partial[MAX_NODES];
+} slabinfo[MAX_SLABS];
+
+struct aliasinfo {
+	char *name;
+	char *ref;
+	struct slabinfo *slab;
+} aliasinfo[MAX_ALIASES];
+
+int slabs;
+int actual_slabs;
+int aliases;
+int alias_targets;
+int highest_node;
+
+char buffer[4096];
+
+int show_empty;
+int show_report;
+int show_alias;
+int show_slab;
+int skip_zero = 1;
+int show_numa;
+int show_track;
+int show_first_alias;
+int validate;
+int shrink;
+int show_inverted;
+int show_single_ref;
+int show_totals;
+int sort_size;
+int sort_active;
+int set_debug;
+int show_ops;
+int sort_partial;
+int show_activity;
+int output_lines = -1;
+int sort_loss;
+int extended_totals;
+int show_bytes;
+int unreclaim_only;
+
+/* Debug options */
+int sanity;
+int redzone;
+int poison;
+int tracking;
+int tracing;
+
+int page_size;
+
+regex_t pattern;
+
+static void fatal(const char *x, ...)
+{
+	va_list ap;
+
+	va_start(ap, x);
+	vfprintf(stderr, x, ap);
+	va_end(ap);
+	exit(EXIT_FAILURE);
+}
+
+static void usage(void)
+{
+	printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n"
+		"slabinfo [-aABDefhilLnoPrsStTUvXz1] [N=K] [-dafzput] [slab-regexp]\n"
+		"-a|--aliases           Show aliases\n"
+		"-A|--activity          Most active slabs first\n"
+		"-B|--Bytes             Show size in bytes\n"
+		"-D|--display-active    Switch line format to activity\n"
+		"-e|--empty             Show empty slabs\n"
+		"-f|--first-alias       Show first alias\n"
+		"-h|--help              Show usage information\n"
+		"-i|--inverted          Inverted list\n"
+		"-l|--slabs             Show slabs\n"
+		"-L|--Loss              Sort by loss\n"
+		"-n|--numa              Show NUMA information\n"
+		"-N|--lines=K           Show the first K slabs\n"
+		"-o|--ops               Show kmem_cache_ops\n"
+		"-P|--partial           Sort by number of partial slabs\n"
+		"-r|--report            Detailed report on single slabs\n"
+		"-s|--shrink            Shrink slabs\n"
+		"-S|--Size              Sort by size\n"
+		"-t|--tracking          Show alloc/free information\n"
+		"-T|--Totals            Show summary information\n"
+		"-U|--Unreclaim         Show unreclaimable slabs only\n"
+		"-v|--validate          Validate slabs\n"
+		"-X|--Xtotals           Show extended summary information\n"
+		"-z|--zero              Include empty slabs\n"
+		"-1|--1ref              Single reference\n"
+
+		"\n"
+		"-d  | --debug          Switch off all debug options\n"
+		"-da | --debug=a        Switch on all debug options (--debug=FZPU)\n"
+
+		"\n"
+		"-d[afzput] | --debug=[afzput]\n"
+		"    f | F              Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n"
+		"    z | Z              Redzoning\n"
+		"    p | P              Poisoning\n"
+		"    u | U              Tracking\n"
+		"    t | T              Tracing\n"
+
+		"\nSorting options (--Loss, --Size, --Partial) are mutually exclusive\n"
+	);
+}
+
+static unsigned long read_obj(const char *name)
+{
+	FILE *f = fopen(name, "r");
+
+	if (!f) {
+		buffer[0] = 0;
+		if (errno == EACCES)
+			fatal("%s, Try using superuser\n", strerror(errno));
+	} else {
+		if (!fgets(buffer, sizeof(buffer), f))
+			buffer[0] = 0;
+		fclose(f);
+		if (buffer[strlen(buffer)] == '\n')
+			buffer[strlen(buffer)] = 0;
+	}
+	return strlen(buffer);
+}
+
+
+/*
+ * Get the contents of an attribute
+ */
+static unsigned long get_obj(const char *name)
+{
+	if (!read_obj(name))
+		return 0;
+
+	return atol(buffer);
+}
+
+static unsigned long get_obj_and_str(const char *name, char **x)
+{
+	unsigned long result = 0;
+	char *p;
+
+	*x = NULL;
+
+	if (!read_obj(name)) {
+		x = NULL;
+		return 0;
+	}
+	result = strtoul(buffer, &p, 10);
+	while (*p == ' ')
+		p++;
+	if (*p)
+		*x = strdup(p);
+	return result;
+}
+
+static void set_obj(struct slabinfo *s, const char *name, int n)
+{
+	char x[100];
+	FILE *f;
+
+	snprintf(x, 100, "%s/%s", s->name, name);
+	f = fopen(x, "w");
+	if (!f)
+		fatal("Cannot write to %s\n", x);
+
+	fprintf(f, "%d\n", n);
+	fclose(f);
+}
+
+static unsigned long read_slab_obj(struct slabinfo *s, const char *name)
+{
+	char x[100];
+	FILE *f;
+	size_t l;
+
+	snprintf(x, 100, "%s/%s", s->name, name);
+	f = fopen(x, "r");
+	if (!f) {
+		buffer[0] = 0;
+		l = 0;
+	} else {
+		l = fread(buffer, 1, sizeof(buffer), f);
+		buffer[l] = 0;
+		fclose(f);
+	}
+	return l;
+}
+
+static unsigned long read_debug_slab_obj(struct slabinfo *s, const char *name)
+{
+	char x[128];
+	FILE *f;
+	size_t l;
+
+	snprintf(x, 128, "/sys/kernel/debug/slab/%s/%s", s->name, name);
+	f = fopen(x, "r");
+	if (!f) {
+		buffer[0] = 0;
+		l = 0;
+	} else {
+		l = fread(buffer, 1, sizeof(buffer), f);
+		buffer[l] = 0;
+		fclose(f);
+	}
+	return l;
+}
+
+/*
+ * Put a size string together
+ */
+static int store_size(char *buffer, unsigned long value)
+{
+	unsigned long divisor = 1;
+	char trailer = 0;
+	int n;
+
+	if (!show_bytes) {
+		if (value > 1000000000UL) {
+			divisor = 100000000UL;
+			trailer = 'G';
+		} else if (value > 1000000UL) {
+			divisor = 100000UL;
+			trailer = 'M';
+		} else if (value > 1000UL) {
+			divisor = 100;
+			trailer = 'K';
+		}
+	}
+
+	value /= divisor;
+	n = sprintf(buffer, "%ld",value);
+	if (trailer) {
+		buffer[n] = trailer;
+		n++;
+		buffer[n] = 0;
+	}
+	if (divisor != 1) {
+		memmove(buffer + n - 2, buffer + n - 3, 4);
+		buffer[n-2] = '.';
+		n++;
+	}
+	return n;
+}
+
+static void decode_numa_list(int *numa, char *t)
+{
+	int node;
+	int nr;
+
+	memset(numa, 0, MAX_NODES * sizeof(int));
+
+	if (!t)
+		return;
+
+	while (*t == 'N') {
+		t++;
+		node = strtoul(t, &t, 10);
+		if (*t == '=') {
+			t++;
+			nr = strtoul(t, &t, 10);
+			numa[node] = nr;
+			if (node > highest_node)
+				highest_node = node;
+		}
+		while (*t == ' ')
+			t++;
+	}
+}
+
+static void slab_validate(struct slabinfo *s)
+{
+	if (strcmp(s->name, "*") == 0)
+		return;
+
+	set_obj(s, "validate", 1);
+}
+
+static void slab_shrink(struct slabinfo *s)
+{
+	if (strcmp(s->name, "*") == 0)
+		return;
+
+	set_obj(s, "shrink", 1);
+}
+
+int line = 0;
+
+static void first_line(void)
+{
+	if (show_activity)
+		printf("Name                   Objects      Alloc       Free"
+			"   %%Fast Fallb O CmpX   UL\n");
+	else
+		printf("Name                   Objects Objsize           %s "
+			"Slabs/Part/Cpu  O/S O %%Fr %%Ef Flg\n",
+			sort_loss ? " Loss" : "Space");
+}
+
+/*
+ * Find the shortest alias of a slab
+ */
+static struct aliasinfo *find_one_alias(struct slabinfo *find)
+{
+	struct aliasinfo *a;
+	struct aliasinfo *best = NULL;
+
+	for(a = aliasinfo;a < aliasinfo + aliases; a++) {
+		if (a->slab == find &&
+			(!best || strlen(best->name) < strlen(a->name))) {
+				best = a;
+				if (strncmp(a->name,"kmall", 5) == 0)
+					return best;
+			}
+	}
+	return best;
+}
+
+static unsigned long slab_size(struct slabinfo *s)
+{
+	return 	s->slabs * (page_size << s->order);
+}
+
+static unsigned long slab_activity(struct slabinfo *s)
+{
+	return 	s->alloc_fastpath + s->free_fastpath +
+		s->alloc_slowpath + s->free_slowpath;
+}
+
+static unsigned long slab_waste(struct slabinfo *s)
+{
+	return	slab_size(s) - s->objects * s->object_size;
+}
+
+static void slab_numa(struct slabinfo *s, int mode)
+{
+	int node;
+
+	if (strcmp(s->name, "*") == 0)
+		return;
+
+	if (!highest_node) {
+		printf("\n%s: No NUMA information available.\n", s->name);
+		return;
+	}
+
+	if (skip_zero && !s->slabs)
+		return;
+
+	if (!line) {
+		printf("\n%-21s:", mode ? "NUMA nodes" : "Slab");
+		for(node = 0; node <= highest_node; node++)
+			printf(" %4d", node);
+		printf("\n----------------------");
+		for(node = 0; node <= highest_node; node++)
+			printf("-----");
+		printf("\n");
+	}
+	printf("%-21s ", mode ? "All slabs" : s->name);
+	for(node = 0; node <= highest_node; node++) {
+		char b[20];
+
+		store_size(b, s->numa[node]);
+		printf(" %4s", b);
+	}
+	printf("\n");
+	if (mode) {
+		printf("%-21s ", "Partial slabs");
+		for(node = 0; node <= highest_node; node++) {
+			char b[20];
+
+			store_size(b, s->numa_partial[node]);
+			printf(" %4s", b);
+		}
+		printf("\n");
+	}
+	line++;
+}
+
+static void show_tracking(struct slabinfo *s)
+{
+	printf("\n%s: Kernel object allocation\n", s->name);
+	printf("-----------------------------------------------------------------------\n");
+	if (read_debug_slab_obj(s, "alloc_traces"))
+		printf("%s", buffer);
+	else if (read_slab_obj(s, "alloc_calls"))
+		printf("%s", buffer);
+	else
+		printf("No Data\n");
+
+	printf("\n%s: Kernel object freeing\n", s->name);
+	printf("------------------------------------------------------------------------\n");
+	if (read_debug_slab_obj(s, "free_traces"))
+		printf("%s", buffer);
+	else if (read_slab_obj(s, "free_calls"))
+		printf("%s", buffer);
+	else
+		printf("No Data\n");
+
+}
+
+static void ops(struct slabinfo *s)
+{
+	if (strcmp(s->name, "*") == 0)
+		return;
+
+	if (read_slab_obj(s, "ops")) {
+		printf("\n%s: kmem_cache operations\n", s->name);
+		printf("--------------------------------------------\n");
+		printf("%s", buffer);
+	} else
+		printf("\n%s has no kmem_cache operations\n", s->name);
+}
+
+static const char *onoff(int x)
+{
+	if (x)
+		return "On ";
+	return "Off";
+}
+
+static void slab_stats(struct slabinfo *s)
+{
+	unsigned long total_alloc;
+	unsigned long total_free;
+	unsigned long total;
+
+	if (!s->alloc_slab)
+		return;
+
+	total_alloc = s->alloc_fastpath + s->alloc_slowpath;
+	total_free = s->free_fastpath + s->free_slowpath;
+
+	if (!total_alloc)
+		return;
+
+	printf("\n");
+	printf("Slab Perf Counter       Alloc     Free %%Al %%Fr\n");
+	printf("--------------------------------------------------\n");
+	printf("Fastpath             %8lu %8lu %3lu %3lu\n",
+		s->alloc_fastpath, s->free_fastpath,
+		s->alloc_fastpath * 100 / total_alloc,
+		total_free ? s->free_fastpath * 100 / total_free : 0);
+	printf("Slowpath             %8lu %8lu %3lu %3lu\n",
+		total_alloc - s->alloc_fastpath, s->free_slowpath,
+		(total_alloc - s->alloc_fastpath) * 100 / total_alloc,
+		total_free ? s->free_slowpath * 100 / total_free : 0);
+	printf("Page Alloc           %8lu %8lu %3lu %3lu\n",
+		s->alloc_slab, s->free_slab,
+		s->alloc_slab * 100 / total_alloc,
+		total_free ? s->free_slab * 100 / total_free : 0);
+	printf("Add partial          %8lu %8lu %3lu %3lu\n",
+		s->deactivate_to_head + s->deactivate_to_tail,
+		s->free_add_partial,
+		(s->deactivate_to_head + s->deactivate_to_tail) * 100 / total_alloc,
+		total_free ? s->free_add_partial * 100 / total_free : 0);
+	printf("Remove partial       %8lu %8lu %3lu %3lu\n",
+		s->alloc_from_partial, s->free_remove_partial,
+		s->alloc_from_partial * 100 / total_alloc,
+		total_free ? s->free_remove_partial * 100 / total_free : 0);
+
+	printf("Cpu partial list     %8lu %8lu %3lu %3lu\n",
+		s->cpu_partial_alloc, s->cpu_partial_free,
+		s->cpu_partial_alloc * 100 / total_alloc,
+		total_free ? s->cpu_partial_free * 100 / total_free : 0);
+
+	printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n",
+		s->deactivate_remote_frees, s->free_frozen,
+		s->deactivate_remote_frees * 100 / total_alloc,
+		total_free ? s->free_frozen * 100 / total_free : 0);
+
+	printf("Total                %8lu %8lu\n\n", total_alloc, total_free);
+
+	if (s->cpuslab_flush)
+		printf("Flushes %8lu\n", s->cpuslab_flush);
+
+	total = s->deactivate_full + s->deactivate_empty +
+			s->deactivate_to_head + s->deactivate_to_tail + s->deactivate_bypass;
+
+	if (total) {
+		printf("\nSlab Deactivation             Occurrences %%\n");
+		printf("-------------------------------------------------\n");
+		printf("Slab full                     %7lu  %3lu%%\n",
+			s->deactivate_full, (s->deactivate_full * 100) / total);
+		printf("Slab empty                    %7lu  %3lu%%\n",
+			s->deactivate_empty, (s->deactivate_empty * 100) / total);
+		printf("Moved to head of partial list %7lu  %3lu%%\n",
+			s->deactivate_to_head, (s->deactivate_to_head * 100) / total);
+		printf("Moved to tail of partial list %7lu  %3lu%%\n",
+			s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total);
+		printf("Deactivation bypass           %7lu  %3lu%%\n",
+			s->deactivate_bypass, (s->deactivate_bypass * 100) / total);
+		printf("Refilled from foreign frees   %7lu  %3lu%%\n",
+			s->alloc_refill, (s->alloc_refill * 100) / total);
+		printf("Node mismatch                 %7lu  %3lu%%\n",
+			s->alloc_node_mismatch, (s->alloc_node_mismatch * 100) / total);
+	}
+
+	if (s->cmpxchg_double_fail || s->cmpxchg_double_cpu_fail) {
+		printf("\nCmpxchg_double Looping\n------------------------\n");
+		printf("Locked Cmpxchg Double redos   %lu\nUnlocked Cmpxchg Double redos %lu\n",
+			s->cmpxchg_double_fail, s->cmpxchg_double_cpu_fail);
+	}
+}
+
+static void report(struct slabinfo *s)
+{
+	if (strcmp(s->name, "*") == 0)
+		return;
+
+	printf("\nSlabcache: %-15s  Aliases: %2d Order : %2d Objects: %lu\n",
+		s->name, s->aliases, s->order, s->objects);
+	if (s->hwcache_align)
+		printf("** Hardware cacheline aligned\n");
+	if (s->cache_dma)
+		printf("** Memory is allocated in a special DMA zone\n");
+	if (s->destroy_by_rcu)
+		printf("** Slabs are destroyed via RCU\n");
+	if (s->reclaim_account)
+		printf("** Reclaim accounting active\n");
+
+	printf("\nSizes (bytes)     Slabs              Debug                Memory\n");
+	printf("------------------------------------------------------------------------\n");
+	printf("Object : %7d  Total  : %7ld   Sanity Checks : %s  Total: %7ld\n",
+			s->object_size, s->slabs, onoff(s->sanity_checks),
+			s->slabs * (page_size << s->order));
+	printf("SlabObj: %7d  Full   : %7ld   Redzoning     : %s  Used : %7ld\n",
+			s->slab_size, s->slabs - s->partial - s->cpu_slabs,
+			onoff(s->red_zone), s->objects * s->object_size);
+	printf("SlabSiz: %7d  Partial: %7ld   Poisoning     : %s  Loss : %7ld\n",
+			page_size << s->order, s->partial, onoff(s->poison),
+			s->slabs * (page_size << s->order) - s->objects * s->object_size);
+	printf("Loss   : %7d  CpuSlab: %7d   Tracking      : %s  Lalig: %7ld\n",
+			s->slab_size - s->object_size, s->cpu_slabs, onoff(s->store_user),
+			(s->slab_size - s->object_size) * s->objects);
+	printf("Align  : %7d  Objects: %7d   Tracing       : %s  Lpadd: %7ld\n",
+			s->align, s->objs_per_slab, onoff(s->trace),
+			((page_size << s->order) - s->objs_per_slab * s->slab_size) *
+			s->slabs);
+
+	ops(s);
+	show_tracking(s);
+	slab_numa(s, 1);
+	slab_stats(s);
+}
+
+static void slabcache(struct slabinfo *s)
+{
+	char size_str[20];
+	char dist_str[40];
+	char flags[20];
+	char *p = flags;
+
+	if (strcmp(s->name, "*") == 0)
+		return;
+
+	if (unreclaim_only && s->reclaim_account)
+		return;
+
+	if (actual_slabs == 1) {
+		report(s);
+		return;
+	}
+
+	if (skip_zero && !show_empty && !s->slabs)
+		return;
+
+	if (show_empty && s->slabs)
+		return;
+
+	if (sort_loss == 0)
+		store_size(size_str, slab_size(s));
+	else
+		store_size(size_str, slab_waste(s));
+	snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs,
+						s->partial, s->cpu_slabs);
+
+	if (!line++)
+		first_line();
+
+	if (s->aliases)
+		*p++ = '*';
+	if (s->cache_dma)
+		*p++ = 'd';
+	if (s->hwcache_align)
+		*p++ = 'A';
+	if (s->poison)
+		*p++ = 'P';
+	if (s->reclaim_account)
+		*p++ = 'a';
+	if (s->red_zone)
+		*p++ = 'Z';
+	if (s->sanity_checks)
+		*p++ = 'F';
+	if (s->store_user)
+		*p++ = 'U';
+	if (s->trace)
+		*p++ = 'T';
+
+	*p = 0;
+	if (show_activity) {
+		unsigned long total_alloc;
+		unsigned long total_free;
+
+		total_alloc = s->alloc_fastpath + s->alloc_slowpath;
+		total_free = s->free_fastpath + s->free_slowpath;
+
+		printf("%-21s %8ld %10ld %10ld %3ld %3ld %5ld %1d %4ld %4ld\n",
+			s->name, s->objects,
+			total_alloc, total_free,
+			total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0,
+			total_free ? (s->free_fastpath * 100 / total_free) : 0,
+			s->order_fallback, s->order, s->cmpxchg_double_fail,
+			s->cmpxchg_double_cpu_fail);
+	} else {
+		printf("%-21s %8ld %7d %15s %14s %4d %1d %3ld %3ld %s\n",
+			s->name, s->objects, s->object_size, size_str, dist_str,
+			s->objs_per_slab, s->order,
+			s->slabs ? (s->partial * 100) / s->slabs : 100,
+			s->slabs ? (s->objects * s->object_size * 100) /
+				(s->slabs * (page_size << s->order)) : 100,
+			flags);
+	}
+}
+
+/*
+ * Analyze debug options. Return false if something is amiss.
+ */
+static int debug_opt_scan(char *opt)
+{
+	if (!opt || !opt[0] || strcmp(opt, "-") == 0)
+		return 1;
+
+	if (strcasecmp(opt, "a") == 0) {
+		sanity = 1;
+		poison = 1;
+		redzone = 1;
+		tracking = 1;
+		return 1;
+	}
+
+	for ( ; *opt; opt++)
+		switch (*opt) {
+		case 'F' : case 'f':
+			if (sanity)
+				return 0;
+			sanity = 1;
+			break;
+		case 'P' : case 'p':
+			if (poison)
+				return 0;
+			poison = 1;
+			break;
+
+		case 'Z' : case 'z':
+			if (redzone)
+				return 0;
+			redzone = 1;
+			break;
+
+		case 'U' : case 'u':
+			if (tracking)
+				return 0;
+			tracking = 1;
+			break;
+
+		case 'T' : case 't':
+			if (tracing)
+				return 0;
+			tracing = 1;
+			break;
+		default:
+			return 0;
+		}
+	return 1;
+}
+
+static int slab_empty(struct slabinfo *s)
+{
+	if (s->objects > 0)
+		return 0;
+
+	/*
+	 * We may still have slabs even if there are no objects. Shrinking will
+	 * remove them.
+	 */
+	if (s->slabs != 0)
+		set_obj(s, "shrink", 1);
+
+	return 1;
+}
+
+static void slab_debug(struct slabinfo *s)
+{
+	if (strcmp(s->name, "*") == 0)
+		return;
+
+	if (sanity && !s->sanity_checks) {
+		set_obj(s, "sanity_checks", 1);
+	}
+	if (!sanity && s->sanity_checks) {
+		if (slab_empty(s))
+			set_obj(s, "sanity_checks", 0);
+		else
+			fprintf(stderr, "%s not empty cannot disable sanity checks\n", s->name);
+	}
+	if (redzone && !s->red_zone) {
+		if (slab_empty(s))
+			set_obj(s, "red_zone", 1);
+		else
+			fprintf(stderr, "%s not empty cannot enable redzoning\n", s->name);
+	}
+	if (!redzone && s->red_zone) {
+		if (slab_empty(s))
+			set_obj(s, "red_zone", 0);
+		else
+			fprintf(stderr, "%s not empty cannot disable redzoning\n", s->name);
+	}
+	if (poison && !s->poison) {
+		if (slab_empty(s))
+			set_obj(s, "poison", 1);
+		else
+			fprintf(stderr, "%s not empty cannot enable poisoning\n", s->name);
+	}
+	if (!poison && s->poison) {
+		if (slab_empty(s))
+			set_obj(s, "poison", 0);
+		else
+			fprintf(stderr, "%s not empty cannot disable poisoning\n", s->name);
+	}
+	if (tracking && !s->store_user) {
+		if (slab_empty(s))
+			set_obj(s, "store_user", 1);
+		else
+			fprintf(stderr, "%s not empty cannot enable tracking\n", s->name);
+	}
+	if (!tracking && s->store_user) {
+		if (slab_empty(s))
+			set_obj(s, "store_user", 0);
+		else
+			fprintf(stderr, "%s not empty cannot disable tracking\n", s->name);
+	}
+	if (tracing && !s->trace) {
+		if (slabs == 1)
+			set_obj(s, "trace", 1);
+		else
+			fprintf(stderr, "%s can only enable trace for one slab at a time\n", s->name);
+	}
+	if (!tracing && s->trace)
+		set_obj(s, "trace", 1);
+}
+
+static void totals(void)
+{
+	struct slabinfo *s;
+
+	int used_slabs = 0;
+	char b1[20], b2[20], b3[20], b4[20];
+	unsigned long long max = 1ULL << 63;
+
+	/* Object size */
+	unsigned long long min_objsize = max, max_objsize = 0, avg_objsize;
+
+	/* Number of partial slabs in a slabcache */
+	unsigned long long min_partial = max, max_partial = 0,
+				avg_partial, total_partial = 0;
+
+	/* Number of slabs in a slab cache */
+	unsigned long long min_slabs = max, max_slabs = 0,
+				avg_slabs, total_slabs = 0;
+
+	/* Size of the whole slab */
+	unsigned long long min_size = max, max_size = 0,
+				avg_size, total_size = 0;
+
+	/* Bytes used for object storage in a slab */
+	unsigned long long min_used = max, max_used = 0,
+				avg_used, total_used = 0;
+
+	/* Waste: Bytes used for alignment and padding */
+	unsigned long long min_waste = max, max_waste = 0,
+				avg_waste, total_waste = 0;
+	/* Number of objects in a slab */
+	unsigned long long min_objects = max, max_objects = 0,
+				avg_objects, total_objects = 0;
+	/* Waste per object */
+	unsigned long long min_objwaste = max,
+				max_objwaste = 0, avg_objwaste,
+				total_objwaste = 0;
+
+	/* Memory per object */
+	unsigned long long min_memobj = max,
+				max_memobj = 0, avg_memobj,
+				total_objsize = 0;
+
+	/* Percentage of partial slabs per slab */
+	unsigned long min_ppart = 100, max_ppart = 0,
+				avg_ppart, total_ppart = 0;
+
+	/* Number of objects in partial slabs */
+	unsigned long min_partobj = max, max_partobj = 0,
+				avg_partobj, total_partobj = 0;
+
+	/* Percentage of partial objects of all objects in a slab */
+	unsigned long min_ppartobj = 100, max_ppartobj = 0,
+				avg_ppartobj, total_ppartobj = 0;
+
+
+	for (s = slabinfo; s < slabinfo + slabs; s++) {
+		unsigned long long size;
+		unsigned long used;
+		unsigned long long wasted;
+		unsigned long long objwaste;
+		unsigned long percentage_partial_slabs;
+		unsigned long percentage_partial_objs;
+
+		if (!s->slabs || !s->objects)
+			continue;
+
+		used_slabs++;
+
+		size = slab_size(s);
+		used = s->objects * s->object_size;
+		wasted = size - used;
+		objwaste = s->slab_size - s->object_size;
+
+		percentage_partial_slabs = s->partial * 100 / s->slabs;
+		if (percentage_partial_slabs > 100)
+			percentage_partial_slabs = 100;
+
+		percentage_partial_objs = s->objects_partial * 100
+							/ s->objects;
+
+		if (percentage_partial_objs > 100)
+			percentage_partial_objs = 100;
+
+		if (s->object_size < min_objsize)
+			min_objsize = s->object_size;
+		if (s->partial < min_partial)
+			min_partial = s->partial;
+		if (s->slabs < min_slabs)
+			min_slabs = s->slabs;
+		if (size < min_size)
+			min_size = size;
+		if (wasted < min_waste)
+			min_waste = wasted;
+		if (objwaste < min_objwaste)
+			min_objwaste = objwaste;
+		if (s->objects < min_objects)
+			min_objects = s->objects;
+		if (used < min_used)
+			min_used = used;
+		if (s->objects_partial < min_partobj)
+			min_partobj = s->objects_partial;
+		if (percentage_partial_slabs < min_ppart)
+			min_ppart = percentage_partial_slabs;
+		if (percentage_partial_objs < min_ppartobj)
+			min_ppartobj = percentage_partial_objs;
+		if (s->slab_size < min_memobj)
+			min_memobj = s->slab_size;
+
+		if (s->object_size > max_objsize)
+			max_objsize = s->object_size;
+		if (s->partial > max_partial)
+			max_partial = s->partial;
+		if (s->slabs > max_slabs)
+			max_slabs = s->slabs;
+		if (size > max_size)
+			max_size = size;
+		if (wasted > max_waste)
+			max_waste = wasted;
+		if (objwaste > max_objwaste)
+			max_objwaste = objwaste;
+		if (s->objects > max_objects)
+			max_objects = s->objects;
+		if (used > max_used)
+			max_used = used;
+		if (s->objects_partial > max_partobj)
+			max_partobj = s->objects_partial;
+		if (percentage_partial_slabs > max_ppart)
+			max_ppart = percentage_partial_slabs;
+		if (percentage_partial_objs > max_ppartobj)
+			max_ppartobj = percentage_partial_objs;
+		if (s->slab_size > max_memobj)
+			max_memobj = s->slab_size;
+
+		total_partial += s->partial;
+		total_slabs += s->slabs;
+		total_size += size;
+		total_waste += wasted;
+
+		total_objects += s->objects;
+		total_used += used;
+		total_partobj += s->objects_partial;
+		total_ppart += percentage_partial_slabs;
+		total_ppartobj += percentage_partial_objs;
+
+		total_objwaste += s->objects * objwaste;
+		total_objsize += s->objects * s->slab_size;
+	}
+
+	if (!total_objects) {
+		printf("No objects\n");
+		return;
+	}
+	if (!used_slabs) {
+		printf("No slabs\n");
+		return;
+	}
+
+	/* Per slab averages */
+	avg_partial = total_partial / used_slabs;
+	avg_slabs = total_slabs / used_slabs;
+	avg_size = total_size / used_slabs;
+	avg_waste = total_waste / used_slabs;
+
+	avg_objects = total_objects / used_slabs;
+	avg_used = total_used / used_slabs;
+	avg_partobj = total_partobj / used_slabs;
+	avg_ppart = total_ppart / used_slabs;
+	avg_ppartobj = total_ppartobj / used_slabs;
+
+	/* Per object object sizes */
+	avg_objsize = total_used / total_objects;
+	avg_objwaste = total_objwaste / total_objects;
+	avg_partobj = total_partobj * 100 / total_objects;
+	avg_memobj = total_objsize / total_objects;
+
+	printf("Slabcache Totals\n");
+	printf("----------------\n");
+	printf("Slabcaches : %15d   Aliases  : %11d->%-3d  Active:    %3d\n",
+			slabs, aliases, alias_targets, used_slabs);
+
+	store_size(b1, total_size);store_size(b2, total_waste);
+	store_size(b3, total_waste * 100 / total_used);
+	printf("Memory used: %15s   # Loss   : %15s   MRatio:%6s%%\n", b1, b2, b3);
+
+	store_size(b1, total_objects);store_size(b2, total_partobj);
+	store_size(b3, total_partobj * 100 / total_objects);
+	printf("# Objects  : %15s   # PartObj: %15s   ORatio:%6s%%\n", b1, b2, b3);
+
+	printf("\n");
+	printf("Per Cache         Average              "
+		"Min              Max            Total\n");
+	printf("---------------------------------------"
+		"-------------------------------------\n");
+
+	store_size(b1, avg_objects);store_size(b2, min_objects);
+	store_size(b3, max_objects);store_size(b4, total_objects);
+	printf("#Objects  %15s  %15s  %15s  %15s\n",
+			b1,	b2,	b3,	b4);
+
+	store_size(b1, avg_slabs);store_size(b2, min_slabs);
+	store_size(b3, max_slabs);store_size(b4, total_slabs);
+	printf("#Slabs    %15s  %15s  %15s  %15s\n",
+			b1,	b2,	b3,	b4);
+
+	store_size(b1, avg_partial);store_size(b2, min_partial);
+	store_size(b3, max_partial);store_size(b4, total_partial);
+	printf("#PartSlab %15s  %15s  %15s  %15s\n",
+			b1,	b2,	b3,	b4);
+	store_size(b1, avg_ppart);store_size(b2, min_ppart);
+	store_size(b3, max_ppart);
+	store_size(b4, total_partial * 100  / total_slabs);
+	printf("%%PartSlab%15s%% %15s%% %15s%% %15s%%\n",
+			b1,	b2,	b3,	b4);
+
+	store_size(b1, avg_partobj);store_size(b2, min_partobj);
+	store_size(b3, max_partobj);
+	store_size(b4, total_partobj);
+	printf("PartObjs  %15s  %15s  %15s  %15s\n",
+			b1,	b2,	b3,	b4);
+
+	store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj);
+	store_size(b3, max_ppartobj);
+	store_size(b4, total_partobj * 100 / total_objects);
+	printf("%% PartObj%15s%% %15s%% %15s%% %15s%%\n",
+			b1,	b2,	b3,	b4);
+
+	store_size(b1, avg_size);store_size(b2, min_size);
+	store_size(b3, max_size);store_size(b4, total_size);
+	printf("Memory    %15s  %15s  %15s  %15s\n",
+			b1,	b2,	b3,	b4);
+
+	store_size(b1, avg_used);store_size(b2, min_used);
+	store_size(b3, max_used);store_size(b4, total_used);
+	printf("Used      %15s  %15s  %15s  %15s\n",
+			b1,	b2,	b3,	b4);
+
+	store_size(b1, avg_waste);store_size(b2, min_waste);
+	store_size(b3, max_waste);store_size(b4, total_waste);
+	printf("Loss      %15s  %15s  %15s  %15s\n",
+			b1,	b2,	b3,	b4);
+
+	printf("\n");
+	printf("Per Object        Average              "
+		"Min              Max\n");
+	printf("---------------------------------------"
+		"--------------------\n");
+
+	store_size(b1, avg_memobj);store_size(b2, min_memobj);
+	store_size(b3, max_memobj);
+	printf("Memory    %15s  %15s  %15s\n",
+			b1,	b2,	b3);
+	store_size(b1, avg_objsize);store_size(b2, min_objsize);
+	store_size(b3, max_objsize);
+	printf("User      %15s  %15s  %15s\n",
+			b1,	b2,	b3);
+
+	store_size(b1, avg_objwaste);store_size(b2, min_objwaste);
+	store_size(b3, max_objwaste);
+	printf("Loss      %15s  %15s  %15s\n",
+			b1,	b2,	b3);
+}
+
+static void sort_slabs(void)
+{
+	struct slabinfo *s1,*s2;
+
+	for (s1 = slabinfo; s1 < slabinfo + slabs; s1++) {
+		for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) {
+			int result;
+
+			if (sort_size) {
+				if (slab_size(s1) == slab_size(s2))
+					result = strcasecmp(s1->name, s2->name);
+				else
+					result = slab_size(s1) < slab_size(s2);
+			} else if (sort_active) {
+				if (slab_activity(s1) == slab_activity(s2))
+					result = strcasecmp(s1->name, s2->name);
+				else
+					result = slab_activity(s1) < slab_activity(s2);
+			} else if (sort_loss) {
+				if (slab_waste(s1) == slab_waste(s2))
+					result = strcasecmp(s1->name, s2->name);
+				else
+					result = slab_waste(s1) < slab_waste(s2);
+			} else if (sort_partial) {
+				if (s1->partial == s2->partial)
+					result = strcasecmp(s1->name, s2->name);
+				else
+					result = s1->partial < s2->partial;
+			} else
+				result = strcasecmp(s1->name, s2->name);
+
+			if (show_inverted)
+				result = -result;
+
+			if (result > 0) {
+				struct slabinfo t;
+
+				memcpy(&t, s1, sizeof(struct slabinfo));
+				memcpy(s1, s2, sizeof(struct slabinfo));
+				memcpy(s2, &t, sizeof(struct slabinfo));
+			}
+		}
+	}
+}
+
+static void sort_aliases(void)
+{
+	struct aliasinfo *a1,*a2;
+
+	for (a1 = aliasinfo; a1 < aliasinfo + aliases; a1++) {
+		for (a2 = a1 + 1; a2 < aliasinfo + aliases; a2++) {
+			char *n1, *n2;
+
+			n1 = a1->name;
+			n2 = a2->name;
+			if (show_alias && !show_inverted) {
+				n1 = a1->ref;
+				n2 = a2->ref;
+			}
+			if (strcasecmp(n1, n2) > 0) {
+				struct aliasinfo t;
+
+				memcpy(&t, a1, sizeof(struct aliasinfo));
+				memcpy(a1, a2, sizeof(struct aliasinfo));
+				memcpy(a2, &t, sizeof(struct aliasinfo));
+			}
+		}
+	}
+}
+
+static void link_slabs(void)
+{
+	struct aliasinfo *a;
+	struct slabinfo *s;
+
+	for (a = aliasinfo; a < aliasinfo + aliases; a++) {
+
+		for (s = slabinfo; s < slabinfo + slabs; s++)
+			if (strcmp(a->ref, s->name) == 0) {
+				a->slab = s;
+				s->refs++;
+				break;
+			}
+		if (s == slabinfo + slabs)
+			fatal("Unresolved alias %s\n", a->ref);
+	}
+}
+
+static void alias(void)
+{
+	struct aliasinfo *a;
+	char *active = NULL;
+
+	sort_aliases();
+	link_slabs();
+
+	for(a = aliasinfo; a < aliasinfo + aliases; a++) {
+
+		if (!show_single_ref && a->slab->refs == 1)
+			continue;
+
+		if (!show_inverted) {
+			if (active) {
+				if (strcmp(a->slab->name, active) == 0) {
+					printf(" %s", a->name);
+					continue;
+				}
+			}
+			printf("\n%-12s <- %s", a->slab->name, a->name);
+			active = a->slab->name;
+		}
+		else
+			printf("%-15s -> %s\n", a->name, a->slab->name);
+	}
+	if (active)
+		printf("\n");
+}
+
+
+static void rename_slabs(void)
+{
+	struct slabinfo *s;
+	struct aliasinfo *a;
+
+	for (s = slabinfo; s < slabinfo + slabs; s++) {
+		if (*s->name != ':')
+			continue;
+
+		if (s->refs > 1 && !show_first_alias)
+			continue;
+
+		a = find_one_alias(s);
+
+		if (a)
+			s->name = a->name;
+		else {
+			s->name = "*";
+			actual_slabs--;
+		}
+	}
+}
+
+static int slab_mismatch(char *slab)
+{
+	return regexec(&pattern, slab, 0, NULL, 0);
+}
+
+static void read_slab_dir(void)
+{
+	DIR *dir;
+	struct dirent *de;
+	struct slabinfo *slab = slabinfo;
+	struct aliasinfo *alias = aliasinfo;
+	char *p;
+	char *t;
+	int count;
+
+	if (chdir("/sys/kernel/slab") && chdir("/sys/slab"))
+		fatal("SYSFS support for SLUB not active\n");
+
+	dir = opendir(".");
+	while ((de = readdir(dir))) {
+		if (de->d_name[0] == '.' ||
+			(de->d_name[0] != ':' && slab_mismatch(de->d_name)))
+				continue;
+		switch (de->d_type) {
+		   case DT_LNK:
+			alias->name = strdup(de->d_name);
+			count = readlink(de->d_name, buffer, sizeof(buffer)-1);
+
+			if (count < 0)
+				fatal("Cannot read symlink %s\n", de->d_name);
+
+			buffer[count] = 0;
+			p = buffer + count;
+			while (p > buffer && p[-1] != '/')
+				p--;
+			alias->ref = strdup(p);
+			alias++;
+			break;
+		   case DT_DIR:
+			if (chdir(de->d_name))
+				fatal("Unable to access slab %s\n", slab->name);
+			slab->name = strdup(de->d_name);
+			slab->alias = 0;
+			slab->refs = 0;
+			slab->aliases = get_obj("aliases");
+			slab->align = get_obj("align");
+			slab->cache_dma = get_obj("cache_dma");
+			slab->cpu_slabs = get_obj("cpu_slabs");
+			slab->destroy_by_rcu = get_obj("destroy_by_rcu");
+			slab->hwcache_align = get_obj("hwcache_align");
+			slab->object_size = get_obj("object_size");
+			slab->objects = get_obj("objects");
+			slab->objects_partial = get_obj("objects_partial");
+			slab->objects_total = get_obj("objects_total");
+			slab->objs_per_slab = get_obj("objs_per_slab");
+			slab->order = get_obj("order");
+			slab->partial = get_obj("partial");
+			slab->partial = get_obj_and_str("partial", &t);
+			decode_numa_list(slab->numa_partial, t);
+			free(t);
+			slab->poison = get_obj("poison");
+			slab->reclaim_account = get_obj("reclaim_account");
+			slab->red_zone = get_obj("red_zone");
+			slab->sanity_checks = get_obj("sanity_checks");
+			slab->slab_size = get_obj("slab_size");
+			slab->slabs = get_obj_and_str("slabs", &t);
+			decode_numa_list(slab->numa, t);
+			free(t);
+			slab->store_user = get_obj("store_user");
+			slab->trace = get_obj("trace");
+			slab->alloc_fastpath = get_obj("alloc_fastpath");
+			slab->alloc_slowpath = get_obj("alloc_slowpath");
+			slab->free_fastpath = get_obj("free_fastpath");
+			slab->free_slowpath = get_obj("free_slowpath");
+			slab->free_frozen= get_obj("free_frozen");
+			slab->free_add_partial = get_obj("free_add_partial");
+			slab->free_remove_partial = get_obj("free_remove_partial");
+			slab->alloc_from_partial = get_obj("alloc_from_partial");
+			slab->alloc_slab = get_obj("alloc_slab");
+			slab->alloc_refill = get_obj("alloc_refill");
+			slab->free_slab = get_obj("free_slab");
+			slab->cpuslab_flush = get_obj("cpuslab_flush");
+			slab->deactivate_full = get_obj("deactivate_full");
+			slab->deactivate_empty = get_obj("deactivate_empty");
+			slab->deactivate_to_head = get_obj("deactivate_to_head");
+			slab->deactivate_to_tail = get_obj("deactivate_to_tail");
+			slab->deactivate_remote_frees = get_obj("deactivate_remote_frees");
+			slab->order_fallback = get_obj("order_fallback");
+			slab->cmpxchg_double_cpu_fail = get_obj("cmpxchg_double_cpu_fail");
+			slab->cmpxchg_double_fail = get_obj("cmpxchg_double_fail");
+			slab->cpu_partial_alloc = get_obj("cpu_partial_alloc");
+			slab->cpu_partial_free = get_obj("cpu_partial_free");
+			slab->alloc_node_mismatch = get_obj("alloc_node_mismatch");
+			slab->deactivate_bypass = get_obj("deactivate_bypass");
+			chdir("..");
+			if (slab->name[0] == ':')
+				alias_targets++;
+			slab++;
+			break;
+		   default :
+			fatal("Unknown file type %lx\n", de->d_type);
+		}
+	}
+	closedir(dir);
+	slabs = slab - slabinfo;
+	actual_slabs = slabs;
+	aliases = alias - aliasinfo;
+	if (slabs > MAX_SLABS)
+		fatal("Too many slabs\n");
+	if (aliases > MAX_ALIASES)
+		fatal("Too many aliases\n");
+}
+
+static void output_slabs(void)
+{
+	struct slabinfo *slab;
+	int lines = output_lines;
+
+	for (slab = slabinfo; (slab < slabinfo + slabs) &&
+			lines != 0; slab++) {
+
+		if (slab->alias)
+			continue;
+
+		if (lines != -1)
+			lines--;
+
+		if (show_numa)
+			slab_numa(slab, 0);
+		else if (show_track)
+			show_tracking(slab);
+		else if (validate)
+			slab_validate(slab);
+		else if (shrink)
+			slab_shrink(slab);
+		else if (set_debug)
+			slab_debug(slab);
+		else if (show_ops)
+			ops(slab);
+		else if (show_slab)
+			slabcache(slab);
+		else if (show_report)
+			report(slab);
+	}
+}
+
+static void _xtotals(char *heading, char *underline,
+		     int loss, int size, int partial)
+{
+	printf("%s%s", heading, underline);
+	line = 0;
+	sort_loss = loss;
+	sort_size = size;
+	sort_partial = partial;
+	sort_slabs();
+	output_slabs();
+}
+
+static void xtotals(void)
+{
+	char *heading, *underline;
+
+	totals();
+
+	link_slabs();
+	rename_slabs();
+
+	heading = "\nSlabs sorted by size\n";
+	underline = "--------------------\n";
+	_xtotals(heading, underline, 0, 1, 0);
+
+	heading = "\nSlabs sorted by loss\n";
+	underline = "--------------------\n";
+	_xtotals(heading, underline, 1, 0, 0);
+
+	heading = "\nSlabs sorted by number of partial slabs\n";
+	underline = "---------------------------------------\n";
+	_xtotals(heading, underline, 0, 0, 1);
+
+	printf("\n");
+}
+
+struct option opts[] = {
+	{ "aliases", no_argument, NULL, 'a' },
+	{ "activity", no_argument, NULL, 'A' },
+	{ "Bytes", no_argument, NULL, 'B'},
+	{ "debug", optional_argument, NULL, 'd' },
+	{ "display-activity", no_argument, NULL, 'D' },
+	{ "empty", no_argument, NULL, 'e' },
+	{ "first-alias", no_argument, NULL, 'f' },
+	{ "help", no_argument, NULL, 'h' },
+	{ "inverted", no_argument, NULL, 'i'},
+	{ "slabs", no_argument, NULL, 'l' },
+	{ "Loss", no_argument, NULL, 'L'},
+	{ "numa", no_argument, NULL, 'n' },
+	{ "lines", required_argument, NULL, 'N'},
+	{ "ops", no_argument, NULL, 'o' },
+	{ "partial", no_argument, NULL, 'p'},
+	{ "report", no_argument, NULL, 'r' },
+	{ "shrink", no_argument, NULL, 's' },
+	{ "Size", no_argument, NULL, 'S'},
+	{ "tracking", no_argument, NULL, 't'},
+	{ "Totals", no_argument, NULL, 'T'},
+	{ "Unreclaim", no_argument, NULL, 'U'},
+	{ "validate", no_argument, NULL, 'v' },
+	{ "Xtotals", no_argument, NULL, 'X'},
+	{ "zero", no_argument, NULL, 'z' },
+	{ "1ref", no_argument, NULL, '1'},
+	{ NULL, 0, NULL, 0 }
+};
+
+int main(int argc, char *argv[])
+{
+	int c;
+	int err;
+	char *pattern_source;
+
+	page_size = getpagesize();
+
+	while ((c = getopt_long(argc, argv, "aABd::DefhilLnN:oPrsStTUvXz1",
+						opts, NULL)) != -1)
+		switch (c) {
+		case 'a':
+			show_alias = 1;
+			break;
+		case 'A':
+			sort_active = 1;
+			break;
+		case 'B':
+			show_bytes = 1;
+			break;
+		case 'd':
+			set_debug = 1;
+			if (!debug_opt_scan(optarg))
+				fatal("Invalid debug option '%s'\n", optarg);
+			break;
+		case 'D':
+			show_activity = 1;
+			break;
+		case 'e':
+			show_empty = 1;
+			break;
+		case 'f':
+			show_first_alias = 1;
+			break;
+		case 'h':
+			usage();
+			return 0;
+		case 'i':
+			show_inverted = 1;
+			break;
+		case 'l':
+			show_slab = 1;
+			break;
+		case 'L':
+			sort_loss = 1;
+			break;
+		case 'n':
+			show_numa = 1;
+			break;
+		case 'N':
+			if (optarg) {
+				output_lines = atoi(optarg);
+				if (output_lines < 1)
+					output_lines = 1;
+			}
+			break;
+		case 'o':
+			show_ops = 1;
+			break;
+		case 'r':
+			show_report = 1;
+			break;
+		case 'P':
+			sort_partial = 1;
+			break;
+		case 's':
+			shrink = 1;
+			break;
+		case 'S':
+			sort_size = 1;
+			break;
+		case 't':
+			show_track = 1;
+			break;
+		case 'T':
+			show_totals = 1;
+			break;
+		case 'U':
+			unreclaim_only = 1;
+			break;
+		case 'v':
+			validate = 1;
+			break;
+		case 'X':
+			if (output_lines == -1)
+				output_lines = 1;
+			extended_totals = 1;
+			show_bytes = 1;
+			break;
+		case 'z':
+			skip_zero = 0;
+			break;
+		case '1':
+			show_single_ref = 1;
+			break;
+		default:
+			fatal("%s: Invalid option '%c'\n", argv[0], optopt);
+
+	}
+
+	if (!show_slab && !show_alias && !show_track && !show_report
+		&& !validate && !shrink && !set_debug && !show_ops)
+			show_slab = 1;
+
+	if (argc > optind)
+		pattern_source = argv[optind];
+	else
+		pattern_source = ".*";
+
+	err = regcomp(&pattern, pattern_source, REG_ICASE|REG_NOSUB);
+	if (err)
+		fatal("%s: Invalid pattern '%s' code %d\n",
+			argv[0], pattern_source, err);
+	read_slab_dir();
+	if (show_alias) {
+		alias();
+	} else if (extended_totals) {
+		xtotals();
+	} else if (show_totals) {
+		totals();
+	} else {
+		link_slabs();
+		rename_slabs();
+		sort_slabs();
+		output_slabs();
+	}
+	return 0;
+}
diff --git a/tools/vm/.gitignore b/tools/vm/.gitignore
deleted file mode 100644
index 922879f93fc8..000000000000
--- a/tools/vm/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-slabinfo
-page-types
-page_owner_sort
diff --git a/tools/vm/Makefile b/tools/vm/Makefile
deleted file mode 100644
index 9860622cbb15..000000000000
--- a/tools/vm/Makefile
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# Makefile for vm tools
-#
-include ../scripts/Makefile.include
-
-TARGETS=page-types slabinfo page_owner_sort
-
-LIB_DIR = ../lib/api
-LIBS = $(LIB_DIR)/libapi.a
-
-CFLAGS = -Wall -Wextra -I../lib/
-LDFLAGS = $(LIBS)
-
-all: $(TARGETS)
-
-$(TARGETS): $(LIBS)
-
-$(LIBS):
-	make -C $(LIB_DIR)
-
-%: %.c
-	$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
-
-clean:
-	$(RM) page-types slabinfo page_owner_sort
-	make -C $(LIB_DIR) clean
-
-sbindir ?= /usr/sbin
-
-install: all
-	install -d $(DESTDIR)$(sbindir)
-	install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir)
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
deleted file mode 100644
index 381dcc00cb62..000000000000
--- a/tools/vm/page-types.c
+++ /dev/null
@@ -1,1396 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * page-types: Tool for querying page flags
- *
- * Copyright (C) 2009 Intel corporation
- *
- * Authors: Wu Fengguang <fengguang.wu@intel.com>
- */
-
-#define _FILE_OFFSET_BITS 64
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <stdarg.h>
-#include <string.h>
-#include <getopt.h>
-#include <limits.h>
-#include <assert.h>
-#include <ftw.h>
-#include <time.h>
-#include <setjmp.h>
-#include <signal.h>
-#include <sys/types.h>
-#include <sys/errno.h>
-#include <sys/fcntl.h>
-#include <sys/mount.h>
-#include <sys/statfs.h>
-#include <sys/mman.h>
-#include "../../include/uapi/linux/magic.h"
-#include "../../include/uapi/linux/kernel-page-flags.h"
-#include <api/fs/fs.h>
-
-#ifndef MAX_PATH
-# define MAX_PATH 256
-#endif
-
-#ifndef STR
-# define _STR(x) #x
-# define STR(x) _STR(x)
-#endif
-
-/*
- * pagemap kernel ABI bits
- */
-
-#define PM_ENTRY_BYTES		8
-#define PM_PFRAME_BITS		55
-#define PM_PFRAME_MASK		((1LL << PM_PFRAME_BITS) - 1)
-#define PM_PFRAME(x)		((x) & PM_PFRAME_MASK)
-#define MAX_SWAPFILES_SHIFT	5
-#define PM_SWAP_OFFSET(x)	(((x) & PM_PFRAME_MASK) >> MAX_SWAPFILES_SHIFT)
-#define PM_SOFT_DIRTY		(1ULL << 55)
-#define PM_MMAP_EXCLUSIVE	(1ULL << 56)
-#define PM_FILE			(1ULL << 61)
-#define PM_SWAP			(1ULL << 62)
-#define PM_PRESENT		(1ULL << 63)
-
-/*
- * kernel page flags
- */
-
-#define KPF_BYTES		8
-#define PROC_KPAGEFLAGS		"/proc/kpageflags"
-#define PROC_KPAGECOUNT		"/proc/kpagecount"
-#define PROC_KPAGECGROUP	"/proc/kpagecgroup"
-
-#define SYS_KERNEL_MM_PAGE_IDLE "/sys/kernel/mm/page_idle/bitmap"
-
-/* [32-] kernel hacking assistances */
-#define KPF_RESERVED		32
-#define KPF_MLOCKED		33
-#define KPF_MAPPEDTODISK	34
-#define KPF_PRIVATE		35
-#define KPF_PRIVATE_2		36
-#define KPF_OWNER_PRIVATE	37
-#define KPF_ARCH		38
-#define KPF_UNCACHED		39
-#define KPF_SOFTDIRTY		40
-#define KPF_ARCH_2		41
-
-/* [47-] take some arbitrary free slots for expanding overloaded flags
- * not part of kernel API
- */
-#define KPF_ANON_EXCLUSIVE	47
-#define KPF_READAHEAD		48
-#define KPF_SLOB_FREE		49
-#define KPF_SLUB_FROZEN		50
-#define KPF_SLUB_DEBUG		51
-#define KPF_FILE		61
-#define KPF_SWAP		62
-#define KPF_MMAP_EXCLUSIVE	63
-
-#define KPF_ALL_BITS		((uint64_t)~0ULL)
-#define KPF_HACKERS_BITS	(0xffffULL << 32)
-#define KPF_OVERLOADED_BITS	(0xffffULL << 48)
-#define BIT(name)		(1ULL << KPF_##name)
-#define BITS_COMPOUND		(BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
-
-static const char * const page_flag_names[] = {
-	[KPF_LOCKED]		= "L:locked",
-	[KPF_ERROR]		= "E:error",
-	[KPF_REFERENCED]	= "R:referenced",
-	[KPF_UPTODATE]		= "U:uptodate",
-	[KPF_DIRTY]		= "D:dirty",
-	[KPF_LRU]		= "l:lru",
-	[KPF_ACTIVE]		= "A:active",
-	[KPF_SLAB]		= "S:slab",
-	[KPF_WRITEBACK]		= "W:writeback",
-	[KPF_RECLAIM]		= "I:reclaim",
-	[KPF_BUDDY]		= "B:buddy",
-
-	[KPF_MMAP]		= "M:mmap",
-	[KPF_ANON]		= "a:anonymous",
-	[KPF_SWAPCACHE]		= "s:swapcache",
-	[KPF_SWAPBACKED]	= "b:swapbacked",
-	[KPF_COMPOUND_HEAD]	= "H:compound_head",
-	[KPF_COMPOUND_TAIL]	= "T:compound_tail",
-	[KPF_HUGE]		= "G:huge",
-	[KPF_UNEVICTABLE]	= "u:unevictable",
-	[KPF_HWPOISON]		= "X:hwpoison",
-	[KPF_NOPAGE]		= "n:nopage",
-	[KPF_KSM]		= "x:ksm",
-	[KPF_THP]		= "t:thp",
-	[KPF_OFFLINE]		= "o:offline",
-	[KPF_PGTABLE]		= "g:pgtable",
-	[KPF_ZERO_PAGE]		= "z:zero_page",
-	[KPF_IDLE]              = "i:idle_page",
-
-	[KPF_RESERVED]		= "r:reserved",
-	[KPF_MLOCKED]		= "m:mlocked",
-	[KPF_MAPPEDTODISK]	= "d:mappedtodisk",
-	[KPF_PRIVATE]		= "P:private",
-	[KPF_PRIVATE_2]		= "p:private_2",
-	[KPF_OWNER_PRIVATE]	= "O:owner_private",
-	[KPF_ARCH]		= "h:arch",
-	[KPF_UNCACHED]		= "c:uncached",
-	[KPF_SOFTDIRTY]		= "f:softdirty",
-	[KPF_ARCH_2]		= "H:arch_2",
-
-	[KPF_ANON_EXCLUSIVE]	= "d:anon_exclusive",
-	[KPF_READAHEAD]		= "I:readahead",
-	[KPF_SLOB_FREE]		= "P:slob_free",
-	[KPF_SLUB_FROZEN]	= "A:slub_frozen",
-	[KPF_SLUB_DEBUG]	= "E:slub_debug",
-
-	[KPF_FILE]		= "F:file",
-	[KPF_SWAP]		= "w:swap",
-	[KPF_MMAP_EXCLUSIVE]	= "1:mmap_exclusive",
-};
-
-
-/*
- * data structures
- */
-
-static int		opt_raw;	/* for kernel developers */
-static int		opt_list;	/* list pages (in ranges) */
-static int		opt_mark_idle;	/* set accessed bit */
-static int		opt_no_summary;	/* don't show summary */
-static pid_t		opt_pid;	/* process to walk */
-const char		*opt_file;	/* file or directory path */
-static uint64_t		opt_cgroup;	/* cgroup inode */
-static int		opt_list_cgroup;/* list page cgroup */
-static int		opt_list_mapcnt;/* list page map count */
-static const char	*opt_kpageflags;/* kpageflags file to parse */
-
-#define MAX_ADDR_RANGES	1024
-static int		nr_addr_ranges;
-static unsigned long	opt_offset[MAX_ADDR_RANGES];
-static unsigned long	opt_size[MAX_ADDR_RANGES];
-
-#define MAX_VMAS	10240
-static int		nr_vmas;
-static unsigned long	pg_start[MAX_VMAS];
-static unsigned long	pg_end[MAX_VMAS];
-
-#define MAX_BIT_FILTERS	64
-static int		nr_bit_filters;
-static uint64_t		opt_mask[MAX_BIT_FILTERS];
-static uint64_t		opt_bits[MAX_BIT_FILTERS];
-
-static int		page_size;
-
-static int		pagemap_fd;
-static int		kpageflags_fd;
-static int		kpagecount_fd = -1;
-static int		kpagecgroup_fd = -1;
-static int		page_idle_fd = -1;
-
-static int		opt_hwpoison;
-static int		opt_unpoison;
-
-static const char	*hwpoison_debug_fs;
-static int		hwpoison_inject_fd;
-static int		hwpoison_forget_fd;
-
-#define HASH_SHIFT	13
-#define HASH_SIZE	(1 << HASH_SHIFT)
-#define HASH_MASK	(HASH_SIZE - 1)
-#define HASH_KEY(flags)	(flags & HASH_MASK)
-
-static unsigned long	total_pages;
-static unsigned long	nr_pages[HASH_SIZE];
-static uint64_t		page_flags[HASH_SIZE];
-
-
-/*
- * helper functions
- */
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-#define min_t(type, x, y) ({			\
-	type __min1 = (x);			\
-	type __min2 = (y);			\
-	__min1 < __min2 ? __min1 : __min2; })
-
-#define max_t(type, x, y) ({			\
-	type __max1 = (x);			\
-	type __max2 = (y);			\
-	__max1 > __max2 ? __max1 : __max2; })
-
-static unsigned long pages2mb(unsigned long pages)
-{
-	return (pages * page_size) >> 20;
-}
-
-static void fatal(const char *x, ...)
-{
-	va_list ap;
-
-	va_start(ap, x);
-	vfprintf(stderr, x, ap);
-	va_end(ap);
-	exit(EXIT_FAILURE);
-}
-
-static int checked_open(const char *pathname, int flags)
-{
-	int fd = open(pathname, flags);
-
-	if (fd < 0) {
-		perror(pathname);
-		exit(EXIT_FAILURE);
-	}
-
-	return fd;
-}
-
-/*
- * pagemap/kpageflags routines
- */
-
-static unsigned long do_u64_read(int fd, const char *name,
-				 uint64_t *buf,
-				 unsigned long index,
-				 unsigned long count)
-{
-	long bytes;
-
-	if (index > ULONG_MAX / 8)
-		fatal("index overflow: %lu\n", index);
-
-	bytes = pread(fd, buf, count * 8, (off_t)index * 8);
-	if (bytes < 0) {
-		perror(name);
-		exit(EXIT_FAILURE);
-	}
-	if (bytes % 8)
-		fatal("partial read: %lu bytes\n", bytes);
-
-	return bytes / 8;
-}
-
-static unsigned long kpageflags_read(uint64_t *buf,
-				     unsigned long index,
-				     unsigned long pages)
-{
-	return do_u64_read(kpageflags_fd, opt_kpageflags, buf, index, pages);
-}
-
-static unsigned long kpagecgroup_read(uint64_t *buf,
-				      unsigned long index,
-				      unsigned long pages)
-{
-	if (kpagecgroup_fd < 0)
-		return pages;
-
-	return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages);
-}
-
-static unsigned long kpagecount_read(uint64_t *buf,
-				     unsigned long index,
-				     unsigned long pages)
-{
-	return kpagecount_fd < 0 ? pages :
-		do_u64_read(kpagecount_fd, PROC_KPAGECOUNT,
-			    buf, index, pages);
-}
-
-static unsigned long pagemap_read(uint64_t *buf,
-				  unsigned long index,
-				  unsigned long pages)
-{
-	return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages);
-}
-
-static unsigned long pagemap_pfn(uint64_t val)
-{
-	unsigned long pfn;
-
-	if (val & PM_PRESENT)
-		pfn = PM_PFRAME(val);
-	else
-		pfn = 0;
-
-	return pfn;
-}
-
-static unsigned long pagemap_swap_offset(uint64_t val)
-{
-	return val & PM_SWAP ? PM_SWAP_OFFSET(val) : 0;
-}
-
-/*
- * page flag names
- */
-
-static char *page_flag_name(uint64_t flags)
-{
-	static char buf[65];
-	int present;
-	size_t i, j;
-
-	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
-		present = (flags >> i) & 1;
-		if (!page_flag_names[i]) {
-			if (present)
-				fatal("unknown flag bit %d\n", i);
-			continue;
-		}
-		buf[j++] = present ? page_flag_names[i][0] : '_';
-	}
-
-	return buf;
-}
-
-static char *page_flag_longname(uint64_t flags)
-{
-	static char buf[1024];
-	size_t i, n;
-
-	for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
-		if (!page_flag_names[i])
-			continue;
-		if ((flags >> i) & 1)
-			n += snprintf(buf + n, sizeof(buf) - n, "%s,",
-					page_flag_names[i] + 2);
-	}
-	if (n)
-		n--;
-	buf[n] = '\0';
-
-	return buf;
-}
-
-
-/*
- * page list and summary
- */
-
-static void show_page_range(unsigned long voffset, unsigned long offset,
-			    unsigned long size, uint64_t flags,
-			    uint64_t cgroup, uint64_t mapcnt)
-{
-	static uint64_t      flags0;
-	static uint64_t	     cgroup0;
-	static uint64_t      mapcnt0;
-	static unsigned long voff;
-	static unsigned long index;
-	static unsigned long count;
-
-	if (flags == flags0 && cgroup == cgroup0 && mapcnt == mapcnt0 &&
-	    offset == index + count && size && voffset == voff + count) {
-		count += size;
-		return;
-	}
-
-	if (count) {
-		if (opt_pid)
-			printf("%lx\t", voff);
-		if (opt_file)
-			printf("%lx\t", voff);
-		if (opt_list_cgroup)
-			printf("@%llu\t", (unsigned long long)cgroup0);
-		if (opt_list_mapcnt)
-			printf("%lu\t", mapcnt0);
-		printf("%lx\t%lx\t%s\n",
-				index, count, page_flag_name(flags0));
-	}
-
-	flags0 = flags;
-	cgroup0 = cgroup;
-	mapcnt0 = mapcnt;
-	index  = offset;
-	voff   = voffset;
-	count  = size;
-}
-
-static void flush_page_range(void)
-{
-	show_page_range(0, 0, 0, 0, 0, 0);
-}
-
-static void show_page(unsigned long voffset, unsigned long offset,
-		      uint64_t flags, uint64_t cgroup, uint64_t mapcnt)
-{
-	if (opt_pid)
-		printf("%lx\t", voffset);
-	if (opt_file)
-		printf("%lx\t", voffset);
-	if (opt_list_cgroup)
-		printf("@%llu\t", (unsigned long long)cgroup);
-	if (opt_list_mapcnt)
-		printf("%lu\t", mapcnt);
-
-	printf("%lx\t%s\n", offset, page_flag_name(flags));
-}
-
-static void show_summary(void)
-{
-	size_t i;
-
-	printf("             flags\tpage-count       MB"
-		"  symbolic-flags\t\t\tlong-symbolic-flags\n");
-
-	for (i = 0; i < ARRAY_SIZE(nr_pages); i++) {
-		if (nr_pages[i])
-			printf("0x%016llx\t%10lu %8lu  %s\t%s\n",
-				(unsigned long long)page_flags[i],
-				nr_pages[i],
-				pages2mb(nr_pages[i]),
-				page_flag_name(page_flags[i]),
-				page_flag_longname(page_flags[i]));
-	}
-
-	printf("             total\t%10lu %8lu\n",
-			total_pages, pages2mb(total_pages));
-}
-
-
-/*
- * page flag filters
- */
-
-static int bit_mask_ok(uint64_t flags)
-{
-	int i;
-
-	for (i = 0; i < nr_bit_filters; i++) {
-		if (opt_bits[i] == KPF_ALL_BITS) {
-			if ((flags & opt_mask[i]) == 0)
-				return 0;
-		} else {
-			if ((flags & opt_mask[i]) != opt_bits[i])
-				return 0;
-		}
-	}
-
-	return 1;
-}
-
-static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
-{
-	/* Anonymous pages overload PG_mappedtodisk */
-	if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK)))
-		flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE);
-
-	/* SLOB/SLUB overload several page flags */
-	if (flags & BIT(SLAB)) {
-		if (flags & BIT(PRIVATE))
-			flags ^= BIT(PRIVATE) | BIT(SLOB_FREE);
-		if (flags & BIT(ACTIVE))
-			flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
-		if (flags & BIT(ERROR))
-			flags ^= BIT(ERROR) | BIT(SLUB_DEBUG);
-	}
-
-	/* PG_reclaim is overloaded as PG_readahead in the read path */
-	if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM))
-		flags ^= BIT(RECLAIM) | BIT(READAHEAD);
-
-	if (pme & PM_SOFT_DIRTY)
-		flags |= BIT(SOFTDIRTY);
-	if (pme & PM_FILE)
-		flags |= BIT(FILE);
-	if (pme & PM_SWAP)
-		flags |= BIT(SWAP);
-	if (pme & PM_MMAP_EXCLUSIVE)
-		flags |= BIT(MMAP_EXCLUSIVE);
-
-	return flags;
-}
-
-static uint64_t well_known_flags(uint64_t flags)
-{
-	/* hide flags intended only for kernel hacker */
-	flags &= ~KPF_HACKERS_BITS;
-
-	/* hide non-hugeTLB compound pages */
-	if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE)))
-		flags &= ~BITS_COMPOUND;
-
-	return flags;
-}
-
-static uint64_t kpageflags_flags(uint64_t flags, uint64_t pme)
-{
-	if (opt_raw)
-		flags = expand_overloaded_flags(flags, pme);
-	else
-		flags = well_known_flags(flags);
-
-	return flags;
-}
-
-/*
- * page actions
- */
-
-static void prepare_hwpoison_fd(void)
-{
-	char buf[MAX_PATH + 1];
-
-	hwpoison_debug_fs = debugfs__mount();
-	if (!hwpoison_debug_fs) {
-		perror("mount debugfs");
-		exit(EXIT_FAILURE);
-	}
-
-	if (opt_hwpoison && !hwpoison_inject_fd) {
-		snprintf(buf, MAX_PATH, "%s/hwpoison/corrupt-pfn",
-			hwpoison_debug_fs);
-		hwpoison_inject_fd = checked_open(buf, O_WRONLY);
-	}
-
-	if (opt_unpoison && !hwpoison_forget_fd) {
-		snprintf(buf, MAX_PATH, "%s/hwpoison/unpoison-pfn",
-			hwpoison_debug_fs);
-		hwpoison_forget_fd = checked_open(buf, O_WRONLY);
-	}
-}
-
-static int hwpoison_page(unsigned long offset)
-{
-	char buf[100];
-	int len;
-
-	len = sprintf(buf, "0x%lx\n", offset);
-	len = write(hwpoison_inject_fd, buf, len);
-	if (len < 0) {
-		perror("hwpoison inject");
-		return len;
-	}
-	return 0;
-}
-
-static int unpoison_page(unsigned long offset)
-{
-	char buf[100];
-	int len;
-
-	len = sprintf(buf, "0x%lx\n", offset);
-	len = write(hwpoison_forget_fd, buf, len);
-	if (len < 0) {
-		perror("hwpoison forget");
-		return len;
-	}
-	return 0;
-}
-
-static int mark_page_idle(unsigned long offset)
-{
-	static unsigned long off;
-	static uint64_t buf;
-	int len;
-
-	if ((offset / 64 == off / 64) || buf == 0) {
-		buf |= 1UL << (offset % 64);
-		off = offset;
-		return 0;
-	}
-
-	len = pwrite(page_idle_fd, &buf, 8, 8 * (off / 64));
-	if (len < 0) {
-		perror("mark page idle");
-		return len;
-	}
-
-	buf = 1UL << (offset % 64);
-	off = offset;
-
-	return 0;
-}
-
-/*
- * page frame walker
- */
-
-static size_t hash_slot(uint64_t flags)
-{
-	size_t k = HASH_KEY(flags);
-	size_t i;
-
-	/* Explicitly reserve slot 0 for flags 0: the following logic
-	 * cannot distinguish an unoccupied slot from slot (flags==0).
-	 */
-	if (flags == 0)
-		return 0;
-
-	/* search through the remaining (HASH_SIZE-1) slots */
-	for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) {
-		if (!k || k >= ARRAY_SIZE(page_flags))
-			k = 1;
-		if (page_flags[k] == 0) {
-			page_flags[k] = flags;
-			return k;
-		}
-		if (page_flags[k] == flags)
-			return k;
-	}
-
-	fatal("hash table full: bump up HASH_SHIFT?\n");
-	exit(EXIT_FAILURE);
-}
-
-static void add_page(unsigned long voffset, unsigned long offset,
-		     uint64_t flags, uint64_t cgroup, uint64_t mapcnt,
-		     uint64_t pme)
-{
-	flags = kpageflags_flags(flags, pme);
-
-	if (!bit_mask_ok(flags))
-		return;
-
-	if (opt_cgroup && cgroup != (uint64_t)opt_cgroup)
-		return;
-
-	if (opt_hwpoison)
-		hwpoison_page(offset);
-	if (opt_unpoison)
-		unpoison_page(offset);
-
-	if (opt_mark_idle)
-		mark_page_idle(offset);
-
-	if (opt_list == 1)
-		show_page_range(voffset, offset, 1, flags, cgroup, mapcnt);
-	else if (opt_list == 2)
-		show_page(voffset, offset, flags, cgroup, mapcnt);
-
-	nr_pages[hash_slot(flags)]++;
-	total_pages++;
-}
-
-#define KPAGEFLAGS_BATCH	(64 << 10)	/* 64k pages */
-static void walk_pfn(unsigned long voffset,
-		     unsigned long index,
-		     unsigned long count,
-		     uint64_t pme)
-{
-	uint64_t buf[KPAGEFLAGS_BATCH];
-	uint64_t cgi[KPAGEFLAGS_BATCH];
-	uint64_t cnt[KPAGEFLAGS_BATCH];
-	unsigned long batch;
-	unsigned long pages;
-	unsigned long i;
-
-	/*
-	 * kpagecgroup_read() reads only if kpagecgroup were opened, but
-	 * /proc/kpagecgroup might even not exist, so it's better to fill
-	 * them with zeros here.
-	 */
-	if (count == 1)
-		cgi[0] = 0;
-	else
-		memset(cgi, 0, sizeof cgi);
-
-	while (count) {
-		batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH);
-		pages = kpageflags_read(buf, index, batch);
-		if (pages == 0)
-			break;
-
-		if (kpagecgroup_read(cgi, index, pages) != pages)
-			fatal("kpagecgroup returned fewer pages than expected");
-
-		if (kpagecount_read(cnt, index, pages) != pages)
-			fatal("kpagecount returned fewer pages than expected");
-
-		for (i = 0; i < pages; i++)
-			add_page(voffset + i, index + i,
-				 buf[i], cgi[i], cnt[i], pme);
-
-		index += pages;
-		count -= pages;
-	}
-}
-
-static void walk_swap(unsigned long voffset, uint64_t pme)
-{
-	uint64_t flags = kpageflags_flags(0, pme);
-
-	if (!bit_mask_ok(flags))
-		return;
-
-	if (opt_cgroup)
-		return;
-
-	if (opt_list == 1)
-		show_page_range(voffset, pagemap_swap_offset(pme),
-				1, flags, 0, 0);
-	else if (opt_list == 2)
-		show_page(voffset, pagemap_swap_offset(pme), flags, 0, 0);
-
-	nr_pages[hash_slot(flags)]++;
-	total_pages++;
-}
-
-#define PAGEMAP_BATCH	(64 << 10)
-static void walk_vma(unsigned long index, unsigned long count)
-{
-	uint64_t buf[PAGEMAP_BATCH];
-	unsigned long batch;
-	unsigned long pages;
-	unsigned long pfn;
-	unsigned long i;
-
-	while (count) {
-		batch = min_t(unsigned long, count, PAGEMAP_BATCH);
-		pages = pagemap_read(buf, index, batch);
-		if (pages == 0)
-			break;
-
-		for (i = 0; i < pages; i++) {
-			pfn = pagemap_pfn(buf[i]);
-			if (pfn)
-				walk_pfn(index + i, pfn, 1, buf[i]);
-			if (buf[i] & PM_SWAP)
-				walk_swap(index + i, buf[i]);
-		}
-
-		index += pages;
-		count -= pages;
-	}
-}
-
-static void walk_task(unsigned long index, unsigned long count)
-{
-	const unsigned long end = index + count;
-	unsigned long start;
-	int i = 0;
-
-	while (index < end) {
-
-		while (pg_end[i] <= index)
-			if (++i >= nr_vmas)
-				return;
-		if (pg_start[i] >= end)
-			return;
-
-		start = max_t(unsigned long, pg_start[i], index);
-		index = min_t(unsigned long, pg_end[i], end);
-
-		assert(start < index);
-		walk_vma(start, index - start);
-	}
-}
-
-static void add_addr_range(unsigned long offset, unsigned long size)
-{
-	if (nr_addr_ranges >= MAX_ADDR_RANGES)
-		fatal("too many addr ranges\n");
-
-	opt_offset[nr_addr_ranges] = offset;
-	opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset);
-	nr_addr_ranges++;
-}
-
-static void walk_addr_ranges(void)
-{
-	int i;
-
-	kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
-
-	if (!nr_addr_ranges)
-		add_addr_range(0, ULONG_MAX);
-
-	for (i = 0; i < nr_addr_ranges; i++)
-		if (!opt_pid)
-			walk_pfn(opt_offset[i], opt_offset[i], opt_size[i], 0);
-		else
-			walk_task(opt_offset[i], opt_size[i]);
-
-	if (opt_mark_idle)
-		mark_page_idle(0);
-
-	close(kpageflags_fd);
-}
-
-
-/*
- * user interface
- */
-
-static const char *page_flag_type(uint64_t flag)
-{
-	if (flag & KPF_HACKERS_BITS)
-		return "(r)";
-	if (flag & KPF_OVERLOADED_BITS)
-		return "(o)";
-	return "   ";
-}
-
-static void usage(void)
-{
-	size_t i, j;
-
-	printf(
-"page-types [options]\n"
-"            -r|--raw                   Raw mode, for kernel developers\n"
-"            -d|--describe flags        Describe flags\n"
-"            -a|--addr    addr-spec     Walk a range of pages\n"
-"            -b|--bits    bits-spec     Walk pages with specified bits\n"
-"            -c|--cgroup  path|@inode   Walk pages within memory cgroup\n"
-"            -p|--pid     pid           Walk process address space\n"
-"            -f|--file    filename      Walk file address space\n"
-"            -i|--mark-idle             Mark pages idle\n"
-"            -l|--list                  Show page details in ranges\n"
-"            -L|--list-each             Show page details one by one\n"
-"            -C|--list-cgroup           Show cgroup inode for pages\n"
-"            -M|--list-mapcnt           Show page map count\n"
-"            -N|--no-summary            Don't show summary info\n"
-"            -X|--hwpoison              hwpoison pages\n"
-"            -x|--unpoison              unpoison pages\n"
-"            -F|--kpageflags filename   kpageflags file to parse\n"
-"            -h|--help                  Show this usage message\n"
-"flags:\n"
-"            0x10                       bitfield format, e.g.\n"
-"            anon                       bit-name, e.g.\n"
-"            0x10,anon                  comma-separated list, e.g.\n"
-"addr-spec:\n"
-"            N                          one page at offset N (unit: pages)\n"
-"            N+M                        pages range from N to N+M-1\n"
-"            N,M                        pages range from N to M-1\n"
-"            N,                         pages range from N to end\n"
-"            ,M                         pages range from 0 to M-1\n"
-"bits-spec:\n"
-"            bit1,bit2                  (flags & (bit1|bit2)) != 0\n"
-"            bit1,bit2=bit1             (flags & (bit1|bit2)) == bit1\n"
-"            bit1,~bit2                 (flags & (bit1|bit2)) == bit1\n"
-"            =bit1,bit2                 flags == (bit1|bit2)\n"
-"bit-names:\n"
-	);
-
-	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
-		if (!page_flag_names[i])
-			continue;
-		printf("%16s%s", page_flag_names[i] + 2,
-				 page_flag_type(1ULL << i));
-		if (++j > 3) {
-			j = 0;
-			putchar('\n');
-		}
-	}
-	printf("\n                                   "
-		"(r) raw mode bits  (o) overloaded bits\n");
-}
-
-static unsigned long long parse_number(const char *str)
-{
-	unsigned long long n;
-
-	n = strtoll(str, NULL, 0);
-
-	if (n == 0 && str[0] != '0')
-		fatal("invalid name or number: %s\n", str);
-
-	return n;
-}
-
-static void parse_pid(const char *str)
-{
-	FILE *file;
-	char buf[5000];
-
-	opt_pid = parse_number(str);
-
-	sprintf(buf, "/proc/%d/pagemap", opt_pid);
-	pagemap_fd = checked_open(buf, O_RDONLY);
-
-	sprintf(buf, "/proc/%d/maps", opt_pid);
-	file = fopen(buf, "r");
-	if (!file) {
-		perror(buf);
-		exit(EXIT_FAILURE);
-	}
-
-	while (fgets(buf, sizeof(buf), file) != NULL) {
-		unsigned long vm_start;
-		unsigned long vm_end;
-		unsigned long long pgoff;
-		int major, minor;
-		char r, w, x, s;
-		unsigned long ino;
-		int n;
-
-		n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu",
-			   &vm_start,
-			   &vm_end,
-			   &r, &w, &x, &s,
-			   &pgoff,
-			   &major, &minor,
-			   &ino);
-		if (n < 10) {
-			fprintf(stderr, "unexpected line: %s\n", buf);
-			continue;
-		}
-		pg_start[nr_vmas] = vm_start / page_size;
-		pg_end[nr_vmas] = vm_end / page_size;
-		if (++nr_vmas >= MAX_VMAS) {
-			fprintf(stderr, "too many VMAs\n");
-			break;
-		}
-	}
-	fclose(file);
-}
-
-static void show_file(const char *name, const struct stat *st)
-{
-	unsigned long long size = st->st_size;
-	char atime[64], mtime[64];
-	long now = time(NULL);
-
-	printf("%s\tInode: %u\tSize: %llu (%llu pages)\n",
-			name, (unsigned)st->st_ino,
-			size, (size + page_size - 1) / page_size);
-
-	strftime(atime, sizeof(atime), "%c", localtime(&st->st_atime));
-	strftime(mtime, sizeof(mtime), "%c", localtime(&st->st_mtime));
-
-	printf("Modify: %s (%ld seconds ago)\nAccess: %s (%ld seconds ago)\n",
-			mtime, now - st->st_mtime,
-			atime, now - st->st_atime);
-}
-
-static sigjmp_buf sigbus_jmp;
-
-static void * volatile sigbus_addr;
-
-static void sigbus_handler(int sig, siginfo_t *info, void *ucontex)
-{
-	(void)sig;
-	(void)ucontex;
-	sigbus_addr = info ? info->si_addr : NULL;
-	siglongjmp(sigbus_jmp, 1);
-}
-
-static struct sigaction sigbus_action = {
-	.sa_sigaction = sigbus_handler,
-	.sa_flags = SA_SIGINFO,
-};
-
-static void walk_file_range(const char *name, int fd,
-			    unsigned long off, unsigned long end)
-{
-	uint8_t vec[PAGEMAP_BATCH];
-	uint64_t buf[PAGEMAP_BATCH], flags;
-	uint64_t cgroup = 0;
-	uint64_t mapcnt = 0;
-	unsigned long nr_pages, pfn, i;
-	ssize_t len;
-	void *ptr;
-	int first = 1;
-
-	for (; off < end; off += len) {
-		nr_pages = (end - off + page_size - 1) / page_size;
-		if (nr_pages > PAGEMAP_BATCH)
-			nr_pages = PAGEMAP_BATCH;
-		len = nr_pages * page_size;
-
-		ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, off);
-		if (ptr == MAP_FAILED)
-			fatal("mmap failed: %s", name);
-
-		/* determine cached pages */
-		if (mincore(ptr, len, vec))
-			fatal("mincore failed: %s", name);
-
-		/* turn off readahead */
-		if (madvise(ptr, len, MADV_RANDOM))
-			fatal("madvice failed: %s", name);
-
-		if (sigsetjmp(sigbus_jmp, 1)) {
-			end = off + sigbus_addr ? sigbus_addr - ptr : 0;
-			fprintf(stderr, "got sigbus at offset %lld: %s\n",
-					(long long)end, name);
-			goto got_sigbus;
-		}
-
-		/* populate ptes */
-		for (i = 0; i < nr_pages ; i++) {
-			if (vec[i] & 1)
-				(void)*(volatile int *)(ptr + i * page_size);
-		}
-got_sigbus:
-
-		/* turn off harvesting reference bits */
-		if (madvise(ptr, len, MADV_SEQUENTIAL))
-			fatal("madvice failed: %s", name);
-
-		if (pagemap_read(buf, (unsigned long)ptr / page_size,
-					nr_pages) != nr_pages)
-			fatal("cannot read pagemap");
-
-		munmap(ptr, len);
-
-		for (i = 0; i < nr_pages; i++) {
-			pfn = pagemap_pfn(buf[i]);
-			if (!pfn)
-				continue;
-			if (!kpageflags_read(&flags, pfn, 1))
-				continue;
-			if (!kpagecgroup_read(&cgroup, pfn, 1))
-				fatal("kpagecgroup_read failed");
-			if (!kpagecount_read(&mapcnt, pfn, 1))
-				fatal("kpagecount_read failed");
-			if (first && opt_list) {
-				first = 0;
-				flush_page_range();
-			}
-			add_page(off / page_size + i, pfn,
-				 flags, cgroup, mapcnt, buf[i]);
-		}
-	}
-}
-
-static void walk_file(const char *name, const struct stat *st)
-{
-	int i;
-	int fd;
-
-	fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW);
-
-	if (!nr_addr_ranges)
-		add_addr_range(0, st->st_size / page_size);
-
-	for (i = 0; i < nr_addr_ranges; i++)
-		walk_file_range(name, fd, opt_offset[i] * page_size,
-				(opt_offset[i] + opt_size[i]) * page_size);
-
-	close(fd);
-}
-
-int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f)
-{
-	(void)f;
-	switch (type) {
-	case FTW_F:
-		if (S_ISREG(st->st_mode))
-			walk_file(name, st);
-		break;
-	case FTW_DNR:
-		fprintf(stderr, "cannot read dir: %s\n", name);
-		break;
-	}
-	return 0;
-}
-
-struct stat st;
-
-static void walk_page_cache(void)
-{
-	kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
-	pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY);
-	sigaction(SIGBUS, &sigbus_action, NULL);
-
-	if (stat(opt_file, &st))
-		fatal("stat failed: %s\n", opt_file);
-
-	if (S_ISREG(st.st_mode)) {
-		walk_file(opt_file, &st);
-	} else if (S_ISDIR(st.st_mode)) {
-		/* do not follow symlinks and mountpoints */
-		if (nftw(opt_file, walk_tree, 64, FTW_MOUNT | FTW_PHYS) < 0)
-			fatal("nftw failed: %s\n", opt_file);
-	} else
-		fatal("unhandled file type: %s\n", opt_file);
-
-	close(kpageflags_fd);
-	close(pagemap_fd);
-	signal(SIGBUS, SIG_DFL);
-}
-
-static void parse_file(const char *name)
-{
-	opt_file = name;
-}
-
-static void parse_cgroup(const char *path)
-{
-	if (path[0] == '@') {
-		opt_cgroup = parse_number(path + 1);
-		return;
-	}
-
-	struct stat st;
-
-	if (stat(path, &st))
-		fatal("stat failed: %s: %m\n", path);
-
-	if (!S_ISDIR(st.st_mode))
-		fatal("cgroup supposed to be a directory: %s\n", path);
-
-	opt_cgroup = st.st_ino;
-}
-
-static void parse_addr_range(const char *optarg)
-{
-	unsigned long offset;
-	unsigned long size;
-	char *p;
-
-	p = strchr(optarg, ',');
-	if (!p)
-		p = strchr(optarg, '+');
-
-	if (p == optarg) {
-		offset = 0;
-		size   = parse_number(p + 1);
-	} else if (p) {
-		offset = parse_number(optarg);
-		if (p[1] == '\0')
-			size = ULONG_MAX;
-		else {
-			size = parse_number(p + 1);
-			if (*p == ',') {
-				if (size < offset)
-					fatal("invalid range: %lu,%lu\n",
-							offset, size);
-				size -= offset;
-			}
-		}
-	} else {
-		offset = parse_number(optarg);
-		size   = 1;
-	}
-
-	add_addr_range(offset, size);
-}
-
-static void add_bits_filter(uint64_t mask, uint64_t bits)
-{
-	if (nr_bit_filters >= MAX_BIT_FILTERS)
-		fatal("too much bit filters\n");
-
-	opt_mask[nr_bit_filters] = mask;
-	opt_bits[nr_bit_filters] = bits;
-	nr_bit_filters++;
-}
-
-static uint64_t parse_flag_name(const char *str, int len)
-{
-	size_t i;
-
-	if (!*str || !len)
-		return 0;
-
-	if (len <= 8 && !strncmp(str, "compound", len))
-		return BITS_COMPOUND;
-
-	for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) {
-		if (!page_flag_names[i])
-			continue;
-		if (!strncmp(str, page_flag_names[i] + 2, len))
-			return 1ULL << i;
-	}
-
-	return parse_number(str);
-}
-
-static uint64_t parse_flag_names(const char *str, int all)
-{
-	const char *p    = str;
-	uint64_t   flags = 0;
-
-	while (1) {
-		if (*p == ',' || *p == '=' || *p == '\0') {
-			if ((*str != '~') || (*str == '~' && all && *++str))
-				flags |= parse_flag_name(str, p - str);
-			if (*p != ',')
-				break;
-			str = p + 1;
-		}
-		p++;
-	}
-
-	return flags;
-}
-
-static void parse_bits_mask(const char *optarg)
-{
-	uint64_t mask;
-	uint64_t bits;
-	const char *p;
-
-	p = strchr(optarg, '=');
-	if (p == optarg) {
-		mask = KPF_ALL_BITS;
-		bits = parse_flag_names(p + 1, 0);
-	} else if (p) {
-		mask = parse_flag_names(optarg, 0);
-		bits = parse_flag_names(p + 1, 0);
-	} else if (strchr(optarg, '~')) {
-		mask = parse_flag_names(optarg, 1);
-		bits = parse_flag_names(optarg, 0);
-	} else {
-		mask = parse_flag_names(optarg, 0);
-		bits = KPF_ALL_BITS;
-	}
-
-	add_bits_filter(mask, bits);
-}
-
-static void parse_kpageflags(const char *name)
-{
-	opt_kpageflags = name;
-}
-
-static void describe_flags(const char *optarg)
-{
-	uint64_t flags = parse_flag_names(optarg, 0);
-
-	printf("0x%016llx\t%s\t%s\n",
-		(unsigned long long)flags,
-		page_flag_name(flags),
-		page_flag_longname(flags));
-}
-
-static const struct option opts[] = {
-	{ "raw"       , 0, NULL, 'r' },
-	{ "pid"       , 1, NULL, 'p' },
-	{ "file"      , 1, NULL, 'f' },
-	{ "addr"      , 1, NULL, 'a' },
-	{ "bits"      , 1, NULL, 'b' },
-	{ "cgroup"    , 1, NULL, 'c' },
-	{ "describe"  , 1, NULL, 'd' },
-	{ "mark-idle" , 0, NULL, 'i' },
-	{ "list"      , 0, NULL, 'l' },
-	{ "list-each" , 0, NULL, 'L' },
-	{ "list-cgroup", 0, NULL, 'C' },
-	{ "list-mapcnt", 0, NULL, 'M' },
-	{ "no-summary", 0, NULL, 'N' },
-	{ "hwpoison"  , 0, NULL, 'X' },
-	{ "unpoison"  , 0, NULL, 'x' },
-	{ "kpageflags", 0, NULL, 'F' },
-	{ "help"      , 0, NULL, 'h' },
-	{ NULL        , 0, NULL, 0 }
-};
-
-int main(int argc, char *argv[])
-{
-	int c;
-
-	page_size = getpagesize();
-
-	while ((c = getopt_long(argc, argv,
-				"rp:f:a:b:d:c:CilLMNXxF:h",
-				opts, NULL)) != -1) {
-		switch (c) {
-		case 'r':
-			opt_raw = 1;
-			break;
-		case 'p':
-			parse_pid(optarg);
-			break;
-		case 'f':
-			parse_file(optarg);
-			break;
-		case 'a':
-			parse_addr_range(optarg);
-			break;
-		case 'b':
-			parse_bits_mask(optarg);
-			break;
-		case 'c':
-			parse_cgroup(optarg);
-			break;
-		case 'C':
-			opt_list_cgroup = 1;
-			break;
-		case 'd':
-			describe_flags(optarg);
-			exit(0);
-		case 'i':
-			opt_mark_idle = 1;
-			break;
-		case 'l':
-			opt_list = 1;
-			break;
-		case 'L':
-			opt_list = 2;
-			break;
-		case 'M':
-			opt_list_mapcnt = 1;
-			break;
-		case 'N':
-			opt_no_summary = 1;
-			break;
-		case 'X':
-			opt_hwpoison = 1;
-			prepare_hwpoison_fd();
-			break;
-		case 'x':
-			opt_unpoison = 1;
-			prepare_hwpoison_fd();
-			break;
-		case 'F':
-			parse_kpageflags(optarg);
-			break;
-		case 'h':
-			usage();
-			exit(0);
-		default:
-			usage();
-			exit(1);
-		}
-	}
-
-	if (!opt_kpageflags)
-		opt_kpageflags = PROC_KPAGEFLAGS;
-
-	if (opt_cgroup || opt_list_cgroup)
-		kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY);
-
-	if (opt_list && opt_list_mapcnt)
-		kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY);
-
-	if (opt_mark_idle)
-		page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR);
-
-	if (opt_list && opt_pid)
-		printf("voffset\t");
-	if (opt_list && opt_file)
-		printf("foffset\t");
-	if (opt_list && opt_list_cgroup)
-		printf("cgroup\t");
-	if (opt_list && opt_list_mapcnt)
-		printf("map-cnt\t");
-
-	if (opt_list == 1)
-		printf("offset\tlen\tflags\n");
-	if (opt_list == 2)
-		printf("offset\tflags\n");
-
-	if (opt_file)
-		walk_page_cache();
-	else
-		walk_addr_ranges();
-
-	if (opt_list == 1)
-		flush_page_range();
-
-	if (opt_no_summary)
-		return 0;
-
-	if (opt_list)
-		printf("\n\n");
-
-	if (opt_file) {
-		show_file(opt_file, &st);
-		printf("\n");
-	}
-
-	show_summary();
-
-	if (opt_list_mapcnt)
-		close(kpagecount_fd);
-
-	if (page_idle_fd >= 0)
-		close(page_idle_fd);
-
-	return 0;
-}
diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c
deleted file mode 100644
index 7c2ac124cdc8..000000000000
--- a/tools/vm/page_owner_sort.c
+++ /dev/null
@@ -1,897 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * User-space helper to sort the output of /sys/kernel/debug/page_owner
- *
- * Example use:
- * cat /sys/kernel/debug/page_owner > page_owner_full.txt
- * ./page_owner_sort page_owner_full.txt sorted_page_owner.txt
- * Or sort by total memory:
- * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt
- *
- * See Documentation/mm/page_owner.rst
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <string.h>
-#include <regex.h>
-#include <errno.h>
-#include <linux/types.h>
-#include <getopt.h>
-
-#define bool int
-#define true 1
-#define false 0
-#define TASK_COMM_LEN 16
-
-struct block_list {
-	char *txt;
-	char *comm; // task command name
-	char *stacktrace;
-	__u64 ts_nsec;
-	__u64 free_ts_nsec;
-	int len;
-	int num;
-	int page_num;
-	pid_t pid;
-	pid_t tgid;
-	int allocator;
-};
-enum FILTER_BIT {
-	FILTER_UNRELEASE = 1<<1,
-	FILTER_PID = 1<<2,
-	FILTER_TGID = 1<<3,
-	FILTER_COMM = 1<<4
-};
-enum CULL_BIT {
-	CULL_UNRELEASE = 1<<1,
-	CULL_PID = 1<<2,
-	CULL_TGID = 1<<3,
-	CULL_COMM = 1<<4,
-	CULL_STACKTRACE = 1<<5,
-	CULL_ALLOCATOR = 1<<6
-};
-enum ALLOCATOR_BIT {
-	ALLOCATOR_CMA = 1<<1,
-	ALLOCATOR_SLAB = 1<<2,
-	ALLOCATOR_VMALLOC = 1<<3,
-	ALLOCATOR_OTHERS = 1<<4
-};
-enum ARG_TYPE {
-	ARG_TXT, ARG_COMM, ARG_STACKTRACE, ARG_ALLOC_TS, ARG_FREE_TS,
-	ARG_CULL_TIME, ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_FREE,
-	ARG_ALLOCATOR
-};
-enum SORT_ORDER {
-	SORT_ASC = 1,
-	SORT_DESC = -1,
-};
-struct filter_condition {
-	pid_t *pids;
-	pid_t *tgids;
-	char **comms;
-	int pids_size;
-	int tgids_size;
-	int comms_size;
-};
-struct sort_condition {
-	int (**cmps)(const void *, const void *);
-	int *signs;
-	int size;
-};
-static struct filter_condition fc;
-static struct sort_condition sc;
-static regex_t order_pattern;
-static regex_t pid_pattern;
-static regex_t tgid_pattern;
-static regex_t comm_pattern;
-static regex_t ts_nsec_pattern;
-static regex_t free_ts_nsec_pattern;
-static struct block_list *list;
-static int list_size;
-static int max_size;
-static int cull;
-static int filter;
-static bool debug_on;
-
-static void set_single_cmp(int (*cmp)(const void *, const void *), int sign);
-
-int read_block(char *buf, char *ext_buf, int buf_size, FILE *fin)
-{
-	char *curr = buf, *const buf_end = buf + buf_size;
-
-	while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) {
-		if (*curr == '\n') { /* empty line */
-			return curr - buf;
-		}
-		if (!strncmp(curr, "PFN", 3)) {
-			strcpy(ext_buf, curr);
-			continue;
-		}
-		curr += strlen(curr);
-	}
-
-	return -1; /* EOF or no space left in buf. */
-}
-
-static int compare_txt(const void *p1, const void *p2)
-{
-	const struct block_list *l1 = p1, *l2 = p2;
-
-	return strcmp(l1->txt, l2->txt);
-}
-
-static int compare_stacktrace(const void *p1, const void *p2)
-{
-	const struct block_list *l1 = p1, *l2 = p2;
-
-	return strcmp(l1->stacktrace, l2->stacktrace);
-}
-
-static int compare_num(const void *p1, const void *p2)
-{
-	const struct block_list *l1 = p1, *l2 = p2;
-
-	return l1->num - l2->num;
-}
-
-static int compare_page_num(const void *p1, const void *p2)
-{
-	const struct block_list *l1 = p1, *l2 = p2;
-
-	return l1->page_num - l2->page_num;
-}
-
-static int compare_pid(const void *p1, const void *p2)
-{
-	const struct block_list *l1 = p1, *l2 = p2;
-
-	return l1->pid - l2->pid;
-}
-
-static int compare_tgid(const void *p1, const void *p2)
-{
-	const struct block_list *l1 = p1, *l2 = p2;
-
-	return l1->tgid - l2->tgid;
-}
-
-static int compare_allocator(const void *p1, const void *p2)
-{
-	const struct block_list *l1 = p1, *l2 = p2;
-
-	return l1->allocator - l2->allocator;
-}
-
-static int compare_comm(const void *p1, const void *p2)
-{
-	const struct block_list *l1 = p1, *l2 = p2;
-
-	return strcmp(l1->comm, l2->comm);
-}
-
-static int compare_ts(const void *p1, const void *p2)
-{
-	const struct block_list *l1 = p1, *l2 = p2;
-
-	return l1->ts_nsec < l2->ts_nsec ? -1 : 1;
-}
-
-static int compare_free_ts(const void *p1, const void *p2)
-{
-	const struct block_list *l1 = p1, *l2 = p2;
-
-	return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1;
-}
-
-static int compare_release(const void *p1, const void *p2)
-{
-	const struct block_list *l1 = p1, *l2 = p2;
-
-	if (!l1->free_ts_nsec && !l2->free_ts_nsec)
-		return 0;
-	if (l1->free_ts_nsec && l2->free_ts_nsec)
-		return 0;
-	return l1->free_ts_nsec ? 1 : -1;
-}
-
-static int compare_cull_condition(const void *p1, const void *p2)
-{
-	if (cull == 0)
-		return compare_txt(p1, p2);
-	if ((cull & CULL_STACKTRACE) && compare_stacktrace(p1, p2))
-		return compare_stacktrace(p1, p2);
-	if ((cull & CULL_PID) && compare_pid(p1, p2))
-		return compare_pid(p1, p2);
-	if ((cull & CULL_TGID) && compare_tgid(p1, p2))
-		return compare_tgid(p1, p2);
-	if ((cull & CULL_COMM) && compare_comm(p1, p2))
-		return compare_comm(p1, p2);
-	if ((cull & CULL_UNRELEASE) && compare_release(p1, p2))
-		return compare_release(p1, p2);
-	if ((cull & CULL_ALLOCATOR) && compare_allocator(p1, p2))
-		return compare_allocator(p1, p2);
-	return 0;
-}
-
-static int compare_sort_condition(const void *p1, const void *p2)
-{
-	int cmp = 0;
-
-	for (int i = 0; i < sc.size; ++i)
-		if (cmp == 0)
-			cmp = sc.signs[i] * sc.cmps[i](p1, p2);
-	return cmp;
-}
-
-static int search_pattern(regex_t *pattern, char *pattern_str, char *buf)
-{
-	int err, val_len;
-	regmatch_t pmatch[2];
-
-	err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL);
-	if (err != 0 || pmatch[1].rm_so == -1) {
-		if (debug_on)
-			fprintf(stderr, "no matching pattern in %s\n", buf);
-		return -1;
-	}
-	val_len = pmatch[1].rm_eo - pmatch[1].rm_so;
-
-	memcpy(pattern_str, buf + pmatch[1].rm_so, val_len);
-
-	return 0;
-}
-
-static bool check_regcomp(regex_t *pattern, const char *regex)
-{
-	int err;
-
-	err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE);
-	if (err != 0 || pattern->re_nsub != 1) {
-		fprintf(stderr, "Invalid pattern %s code %d\n", regex, err);
-		return false;
-	}
-	return true;
-}
-
-static char **explode(char sep, const char *str, int *size)
-{
-	int count = 0, len = strlen(str);
-	int lastindex = -1, j = 0;
-
-	for (int i = 0; i < len; i++)
-		if (str[i] == sep)
-			count++;
-	char **ret = calloc(++count, sizeof(char *));
-
-	for (int i = 0; i < len; i++) {
-		if (str[i] == sep) {
-			ret[j] = calloc(i - lastindex, sizeof(char));
-			memcpy(ret[j++], str + lastindex + 1, i - lastindex - 1);
-			lastindex = i;
-		}
-	}
-	if (lastindex <= len - 1) {
-		ret[j] = calloc(len - lastindex, sizeof(char));
-		memcpy(ret[j++], str + lastindex + 1, strlen(str) - 1 - lastindex);
-	}
-	*size = j;
-	return ret;
-}
-
-static void free_explode(char **arr, int size)
-{
-	for (int i = 0; i < size; i++)
-		free(arr[i]);
-	free(arr);
-}
-
-# define FIELD_BUFF 25
-
-static int get_page_num(char *buf)
-{
-	int order_val;
-	char order_str[FIELD_BUFF] = {0};
-	char *endptr;
-
-	search_pattern(&order_pattern, order_str, buf);
-	errno = 0;
-	order_val = strtol(order_str, &endptr, 10);
-	if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') {
-		if (debug_on)
-			fprintf(stderr, "wrong order in follow buf:\n%s\n", buf);
-		return 0;
-	}
-
-	return 1 << order_val;
-}
-
-static pid_t get_pid(char *buf)
-{
-	pid_t pid;
-	char pid_str[FIELD_BUFF] = {0};
-	char *endptr;
-
-	search_pattern(&pid_pattern, pid_str, buf);
-	errno = 0;
-	pid = strtol(pid_str, &endptr, 10);
-	if (errno != 0 || endptr == pid_str || *endptr != '\0') {
-		if (debug_on)
-			fprintf(stderr, "wrong/invalid pid in follow buf:\n%s\n", buf);
-		return -1;
-	}
-
-	return pid;
-
-}
-
-static pid_t get_tgid(char *buf)
-{
-	pid_t tgid;
-	char tgid_str[FIELD_BUFF] = {0};
-	char *endptr;
-
-	search_pattern(&tgid_pattern, tgid_str, buf);
-	errno = 0;
-	tgid = strtol(tgid_str, &endptr, 10);
-	if (errno != 0 || endptr == tgid_str || *endptr != '\0') {
-		if (debug_on)
-			fprintf(stderr, "wrong/invalid tgid in follow buf:\n%s\n", buf);
-		return -1;
-	}
-
-	return tgid;
-
-}
-
-static __u64 get_ts_nsec(char *buf)
-{
-	__u64 ts_nsec;
-	char ts_nsec_str[FIELD_BUFF] = {0};
-	char *endptr;
-
-	search_pattern(&ts_nsec_pattern, ts_nsec_str, buf);
-	errno = 0;
-	ts_nsec = strtoull(ts_nsec_str, &endptr, 10);
-	if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') {
-		if (debug_on)
-			fprintf(stderr, "wrong ts_nsec in follow buf:\n%s\n", buf);
-		return -1;
-	}
-
-	return ts_nsec;
-}
-
-static __u64 get_free_ts_nsec(char *buf)
-{
-	__u64 free_ts_nsec;
-	char free_ts_nsec_str[FIELD_BUFF] = {0};
-	char *endptr;
-
-	search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf);
-	errno = 0;
-	free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10);
-	if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') {
-		if (debug_on)
-			fprintf(stderr, "wrong free_ts_nsec in follow buf:\n%s\n", buf);
-		return -1;
-	}
-
-	return free_ts_nsec;
-}
-
-static char *get_comm(char *buf)
-{
-	char *comm_str = malloc(TASK_COMM_LEN);
-
-	memset(comm_str, 0, TASK_COMM_LEN);
-
-	search_pattern(&comm_pattern, comm_str, buf);
-	errno = 0;
-	if (errno != 0) {
-		if (debug_on)
-			fprintf(stderr, "wrong comm in follow buf:\n%s\n", buf);
-		return NULL;
-	}
-
-	return comm_str;
-}
-
-static int get_arg_type(const char *arg)
-{
-	if (!strcmp(arg, "pid") || !strcmp(arg, "p"))
-		return ARG_PID;
-	else if (!strcmp(arg, "tgid") || !strcmp(arg, "tg"))
-		return ARG_TGID;
-	else if (!strcmp(arg, "name") || !strcmp(arg, "n"))
-		return  ARG_COMM;
-	else if (!strcmp(arg, "stacktrace") || !strcmp(arg, "st"))
-		return ARG_STACKTRACE;
-	else if (!strcmp(arg, "free") || !strcmp(arg, "f"))
-		return ARG_FREE;
-	else if (!strcmp(arg, "txt") || !strcmp(arg, "T"))
-		return ARG_TXT;
-	else if (!strcmp(arg, "free_ts") || !strcmp(arg, "ft"))
-		return ARG_FREE_TS;
-	else if (!strcmp(arg, "alloc_ts") || !strcmp(arg, "at"))
-		return ARG_ALLOC_TS;
-	else if (!strcmp(arg, "allocator") || !strcmp(arg, "ator"))
-		return ARG_ALLOCATOR;
-	else {
-		return ARG_UNKNOWN;
-	}
-}
-
-static int get_allocator(const char *buf, const char *migrate_info)
-{
-	char *tmp, *first_line, *second_line;
-	int allocator = 0;
-
-	if (strstr(migrate_info, "CMA"))
-		allocator |= ALLOCATOR_CMA;
-	if (strstr(migrate_info, "slab"))
-		allocator |= ALLOCATOR_SLAB;
-	tmp = strstr(buf, "__vmalloc_node_range");
-	if (tmp) {
-		second_line = tmp;
-		while (*tmp != '\n')
-			tmp--;
-		tmp--;
-		while (*tmp != '\n')
-			tmp--;
-		first_line = ++tmp;
-		tmp = strstr(tmp, "alloc_pages");
-		if (tmp && first_line <= tmp && tmp < second_line)
-			allocator |= ALLOCATOR_VMALLOC;
-	}
-	if (allocator == 0)
-		allocator = ALLOCATOR_OTHERS;
-	return allocator;
-}
-
-static bool match_num_list(int num, int *list, int list_size)
-{
-	for (int i = 0; i < list_size; ++i)
-		if (list[i] == num)
-			return true;
-	return false;
-}
-
-static bool match_str_list(const char *str, char **list, int list_size)
-{
-	for (int i = 0; i < list_size; ++i)
-		if (!strcmp(list[i], str))
-			return true;
-	return false;
-}
-
-static bool is_need(char *buf)
-{
-	__u64 ts_nsec, free_ts_nsec;
-
-	ts_nsec = get_ts_nsec(buf);
-	free_ts_nsec = get_free_ts_nsec(buf);
-
-	if ((filter & FILTER_UNRELEASE) && free_ts_nsec != 0 && ts_nsec < free_ts_nsec)
-		return false;
-	if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size))
-		return false;
-	if ((filter & FILTER_TGID) &&
-		!match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size))
-		return false;
-
-	char *comm = get_comm(buf);
-
-	if ((filter & FILTER_COMM) &&
-	!match_str_list(comm, fc.comms, fc.comms_size)) {
-		free(comm);
-		return false;
-	}
-	free(comm);
-	return true;
-}
-
-static bool add_list(char *buf, int len, char *ext_buf)
-{
-	if (list_size != 0 &&
-		len == list[list_size-1].len &&
-		memcmp(buf, list[list_size-1].txt, len) == 0) {
-		list[list_size-1].num++;
-		list[list_size-1].page_num += get_page_num(buf);
-		return true;
-	}
-	if (list_size == max_size) {
-		fprintf(stderr, "max_size too small??\n");
-		return false;
-	}
-	if (!is_need(buf))
-		return true;
-	list[list_size].pid = get_pid(buf);
-	list[list_size].tgid = get_tgid(buf);
-	list[list_size].comm = get_comm(buf);
-	list[list_size].txt = malloc(len+1);
-	if (!list[list_size].txt) {
-		fprintf(stderr, "Out of memory\n");
-		return false;
-	}
-	memcpy(list[list_size].txt, buf, len);
-	list[list_size].txt[len] = 0;
-	list[list_size].len = len;
-	list[list_size].num = 1;
-	list[list_size].page_num = get_page_num(buf);
-
-	list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: "";
-	if (*list[list_size].stacktrace == '\n')
-		list[list_size].stacktrace++;
-	list[list_size].ts_nsec = get_ts_nsec(buf);
-	list[list_size].free_ts_nsec = get_free_ts_nsec(buf);
-	list[list_size].allocator = get_allocator(buf, ext_buf);
-	list_size++;
-	if (list_size % 1000 == 0) {
-		printf("loaded %d\r", list_size);
-		fflush(stdout);
-	}
-	return true;
-}
-
-static bool parse_cull_args(const char *arg_str)
-{
-	int size = 0;
-	char **args = explode(',', arg_str, &size);
-
-	for (int i = 0; i < size; ++i) {
-		int arg_type = get_arg_type(args[i]);
-
-		if (arg_type == ARG_PID)
-			cull |= CULL_PID;
-		else if (arg_type == ARG_TGID)
-			cull |= CULL_TGID;
-		else if (arg_type == ARG_COMM)
-			cull |= CULL_COMM;
-		else if (arg_type == ARG_STACKTRACE)
-			cull |= CULL_STACKTRACE;
-		else if (arg_type == ARG_FREE)
-			cull |= CULL_UNRELEASE;
-		else if (arg_type == ARG_ALLOCATOR)
-			cull |= CULL_ALLOCATOR;
-		else {
-			free_explode(args, size);
-			return false;
-		}
-	}
-	free_explode(args, size);
-	if (sc.size == 0)
-		set_single_cmp(compare_num, SORT_DESC);
-	return true;
-}
-
-static void set_single_cmp(int (*cmp)(const void *, const void *), int sign)
-{
-	if (sc.signs == NULL || sc.size < 1)
-		sc.signs = calloc(1, sizeof(int));
-	sc.signs[0] = sign;
-	if (sc.cmps == NULL || sc.size < 1)
-		sc.cmps = calloc(1, sizeof(int *));
-	sc.cmps[0] = cmp;
-	sc.size = 1;
-}
-
-static bool parse_sort_args(const char *arg_str)
-{
-	int size = 0;
-
-	if (sc.size != 0) { /* reset sort_condition */
-		free(sc.signs);
-		free(sc.cmps);
-		size = 0;
-	}
-
-	char **args = explode(',', arg_str, &size);
-
-	sc.signs = calloc(size, sizeof(int));
-	sc.cmps = calloc(size, sizeof(int *));
-	for (int i = 0; i < size; ++i) {
-		int offset = 0;
-
-		sc.signs[i] = SORT_ASC;
-		if (args[i][0] == '-' || args[i][0] == '+') {
-			if (args[i][0] == '-')
-				sc.signs[i] = SORT_DESC;
-			offset = 1;
-		}
-
-		int arg_type = get_arg_type(args[i]+offset);
-
-		if (arg_type == ARG_PID)
-			sc.cmps[i] = compare_pid;
-		else if (arg_type == ARG_TGID)
-			sc.cmps[i] = compare_tgid;
-		else if (arg_type == ARG_COMM)
-			sc.cmps[i] = compare_comm;
-		else if (arg_type == ARG_STACKTRACE)
-			sc.cmps[i] = compare_stacktrace;
-		else if (arg_type == ARG_ALLOC_TS)
-			sc.cmps[i] = compare_ts;
-		else if (arg_type == ARG_FREE_TS)
-			sc.cmps[i] = compare_free_ts;
-		else if (arg_type == ARG_TXT)
-			sc.cmps[i] = compare_txt;
-		else if (arg_type == ARG_ALLOCATOR)
-			sc.cmps[i] = compare_allocator;
-		else {
-			free_explode(args, size);
-			sc.size = 0;
-			return false;
-		}
-	}
-	sc.size = size;
-	free_explode(args, size);
-	return true;
-}
-
-static int *parse_nums_list(char *arg_str, int *list_size)
-{
-	int size = 0;
-	char **args = explode(',', arg_str, &size);
-	int *list = calloc(size, sizeof(int));
-
-	errno = 0;
-	for (int i = 0; i < size; ++i) {
-		char *endptr = NULL;
-
-		list[i] = strtol(args[i], &endptr, 10);
-		if (errno != 0 || endptr == args[i] || *endptr != '\0') {
-			free(list);
-			return NULL;
-		}
-	}
-	*list_size = size;
-	free_explode(args, size);
-	return list;
-}
-
-static void print_allocator(FILE *out, int allocator)
-{
-	fprintf(out, "allocated by ");
-	if (allocator & ALLOCATOR_CMA)
-		fprintf(out, "CMA ");
-	if (allocator & ALLOCATOR_SLAB)
-		fprintf(out, "SLAB ");
-	if (allocator & ALLOCATOR_VMALLOC)
-		fprintf(out, "VMALLOC ");
-	if (allocator & ALLOCATOR_OTHERS)
-		fprintf(out, "OTHERS ");
-}
-
-#define BUF_SIZE	(128 * 1024)
-
-static void usage(void)
-{
-	printf("Usage: ./page_owner_sort [OPTIONS] <input> <output>\n"
-		"-m\t\tSort by total memory.\n"
-		"-s\t\tSort by the stack trace.\n"
-		"-t\t\tSort by times (default).\n"
-		"-p\t\tSort by pid.\n"
-		"-P\t\tSort by tgid.\n"
-		"-n\t\tSort by task command name.\n"
-		"-a\t\tSort by memory allocate time.\n"
-		"-r\t\tSort by memory release time.\n"
-		"-f\t\tFilter out the information of blocks whose memory has been released.\n"
-		"-d\t\tPrint debug information.\n"
-		"--pid <pidlist>\tSelect by pid. This selects the information of blocks whose process ID numbers appear in <pidlist>.\n"
-		"--tgid <tgidlist>\tSelect by tgid. This selects the information of blocks whose Thread Group ID numbers appear in <tgidlist>.\n"
-		"--name <cmdlist>\n\t\tSelect by command name. This selects the information of blocks whose command name appears in <cmdlist>.\n"
-		"--cull <rules>\tCull by user-defined rules.<rules> is a single argument in the form of a comma-separated list with some common fields predefined\n"
-		"--sort <order>\tSpecify sort order as: [+|-]key[,[+|-]key[,...]]\n"
-	);
-}
-
-int main(int argc, char **argv)
-{
-	FILE *fin, *fout;
-	char *buf, *ext_buf;
-	int i, count;
-	struct stat st;
-	int opt;
-	struct option longopts[] = {
-		{ "pid", required_argument, NULL, 1 },
-		{ "tgid", required_argument, NULL, 2 },
-		{ "name", required_argument, NULL, 3 },
-		{ "cull",  required_argument, NULL, 4 },
-		{ "sort",  required_argument, NULL, 5 },
-		{ 0, 0, 0, 0},
-	};
-
-	while ((opt = getopt_long(argc, argv, "adfmnprstP", longopts, NULL)) != -1)
-		switch (opt) {
-		case 'a':
-			set_single_cmp(compare_ts, SORT_ASC);
-			break;
-		case 'd':
-			debug_on = true;
-			break;
-		case 'f':
-			filter = filter | FILTER_UNRELEASE;
-			break;
-		case 'm':
-			set_single_cmp(compare_page_num, SORT_DESC);
-			break;
-		case 'p':
-			set_single_cmp(compare_pid, SORT_ASC);
-			break;
-		case 'r':
-			set_single_cmp(compare_free_ts, SORT_ASC);
-			break;
-		case 's':
-			set_single_cmp(compare_stacktrace, SORT_ASC);
-			break;
-		case 't':
-			set_single_cmp(compare_num, SORT_DESC);
-			break;
-		case 'P':
-			set_single_cmp(compare_tgid, SORT_ASC);
-			break;
-		case 'n':
-			set_single_cmp(compare_comm, SORT_ASC);
-			break;
-		case 1:
-			filter = filter | FILTER_PID;
-			fc.pids = parse_nums_list(optarg, &fc.pids_size);
-			if (fc.pids == NULL) {
-				fprintf(stderr, "wrong/invalid pid in from the command line:%s\n",
-						optarg);
-				exit(1);
-			}
-			break;
-		case 2:
-			filter = filter | FILTER_TGID;
-			fc.tgids = parse_nums_list(optarg, &fc.tgids_size);
-			if (fc.tgids == NULL) {
-				fprintf(stderr, "wrong/invalid tgid in from the command line:%s\n",
-						optarg);
-				exit(1);
-			}
-			break;
-		case 3:
-			filter = filter | FILTER_COMM;
-			fc.comms = explode(',', optarg, &fc.comms_size);
-			break;
-		case 4:
-			if (!parse_cull_args(optarg)) {
-				fprintf(stderr, "wrong argument after --cull option:%s\n",
-						optarg);
-				exit(1);
-			}
-			break;
-		case 5:
-			if (!parse_sort_args(optarg)) {
-				fprintf(stderr, "wrong argument after --sort option:%s\n",
-						optarg);
-				exit(1);
-			}
-			break;
-		default:
-			usage();
-			exit(1);
-		}
-
-	if (optind >= (argc - 1)) {
-		usage();
-		exit(1);
-	}
-
-	fin = fopen(argv[optind], "r");
-	fout = fopen(argv[optind + 1], "w");
-	if (!fin || !fout) {
-		usage();
-		perror("open: ");
-		exit(1);
-	}
-
-	if (!check_regcomp(&order_pattern, "order\\s*([0-9]*),"))
-		goto out_order;
-	if (!check_regcomp(&pid_pattern, "pid\\s*([0-9]*),"))
-		goto out_pid;
-	if (!check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) "))
-		goto out_tgid;
-	if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts"))
-		goto out_comm;
-	if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,"))
-		goto out_ts;
-	if (!check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns"))
-		goto out_free_ts;
-
-	fstat(fileno(fin), &st);
-	max_size = st.st_size / 100; /* hack ... */
-
-	list = malloc(max_size * sizeof(*list));
-	buf = malloc(BUF_SIZE);
-	ext_buf = malloc(BUF_SIZE);
-	if (!list || !buf || !ext_buf) {
-		fprintf(stderr, "Out of memory\n");
-		goto out_free;
-	}
-
-	for ( ; ; ) {
-		int buf_len = read_block(buf, ext_buf, BUF_SIZE, fin);
-
-		if (buf_len < 0)
-			break;
-		if (!add_list(buf, buf_len, ext_buf))
-			goto out_free;
-	}
-
-	printf("loaded %d\n", list_size);
-
-	printf("sorting ....\n");
-
-	qsort(list, list_size, sizeof(list[0]), compare_cull_condition);
-
-	printf("culling\n");
-
-	for (i = count = 0; i < list_size; i++) {
-		if (count == 0 ||
-		    compare_cull_condition((void *)(&list[count-1]), (void *)(&list[i])) != 0) {
-			list[count++] = list[i];
-		} else {
-			list[count-1].num += list[i].num;
-			list[count-1].page_num += list[i].page_num;
-		}
-	}
-
-	qsort(list, count, sizeof(list[0]), compare_sort_condition);
-
-	for (i = 0; i < count; i++) {
-		if (cull == 0) {
-			fprintf(fout, "%d times, %d pages, ", list[i].num, list[i].page_num);
-			print_allocator(fout, list[i].allocator);
-			fprintf(fout, ":\n%s\n", list[i].txt);
-		}
-		else {
-			fprintf(fout, "%d times, %d pages",
-					list[i].num, list[i].page_num);
-			if (cull & CULL_PID || filter & FILTER_PID)
-				fprintf(fout, ", PID %d", list[i].pid);
-			if (cull & CULL_TGID || filter & FILTER_TGID)
-				fprintf(fout, ", TGID %d", list[i].pid);
-			if (cull & CULL_COMM || filter & FILTER_COMM)
-				fprintf(fout, ", task_comm_name: %s", list[i].comm);
-			if (cull & CULL_ALLOCATOR) {
-				fprintf(fout, ", ");
-				print_allocator(fout, list[i].allocator);
-			}
-			if (cull & CULL_UNRELEASE)
-				fprintf(fout, " (%s)",
-						list[i].free_ts_nsec ? "UNRELEASED" : "RELEASED");
-			if (cull & CULL_STACKTRACE)
-				fprintf(fout, ":\n%s", list[i].stacktrace);
-			fprintf(fout, "\n");
-		}
-	}
-
-out_free:
-	if (ext_buf)
-		free(ext_buf);
-	if (buf)
-		free(buf);
-	if (list)
-		free(list);
-out_free_ts:
-	regfree(&free_ts_nsec_pattern);
-out_ts:
-	regfree(&ts_nsec_pattern);
-out_comm:
-	regfree(&comm_pattern);
-out_tgid:
-	regfree(&tgid_pattern);
-out_pid:
-	regfree(&pid_pattern);
-out_order:
-	regfree(&order_pattern);
-
-	return 0;
-}
diff --git a/tools/vm/slabinfo-gnuplot.sh b/tools/vm/slabinfo-gnuplot.sh
deleted file mode 100644
index 873a892147e5..000000000000
--- a/tools/vm/slabinfo-gnuplot.sh
+++ /dev/null
@@ -1,268 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0-only
-
-# Sergey Senozhatsky, 2015
-# sergey.senozhatsky.work@gmail.com
-#
-
-
-# This program is intended to plot a `slabinfo -X' stats, collected,
-# for example, using the following command:
-#   while [ 1 ]; do slabinfo -X >> stats; sleep 1; done
-#
-# Use `slabinfo-gnuplot.sh stats' to pre-process collected records
-# and generate graphs (totals, slabs sorted by size, slabs sorted
-# by size).
-#
-# Graphs can be [individually] regenerate with different ranges and
-# size (-r %d,%d and -s %d,%d options).
-#
-# To visually compare N `totals' graphs, do
-# slabinfo-gnuplot.sh -t FILE1-totals FILE2-totals ... FILEN-totals
-#
-
-min_slab_name_size=11
-xmin=0
-xmax=0
-width=1500
-height=700
-mode=preprocess
-
-usage()
-{
-	echo "Usage: [-s W,H] [-r MIN,MAX] [-t|-l] FILE1 [FILE2 ..]"
-	echo "FILEs must contain 'slabinfo -X' samples"
-	echo "-t 			- plot totals for FILE(s)"
-	echo "-l 			- plot slabs stats for FILE(s)"
-	echo "-s %d,%d		- set image width and height"
-	echo "-r %d,%d		- use data samples from a given range"
-}
-
-check_file_exist()
-{
-	if [ ! -f "$1" ]; then
-		echo "File '$1' does not exist"
-		exit 1
-	fi
-}
-
-do_slabs_plotting()
-{
-	local file=$1
-	local out_file
-	local range="every ::$xmin"
-	local xtic=""
-	local xtic_rotate="norotate"
-	local lines=2000000
-	local wc_lines
-
-	check_file_exist "$file"
-
-	out_file=`basename "$file"`
-	if [ $xmax -ne 0 ]; then
-		range="$range::$xmax"
-		lines=$((xmax-xmin))
-	fi
-
-	wc_lines=`cat "$file" | wc -l`
-	if [ $? -ne 0 ] || [ "$wc_lines" -eq 0 ] ; then
-		wc_lines=$lines
-	fi
-
-	if [ "$wc_lines" -lt "$lines" ]; then
-		lines=$wc_lines
-	fi
-
-	if [ $((width / lines)) -gt $min_slab_name_size ]; then
-		xtic=":xtic(1)"
-		xtic_rotate=90
-	fi
-
-gnuplot -p << EOF
-#!/usr/bin/env gnuplot
-
-set terminal png enhanced size $width,$height large
-set output '$out_file.png'
-set autoscale xy
-set xlabel 'samples'
-set ylabel 'bytes'
-set style histogram columnstacked title textcolor lt -1
-set style fill solid 0.15
-set xtics rotate $xtic_rotate
-set key left above Left title reverse
-
-plot "$file" $range u 2$xtic title 'SIZE' with boxes,\
-	'' $range u 3 title 'LOSS' with boxes
-EOF
-
-	if [ $? -eq 0 ]; then
-		echo "$out_file.png"
-	fi
-}
-
-do_totals_plotting()
-{
-	local gnuplot_cmd=""
-	local range="every ::$xmin"
-	local file=""
-
-	if [ $xmax -ne 0 ]; then
-		range="$range::$xmax"
-	fi
-
-	for i in "${t_files[@]}"; do
-		check_file_exist "$i"
-
-		file="$file"`basename "$i"`
-		gnuplot_cmd="$gnuplot_cmd '$i' $range using 1 title\
-			'$i Memory usage' with lines,"
-		gnuplot_cmd="$gnuplot_cmd '' $range using 2 title \
-			'$i Loss' with lines,"
-	done
-
-gnuplot -p << EOF
-#!/usr/bin/env gnuplot
-
-set terminal png enhanced size $width,$height large
-set autoscale xy
-set output '$file.png'
-set xlabel 'samples'
-set ylabel 'bytes'
-set key left above Left title reverse
-
-plot $gnuplot_cmd
-EOF
-
-	if [ $? -eq 0 ]; then
-		echo "$file.png"
-	fi
-}
-
-do_preprocess()
-{
-	local out
-	local lines
-	local in=$1
-
-	check_file_exist "$in"
-
-	# use only 'TOP' slab (biggest memory usage or loss)
-	let lines=3
-	out=`basename "$in"`"-slabs-by-loss"
-	`cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\
-		grep -E -iv '\-\-|Name|Slabs'\
-		| awk '{print $1" "$4+$2*$3" "$4}' > "$out"`
-	if [ $? -eq 0 ]; then
-		do_slabs_plotting "$out"
-	fi
-
-	let lines=3
-	out=`basename "$in"`"-slabs-by-size"
-	`cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\
-		grep -E -iv '\-\-|Name|Slabs'\
-		| awk '{print $1" "$4" "$4-$2*$3}' > "$out"`
-	if [ $? -eq 0 ]; then
-		do_slabs_plotting "$out"
-	fi
-
-	out=`basename "$in"`"-totals"
-	`cat "$in" | grep "Memory used" |\
-		awk '{print $3" "$7}' > "$out"`
-	if [ $? -eq 0 ]; then
-		t_files[0]=$out
-		do_totals_plotting
-	fi
-}
-
-parse_opts()
-{
-	local opt
-
-	while getopts "tlr::s::h" opt; do
-		case $opt in
-			t)
-				mode=totals
-				;;
-			l)
-				mode=slabs
-				;;
-			s)
-				array=(${OPTARG//,/ })
-				width=${array[0]}
-				height=${array[1]}
-				;;
-			r)
-				array=(${OPTARG//,/ })
-				xmin=${array[0]}
-				xmax=${array[1]}
-				;;
-			h)
-				usage
-				exit 0
-				;;
-			\?)
-				echo "Invalid option: -$OPTARG" >&2
-				exit 1
-				;;
-			:)
-				echo "-$OPTARG requires an argument." >&2
-				exit 1
-				;;
-		esac
-	done
-
-	return $OPTIND
-}
-
-parse_args()
-{
-	local idx=0
-	local p
-
-	for p in "$@"; do
-		case $mode in
-			preprocess)
-				files[$idx]=$p
-				idx=$idx+1
-				;;
-			totals)
-				t_files[$idx]=$p
-				idx=$idx+1
-				;;
-			slabs)
-				files[$idx]=$p
-				idx=$idx+1
-				;;
-		esac
-	done
-}
-
-parse_opts "$@"
-argstart=$?
-parse_args "${@:$argstart}"
-
-if [ ${#files[@]} -eq 0 ] && [ ${#t_files[@]} -eq 0 ]; then
-	usage
-	exit 1
-fi
-
-case $mode in
-	preprocess)
-		for i in "${files[@]}"; do
-			do_preprocess "$i"
-		done
-		;;
-	totals)
-		do_totals_plotting
-		;;
-	slabs)
-		for i in "${files[@]}"; do
-			do_slabs_plotting "$i"
-		done
-		;;
-	*)
-		echo "Unknown mode $mode" >&2
-		usage
-		exit 1
-	;;
-esac
diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c
deleted file mode 100644
index cfaeaea71042..000000000000
--- a/tools/vm/slabinfo.c
+++ /dev/null
@@ -1,1544 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Slabinfo: Tool to get reports about slabs
- *
- * (C) 2007 sgi, Christoph Lameter
- * (C) 2011 Linux Foundation, Christoph Lameter
- *
- * Compile with:
- *
- * gcc -o slabinfo slabinfo.c
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <dirent.h>
-#include <strings.h>
-#include <string.h>
-#include <unistd.h>
-#include <stdarg.h>
-#include <getopt.h>
-#include <regex.h>
-#include <errno.h>
-
-#define MAX_SLABS 500
-#define MAX_ALIASES 500
-#define MAX_NODES 1024
-
-struct slabinfo {
-	char *name;
-	int alias;
-	int refs;
-	int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu;
-	unsigned int hwcache_align, object_size, objs_per_slab;
-	unsigned int sanity_checks, slab_size, store_user, trace;
-	int order, poison, reclaim_account, red_zone;
-	unsigned long partial, objects, slabs, objects_partial, objects_total;
-	unsigned long alloc_fastpath, alloc_slowpath;
-	unsigned long free_fastpath, free_slowpath;
-	unsigned long free_frozen, free_add_partial, free_remove_partial;
-	unsigned long alloc_from_partial, alloc_slab, free_slab, alloc_refill;
-	unsigned long cpuslab_flush, deactivate_full, deactivate_empty;
-	unsigned long deactivate_to_head, deactivate_to_tail;
-	unsigned long deactivate_remote_frees, order_fallback;
-	unsigned long cmpxchg_double_cpu_fail, cmpxchg_double_fail;
-	unsigned long alloc_node_mismatch, deactivate_bypass;
-	unsigned long cpu_partial_alloc, cpu_partial_free;
-	int numa[MAX_NODES];
-	int numa_partial[MAX_NODES];
-} slabinfo[MAX_SLABS];
-
-struct aliasinfo {
-	char *name;
-	char *ref;
-	struct slabinfo *slab;
-} aliasinfo[MAX_ALIASES];
-
-int slabs;
-int actual_slabs;
-int aliases;
-int alias_targets;
-int highest_node;
-
-char buffer[4096];
-
-int show_empty;
-int show_report;
-int show_alias;
-int show_slab;
-int skip_zero = 1;
-int show_numa;
-int show_track;
-int show_first_alias;
-int validate;
-int shrink;
-int show_inverted;
-int show_single_ref;
-int show_totals;
-int sort_size;
-int sort_active;
-int set_debug;
-int show_ops;
-int sort_partial;
-int show_activity;
-int output_lines = -1;
-int sort_loss;
-int extended_totals;
-int show_bytes;
-int unreclaim_only;
-
-/* Debug options */
-int sanity;
-int redzone;
-int poison;
-int tracking;
-int tracing;
-
-int page_size;
-
-regex_t pattern;
-
-static void fatal(const char *x, ...)
-{
-	va_list ap;
-
-	va_start(ap, x);
-	vfprintf(stderr, x, ap);
-	va_end(ap);
-	exit(EXIT_FAILURE);
-}
-
-static void usage(void)
-{
-	printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n"
-		"slabinfo [-aABDefhilLnoPrsStTUvXz1] [N=K] [-dafzput] [slab-regexp]\n"
-		"-a|--aliases           Show aliases\n"
-		"-A|--activity          Most active slabs first\n"
-		"-B|--Bytes             Show size in bytes\n"
-		"-D|--display-active    Switch line format to activity\n"
-		"-e|--empty             Show empty slabs\n"
-		"-f|--first-alias       Show first alias\n"
-		"-h|--help              Show usage information\n"
-		"-i|--inverted          Inverted list\n"
-		"-l|--slabs             Show slabs\n"
-		"-L|--Loss              Sort by loss\n"
-		"-n|--numa              Show NUMA information\n"
-		"-N|--lines=K           Show the first K slabs\n"
-		"-o|--ops               Show kmem_cache_ops\n"
-		"-P|--partial           Sort by number of partial slabs\n"
-		"-r|--report            Detailed report on single slabs\n"
-		"-s|--shrink            Shrink slabs\n"
-		"-S|--Size              Sort by size\n"
-		"-t|--tracking          Show alloc/free information\n"
-		"-T|--Totals            Show summary information\n"
-		"-U|--Unreclaim         Show unreclaimable slabs only\n"
-		"-v|--validate          Validate slabs\n"
-		"-X|--Xtotals           Show extended summary information\n"
-		"-z|--zero              Include empty slabs\n"
-		"-1|--1ref              Single reference\n"
-
-		"\n"
-		"-d  | --debug          Switch off all debug options\n"
-		"-da | --debug=a        Switch on all debug options (--debug=FZPU)\n"
-
-		"\n"
-		"-d[afzput] | --debug=[afzput]\n"
-		"    f | F              Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n"
-		"    z | Z              Redzoning\n"
-		"    p | P              Poisoning\n"
-		"    u | U              Tracking\n"
-		"    t | T              Tracing\n"
-
-		"\nSorting options (--Loss, --Size, --Partial) are mutually exclusive\n"
-	);
-}
-
-static unsigned long read_obj(const char *name)
-{
-	FILE *f = fopen(name, "r");
-
-	if (!f) {
-		buffer[0] = 0;
-		if (errno == EACCES)
-			fatal("%s, Try using superuser\n", strerror(errno));
-	} else {
-		if (!fgets(buffer, sizeof(buffer), f))
-			buffer[0] = 0;
-		fclose(f);
-		if (buffer[strlen(buffer)] == '\n')
-			buffer[strlen(buffer)] = 0;
-	}
-	return strlen(buffer);
-}
-
-
-/*
- * Get the contents of an attribute
- */
-static unsigned long get_obj(const char *name)
-{
-	if (!read_obj(name))
-		return 0;
-
-	return atol(buffer);
-}
-
-static unsigned long get_obj_and_str(const char *name, char **x)
-{
-	unsigned long result = 0;
-	char *p;
-
-	*x = NULL;
-
-	if (!read_obj(name)) {
-		x = NULL;
-		return 0;
-	}
-	result = strtoul(buffer, &p, 10);
-	while (*p == ' ')
-		p++;
-	if (*p)
-		*x = strdup(p);
-	return result;
-}
-
-static void set_obj(struct slabinfo *s, const char *name, int n)
-{
-	char x[100];
-	FILE *f;
-
-	snprintf(x, 100, "%s/%s", s->name, name);
-	f = fopen(x, "w");
-	if (!f)
-		fatal("Cannot write to %s\n", x);
-
-	fprintf(f, "%d\n", n);
-	fclose(f);
-}
-
-static unsigned long read_slab_obj(struct slabinfo *s, const char *name)
-{
-	char x[100];
-	FILE *f;
-	size_t l;
-
-	snprintf(x, 100, "%s/%s", s->name, name);
-	f = fopen(x, "r");
-	if (!f) {
-		buffer[0] = 0;
-		l = 0;
-	} else {
-		l = fread(buffer, 1, sizeof(buffer), f);
-		buffer[l] = 0;
-		fclose(f);
-	}
-	return l;
-}
-
-static unsigned long read_debug_slab_obj(struct slabinfo *s, const char *name)
-{
-	char x[128];
-	FILE *f;
-	size_t l;
-
-	snprintf(x, 128, "/sys/kernel/debug/slab/%s/%s", s->name, name);
-	f = fopen(x, "r");
-	if (!f) {
-		buffer[0] = 0;
-		l = 0;
-	} else {
-		l = fread(buffer, 1, sizeof(buffer), f);
-		buffer[l] = 0;
-		fclose(f);
-	}
-	return l;
-}
-
-/*
- * Put a size string together
- */
-static int store_size(char *buffer, unsigned long value)
-{
-	unsigned long divisor = 1;
-	char trailer = 0;
-	int n;
-
-	if (!show_bytes) {
-		if (value > 1000000000UL) {
-			divisor = 100000000UL;
-			trailer = 'G';
-		} else if (value > 1000000UL) {
-			divisor = 100000UL;
-			trailer = 'M';
-		} else if (value > 1000UL) {
-			divisor = 100;
-			trailer = 'K';
-		}
-	}
-
-	value /= divisor;
-	n = sprintf(buffer, "%ld",value);
-	if (trailer) {
-		buffer[n] = trailer;
-		n++;
-		buffer[n] = 0;
-	}
-	if (divisor != 1) {
-		memmove(buffer + n - 2, buffer + n - 3, 4);
-		buffer[n-2] = '.';
-		n++;
-	}
-	return n;
-}
-
-static void decode_numa_list(int *numa, char *t)
-{
-	int node;
-	int nr;
-
-	memset(numa, 0, MAX_NODES * sizeof(int));
-
-	if (!t)
-		return;
-
-	while (*t == 'N') {
-		t++;
-		node = strtoul(t, &t, 10);
-		if (*t == '=') {
-			t++;
-			nr = strtoul(t, &t, 10);
-			numa[node] = nr;
-			if (node > highest_node)
-				highest_node = node;
-		}
-		while (*t == ' ')
-			t++;
-	}
-}
-
-static void slab_validate(struct slabinfo *s)
-{
-	if (strcmp(s->name, "*") == 0)
-		return;
-
-	set_obj(s, "validate", 1);
-}
-
-static void slab_shrink(struct slabinfo *s)
-{
-	if (strcmp(s->name, "*") == 0)
-		return;
-
-	set_obj(s, "shrink", 1);
-}
-
-int line = 0;
-
-static void first_line(void)
-{
-	if (show_activity)
-		printf("Name                   Objects      Alloc       Free"
-			"   %%Fast Fallb O CmpX   UL\n");
-	else
-		printf("Name                   Objects Objsize           %s "
-			"Slabs/Part/Cpu  O/S O %%Fr %%Ef Flg\n",
-			sort_loss ? " Loss" : "Space");
-}
-
-/*
- * Find the shortest alias of a slab
- */
-static struct aliasinfo *find_one_alias(struct slabinfo *find)
-{
-	struct aliasinfo *a;
-	struct aliasinfo *best = NULL;
-
-	for(a = aliasinfo;a < aliasinfo + aliases; a++) {
-		if (a->slab == find &&
-			(!best || strlen(best->name) < strlen(a->name))) {
-				best = a;
-				if (strncmp(a->name,"kmall", 5) == 0)
-					return best;
-			}
-	}
-	return best;
-}
-
-static unsigned long slab_size(struct slabinfo *s)
-{
-	return 	s->slabs * (page_size << s->order);
-}
-
-static unsigned long slab_activity(struct slabinfo *s)
-{
-	return 	s->alloc_fastpath + s->free_fastpath +
-		s->alloc_slowpath + s->free_slowpath;
-}
-
-static unsigned long slab_waste(struct slabinfo *s)
-{
-	return	slab_size(s) - s->objects * s->object_size;
-}
-
-static void slab_numa(struct slabinfo *s, int mode)
-{
-	int node;
-
-	if (strcmp(s->name, "*") == 0)
-		return;
-
-	if (!highest_node) {
-		printf("\n%s: No NUMA information available.\n", s->name);
-		return;
-	}
-
-	if (skip_zero && !s->slabs)
-		return;
-
-	if (!line) {
-		printf("\n%-21s:", mode ? "NUMA nodes" : "Slab");
-		for(node = 0; node <= highest_node; node++)
-			printf(" %4d", node);
-		printf("\n----------------------");
-		for(node = 0; node <= highest_node; node++)
-			printf("-----");
-		printf("\n");
-	}
-	printf("%-21s ", mode ? "All slabs" : s->name);
-	for(node = 0; node <= highest_node; node++) {
-		char b[20];
-
-		store_size(b, s->numa[node]);
-		printf(" %4s", b);
-	}
-	printf("\n");
-	if (mode) {
-		printf("%-21s ", "Partial slabs");
-		for(node = 0; node <= highest_node; node++) {
-			char b[20];
-
-			store_size(b, s->numa_partial[node]);
-			printf(" %4s", b);
-		}
-		printf("\n");
-	}
-	line++;
-}
-
-static void show_tracking(struct slabinfo *s)
-{
-	printf("\n%s: Kernel object allocation\n", s->name);
-	printf("-----------------------------------------------------------------------\n");
-	if (read_debug_slab_obj(s, "alloc_traces"))
-		printf("%s", buffer);
-	else if (read_slab_obj(s, "alloc_calls"))
-		printf("%s", buffer);
-	else
-		printf("No Data\n");
-
-	printf("\n%s: Kernel object freeing\n", s->name);
-	printf("------------------------------------------------------------------------\n");
-	if (read_debug_slab_obj(s, "free_traces"))
-		printf("%s", buffer);
-	else if (read_slab_obj(s, "free_calls"))
-		printf("%s", buffer);
-	else
-		printf("No Data\n");
-
-}
-
-static void ops(struct slabinfo *s)
-{
-	if (strcmp(s->name, "*") == 0)
-		return;
-
-	if (read_slab_obj(s, "ops")) {
-		printf("\n%s: kmem_cache operations\n", s->name);
-		printf("--------------------------------------------\n");
-		printf("%s", buffer);
-	} else
-		printf("\n%s has no kmem_cache operations\n", s->name);
-}
-
-static const char *onoff(int x)
-{
-	if (x)
-		return "On ";
-	return "Off";
-}
-
-static void slab_stats(struct slabinfo *s)
-{
-	unsigned long total_alloc;
-	unsigned long total_free;
-	unsigned long total;
-
-	if (!s->alloc_slab)
-		return;
-
-	total_alloc = s->alloc_fastpath + s->alloc_slowpath;
-	total_free = s->free_fastpath + s->free_slowpath;
-
-	if (!total_alloc)
-		return;
-
-	printf("\n");
-	printf("Slab Perf Counter       Alloc     Free %%Al %%Fr\n");
-	printf("--------------------------------------------------\n");
-	printf("Fastpath             %8lu %8lu %3lu %3lu\n",
-		s->alloc_fastpath, s->free_fastpath,
-		s->alloc_fastpath * 100 / total_alloc,
-		total_free ? s->free_fastpath * 100 / total_free : 0);
-	printf("Slowpath             %8lu %8lu %3lu %3lu\n",
-		total_alloc - s->alloc_fastpath, s->free_slowpath,
-		(total_alloc - s->alloc_fastpath) * 100 / total_alloc,
-		total_free ? s->free_slowpath * 100 / total_free : 0);
-	printf("Page Alloc           %8lu %8lu %3lu %3lu\n",
-		s->alloc_slab, s->free_slab,
-		s->alloc_slab * 100 / total_alloc,
-		total_free ? s->free_slab * 100 / total_free : 0);
-	printf("Add partial          %8lu %8lu %3lu %3lu\n",
-		s->deactivate_to_head + s->deactivate_to_tail,
-		s->free_add_partial,
-		(s->deactivate_to_head + s->deactivate_to_tail) * 100 / total_alloc,
-		total_free ? s->free_add_partial * 100 / total_free : 0);
-	printf("Remove partial       %8lu %8lu %3lu %3lu\n",
-		s->alloc_from_partial, s->free_remove_partial,
-		s->alloc_from_partial * 100 / total_alloc,
-		total_free ? s->free_remove_partial * 100 / total_free : 0);
-
-	printf("Cpu partial list     %8lu %8lu %3lu %3lu\n",
-		s->cpu_partial_alloc, s->cpu_partial_free,
-		s->cpu_partial_alloc * 100 / total_alloc,
-		total_free ? s->cpu_partial_free * 100 / total_free : 0);
-
-	printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n",
-		s->deactivate_remote_frees, s->free_frozen,
-		s->deactivate_remote_frees * 100 / total_alloc,
-		total_free ? s->free_frozen * 100 / total_free : 0);
-
-	printf("Total                %8lu %8lu\n\n", total_alloc, total_free);
-
-	if (s->cpuslab_flush)
-		printf("Flushes %8lu\n", s->cpuslab_flush);
-
-	total = s->deactivate_full + s->deactivate_empty +
-			s->deactivate_to_head + s->deactivate_to_tail + s->deactivate_bypass;
-
-	if (total) {
-		printf("\nSlab Deactivation             Occurrences %%\n");
-		printf("-------------------------------------------------\n");
-		printf("Slab full                     %7lu  %3lu%%\n",
-			s->deactivate_full, (s->deactivate_full * 100) / total);
-		printf("Slab empty                    %7lu  %3lu%%\n",
-			s->deactivate_empty, (s->deactivate_empty * 100) / total);
-		printf("Moved to head of partial list %7lu  %3lu%%\n",
-			s->deactivate_to_head, (s->deactivate_to_head * 100) / total);
-		printf("Moved to tail of partial list %7lu  %3lu%%\n",
-			s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total);
-		printf("Deactivation bypass           %7lu  %3lu%%\n",
-			s->deactivate_bypass, (s->deactivate_bypass * 100) / total);
-		printf("Refilled from foreign frees   %7lu  %3lu%%\n",
-			s->alloc_refill, (s->alloc_refill * 100) / total);
-		printf("Node mismatch                 %7lu  %3lu%%\n",
-			s->alloc_node_mismatch, (s->alloc_node_mismatch * 100) / total);
-	}
-
-	if (s->cmpxchg_double_fail || s->cmpxchg_double_cpu_fail) {
-		printf("\nCmpxchg_double Looping\n------------------------\n");
-		printf("Locked Cmpxchg Double redos   %lu\nUnlocked Cmpxchg Double redos %lu\n",
-			s->cmpxchg_double_fail, s->cmpxchg_double_cpu_fail);
-	}
-}
-
-static void report(struct slabinfo *s)
-{
-	if (strcmp(s->name, "*") == 0)
-		return;
-
-	printf("\nSlabcache: %-15s  Aliases: %2d Order : %2d Objects: %lu\n",
-		s->name, s->aliases, s->order, s->objects);
-	if (s->hwcache_align)
-		printf("** Hardware cacheline aligned\n");
-	if (s->cache_dma)
-		printf("** Memory is allocated in a special DMA zone\n");
-	if (s->destroy_by_rcu)
-		printf("** Slabs are destroyed via RCU\n");
-	if (s->reclaim_account)
-		printf("** Reclaim accounting active\n");
-
-	printf("\nSizes (bytes)     Slabs              Debug                Memory\n");
-	printf("------------------------------------------------------------------------\n");
-	printf("Object : %7d  Total  : %7ld   Sanity Checks : %s  Total: %7ld\n",
-			s->object_size, s->slabs, onoff(s->sanity_checks),
-			s->slabs * (page_size << s->order));
-	printf("SlabObj: %7d  Full   : %7ld   Redzoning     : %s  Used : %7ld\n",
-			s->slab_size, s->slabs - s->partial - s->cpu_slabs,
-			onoff(s->red_zone), s->objects * s->object_size);
-	printf("SlabSiz: %7d  Partial: %7ld   Poisoning     : %s  Loss : %7ld\n",
-			page_size << s->order, s->partial, onoff(s->poison),
-			s->slabs * (page_size << s->order) - s->objects * s->object_size);
-	printf("Loss   : %7d  CpuSlab: %7d   Tracking      : %s  Lalig: %7ld\n",
-			s->slab_size - s->object_size, s->cpu_slabs, onoff(s->store_user),
-			(s->slab_size - s->object_size) * s->objects);
-	printf("Align  : %7d  Objects: %7d   Tracing       : %s  Lpadd: %7ld\n",
-			s->align, s->objs_per_slab, onoff(s->trace),
-			((page_size << s->order) - s->objs_per_slab * s->slab_size) *
-			s->slabs);
-
-	ops(s);
-	show_tracking(s);
-	slab_numa(s, 1);
-	slab_stats(s);
-}
-
-static void slabcache(struct slabinfo *s)
-{
-	char size_str[20];
-	char dist_str[40];
-	char flags[20];
-	char *p = flags;
-
-	if (strcmp(s->name, "*") == 0)
-		return;
-
-	if (unreclaim_only && s->reclaim_account)
-		return;
-
-	if (actual_slabs == 1) {
-		report(s);
-		return;
-	}
-
-	if (skip_zero && !show_empty && !s->slabs)
-		return;
-
-	if (show_empty && s->slabs)
-		return;
-
-	if (sort_loss == 0)
-		store_size(size_str, slab_size(s));
-	else
-		store_size(size_str, slab_waste(s));
-	snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs,
-						s->partial, s->cpu_slabs);
-
-	if (!line++)
-		first_line();
-
-	if (s->aliases)
-		*p++ = '*';
-	if (s->cache_dma)
-		*p++ = 'd';
-	if (s->hwcache_align)
-		*p++ = 'A';
-	if (s->poison)
-		*p++ = 'P';
-	if (s->reclaim_account)
-		*p++ = 'a';
-	if (s->red_zone)
-		*p++ = 'Z';
-	if (s->sanity_checks)
-		*p++ = 'F';
-	if (s->store_user)
-		*p++ = 'U';
-	if (s->trace)
-		*p++ = 'T';
-
-	*p = 0;
-	if (show_activity) {
-		unsigned long total_alloc;
-		unsigned long total_free;
-
-		total_alloc = s->alloc_fastpath + s->alloc_slowpath;
-		total_free = s->free_fastpath + s->free_slowpath;
-
-		printf("%-21s %8ld %10ld %10ld %3ld %3ld %5ld %1d %4ld %4ld\n",
-			s->name, s->objects,
-			total_alloc, total_free,
-			total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0,
-			total_free ? (s->free_fastpath * 100 / total_free) : 0,
-			s->order_fallback, s->order, s->cmpxchg_double_fail,
-			s->cmpxchg_double_cpu_fail);
-	} else {
-		printf("%-21s %8ld %7d %15s %14s %4d %1d %3ld %3ld %s\n",
-			s->name, s->objects, s->object_size, size_str, dist_str,
-			s->objs_per_slab, s->order,
-			s->slabs ? (s->partial * 100) / s->slabs : 100,
-			s->slabs ? (s->objects * s->object_size * 100) /
-				(s->slabs * (page_size << s->order)) : 100,
-			flags);
-	}
-}
-
-/*
- * Analyze debug options. Return false if something is amiss.
- */
-static int debug_opt_scan(char *opt)
-{
-	if (!opt || !opt[0] || strcmp(opt, "-") == 0)
-		return 1;
-
-	if (strcasecmp(opt, "a") == 0) {
-		sanity = 1;
-		poison = 1;
-		redzone = 1;
-		tracking = 1;
-		return 1;
-	}
-
-	for ( ; *opt; opt++)
-		switch (*opt) {
-		case 'F' : case 'f':
-			if (sanity)
-				return 0;
-			sanity = 1;
-			break;
-		case 'P' : case 'p':
-			if (poison)
-				return 0;
-			poison = 1;
-			break;
-
-		case 'Z' : case 'z':
-			if (redzone)
-				return 0;
-			redzone = 1;
-			break;
-
-		case 'U' : case 'u':
-			if (tracking)
-				return 0;
-			tracking = 1;
-			break;
-
-		case 'T' : case 't':
-			if (tracing)
-				return 0;
-			tracing = 1;
-			break;
-		default:
-			return 0;
-		}
-	return 1;
-}
-
-static int slab_empty(struct slabinfo *s)
-{
-	if (s->objects > 0)
-		return 0;
-
-	/*
-	 * We may still have slabs even if there are no objects. Shrinking will
-	 * remove them.
-	 */
-	if (s->slabs != 0)
-		set_obj(s, "shrink", 1);
-
-	return 1;
-}
-
-static void slab_debug(struct slabinfo *s)
-{
-	if (strcmp(s->name, "*") == 0)
-		return;
-
-	if (sanity && !s->sanity_checks) {
-		set_obj(s, "sanity_checks", 1);
-	}
-	if (!sanity && s->sanity_checks) {
-		if (slab_empty(s))
-			set_obj(s, "sanity_checks", 0);
-		else
-			fprintf(stderr, "%s not empty cannot disable sanity checks\n", s->name);
-	}
-	if (redzone && !s->red_zone) {
-		if (slab_empty(s))
-			set_obj(s, "red_zone", 1);
-		else
-			fprintf(stderr, "%s not empty cannot enable redzoning\n", s->name);
-	}
-	if (!redzone && s->red_zone) {
-		if (slab_empty(s))
-			set_obj(s, "red_zone", 0);
-		else
-			fprintf(stderr, "%s not empty cannot disable redzoning\n", s->name);
-	}
-	if (poison && !s->poison) {
-		if (slab_empty(s))
-			set_obj(s, "poison", 1);
-		else
-			fprintf(stderr, "%s not empty cannot enable poisoning\n", s->name);
-	}
-	if (!poison && s->poison) {
-		if (slab_empty(s))
-			set_obj(s, "poison", 0);
-		else
-			fprintf(stderr, "%s not empty cannot disable poisoning\n", s->name);
-	}
-	if (tracking && !s->store_user) {
-		if (slab_empty(s))
-			set_obj(s, "store_user", 1);
-		else
-			fprintf(stderr, "%s not empty cannot enable tracking\n", s->name);
-	}
-	if (!tracking && s->store_user) {
-		if (slab_empty(s))
-			set_obj(s, "store_user", 0);
-		else
-			fprintf(stderr, "%s not empty cannot disable tracking\n", s->name);
-	}
-	if (tracing && !s->trace) {
-		if (slabs == 1)
-			set_obj(s, "trace", 1);
-		else
-			fprintf(stderr, "%s can only enable trace for one slab at a time\n", s->name);
-	}
-	if (!tracing && s->trace)
-		set_obj(s, "trace", 1);
-}
-
-static void totals(void)
-{
-	struct slabinfo *s;
-
-	int used_slabs = 0;
-	char b1[20], b2[20], b3[20], b4[20];
-	unsigned long long max = 1ULL << 63;
-
-	/* Object size */
-	unsigned long long min_objsize = max, max_objsize = 0, avg_objsize;
-
-	/* Number of partial slabs in a slabcache */
-	unsigned long long min_partial = max, max_partial = 0,
-				avg_partial, total_partial = 0;
-
-	/* Number of slabs in a slab cache */
-	unsigned long long min_slabs = max, max_slabs = 0,
-				avg_slabs, total_slabs = 0;
-
-	/* Size of the whole slab */
-	unsigned long long min_size = max, max_size = 0,
-				avg_size, total_size = 0;
-
-	/* Bytes used for object storage in a slab */
-	unsigned long long min_used = max, max_used = 0,
-				avg_used, total_used = 0;
-
-	/* Waste: Bytes used for alignment and padding */
-	unsigned long long min_waste = max, max_waste = 0,
-				avg_waste, total_waste = 0;
-	/* Number of objects in a slab */
-	unsigned long long min_objects = max, max_objects = 0,
-				avg_objects, total_objects = 0;
-	/* Waste per object */
-	unsigned long long min_objwaste = max,
-				max_objwaste = 0, avg_objwaste,
-				total_objwaste = 0;
-
-	/* Memory per object */
-	unsigned long long min_memobj = max,
-				max_memobj = 0, avg_memobj,
-				total_objsize = 0;
-
-	/* Percentage of partial slabs per slab */
-	unsigned long min_ppart = 100, max_ppart = 0,
-				avg_ppart, total_ppart = 0;
-
-	/* Number of objects in partial slabs */
-	unsigned long min_partobj = max, max_partobj = 0,
-				avg_partobj, total_partobj = 0;
-
-	/* Percentage of partial objects of all objects in a slab */
-	unsigned long min_ppartobj = 100, max_ppartobj = 0,
-				avg_ppartobj, total_ppartobj = 0;
-
-
-	for (s = slabinfo; s < slabinfo + slabs; s++) {
-		unsigned long long size;
-		unsigned long used;
-		unsigned long long wasted;
-		unsigned long long objwaste;
-		unsigned long percentage_partial_slabs;
-		unsigned long percentage_partial_objs;
-
-		if (!s->slabs || !s->objects)
-			continue;
-
-		used_slabs++;
-
-		size = slab_size(s);
-		used = s->objects * s->object_size;
-		wasted = size - used;
-		objwaste = s->slab_size - s->object_size;
-
-		percentage_partial_slabs = s->partial * 100 / s->slabs;
-		if (percentage_partial_slabs > 100)
-			percentage_partial_slabs = 100;
-
-		percentage_partial_objs = s->objects_partial * 100
-							/ s->objects;
-
-		if (percentage_partial_objs > 100)
-			percentage_partial_objs = 100;
-
-		if (s->object_size < min_objsize)
-			min_objsize = s->object_size;
-		if (s->partial < min_partial)
-			min_partial = s->partial;
-		if (s->slabs < min_slabs)
-			min_slabs = s->slabs;
-		if (size < min_size)
-			min_size = size;
-		if (wasted < min_waste)
-			min_waste = wasted;
-		if (objwaste < min_objwaste)
-			min_objwaste = objwaste;
-		if (s->objects < min_objects)
-			min_objects = s->objects;
-		if (used < min_used)
-			min_used = used;
-		if (s->objects_partial < min_partobj)
-			min_partobj = s->objects_partial;
-		if (percentage_partial_slabs < min_ppart)
-			min_ppart = percentage_partial_slabs;
-		if (percentage_partial_objs < min_ppartobj)
-			min_ppartobj = percentage_partial_objs;
-		if (s->slab_size < min_memobj)
-			min_memobj = s->slab_size;
-
-		if (s->object_size > max_objsize)
-			max_objsize = s->object_size;
-		if (s->partial > max_partial)
-			max_partial = s->partial;
-		if (s->slabs > max_slabs)
-			max_slabs = s->slabs;
-		if (size > max_size)
-			max_size = size;
-		if (wasted > max_waste)
-			max_waste = wasted;
-		if (objwaste > max_objwaste)
-			max_objwaste = objwaste;
-		if (s->objects > max_objects)
-			max_objects = s->objects;
-		if (used > max_used)
-			max_used = used;
-		if (s->objects_partial > max_partobj)
-			max_partobj = s->objects_partial;
-		if (percentage_partial_slabs > max_ppart)
-			max_ppart = percentage_partial_slabs;
-		if (percentage_partial_objs > max_ppartobj)
-			max_ppartobj = percentage_partial_objs;
-		if (s->slab_size > max_memobj)
-			max_memobj = s->slab_size;
-
-		total_partial += s->partial;
-		total_slabs += s->slabs;
-		total_size += size;
-		total_waste += wasted;
-
-		total_objects += s->objects;
-		total_used += used;
-		total_partobj += s->objects_partial;
-		total_ppart += percentage_partial_slabs;
-		total_ppartobj += percentage_partial_objs;
-
-		total_objwaste += s->objects * objwaste;
-		total_objsize += s->objects * s->slab_size;
-	}
-
-	if (!total_objects) {
-		printf("No objects\n");
-		return;
-	}
-	if (!used_slabs) {
-		printf("No slabs\n");
-		return;
-	}
-
-	/* Per slab averages */
-	avg_partial = total_partial / used_slabs;
-	avg_slabs = total_slabs / used_slabs;
-	avg_size = total_size / used_slabs;
-	avg_waste = total_waste / used_slabs;
-
-	avg_objects = total_objects / used_slabs;
-	avg_used = total_used / used_slabs;
-	avg_partobj = total_partobj / used_slabs;
-	avg_ppart = total_ppart / used_slabs;
-	avg_ppartobj = total_ppartobj / used_slabs;
-
-	/* Per object object sizes */
-	avg_objsize = total_used / total_objects;
-	avg_objwaste = total_objwaste / total_objects;
-	avg_partobj = total_partobj * 100 / total_objects;
-	avg_memobj = total_objsize / total_objects;
-
-	printf("Slabcache Totals\n");
-	printf("----------------\n");
-	printf("Slabcaches : %15d   Aliases  : %11d->%-3d  Active:    %3d\n",
-			slabs, aliases, alias_targets, used_slabs);
-
-	store_size(b1, total_size);store_size(b2, total_waste);
-	store_size(b3, total_waste * 100 / total_used);
-	printf("Memory used: %15s   # Loss   : %15s   MRatio:%6s%%\n", b1, b2, b3);
-
-	store_size(b1, total_objects);store_size(b2, total_partobj);
-	store_size(b3, total_partobj * 100 / total_objects);
-	printf("# Objects  : %15s   # PartObj: %15s   ORatio:%6s%%\n", b1, b2, b3);
-
-	printf("\n");
-	printf("Per Cache         Average              "
-		"Min              Max            Total\n");
-	printf("---------------------------------------"
-		"-------------------------------------\n");
-
-	store_size(b1, avg_objects);store_size(b2, min_objects);
-	store_size(b3, max_objects);store_size(b4, total_objects);
-	printf("#Objects  %15s  %15s  %15s  %15s\n",
-			b1,	b2,	b3,	b4);
-
-	store_size(b1, avg_slabs);store_size(b2, min_slabs);
-	store_size(b3, max_slabs);store_size(b4, total_slabs);
-	printf("#Slabs    %15s  %15s  %15s  %15s\n",
-			b1,	b2,	b3,	b4);
-
-	store_size(b1, avg_partial);store_size(b2, min_partial);
-	store_size(b3, max_partial);store_size(b4, total_partial);
-	printf("#PartSlab %15s  %15s  %15s  %15s\n",
-			b1,	b2,	b3,	b4);
-	store_size(b1, avg_ppart);store_size(b2, min_ppart);
-	store_size(b3, max_ppart);
-	store_size(b4, total_partial * 100  / total_slabs);
-	printf("%%PartSlab%15s%% %15s%% %15s%% %15s%%\n",
-			b1,	b2,	b3,	b4);
-
-	store_size(b1, avg_partobj);store_size(b2, min_partobj);
-	store_size(b3, max_partobj);
-	store_size(b4, total_partobj);
-	printf("PartObjs  %15s  %15s  %15s  %15s\n",
-			b1,	b2,	b3,	b4);
-
-	store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj);
-	store_size(b3, max_ppartobj);
-	store_size(b4, total_partobj * 100 / total_objects);
-	printf("%% PartObj%15s%% %15s%% %15s%% %15s%%\n",
-			b1,	b2,	b3,	b4);
-
-	store_size(b1, avg_size);store_size(b2, min_size);
-	store_size(b3, max_size);store_size(b4, total_size);
-	printf("Memory    %15s  %15s  %15s  %15s\n",
-			b1,	b2,	b3,	b4);
-
-	store_size(b1, avg_used);store_size(b2, min_used);
-	store_size(b3, max_used);store_size(b4, total_used);
-	printf("Used      %15s  %15s  %15s  %15s\n",
-			b1,	b2,	b3,	b4);
-
-	store_size(b1, avg_waste);store_size(b2, min_waste);
-	store_size(b3, max_waste);store_size(b4, total_waste);
-	printf("Loss      %15s  %15s  %15s  %15s\n",
-			b1,	b2,	b3,	b4);
-
-	printf("\n");
-	printf("Per Object        Average              "
-		"Min              Max\n");
-	printf("---------------------------------------"
-		"--------------------\n");
-
-	store_size(b1, avg_memobj);store_size(b2, min_memobj);
-	store_size(b3, max_memobj);
-	printf("Memory    %15s  %15s  %15s\n",
-			b1,	b2,	b3);
-	store_size(b1, avg_objsize);store_size(b2, min_objsize);
-	store_size(b3, max_objsize);
-	printf("User      %15s  %15s  %15s\n",
-			b1,	b2,	b3);
-
-	store_size(b1, avg_objwaste);store_size(b2, min_objwaste);
-	store_size(b3, max_objwaste);
-	printf("Loss      %15s  %15s  %15s\n",
-			b1,	b2,	b3);
-}
-
-static void sort_slabs(void)
-{
-	struct slabinfo *s1,*s2;
-
-	for (s1 = slabinfo; s1 < slabinfo + slabs; s1++) {
-		for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) {
-			int result;
-
-			if (sort_size) {
-				if (slab_size(s1) == slab_size(s2))
-					result = strcasecmp(s1->name, s2->name);
-				else
-					result = slab_size(s1) < slab_size(s2);
-			} else if (sort_active) {
-				if (slab_activity(s1) == slab_activity(s2))
-					result = strcasecmp(s1->name, s2->name);
-				else
-					result = slab_activity(s1) < slab_activity(s2);
-			} else if (sort_loss) {
-				if (slab_waste(s1) == slab_waste(s2))
-					result = strcasecmp(s1->name, s2->name);
-				else
-					result = slab_waste(s1) < slab_waste(s2);
-			} else if (sort_partial) {
-				if (s1->partial == s2->partial)
-					result = strcasecmp(s1->name, s2->name);
-				else
-					result = s1->partial < s2->partial;
-			} else
-				result = strcasecmp(s1->name, s2->name);
-
-			if (show_inverted)
-				result = -result;
-
-			if (result > 0) {
-				struct slabinfo t;
-
-				memcpy(&t, s1, sizeof(struct slabinfo));
-				memcpy(s1, s2, sizeof(struct slabinfo));
-				memcpy(s2, &t, sizeof(struct slabinfo));
-			}
-		}
-	}
-}
-
-static void sort_aliases(void)
-{
-	struct aliasinfo *a1,*a2;
-
-	for (a1 = aliasinfo; a1 < aliasinfo + aliases; a1++) {
-		for (a2 = a1 + 1; a2 < aliasinfo + aliases; a2++) {
-			char *n1, *n2;
-
-			n1 = a1->name;
-			n2 = a2->name;
-			if (show_alias && !show_inverted) {
-				n1 = a1->ref;
-				n2 = a2->ref;
-			}
-			if (strcasecmp(n1, n2) > 0) {
-				struct aliasinfo t;
-
-				memcpy(&t, a1, sizeof(struct aliasinfo));
-				memcpy(a1, a2, sizeof(struct aliasinfo));
-				memcpy(a2, &t, sizeof(struct aliasinfo));
-			}
-		}
-	}
-}
-
-static void link_slabs(void)
-{
-	struct aliasinfo *a;
-	struct slabinfo *s;
-
-	for (a = aliasinfo; a < aliasinfo + aliases; a++) {
-
-		for (s = slabinfo; s < slabinfo + slabs; s++)
-			if (strcmp(a->ref, s->name) == 0) {
-				a->slab = s;
-				s->refs++;
-				break;
-			}
-		if (s == slabinfo + slabs)
-			fatal("Unresolved alias %s\n", a->ref);
-	}
-}
-
-static void alias(void)
-{
-	struct aliasinfo *a;
-	char *active = NULL;
-
-	sort_aliases();
-	link_slabs();
-
-	for(a = aliasinfo; a < aliasinfo + aliases; a++) {
-
-		if (!show_single_ref && a->slab->refs == 1)
-			continue;
-
-		if (!show_inverted) {
-			if (active) {
-				if (strcmp(a->slab->name, active) == 0) {
-					printf(" %s", a->name);
-					continue;
-				}
-			}
-			printf("\n%-12s <- %s", a->slab->name, a->name);
-			active = a->slab->name;
-		}
-		else
-			printf("%-15s -> %s\n", a->name, a->slab->name);
-	}
-	if (active)
-		printf("\n");
-}
-
-
-static void rename_slabs(void)
-{
-	struct slabinfo *s;
-	struct aliasinfo *a;
-
-	for (s = slabinfo; s < slabinfo + slabs; s++) {
-		if (*s->name != ':')
-			continue;
-
-		if (s->refs > 1 && !show_first_alias)
-			continue;
-
-		a = find_one_alias(s);
-
-		if (a)
-			s->name = a->name;
-		else {
-			s->name = "*";
-			actual_slabs--;
-		}
-	}
-}
-
-static int slab_mismatch(char *slab)
-{
-	return regexec(&pattern, slab, 0, NULL, 0);
-}
-
-static void read_slab_dir(void)
-{
-	DIR *dir;
-	struct dirent *de;
-	struct slabinfo *slab = slabinfo;
-	struct aliasinfo *alias = aliasinfo;
-	char *p;
-	char *t;
-	int count;
-
-	if (chdir("/sys/kernel/slab") && chdir("/sys/slab"))
-		fatal("SYSFS support for SLUB not active\n");
-
-	dir = opendir(".");
-	while ((de = readdir(dir))) {
-		if (de->d_name[0] == '.' ||
-			(de->d_name[0] != ':' && slab_mismatch(de->d_name)))
-				continue;
-		switch (de->d_type) {
-		   case DT_LNK:
-			alias->name = strdup(de->d_name);
-			count = readlink(de->d_name, buffer, sizeof(buffer)-1);
-
-			if (count < 0)
-				fatal("Cannot read symlink %s\n", de->d_name);
-
-			buffer[count] = 0;
-			p = buffer + count;
-			while (p > buffer && p[-1] != '/')
-				p--;
-			alias->ref = strdup(p);
-			alias++;
-			break;
-		   case DT_DIR:
-			if (chdir(de->d_name))
-				fatal("Unable to access slab %s\n", slab->name);
-			slab->name = strdup(de->d_name);
-			slab->alias = 0;
-			slab->refs = 0;
-			slab->aliases = get_obj("aliases");
-			slab->align = get_obj("align");
-			slab->cache_dma = get_obj("cache_dma");
-			slab->cpu_slabs = get_obj("cpu_slabs");
-			slab->destroy_by_rcu = get_obj("destroy_by_rcu");
-			slab->hwcache_align = get_obj("hwcache_align");
-			slab->object_size = get_obj("object_size");
-			slab->objects = get_obj("objects");
-			slab->objects_partial = get_obj("objects_partial");
-			slab->objects_total = get_obj("objects_total");
-			slab->objs_per_slab = get_obj("objs_per_slab");
-			slab->order = get_obj("order");
-			slab->partial = get_obj("partial");
-			slab->partial = get_obj_and_str("partial", &t);
-			decode_numa_list(slab->numa_partial, t);
-			free(t);
-			slab->poison = get_obj("poison");
-			slab->reclaim_account = get_obj("reclaim_account");
-			slab->red_zone = get_obj("red_zone");
-			slab->sanity_checks = get_obj("sanity_checks");
-			slab->slab_size = get_obj("slab_size");
-			slab->slabs = get_obj_and_str("slabs", &t);
-			decode_numa_list(slab->numa, t);
-			free(t);
-			slab->store_user = get_obj("store_user");
-			slab->trace = get_obj("trace");
-			slab->alloc_fastpath = get_obj("alloc_fastpath");
-			slab->alloc_slowpath = get_obj("alloc_slowpath");
-			slab->free_fastpath = get_obj("free_fastpath");
-			slab->free_slowpath = get_obj("free_slowpath");
-			slab->free_frozen= get_obj("free_frozen");
-			slab->free_add_partial = get_obj("free_add_partial");
-			slab->free_remove_partial = get_obj("free_remove_partial");
-			slab->alloc_from_partial = get_obj("alloc_from_partial");
-			slab->alloc_slab = get_obj("alloc_slab");
-			slab->alloc_refill = get_obj("alloc_refill");
-			slab->free_slab = get_obj("free_slab");
-			slab->cpuslab_flush = get_obj("cpuslab_flush");
-			slab->deactivate_full = get_obj("deactivate_full");
-			slab->deactivate_empty = get_obj("deactivate_empty");
-			slab->deactivate_to_head = get_obj("deactivate_to_head");
-			slab->deactivate_to_tail = get_obj("deactivate_to_tail");
-			slab->deactivate_remote_frees = get_obj("deactivate_remote_frees");
-			slab->order_fallback = get_obj("order_fallback");
-			slab->cmpxchg_double_cpu_fail = get_obj("cmpxchg_double_cpu_fail");
-			slab->cmpxchg_double_fail = get_obj("cmpxchg_double_fail");
-			slab->cpu_partial_alloc = get_obj("cpu_partial_alloc");
-			slab->cpu_partial_free = get_obj("cpu_partial_free");
-			slab->alloc_node_mismatch = get_obj("alloc_node_mismatch");
-			slab->deactivate_bypass = get_obj("deactivate_bypass");
-			chdir("..");
-			if (slab->name[0] == ':')
-				alias_targets++;
-			slab++;
-			break;
-		   default :
-			fatal("Unknown file type %lx\n", de->d_type);
-		}
-	}
-	closedir(dir);
-	slabs = slab - slabinfo;
-	actual_slabs = slabs;
-	aliases = alias - aliasinfo;
-	if (slabs > MAX_SLABS)
-		fatal("Too many slabs\n");
-	if (aliases > MAX_ALIASES)
-		fatal("Too many aliases\n");
-}
-
-static void output_slabs(void)
-{
-	struct slabinfo *slab;
-	int lines = output_lines;
-
-	for (slab = slabinfo; (slab < slabinfo + slabs) &&
-			lines != 0; slab++) {
-
-		if (slab->alias)
-			continue;
-
-		if (lines != -1)
-			lines--;
-
-		if (show_numa)
-			slab_numa(slab, 0);
-		else if (show_track)
-			show_tracking(slab);
-		else if (validate)
-			slab_validate(slab);
-		else if (shrink)
-			slab_shrink(slab);
-		else if (set_debug)
-			slab_debug(slab);
-		else if (show_ops)
-			ops(slab);
-		else if (show_slab)
-			slabcache(slab);
-		else if (show_report)
-			report(slab);
-	}
-}
-
-static void _xtotals(char *heading, char *underline,
-		     int loss, int size, int partial)
-{
-	printf("%s%s", heading, underline);
-	line = 0;
-	sort_loss = loss;
-	sort_size = size;
-	sort_partial = partial;
-	sort_slabs();
-	output_slabs();
-}
-
-static void xtotals(void)
-{
-	char *heading, *underline;
-
-	totals();
-
-	link_slabs();
-	rename_slabs();
-
-	heading = "\nSlabs sorted by size\n";
-	underline = "--------------------\n";
-	_xtotals(heading, underline, 0, 1, 0);
-
-	heading = "\nSlabs sorted by loss\n";
-	underline = "--------------------\n";
-	_xtotals(heading, underline, 1, 0, 0);
-
-	heading = "\nSlabs sorted by number of partial slabs\n";
-	underline = "---------------------------------------\n";
-	_xtotals(heading, underline, 0, 0, 1);
-
-	printf("\n");
-}
-
-struct option opts[] = {
-	{ "aliases", no_argument, NULL, 'a' },
-	{ "activity", no_argument, NULL, 'A' },
-	{ "Bytes", no_argument, NULL, 'B'},
-	{ "debug", optional_argument, NULL, 'd' },
-	{ "display-activity", no_argument, NULL, 'D' },
-	{ "empty", no_argument, NULL, 'e' },
-	{ "first-alias", no_argument, NULL, 'f' },
-	{ "help", no_argument, NULL, 'h' },
-	{ "inverted", no_argument, NULL, 'i'},
-	{ "slabs", no_argument, NULL, 'l' },
-	{ "Loss", no_argument, NULL, 'L'},
-	{ "numa", no_argument, NULL, 'n' },
-	{ "lines", required_argument, NULL, 'N'},
-	{ "ops", no_argument, NULL, 'o' },
-	{ "partial", no_argument, NULL, 'p'},
-	{ "report", no_argument, NULL, 'r' },
-	{ "shrink", no_argument, NULL, 's' },
-	{ "Size", no_argument, NULL, 'S'},
-	{ "tracking", no_argument, NULL, 't'},
-	{ "Totals", no_argument, NULL, 'T'},
-	{ "Unreclaim", no_argument, NULL, 'U'},
-	{ "validate", no_argument, NULL, 'v' },
-	{ "Xtotals", no_argument, NULL, 'X'},
-	{ "zero", no_argument, NULL, 'z' },
-	{ "1ref", no_argument, NULL, '1'},
-	{ NULL, 0, NULL, 0 }
-};
-
-int main(int argc, char *argv[])
-{
-	int c;
-	int err;
-	char *pattern_source;
-
-	page_size = getpagesize();
-
-	while ((c = getopt_long(argc, argv, "aABd::DefhilLnN:oPrsStTUvXz1",
-						opts, NULL)) != -1)
-		switch (c) {
-		case 'a':
-			show_alias = 1;
-			break;
-		case 'A':
-			sort_active = 1;
-			break;
-		case 'B':
-			show_bytes = 1;
-			break;
-		case 'd':
-			set_debug = 1;
-			if (!debug_opt_scan(optarg))
-				fatal("Invalid debug option '%s'\n", optarg);
-			break;
-		case 'D':
-			show_activity = 1;
-			break;
-		case 'e':
-			show_empty = 1;
-			break;
-		case 'f':
-			show_first_alias = 1;
-			break;
-		case 'h':
-			usage();
-			return 0;
-		case 'i':
-			show_inverted = 1;
-			break;
-		case 'l':
-			show_slab = 1;
-			break;
-		case 'L':
-			sort_loss = 1;
-			break;
-		case 'n':
-			show_numa = 1;
-			break;
-		case 'N':
-			if (optarg) {
-				output_lines = atoi(optarg);
-				if (output_lines < 1)
-					output_lines = 1;
-			}
-			break;
-		case 'o':
-			show_ops = 1;
-			break;
-		case 'r':
-			show_report = 1;
-			break;
-		case 'P':
-			sort_partial = 1;
-			break;
-		case 's':
-			shrink = 1;
-			break;
-		case 'S':
-			sort_size = 1;
-			break;
-		case 't':
-			show_track = 1;
-			break;
-		case 'T':
-			show_totals = 1;
-			break;
-		case 'U':
-			unreclaim_only = 1;
-			break;
-		case 'v':
-			validate = 1;
-			break;
-		case 'X':
-			if (output_lines == -1)
-				output_lines = 1;
-			extended_totals = 1;
-			show_bytes = 1;
-			break;
-		case 'z':
-			skip_zero = 0;
-			break;
-		case '1':
-			show_single_ref = 1;
-			break;
-		default:
-			fatal("%s: Invalid option '%c'\n", argv[0], optopt);
-
-	}
-
-	if (!show_slab && !show_alias && !show_track && !show_report
-		&& !validate && !shrink && !set_debug && !show_ops)
-			show_slab = 1;
-
-	if (argc > optind)
-		pattern_source = argv[optind];
-	else
-		pattern_source = ".*";
-
-	err = regcomp(&pattern, pattern_source, REG_ICASE|REG_NOSUB);
-	if (err)
-		fatal("%s: Invalid pattern '%s' code %d\n",
-			argv[0], pattern_source, err);
-	read_slab_dir();
-	if (show_alias) {
-		alias();
-	} else if (extended_totals) {
-		xtotals();
-	} else if (show_totals) {
-		totals();
-	} else {
-		link_slabs();
-		rename_slabs();
-		sort_slabs();
-		output_slabs();
-	}
-	return 0;
-}
-- 
cgit 


From baa489fabd01596d5426d6e112b34ba5fb59ab82 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 3 Jan 2023 18:07:53 +0000
Subject: selftests/vm: rename selftests/vm to selftests/mm

Rename selftets/vm to selftests/mm for being more consistent with the
code, documentation, and tools directories, and won't be confused with
virtual machines.

[sj@kernel.org: convert missing vm->mm changes]
  Link: https://lkml.kernel.org/r/20230107230643.252273-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20230103180754.129637-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/hugetlbpage.rst       |    6 +-
 Documentation/core-api/pin_user_pages.rst          |    2 +-
 MAINTAINERS                                        |    4 +-
 mm/Kconfig                                         |    2 +-
 tools/testing/selftests/Makefile                   |    2 +-
 tools/testing/selftests/kselftest_deps.sh          |    6 +-
 tools/testing/selftests/mm/.gitignore              |   38 +
 tools/testing/selftests/mm/Makefile                |  180 ++
 .../selftests/mm/charge_reserved_hugetlb.sh        |  584 ++++++
 tools/testing/selftests/mm/check_config.sh         |   31 +
 tools/testing/selftests/mm/compaction_test.c       |  231 +++
 tools/testing/selftests/mm/config                  |    8 +
 tools/testing/selftests/mm/cow.c                   | 1764 +++++++++++++++++
 tools/testing/selftests/mm/gup_test.c              |  271 +++
 tools/testing/selftests/mm/hmm-tests.c             | 2054 ++++++++++++++++++++
 tools/testing/selftests/mm/hugepage-mmap.c         |   91 +
 tools/testing/selftests/mm/hugepage-mremap.c       |  188 ++
 tools/testing/selftests/mm/hugepage-shm.c          |  101 +
 tools/testing/selftests/mm/hugepage-vmemmap.c      |  144 ++
 tools/testing/selftests/mm/hugetlb-madvise.c       |  406 ++++
 .../selftests/mm/hugetlb_reparenting_test.sh       |  252 +++
 tools/testing/selftests/mm/khugepaged.c            | 1558 +++++++++++++++
 tools/testing/selftests/mm/ksm_functional_tests.c  |  279 +++
 tools/testing/selftests/mm/ksm_tests.c             |  849 ++++++++
 tools/testing/selftests/mm/madv_populate.c         |  296 +++
 tools/testing/selftests/mm/map_fixed_noreplace.c   |  231 +++
 tools/testing/selftests/mm/map_hugetlb.c           |  109 ++
 tools/testing/selftests/mm/map_populate.c          |  113 ++
 tools/testing/selftests/mm/memfd_secret.c          |  296 +++
 tools/testing/selftests/mm/migration.c             |  193 ++
 tools/testing/selftests/mm/mlock-random-test.c     |  294 +++
 tools/testing/selftests/mm/mlock2-tests.c          |  520 +++++
 tools/testing/selftests/mm/mlock2.h                |   63 +
 tools/testing/selftests/mm/mrelease_test.c         |  206 ++
 tools/testing/selftests/mm/mremap_dontunmap.c      |  364 ++++
 tools/testing/selftests/mm/mremap_test.c           |  475 +++++
 tools/testing/selftests/mm/on-fault-limit.c        |   48 +
 tools/testing/selftests/mm/pkey-helpers.h          |  226 +++
 tools/testing/selftests/mm/pkey-powerpc.h          |  133 ++
 tools/testing/selftests/mm/pkey-x86.h              |  177 ++
 tools/testing/selftests/mm/protection_keys.c       | 1788 +++++++++++++++++
 tools/testing/selftests/mm/run_vmtests.sh          |  274 +++
 tools/testing/selftests/mm/settings                |    1 +
 tools/testing/selftests/mm/soft-dirty.c            |  210 ++
 tools/testing/selftests/mm/split_huge_page_test.c  |  309 +++
 tools/testing/selftests/mm/test_hmm.sh             |  105 +
 tools/testing/selftests/mm/test_vmalloc.sh         |  177 ++
 tools/testing/selftests/mm/thuge-gen.c             |  257 +++
 tools/testing/selftests/mm/transhuge-stress.c      |  122 ++
 tools/testing/selftests/mm/userfaultfd.c           | 1858 ++++++++++++++++++
 tools/testing/selftests/mm/util.h                  |   69 +
 tools/testing/selftests/mm/va_128TBswitch.c        |  289 +++
 tools/testing/selftests/mm/va_128TBswitch.sh       |   54 +
 tools/testing/selftests/mm/virtual_address_range.c |  139 ++
 tools/testing/selftests/mm/vm_util.c               |  151 ++
 tools/testing/selftests/mm/vm_util.h               |   15 +
 tools/testing/selftests/mm/write_hugetlb_memory.sh |   23 +
 tools/testing/selftests/mm/write_to_hugetlbfs.c    |  240 +++
 tools/testing/selftests/vm/.gitignore              |   38 -
 tools/testing/selftests/vm/Makefile                |  180 --
 .../selftests/vm/charge_reserved_hugetlb.sh        |  584 ------
 tools/testing/selftests/vm/check_config.sh         |   31 -
 tools/testing/selftests/vm/compaction_test.c       |  231 ---
 tools/testing/selftests/vm/config                  |    8 -
 tools/testing/selftests/vm/cow.c                   | 1764 -----------------
 tools/testing/selftests/vm/gup_test.c              |  271 ---
 tools/testing/selftests/vm/hmm-tests.c             | 2054 --------------------
 tools/testing/selftests/vm/hugepage-mmap.c         |   91 -
 tools/testing/selftests/vm/hugepage-mremap.c       |  188 --
 tools/testing/selftests/vm/hugepage-shm.c          |  101 -
 tools/testing/selftests/vm/hugepage-vmemmap.c      |  144 --
 tools/testing/selftests/vm/hugetlb-madvise.c       |  406 ----
 .../selftests/vm/hugetlb_reparenting_test.sh       |  252 ---
 tools/testing/selftests/vm/khugepaged.c            | 1558 ---------------
 tools/testing/selftests/vm/ksm_functional_tests.c  |  279 ---
 tools/testing/selftests/vm/ksm_tests.c             |  849 --------
 tools/testing/selftests/vm/madv_populate.c         |  296 ---
 tools/testing/selftests/vm/map_fixed_noreplace.c   |  231 ---
 tools/testing/selftests/vm/map_hugetlb.c           |  109 --
 tools/testing/selftests/vm/map_populate.c          |  113 --
 tools/testing/selftests/vm/memfd_secret.c          |  296 ---
 tools/testing/selftests/vm/migration.c             |  193 --
 tools/testing/selftests/vm/mlock-random-test.c     |  294 ---
 tools/testing/selftests/vm/mlock2-tests.c          |  520 -----
 tools/testing/selftests/vm/mlock2.h                |   63 -
 tools/testing/selftests/vm/mrelease_test.c         |  206 --
 tools/testing/selftests/vm/mremap_dontunmap.c      |  364 ----
 tools/testing/selftests/vm/mremap_test.c           |  475 -----
 tools/testing/selftests/vm/on-fault-limit.c        |   48 -
 tools/testing/selftests/vm/pkey-helpers.h          |  226 ---
 tools/testing/selftests/vm/pkey-powerpc.h          |  133 --
 tools/testing/selftests/vm/pkey-x86.h              |  177 --
 tools/testing/selftests/vm/protection_keys.c       | 1788 -----------------
 tools/testing/selftests/vm/run_vmtests.sh          |  274 ---
 tools/testing/selftests/vm/settings                |    1 -
 tools/testing/selftests/vm/soft-dirty.c            |  210 --
 tools/testing/selftests/vm/split_huge_page_test.c  |  309 ---
 tools/testing/selftests/vm/test_hmm.sh             |  105 -
 tools/testing/selftests/vm/test_vmalloc.sh         |  177 --
 tools/testing/selftests/vm/thuge-gen.c             |  257 ---
 tools/testing/selftests/vm/transhuge-stress.c      |  122 --
 tools/testing/selftests/vm/userfaultfd.c           | 1858 ------------------
 tools/testing/selftests/vm/util.h                  |   69 -
 tools/testing/selftests/vm/va_128TBswitch.c        |  289 ---
 tools/testing/selftests/vm/va_128TBswitch.sh       |   54 -
 tools/testing/selftests/vm/virtual_address_range.c |  139 --
 tools/testing/selftests/vm/vm_util.c               |  151 --
 tools/testing/selftests/vm/vm_util.h               |   15 -
 tools/testing/selftests/vm/write_hugetlb_memory.sh |   23 -
 tools/testing/selftests/vm/write_to_hugetlbfs.c    |  240 ---
 110 files changed, 18865 insertions(+), 18865 deletions(-)
 create mode 100644 tools/testing/selftests/mm/.gitignore
 create mode 100644 tools/testing/selftests/mm/Makefile
 create mode 100644 tools/testing/selftests/mm/charge_reserved_hugetlb.sh
 create mode 100644 tools/testing/selftests/mm/check_config.sh
 create mode 100644 tools/testing/selftests/mm/compaction_test.c
 create mode 100644 tools/testing/selftests/mm/config
 create mode 100644 tools/testing/selftests/mm/cow.c
 create mode 100644 tools/testing/selftests/mm/gup_test.c
 create mode 100644 tools/testing/selftests/mm/hmm-tests.c
 create mode 100644 tools/testing/selftests/mm/hugepage-mmap.c
 create mode 100644 tools/testing/selftests/mm/hugepage-mremap.c
 create mode 100644 tools/testing/selftests/mm/hugepage-shm.c
 create mode 100644 tools/testing/selftests/mm/hugepage-vmemmap.c
 create mode 100644 tools/testing/selftests/mm/hugetlb-madvise.c
 create mode 100644 tools/testing/selftests/mm/hugetlb_reparenting_test.sh
 create mode 100644 tools/testing/selftests/mm/khugepaged.c
 create mode 100644 tools/testing/selftests/mm/ksm_functional_tests.c
 create mode 100644 tools/testing/selftests/mm/ksm_tests.c
 create mode 100644 tools/testing/selftests/mm/madv_populate.c
 create mode 100644 tools/testing/selftests/mm/map_fixed_noreplace.c
 create mode 100644 tools/testing/selftests/mm/map_hugetlb.c
 create mode 100644 tools/testing/selftests/mm/map_populate.c
 create mode 100644 tools/testing/selftests/mm/memfd_secret.c
 create mode 100644 tools/testing/selftests/mm/migration.c
 create mode 100644 tools/testing/selftests/mm/mlock-random-test.c
 create mode 100644 tools/testing/selftests/mm/mlock2-tests.c
 create mode 100644 tools/testing/selftests/mm/mlock2.h
 create mode 100644 tools/testing/selftests/mm/mrelease_test.c
 create mode 100644 tools/testing/selftests/mm/mremap_dontunmap.c
 create mode 100644 tools/testing/selftests/mm/mremap_test.c
 create mode 100644 tools/testing/selftests/mm/on-fault-limit.c
 create mode 100644 tools/testing/selftests/mm/pkey-helpers.h
 create mode 100644 tools/testing/selftests/mm/pkey-powerpc.h
 create mode 100644 tools/testing/selftests/mm/pkey-x86.h
 create mode 100644 tools/testing/selftests/mm/protection_keys.c
 create mode 100644 tools/testing/selftests/mm/run_vmtests.sh
 create mode 100644 tools/testing/selftests/mm/settings
 create mode 100644 tools/testing/selftests/mm/soft-dirty.c
 create mode 100644 tools/testing/selftests/mm/split_huge_page_test.c
 create mode 100644 tools/testing/selftests/mm/test_hmm.sh
 create mode 100644 tools/testing/selftests/mm/test_vmalloc.sh
 create mode 100644 tools/testing/selftests/mm/thuge-gen.c
 create mode 100644 tools/testing/selftests/mm/transhuge-stress.c
 create mode 100644 tools/testing/selftests/mm/userfaultfd.c
 create mode 100644 tools/testing/selftests/mm/util.h
 create mode 100644 tools/testing/selftests/mm/va_128TBswitch.c
 create mode 100644 tools/testing/selftests/mm/va_128TBswitch.sh
 create mode 100644 tools/testing/selftests/mm/virtual_address_range.c
 create mode 100644 tools/testing/selftests/mm/vm_util.c
 create mode 100644 tools/testing/selftests/mm/vm_util.h
 create mode 100644 tools/testing/selftests/mm/write_hugetlb_memory.sh
 create mode 100644 tools/testing/selftests/mm/write_to_hugetlbfs.c
 delete mode 100644 tools/testing/selftests/vm/.gitignore
 delete mode 100644 tools/testing/selftests/vm/Makefile
 delete mode 100644 tools/testing/selftests/vm/charge_reserved_hugetlb.sh
 delete mode 100644 tools/testing/selftests/vm/check_config.sh
 delete mode 100644 tools/testing/selftests/vm/compaction_test.c
 delete mode 100644 tools/testing/selftests/vm/config
 delete mode 100644 tools/testing/selftests/vm/cow.c
 delete mode 100644 tools/testing/selftests/vm/gup_test.c
 delete mode 100644 tools/testing/selftests/vm/hmm-tests.c
 delete mode 100644 tools/testing/selftests/vm/hugepage-mmap.c
 delete mode 100644 tools/testing/selftests/vm/hugepage-mremap.c
 delete mode 100644 tools/testing/selftests/vm/hugepage-shm.c
 delete mode 100644 tools/testing/selftests/vm/hugepage-vmemmap.c
 delete mode 100644 tools/testing/selftests/vm/hugetlb-madvise.c
 delete mode 100644 tools/testing/selftests/vm/hugetlb_reparenting_test.sh
 delete mode 100644 tools/testing/selftests/vm/khugepaged.c
 delete mode 100644 tools/testing/selftests/vm/ksm_functional_tests.c
 delete mode 100644 tools/testing/selftests/vm/ksm_tests.c
 delete mode 100644 tools/testing/selftests/vm/madv_populate.c
 delete mode 100644 tools/testing/selftests/vm/map_fixed_noreplace.c
 delete mode 100644 tools/testing/selftests/vm/map_hugetlb.c
 delete mode 100644 tools/testing/selftests/vm/map_populate.c
 delete mode 100644 tools/testing/selftests/vm/memfd_secret.c
 delete mode 100644 tools/testing/selftests/vm/migration.c
 delete mode 100644 tools/testing/selftests/vm/mlock-random-test.c
 delete mode 100644 tools/testing/selftests/vm/mlock2-tests.c
 delete mode 100644 tools/testing/selftests/vm/mlock2.h
 delete mode 100644 tools/testing/selftests/vm/mrelease_test.c
 delete mode 100644 tools/testing/selftests/vm/mremap_dontunmap.c
 delete mode 100644 tools/testing/selftests/vm/mremap_test.c
 delete mode 100644 tools/testing/selftests/vm/on-fault-limit.c
 delete mode 100644 tools/testing/selftests/vm/pkey-helpers.h
 delete mode 100644 tools/testing/selftests/vm/pkey-powerpc.h
 delete mode 100644 tools/testing/selftests/vm/pkey-x86.h
 delete mode 100644 tools/testing/selftests/vm/protection_keys.c
 delete mode 100755 tools/testing/selftests/vm/run_vmtests.sh
 delete mode 100644 tools/testing/selftests/vm/settings
 delete mode 100644 tools/testing/selftests/vm/soft-dirty.c
 delete mode 100644 tools/testing/selftests/vm/split_huge_page_test.c
 delete mode 100755 tools/testing/selftests/vm/test_hmm.sh
 delete mode 100755 tools/testing/selftests/vm/test_vmalloc.sh
 delete mode 100644 tools/testing/selftests/vm/thuge-gen.c
 delete mode 100644 tools/testing/selftests/vm/transhuge-stress.c
 delete mode 100644 tools/testing/selftests/vm/userfaultfd.c
 delete mode 100644 tools/testing/selftests/vm/util.h
 delete mode 100644 tools/testing/selftests/vm/va_128TBswitch.c
 delete mode 100755 tools/testing/selftests/vm/va_128TBswitch.sh
 delete mode 100644 tools/testing/selftests/vm/virtual_address_range.c
 delete mode 100644 tools/testing/selftests/vm/vm_util.c
 delete mode 100644 tools/testing/selftests/vm/vm_util.h
 delete mode 100644 tools/testing/selftests/vm/write_hugetlb_memory.sh
 delete mode 100644 tools/testing/selftests/vm/write_to_hugetlbfs.c

(limited to 'tools')

diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index 19f27c0d92e0..a969a2c742b2 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -461,13 +461,13 @@ Examples
 .. _map_hugetlb:
 
 ``map_hugetlb``
-	see tools/testing/selftests/vm/map_hugetlb.c
+	see tools/testing/selftests/mm/map_hugetlb.c
 
 ``hugepage-shm``
-	see tools/testing/selftests/vm/hugepage-shm.c
+	see tools/testing/selftests/mm/hugepage-shm.c
 
 ``hugepage-mmap``
-	see tools/testing/selftests/vm/hugepage-mmap.c
+	see tools/testing/selftests/mm/hugepage-mmap.c
 
 The `libhugetlbfs`_  library provides a wide range of userspace tools
 to help with huge page usability, environment setup, and control.
diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst
index b18416f4500f..facafbdecb95 100644
--- a/Documentation/core-api/pin_user_pages.rst
+++ b/Documentation/core-api/pin_user_pages.rst
@@ -221,7 +221,7 @@ Unit testing
 ============
 This file::
 
- tools/testing/selftests/vm/gup_test.c
+ tools/testing/selftests/mm/gup_test.c
 
 has the following new calls to exercise the new pin*() wrapper functions:
 
diff --git a/MAINTAINERS b/MAINTAINERS
index c726adfd1f0d..8ac1472bea34 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9466,7 +9466,7 @@ F:	Documentation/mm/hmm.rst
 F:	include/linux/hmm*
 F:	lib/test_hmm*
 F:	mm/hmm*
-F:	tools/testing/selftests/vm/*hmm*
+F:	tools/testing/selftests/mm/*hmm*
 
 HOST AP DRIVER
 M:	Jouni Malinen <j@w1.fi>
@@ -13484,7 +13484,7 @@ F:	include/linux/mmzone.h
 F:	include/linux/pagewalk.h
 F:	mm/
 F:	tools/mm/
-F:	tools/testing/selftests/vm/
+F:	tools/testing/selftests/mm/
 
 VMALLOC
 M:	Andrew Morton <akpm@linux-foundation.org>
diff --git a/mm/Kconfig b/mm/Kconfig
index ff7b209dec05..39df30dcabe3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1073,7 +1073,7 @@ config GUP_TEST
 	  pin_user_pages*(), or pinned via get_user_pages*(), as specified
 	  by other command line arguments.
 
-	  See tools/testing/selftests/vm/gup_test.c
+	  See tools/testing/selftests/mm/gup_test.c
 
 comment "GUP_TEST needs to have DEBUG_FS enabled"
 	depends on !GUP_TEST && !DEBUG_FS
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 41b649452560..56a29f2de8e6 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -85,7 +85,7 @@ TARGETS += tmpfs
 TARGETS += tpm2
 TARGETS += user
 TARGETS += vDSO
-TARGETS += vm
+TARGETS += mm
 TARGETS += x86
 TARGETS += zram
 #Please keep the TARGETS list alphabetically sorted
diff --git a/tools/testing/selftests/kselftest_deps.sh b/tools/testing/selftests/kselftest_deps.sh
index 7424a1f5babc..4bc14d9e8ff1 100755
--- a/tools/testing/selftests/kselftest_deps.sh
+++ b/tools/testing/selftests/kselftest_deps.sh
@@ -12,9 +12,9 @@ usage()
 
 echo -e "Usage: $0 -[p] <compiler> [test_name]\n"
 echo -e "\tkselftest_deps.sh [-p] gcc"
-echo -e "\tkselftest_deps.sh [-p] gcc vm"
+echo -e "\tkselftest_deps.sh [-p] gcc mm"
 echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc"
-echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc vm\n"
+echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc mm\n"
 echo "- Should be run in selftests directory in the kernel repo."
 echo "- Checks if Kselftests can be built/cross-built on a system."
 echo "- Parses all test/sub-test Makefile to find library dependencies."
@@ -120,7 +120,7 @@ l1_tests=$(grep -r --include=Makefile "^LDLIBS" | \
 # Level 2
 # Some tests have multiple valid LDLIBS lines for individual sub-tests
 # that need dependency checks. Find them and append them to the tests
-# e.g: vm/Makefile:$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
+# e.g: mm/Makefile:$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
 # Filter out VAR_LDLIBS to discard the following:
 # 	memfd/Makefile:$(OUTPUT)/fuse_mnt: LDLIBS += $(VAR_LDLIBS)
 # Append space at the end of the list to append more tests.
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
new file mode 100644
index 000000000000..1f8c36a9fa10
--- /dev/null
+++ b/tools/testing/selftests/mm/.gitignore
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: GPL-2.0-only
+cow
+hugepage-mmap
+hugepage-mremap
+hugepage-shm
+hugepage-vmemmap
+hugetlb-madvise
+khugepaged
+map_hugetlb
+map_populate
+thuge-gen
+compaction_test
+migration
+mlock2-tests
+mrelease_test
+mremap_dontunmap
+mremap_test
+on-fault-limit
+transhuge-stress
+protection_keys
+protection_keys_32
+protection_keys_64
+madv_populate
+userfaultfd
+mlock-intersect-test
+mlock-random-test
+virtual_address_range
+gup_test
+va_128TBswitch
+map_fixed_noreplace
+write_to_hugetlbfs
+hmm-tests
+memfd_secret
+soft-dirty
+split_huge_page_test
+ksm_tests
+local_config.h
+local_config.mk
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
new file mode 100644
index 000000000000..6a4b639b2b2b
--- /dev/null
+++ b/tools/testing/selftests/mm/Makefile
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for mm selftests
+
+LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h
+
+include local_config.mk
+
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/')
+
+# Without this, failed build products remain, with up-to-date timestamps,
+# thus tricking Make (and you!) into believing that All Is Well, in subsequent
+# make invocations:
+.DELETE_ON_ERROR:
+
+# Avoid accidental wrong builds, due to built-in rules working just a little
+# bit too well--but not quite as well as required for our situation here.
+#
+# In other words, "make userfaultfd" is supposed to fail to build at all,
+# because this Makefile only supports either "make" (all), or "make /full/path".
+# However,  the built-in rules, if not suppressed, will pick up CFLAGS and the
+# initial LDLIBS (but not the target-specific LDLIBS, because those are only
+# set for the full path target!). This causes it to get pretty far into building
+# things despite using incorrect values such as an *occasionally* incomplete
+# LDLIBS.
+MAKEFLAGS += --no-builtin-rules
+
+CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
+LDLIBS = -lrt -lpthread
+TEST_GEN_FILES = cow
+TEST_GEN_FILES += compaction_test
+TEST_GEN_FILES += gup_test
+TEST_GEN_FILES += hmm-tests
+TEST_GEN_FILES += hugetlb-madvise
+TEST_GEN_FILES += hugepage-mmap
+TEST_GEN_FILES += hugepage-mremap
+TEST_GEN_FILES += hugepage-shm
+TEST_GEN_FILES += hugepage-vmemmap
+TEST_GEN_FILES += khugepaged
+TEST_GEN_PROGS = madv_populate
+TEST_GEN_FILES += map_fixed_noreplace
+TEST_GEN_FILES += map_hugetlb
+TEST_GEN_FILES += map_populate
+TEST_GEN_FILES += memfd_secret
+TEST_GEN_FILES += migration
+TEST_GEN_FILES += mlock-random-test
+TEST_GEN_FILES += mlock2-tests
+TEST_GEN_FILES += mrelease_test
+TEST_GEN_FILES += mremap_dontunmap
+TEST_GEN_FILES += mremap_test
+TEST_GEN_FILES += on-fault-limit
+TEST_GEN_FILES += thuge-gen
+TEST_GEN_FILES += transhuge-stress
+TEST_GEN_FILES += userfaultfd
+TEST_GEN_PROGS += soft-dirty
+TEST_GEN_PROGS += split_huge_page_test
+TEST_GEN_FILES += ksm_tests
+TEST_GEN_PROGS += ksm_functional_tests
+
+ifeq ($(MACHINE),x86_64)
+CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32)
+CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c)
+CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie)
+
+VMTARGETS := protection_keys
+BINARIES_32 := $(VMTARGETS:%=%_32)
+BINARIES_64 := $(VMTARGETS:%=%_64)
+
+ifeq ($(CAN_BUILD_WITH_NOPIE),1)
+CFLAGS += -no-pie
+endif
+
+ifeq ($(CAN_BUILD_I386),1)
+TEST_GEN_FILES += $(BINARIES_32)
+endif
+
+ifeq ($(CAN_BUILD_X86_64),1)
+TEST_GEN_FILES += $(BINARIES_64)
+endif
+else
+
+ifneq (,$(findstring $(MACHINE),ppc64))
+TEST_GEN_FILES += protection_keys
+endif
+
+endif
+
+ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64))
+TEST_GEN_FILES += va_128TBswitch
+TEST_GEN_FILES += virtual_address_range
+TEST_GEN_FILES += write_to_hugetlbfs
+endif
+
+TEST_PROGS := run_vmtests.sh
+
+TEST_FILES := test_vmalloc.sh
+TEST_FILES += test_hmm.sh
+TEST_FILES += va_128TBswitch.sh
+
+include ../lib.mk
+
+$(OUTPUT)/cow: vm_util.c
+$(OUTPUT)/khugepaged: vm_util.c
+$(OUTPUT)/ksm_functional_tests: vm_util.c
+$(OUTPUT)/madv_populate: vm_util.c
+$(OUTPUT)/soft-dirty: vm_util.c
+$(OUTPUT)/split_huge_page_test: vm_util.c
+$(OUTPUT)/userfaultfd: vm_util.c
+
+ifeq ($(MACHINE),x86_64)
+BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
+BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
+
+define gen-target-rule-32
+$(1) $(1)_32: $(OUTPUT)/$(1)_32
+.PHONY: $(1) $(1)_32
+endef
+
+define gen-target-rule-64
+$(1) $(1)_64: $(OUTPUT)/$(1)_64
+.PHONY: $(1) $(1)_64
+endef
+
+ifeq ($(CAN_BUILD_I386),1)
+$(BINARIES_32): CFLAGS += -m32 -mxsave
+$(BINARIES_32): LDLIBS += -lrt -ldl -lm
+$(BINARIES_32): $(OUTPUT)/%_32: %.c
+	$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
+$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t))))
+endif
+
+ifeq ($(CAN_BUILD_X86_64),1)
+$(BINARIES_64): CFLAGS += -m64 -mxsave
+$(BINARIES_64): LDLIBS += -lrt -ldl
+$(BINARIES_64): $(OUTPUT)/%_64: %.c
+	$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
+$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t))))
+endif
+
+# x86_64 users should be encouraged to install 32-bit libraries
+ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01)
+all: warn_32bit_failure
+
+warn_32bit_failure:
+	@echo "Warning: you seem to have a broken 32-bit build" 2>&1;		\
+	echo  "environment. This will reduce test coverage of 64-bit" 2>&1;	\
+	echo  "kernels. If you are using a Debian-like distribution," 2>&1;	\
+	echo  "try:"; 2>&1;							\
+	echo  "";								\
+	echo  "  apt-get install gcc-multilib libc6-i386 libc6-dev-i386";	\
+	echo  "";								\
+	echo  "If you are using a Fedora-like distribution, try:";		\
+	echo  "";								\
+	echo  "  yum install glibc-devel.*i686";				\
+	exit 0;
+endif
+endif
+
+# cow_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
+$(OUTPUT)/cow: LDLIBS += $(COW_EXTRA_LIBS)
+
+$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap
+
+$(OUTPUT)/ksm_tests: LDLIBS += -lnuma
+
+$(OUTPUT)/migration: LDLIBS += -lnuma
+
+local_config.mk local_config.h: check_config.sh
+	/bin/sh ./check_config.sh $(CC)
+
+EXTRA_CLEAN += local_config.mk local_config.h
+
+ifeq ($(COW_EXTRA_LIBS),)
+all: warn_missing_liburing
+
+warn_missing_liburing:
+	@echo ; \
+	echo "Warning: missing liburing support. Some COW tests will be skipped." ; \
+	echo
+endif
diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
new file mode 100644
index 000000000000..a5cb4b09a46c
--- /dev/null
+++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
@@ -0,0 +1,584 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+set -e
+
+if [[ $(id -u) -ne 0 ]]; then
+  echo "This test must be run as root. Skipping..."
+  exit $ksft_skip
+fi
+
+fault_limit_file=limit_in_bytes
+reservation_limit_file=rsvd.limit_in_bytes
+fault_usage_file=usage_in_bytes
+reservation_usage_file=rsvd.usage_in_bytes
+
+if [[ "$1" == "-cgroup-v2" ]]; then
+  cgroup2=1
+  fault_limit_file=max
+  reservation_limit_file=rsvd.max
+  fault_usage_file=current
+  reservation_usage_file=rsvd.current
+fi
+
+if [[ $cgroup2 ]]; then
+  cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
+  if [[ -z "$cgroup_path" ]]; then
+    cgroup_path=/dev/cgroup/memory
+    mount -t cgroup2 none $cgroup_path
+    do_umount=1
+  fi
+  echo "+hugetlb" >$cgroup_path/cgroup.subtree_control
+else
+  cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}')
+  if [[ -z "$cgroup_path" ]]; then
+    cgroup_path=/dev/cgroup/memory
+    mount -t cgroup memory,hugetlb $cgroup_path
+    do_umount=1
+  fi
+fi
+export cgroup_path
+
+function cleanup() {
+  if [[ $cgroup2 ]]; then
+    echo $$ >$cgroup_path/cgroup.procs
+  else
+    echo $$ >$cgroup_path/tasks
+  fi
+
+  if [[ -e /mnt/huge ]]; then
+    rm -rf /mnt/huge/*
+    umount /mnt/huge || echo error
+    rmdir /mnt/huge
+  fi
+  if [[ -e $cgroup_path/hugetlb_cgroup_test ]]; then
+    rmdir $cgroup_path/hugetlb_cgroup_test
+  fi
+  if [[ -e $cgroup_path/hugetlb_cgroup_test1 ]]; then
+    rmdir $cgroup_path/hugetlb_cgroup_test1
+  fi
+  if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then
+    rmdir $cgroup_path/hugetlb_cgroup_test2
+  fi
+  echo 0 >/proc/sys/vm/nr_hugepages
+  echo CLEANUP DONE
+}
+
+function expect_equal() {
+  local expected="$1"
+  local actual="$2"
+  local error="$3"
+
+  if [[ "$expected" != "$actual" ]]; then
+    echo "expected ($expected) != actual ($actual): $3"
+    cleanup
+    exit 1
+  fi
+}
+
+function get_machine_hugepage_size() {
+  hpz=$(grep -i hugepagesize /proc/meminfo)
+  kb=${hpz:14:-3}
+  mb=$(($kb / 1024))
+  echo $mb
+}
+
+MB=$(get_machine_hugepage_size)
+
+function setup_cgroup() {
+  local name="$1"
+  local cgroup_limit="$2"
+  local reservation_limit="$3"
+
+  mkdir $cgroup_path/$name
+
+  echo writing cgroup limit: "$cgroup_limit"
+  echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file
+
+  echo writing reseravation limit: "$reservation_limit"
+  echo "$reservation_limit" > \
+    $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file
+
+  if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then
+    echo 0 >$cgroup_path/$name/cpuset.cpus
+  fi
+  if [ -e "$cgroup_path/$name/cpuset.mems" ]; then
+    echo 0 >$cgroup_path/$name/cpuset.mems
+  fi
+}
+
+function wait_for_hugetlb_memory_to_get_depleted() {
+  local cgroup="$1"
+  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
+  # Wait for hugetlbfs memory to get depleted.
+  while [ $(cat $path) != 0 ]; do
+    echo Waiting for hugetlb memory to get depleted.
+    cat $path
+    sleep 0.5
+  done
+}
+
+function wait_for_hugetlb_memory_to_get_reserved() {
+  local cgroup="$1"
+  local size="$2"
+
+  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
+  # Wait for hugetlbfs memory to get written.
+  while [ $(cat $path) != $size ]; do
+    echo Waiting for hugetlb memory reservation to reach size $size.
+    cat $path
+    sleep 0.5
+  done
+}
+
+function wait_for_hugetlb_memory_to_get_written() {
+  local cgroup="$1"
+  local size="$2"
+
+  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file"
+  # Wait for hugetlbfs memory to get written.
+  while [ $(cat $path) != $size ]; do
+    echo Waiting for hugetlb memory to reach size $size.
+    cat $path
+    sleep 0.5
+  done
+}
+
+function write_hugetlbfs_and_get_usage() {
+  local cgroup="$1"
+  local size="$2"
+  local populate="$3"
+  local write="$4"
+  local path="$5"
+  local method="$6"
+  local private="$7"
+  local expect_failure="$8"
+  local reserve="$9"
+
+  # Function return values.
+  reservation_failed=0
+  oom_killed=0
+  hugetlb_difference=0
+  reserved_difference=0
+
+  local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file
+  local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file
+
+  local hugetlb_before=$(cat $hugetlb_usage)
+  local reserved_before=$(cat $reserved_usage)
+
+  echo
+  echo Starting:
+  echo hugetlb_usage="$hugetlb_before"
+  echo reserved_usage="$reserved_before"
+  echo expect_failure is "$expect_failure"
+
+  output=$(mktemp)
+  set +e
+  if [[ "$method" == "1" ]] || [[ "$method" == 2 ]] ||
+    [[ "$private" == "-r" ]] && [[ "$expect_failure" != 1 ]]; then
+
+    bash write_hugetlb_memory.sh "$size" "$populate" "$write" \
+      "$cgroup" "$path" "$method" "$private" "-l" "$reserve" 2>&1 | tee $output &
+
+    local write_result=$?
+    local write_pid=$!
+
+    until grep -q -i "DONE" $output; do
+      echo waiting for DONE signal.
+      if ! ps $write_pid > /dev/null
+      then
+        echo "FAIL: The write died"
+        cleanup
+        exit 1
+      fi
+      sleep 0.5
+    done
+
+    echo ================= write_hugetlb_memory.sh output is:
+    cat $output
+    echo ================= end output.
+
+    if [[ "$populate" == "-o" ]] || [[ "$write" == "-w" ]]; then
+      wait_for_hugetlb_memory_to_get_written "$cgroup" "$size"
+    elif [[ "$reserve" != "-n" ]]; then
+      wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size"
+    else
+      # This case doesn't produce visible effects, but we still have
+      # to wait for the async process to start and execute...
+      sleep 0.5
+    fi
+
+    echo write_result is $write_result
+  else
+    bash write_hugetlb_memory.sh "$size" "$populate" "$write" \
+      "$cgroup" "$path" "$method" "$private" "$reserve"
+    local write_result=$?
+
+    if [[ "$reserve" != "-n" ]]; then
+      wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size"
+    fi
+  fi
+  set -e
+
+  if [[ "$write_result" == 1 ]]; then
+    reservation_failed=1
+  fi
+
+  # On linus/master, the above process gets SIGBUS'd on oomkill, with
+  # return code 135. On earlier kernels, it gets actual oomkill, with return
+  # code 137, so just check for both conditions in case we're testing
+  # against an earlier kernel.
+  if [[ "$write_result" == 135 ]] || [[ "$write_result" == 137 ]]; then
+    oom_killed=1
+  fi
+
+  local hugetlb_after=$(cat $hugetlb_usage)
+  local reserved_after=$(cat $reserved_usage)
+
+  echo After write:
+  echo hugetlb_usage="$hugetlb_after"
+  echo reserved_usage="$reserved_after"
+
+  hugetlb_difference=$(($hugetlb_after - $hugetlb_before))
+  reserved_difference=$(($reserved_after - $reserved_before))
+}
+
+function cleanup_hugetlb_memory() {
+  set +e
+  local cgroup="$1"
+  if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then
+    echo killing write_to_hugetlbfs
+    killall -2 write_to_hugetlbfs
+    wait_for_hugetlb_memory_to_get_depleted $cgroup
+  fi
+  set -e
+
+  if [[ -e /mnt/huge ]]; then
+    rm -rf /mnt/huge/*
+    umount /mnt/huge
+    rmdir /mnt/huge
+  fi
+}
+
+function run_test() {
+  local size=$(($1 * ${MB} * 1024 * 1024))
+  local populate="$2"
+  local write="$3"
+  local cgroup_limit=$(($4 * ${MB} * 1024 * 1024))
+  local reservation_limit=$(($5 * ${MB} * 1024 * 1024))
+  local nr_hugepages="$6"
+  local method="$7"
+  local private="$8"
+  local expect_failure="$9"
+  local reserve="${10}"
+
+  # Function return values.
+  hugetlb_difference=0
+  reserved_difference=0
+  reservation_failed=0
+  oom_killed=0
+
+  echo nr hugepages = "$nr_hugepages"
+  echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages
+
+  setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit"
+
+  mkdir -p /mnt/huge
+  mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge
+
+  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \
+    "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \
+    "$reserve"
+
+  cleanup_hugetlb_memory "hugetlb_cgroup_test"
+
+  local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file)
+  local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file)
+
+  echo $hugetlb_difference
+  echo $reserved_difference
+  expect_equal "0" "$final_hugetlb" "final hugetlb is not zero"
+  expect_equal "0" "$final_reservation" "final reservation is not zero"
+}
+
+function run_multiple_cgroup_test() {
+  local size1="$1"
+  local populate1="$2"
+  local write1="$3"
+  local cgroup_limit1="$4"
+  local reservation_limit1="$5"
+
+  local size2="$6"
+  local populate2="$7"
+  local write2="$8"
+  local cgroup_limit2="$9"
+  local reservation_limit2="${10}"
+
+  local nr_hugepages="${11}"
+  local method="${12}"
+  local private="${13}"
+  local expect_failure="${14}"
+  local reserve="${15}"
+
+  # Function return values.
+  hugetlb_difference1=0
+  reserved_difference1=0
+  reservation_failed1=0
+  oom_killed1=0
+
+  hugetlb_difference2=0
+  reserved_difference2=0
+  reservation_failed2=0
+  oom_killed2=0
+
+  echo nr hugepages = "$nr_hugepages"
+  echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages
+
+  setup_cgroup "hugetlb_cgroup_test1" "$cgroup_limit1" "$reservation_limit1"
+  setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2"
+
+  mkdir -p /mnt/huge
+  mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge
+
+  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \
+    "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \
+    "$expect_failure" "$reserve"
+
+  hugetlb_difference1=$hugetlb_difference
+  reserved_difference1=$reserved_difference
+  reservation_failed1=$reservation_failed
+  oom_killed1=$oom_killed
+
+  local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file
+  local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file
+  local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file
+  local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file
+
+  local usage_before_second_write=$(cat $cgroup1_hugetlb_usage)
+  local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage)
+
+  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test2" "$size2" \
+    "$populate2" "$write2" "/mnt/huge/test2" "$method" "$private" \
+    "$expect_failure" "$reserve"
+
+  hugetlb_difference2=$hugetlb_difference
+  reserved_difference2=$reserved_difference
+  reservation_failed2=$reservation_failed
+  oom_killed2=$oom_killed
+
+  expect_equal "$usage_before_second_write" \
+    "$(cat $cgroup1_hugetlb_usage)" "Usage changed."
+  expect_equal "$reservation_usage_before_second_write" \
+    "$(cat $cgroup1_reservation_usage)" "Reservation usage changed."
+
+  cleanup_hugetlb_memory
+
+  local final_hugetlb=$(cat $cgroup1_hugetlb_usage)
+  local final_reservation=$(cat $cgroup1_reservation_usage)
+
+  expect_equal "0" "$final_hugetlb" \
+    "hugetlbt_cgroup_test1 final hugetlb is not zero"
+  expect_equal "0" "$final_reservation" \
+    "hugetlbt_cgroup_test1 final reservation is not zero"
+
+  local final_hugetlb=$(cat $cgroup2_hugetlb_usage)
+  local final_reservation=$(cat $cgroup2_reservation_usage)
+
+  expect_equal "0" "$final_hugetlb" \
+    "hugetlb_cgroup_test2 final hugetlb is not zero"
+  expect_equal "0" "$final_reservation" \
+    "hugetlb_cgroup_test2 final reservation is not zero"
+}
+
+cleanup
+
+for populate in "" "-o"; do
+  for method in 0 1 2; do
+    for private in "" "-r"; do
+      for reserve in "" "-n"; do
+
+        # Skip mmap(MAP_HUGETLB | MAP_SHARED). Doesn't seem to be supported.
+        if [[ "$method" == 1 ]] && [[ "$private" == "" ]]; then
+          continue
+        fi
+
+        # Skip populated shmem tests. Doesn't seem to be supported.
+        if [[ "$method" == 2"" ]] && [[ "$populate" == "-o" ]]; then
+          continue
+        fi
+
+        if [[ "$method" == 2"" ]] && [[ "$reserve" == "-n" ]]; then
+          continue
+        fi
+
+        cleanup
+        echo
+        echo
+        echo
+        echo Test normal case.
+        echo private=$private, populate=$populate, method=$method, reserve=$reserve
+        run_test 5 "$populate" "" 10 10 10 "$method" "$private" "0" "$reserve"
+
+        echo Memory charged to hugtlb=$hugetlb_difference
+        echo Memory charged to reservation=$reserved_difference
+
+        if [[ "$populate" == "-o" ]]; then
+          expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \
+            "Reserved memory charged to hugetlb cgroup."
+        else
+          expect_equal "0" "$hugetlb_difference" \
+            "Reserved memory charged to hugetlb cgroup."
+        fi
+
+        if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then
+          expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \
+            "Reserved memory not charged to reservation usage."
+        else
+          expect_equal "0" "$reserved_difference" \
+            "Reserved memory not charged to reservation usage."
+        fi
+
+        echo 'PASS'
+
+        cleanup
+        echo
+        echo
+        echo
+        echo Test normal case with write.
+        echo private=$private, populate=$populate, method=$method, reserve=$reserve
+        run_test 5 "$populate" '-w' 5 5 10 "$method" "$private" "0" "$reserve"
+
+        echo Memory charged to hugtlb=$hugetlb_difference
+        echo Memory charged to reservation=$reserved_difference
+
+        expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \
+          "Reserved memory charged to hugetlb cgroup."
+
+        expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \
+          "Reserved memory not charged to reservation usage."
+
+        echo 'PASS'
+
+        cleanup
+        continue
+        echo
+        echo
+        echo
+        echo Test more than reservation case.
+        echo private=$private, populate=$populate, method=$method, reserve=$reserve
+
+        if [ "$reserve" != "-n" ]; then
+          run_test "5" "$populate" '' "10" "2" "10" "$method" "$private" "1" \
+            "$reserve"
+
+          expect_equal "1" "$reservation_failed" "Reservation succeeded."
+        fi
+
+        echo 'PASS'
+
+        cleanup
+
+        echo
+        echo
+        echo
+        echo Test more than cgroup limit case.
+        echo private=$private, populate=$populate, method=$method, reserve=$reserve
+
+        # Not sure if shm memory can be cleaned up when the process gets sigbus'd.
+        if [[ "$method" != 2 ]]; then
+          run_test 5 "$populate" "-w" 2 10 10 "$method" "$private" "1" "$reserve"
+
+          expect_equal "1" "$oom_killed" "Not oom killed."
+        fi
+        echo 'PASS'
+
+        cleanup
+
+        echo
+        echo
+        echo
+        echo Test normal case, multiple cgroups.
+        echo private=$private, populate=$populate, method=$method, reserve=$reserve
+        run_multiple_cgroup_test "3" "$populate" "" "10" "10" "5" \
+          "$populate" "" "10" "10" "10" \
+          "$method" "$private" "0" "$reserve"
+
+        echo Memory charged to hugtlb1=$hugetlb_difference1
+        echo Memory charged to reservation1=$reserved_difference1
+        echo Memory charged to hugtlb2=$hugetlb_difference2
+        echo Memory charged to reservation2=$reserved_difference2
+
+        if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then
+          expect_equal "3" "$reserved_difference1" \
+            "Incorrect reservations charged to cgroup 1."
+
+          expect_equal "5" "$reserved_difference2" \
+            "Incorrect reservation charged to cgroup 2."
+
+        else
+          expect_equal "0" "$reserved_difference1" \
+            "Incorrect reservations charged to cgroup 1."
+
+          expect_equal "0" "$reserved_difference2" \
+            "Incorrect reservation charged to cgroup 2."
+        fi
+
+        if [[ "$populate" == "-o" ]]; then
+          expect_equal "3" "$hugetlb_difference1" \
+            "Incorrect hugetlb charged to cgroup 1."
+
+          expect_equal "5" "$hugetlb_difference2" \
+            "Incorrect hugetlb charged to cgroup 2."
+
+        else
+          expect_equal "0" "$hugetlb_difference1" \
+            "Incorrect hugetlb charged to cgroup 1."
+
+          expect_equal "0" "$hugetlb_difference2" \
+            "Incorrect hugetlb charged to cgroup 2."
+        fi
+        echo 'PASS'
+
+        cleanup
+        echo
+        echo
+        echo
+        echo Test normal case with write, multiple cgroups.
+        echo private=$private, populate=$populate, method=$method, reserve=$reserve
+        run_multiple_cgroup_test "3" "$populate" "-w" "10" "10" "5" \
+          "$populate" "-w" "10" "10" "10" \
+          "$method" "$private" "0" "$reserve"
+
+        echo Memory charged to hugtlb1=$hugetlb_difference1
+        echo Memory charged to reservation1=$reserved_difference1
+        echo Memory charged to hugtlb2=$hugetlb_difference2
+        echo Memory charged to reservation2=$reserved_difference2
+
+        expect_equal "3" "$hugetlb_difference1" \
+          "Incorrect hugetlb charged to cgroup 1."
+
+        expect_equal "3" "$reserved_difference1" \
+          "Incorrect reservation charged to cgroup 1."
+
+        expect_equal "5" "$hugetlb_difference2" \
+          "Incorrect hugetlb charged to cgroup 2."
+
+        expect_equal "5" "$reserved_difference2" \
+          "Incorrected reservation charged to cgroup 2."
+        echo 'PASS'
+
+        cleanup
+
+      done # reserve
+    done   # private
+  done     # populate
+done       # method
+
+if [[ $do_umount ]]; then
+  umount $cgroup_path
+  rmdir $cgroup_path
+fi
diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh
new file mode 100644
index 000000000000..bcba3af0acea
--- /dev/null
+++ b/tools/testing/selftests/mm/check_config.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Probe for libraries and create header files to record the results. Both C
+# header files and Makefile include fragments are created.
+
+OUTPUT_H_FILE=local_config.h
+OUTPUT_MKFILE=local_config.mk
+
+tmpname=$(mktemp)
+tmpfile_c=${tmpname}.c
+tmpfile_o=${tmpname}.o
+
+# liburing
+echo "#include <sys/types.h>"        > $tmpfile_c
+echo "#include <liburing.h>"        >> $tmpfile_c
+echo "int func(void) { return 0; }" >> $tmpfile_c
+
+CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"}
+$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
+
+if [ -f $tmpfile_o ]; then
+    echo "#define LOCAL_CONFIG_HAVE_LIBURING 1"  > $OUTPUT_H_FILE
+    echo "COW_EXTRA_LIBS = -luring"              > $OUTPUT_MKFILE
+else
+    echo "// No liburing support found"          > $OUTPUT_H_FILE
+    echo "# No liburing support found, so:"      > $OUTPUT_MKFILE
+    echo "COW_EXTRA_LIBS = "                    >> $OUTPUT_MKFILE
+fi
+
+rm ${tmpname}.*
diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c
new file mode 100644
index 000000000000..9b420140ba2b
--- /dev/null
+++ b/tools/testing/selftests/mm/compaction_test.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * A test for the patch "Allow compaction of unevictable pages".
+ * With this patch we should be able to allocate at least 1/4
+ * of RAM in huge pages. Without the patch much less is
+ * allocated.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "../kselftest.h"
+
+#define MAP_SIZE_MB	100
+#define MAP_SIZE	(MAP_SIZE_MB * 1024 * 1024)
+
+struct map_list {
+	void *map;
+	struct map_list *next;
+};
+
+int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize)
+{
+	char  buffer[256] = {0};
+	char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'";
+	FILE *cmdfile = popen(cmd, "r");
+
+	if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
+		perror("Failed to read meminfo\n");
+		return -1;
+	}
+
+	pclose(cmdfile);
+
+	*memfree = atoll(buffer);
+	cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'";
+	cmdfile = popen(cmd, "r");
+
+	if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
+		perror("Failed to read meminfo\n");
+		return -1;
+	}
+
+	pclose(cmdfile);
+	*hugepagesize = atoll(buffer);
+
+	return 0;
+}
+
+int prereq(void)
+{
+	char allowed;
+	int fd;
+
+	fd = open("/proc/sys/vm/compact_unevictable_allowed",
+		  O_RDONLY | O_NONBLOCK);
+	if (fd < 0) {
+		perror("Failed to open\n"
+		       "/proc/sys/vm/compact_unevictable_allowed\n");
+		return -1;
+	}
+
+	if (read(fd, &allowed, sizeof(char)) != sizeof(char)) {
+		perror("Failed to read from\n"
+		       "/proc/sys/vm/compact_unevictable_allowed\n");
+		close(fd);
+		return -1;
+	}
+
+	close(fd);
+	if (allowed == '1')
+		return 0;
+
+	return -1;
+}
+
+int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
+{
+	int fd;
+	int compaction_index = 0;
+	char initial_nr_hugepages[10] = {0};
+	char nr_hugepages[10] = {0};
+
+	/* We want to test with 80% of available memory. Else, OOM killer comes
+	   in to play */
+	mem_free = mem_free * 0.8;
+
+	fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK);
+	if (fd < 0) {
+		perror("Failed to open /proc/sys/vm/nr_hugepages");
+		return -1;
+	}
+
+	if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) {
+		perror("Failed to read from /proc/sys/vm/nr_hugepages");
+		goto close_fd;
+	}
+
+	/* Start with the initial condition of 0 huge pages*/
+	if (write(fd, "0", sizeof(char)) != sizeof(char)) {
+		perror("Failed to write 0 to /proc/sys/vm/nr_hugepages\n");
+		goto close_fd;
+	}
+
+	lseek(fd, 0, SEEK_SET);
+
+	/* Request a large number of huge pages. The Kernel will allocate
+	   as much as it can */
+	if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) {
+		perror("Failed to write 100000 to /proc/sys/vm/nr_hugepages\n");
+		goto close_fd;
+	}
+
+	lseek(fd, 0, SEEK_SET);
+
+	if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) {
+		perror("Failed to re-read from /proc/sys/vm/nr_hugepages\n");
+		goto close_fd;
+	}
+
+	/* We should have been able to request at least 1/3 rd of the memory in
+	   huge pages */
+	compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size);
+
+	if (compaction_index > 3) {
+		printf("No of huge pages allocated = %d\n",
+		       (atoi(nr_hugepages)));
+		fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n"
+			"as huge pages\n", compaction_index);
+		goto close_fd;
+	}
+
+	printf("No of huge pages allocated = %d\n",
+	       (atoi(nr_hugepages)));
+
+	lseek(fd, 0, SEEK_SET);
+
+	if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages))
+	    != strlen(initial_nr_hugepages)) {
+		perror("Failed to write value to /proc/sys/vm/nr_hugepages\n");
+		goto close_fd;
+	}
+
+	close(fd);
+	return 0;
+
+ close_fd:
+	close(fd);
+	printf("Not OK. Compaction test failed.");
+	return -1;
+}
+
+
+int main(int argc, char **argv)
+{
+	struct rlimit lim;
+	struct map_list *list, *entry;
+	size_t page_size, i;
+	void *map = NULL;
+	unsigned long mem_free = 0;
+	unsigned long hugepage_size = 0;
+	long mem_fragmentable_MB = 0;
+
+	if (prereq() != 0) {
+		printf("Either the sysctl compact_unevictable_allowed is not\n"
+		       "set to 1 or couldn't read the proc file.\n"
+		       "Skipping the test\n");
+		return KSFT_SKIP;
+	}
+
+	lim.rlim_cur = RLIM_INFINITY;
+	lim.rlim_max = RLIM_INFINITY;
+	if (setrlimit(RLIMIT_MEMLOCK, &lim)) {
+		perror("Failed to set rlimit:\n");
+		return -1;
+	}
+
+	page_size = getpagesize();
+
+	list = NULL;
+
+	if (read_memory_info(&mem_free, &hugepage_size) != 0) {
+		printf("ERROR: Cannot read meminfo\n");
+		return -1;
+	}
+
+	mem_fragmentable_MB = mem_free * 0.8 / 1024;
+
+	while (mem_fragmentable_MB > 0) {
+		map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
+			   MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0);
+		if (map == MAP_FAILED)
+			break;
+
+		entry = malloc(sizeof(struct map_list));
+		if (!entry) {
+			munmap(map, MAP_SIZE);
+			break;
+		}
+		entry->map = map;
+		entry->next = list;
+		list = entry;
+
+		/* Write something (in this case the address of the map) to
+		 * ensure that KSM can't merge the mapped pages
+		 */
+		for (i = 0; i < MAP_SIZE; i += page_size)
+			*(unsigned long *)(map + i) = (unsigned long)map + i;
+
+		mem_fragmentable_MB -= MAP_SIZE_MB;
+	}
+
+	for (entry = list; entry != NULL; entry = entry->next) {
+		munmap(entry->map, MAP_SIZE);
+		if (!entry->next)
+			break;
+		entry = entry->next;
+	}
+
+	if (check_compaction(mem_free, hugepage_size) == 0)
+		return 0;
+
+	return -1;
+}
diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config
new file mode 100644
index 000000000000..be087c4bc396
--- /dev/null
+++ b/tools/testing/selftests/mm/config
@@ -0,0 +1,8 @@
+CONFIG_SYSVIPC=y
+CONFIG_USERFAULTFD=y
+CONFIG_TEST_VMALLOC=m
+CONFIG_DEVICE_PRIVATE=y
+CONFIG_TEST_HMM=m
+CONFIG_GUP_TEST=y
+CONFIG_TRANSPARENT_HUGEPAGE=y
+CONFIG_MEM_SOFT_DIRTY=y
diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c
new file mode 100644
index 000000000000..16216d893d96
--- /dev/null
+++ b/tools/testing/selftests/mm/cow.c
@@ -0,0 +1,1764 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * COW (Copy On Write) tests.
+ *
+ * Copyright 2022, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <assert.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <linux/memfd.h>
+
+#include "local_config.h"
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+#include <liburing.h>
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+
+#include "../../../../mm/gup_test.h"
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#ifndef MADV_COLLAPSE
+#define MADV_COLLAPSE 25
+#endif
+
+static size_t pagesize;
+static int pagemap_fd;
+static size_t thpsize;
+static int nr_hugetlbsizes;
+static size_t hugetlbsizes[10];
+static int gup_fd;
+static bool has_huge_zeropage;
+
+static void detect_thpsize(void)
+{
+	int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size",
+		      O_RDONLY);
+	size_t size = 0;
+	char buf[15];
+	int ret;
+
+	if (fd < 0)
+		return;
+
+	ret = pread(fd, buf, sizeof(buf), 0);
+	if (ret > 0 && ret < sizeof(buf)) {
+		buf[ret] = 0;
+
+		size = strtoul(buf, NULL, 10);
+		if (size < pagesize)
+			size = 0;
+		if (size > 0) {
+			thpsize = size;
+			ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
+				       thpsize / 1024);
+		}
+	}
+
+	close(fd);
+}
+
+static void detect_huge_zeropage(void)
+{
+	int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
+		      O_RDONLY);
+	size_t enabled = 0;
+	char buf[15];
+	int ret;
+
+	if (fd < 0)
+		return;
+
+	ret = pread(fd, buf, sizeof(buf), 0);
+	if (ret > 0 && ret < sizeof(buf)) {
+		buf[ret] = 0;
+
+		enabled = strtoul(buf, NULL, 10);
+		if (enabled == 1) {
+			has_huge_zeropage = true;
+			ksft_print_msg("[INFO] huge zeropage is enabled\n");
+		}
+	}
+
+	close(fd);
+}
+
+static void detect_hugetlbsizes(void)
+{
+	DIR *dir = opendir("/sys/kernel/mm/hugepages/");
+
+	if (!dir)
+		return;
+
+	while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) {
+		struct dirent *entry = readdir(dir);
+		size_t kb;
+
+		if (!entry)
+			break;
+		if (entry->d_type != DT_DIR)
+			continue;
+		if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1)
+			continue;
+		hugetlbsizes[nr_hugetlbsizes] = kb * 1024;
+		nr_hugetlbsizes++;
+		ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n",
+			       kb);
+	}
+	closedir(dir);
+}
+
+static bool range_is_swapped(void *addr, size_t size)
+{
+	for (; size; addr += pagesize, size -= pagesize)
+		if (!pagemap_is_swapped(pagemap_fd, addr))
+			return false;
+	return true;
+}
+
+struct comm_pipes {
+	int child_ready[2];
+	int parent_ready[2];
+};
+
+static int setup_comm_pipes(struct comm_pipes *comm_pipes)
+{
+	if (pipe(comm_pipes->child_ready) < 0)
+		return -errno;
+	if (pipe(comm_pipes->parent_ready) < 0) {
+		close(comm_pipes->child_ready[0]);
+		close(comm_pipes->child_ready[1]);
+		return -errno;
+	}
+
+	return 0;
+}
+
+static void close_comm_pipes(struct comm_pipes *comm_pipes)
+{
+	close(comm_pipes->child_ready[0]);
+	close(comm_pipes->child_ready[1]);
+	close(comm_pipes->parent_ready[0]);
+	close(comm_pipes->parent_ready[1]);
+}
+
+static int child_memcmp_fn(char *mem, size_t size,
+			   struct comm_pipes *comm_pipes)
+{
+	char *old = malloc(size);
+	char buf;
+
+	/* Backup the original content. */
+	memcpy(old, mem, size);
+
+	/* Wait until the parent modified the page. */
+	write(comm_pipes->child_ready[1], "0", 1);
+	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
+		;
+
+	/* See if we still read the old values. */
+	return memcmp(old, mem, size);
+}
+
+static int child_vmsplice_memcmp_fn(char *mem, size_t size,
+				    struct comm_pipes *comm_pipes)
+{
+	struct iovec iov = {
+		.iov_base = mem,
+		.iov_len = size,
+	};
+	ssize_t cur, total, transferred;
+	char *old, *new;
+	int fds[2];
+	char buf;
+
+	old = malloc(size);
+	new = malloc(size);
+
+	/* Backup the original content. */
+	memcpy(old, mem, size);
+
+	if (pipe(fds) < 0)
+		return -errno;
+
+	/* Trigger a read-only pin. */
+	transferred = vmsplice(fds[1], &iov, 1, 0);
+	if (transferred < 0)
+		return -errno;
+	if (transferred == 0)
+		return -EINVAL;
+
+	/* Unmap it from our page tables. */
+	if (munmap(mem, size) < 0)
+		return -errno;
+
+	/* Wait until the parent modified it. */
+	write(comm_pipes->child_ready[1], "0", 1);
+	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
+		;
+
+	/* See if we still read the old values via the pipe. */
+	for (total = 0; total < transferred; total += cur) {
+		cur = read(fds[0], new + total, transferred - total);
+		if (cur < 0)
+			return -errno;
+	}
+
+	return memcmp(old, new, transferred);
+}
+
+typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
+
+static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
+				  child_fn fn)
+{
+	struct comm_pipes comm_pipes;
+	char buf;
+	int ret;
+
+	ret = setup_comm_pipes(&comm_pipes);
+	if (ret) {
+		ksft_test_result_fail("pipe() failed\n");
+		return;
+	}
+
+	ret = fork();
+	if (ret < 0) {
+		ksft_test_result_fail("fork() failed\n");
+		goto close_comm_pipes;
+	} else if (!ret) {
+		exit(fn(mem, size, &comm_pipes));
+	}
+
+	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+		;
+
+	if (do_mprotect) {
+		/*
+		 * mprotect() optimizations might try avoiding
+		 * write-faults by directly mapping pages writable.
+		 */
+		ret = mprotect(mem, size, PROT_READ);
+		ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
+		if (ret) {
+			ksft_test_result_fail("mprotect() failed\n");
+			write(comm_pipes.parent_ready[1], "0", 1);
+			wait(&ret);
+			goto close_comm_pipes;
+		}
+	}
+
+	/* Modify the page. */
+	memset(mem, 0xff, size);
+	write(comm_pipes.parent_ready[1], "0", 1);
+
+	wait(&ret);
+	if (WIFEXITED(ret))
+		ret = WEXITSTATUS(ret);
+	else
+		ret = -EINVAL;
+
+	ksft_test_result(!ret, "No leak from parent into child\n");
+close_comm_pipes:
+	close_comm_pipes(&comm_pipes);
+}
+
+static void test_cow_in_parent(char *mem, size_t size)
+{
+	do_test_cow_in_parent(mem, size, false, child_memcmp_fn);
+}
+
+static void test_cow_in_parent_mprotect(char *mem, size_t size)
+{
+	do_test_cow_in_parent(mem, size, true, child_memcmp_fn);
+}
+
+static void test_vmsplice_in_child(char *mem, size_t size)
+{
+	do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn);
+}
+
+static void test_vmsplice_in_child_mprotect(char *mem, size_t size)
+{
+	do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn);
+}
+
+static void do_test_vmsplice_in_parent(char *mem, size_t size,
+				       bool before_fork)
+{
+	struct iovec iov = {
+		.iov_base = mem,
+		.iov_len = size,
+	};
+	ssize_t cur, total, transferred;
+	struct comm_pipes comm_pipes;
+	char *old, *new;
+	int ret, fds[2];
+	char buf;
+
+	old = malloc(size);
+	new = malloc(size);
+
+	memcpy(old, mem, size);
+
+	ret = setup_comm_pipes(&comm_pipes);
+	if (ret) {
+		ksft_test_result_fail("pipe() failed\n");
+		goto free;
+	}
+
+	if (pipe(fds) < 0) {
+		ksft_test_result_fail("pipe() failed\n");
+		goto close_comm_pipes;
+	}
+
+	if (before_fork) {
+		transferred = vmsplice(fds[1], &iov, 1, 0);
+		if (transferred <= 0) {
+			ksft_test_result_fail("vmsplice() failed\n");
+			goto close_pipe;
+		}
+	}
+
+	ret = fork();
+	if (ret < 0) {
+		ksft_test_result_fail("fork() failed\n");
+		goto close_pipe;
+	} else if (!ret) {
+		write(comm_pipes.child_ready[1], "0", 1);
+		while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+			;
+		/* Modify page content in the child. */
+		memset(mem, 0xff, size);
+		exit(0);
+	}
+
+	if (!before_fork) {
+		transferred = vmsplice(fds[1], &iov, 1, 0);
+		if (transferred <= 0) {
+			ksft_test_result_fail("vmsplice() failed\n");
+			wait(&ret);
+			goto close_pipe;
+		}
+	}
+
+	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+		;
+	if (munmap(mem, size) < 0) {
+		ksft_test_result_fail("munmap() failed\n");
+		goto close_pipe;
+	}
+	write(comm_pipes.parent_ready[1], "0", 1);
+
+	/* Wait until the child is done writing. */
+	wait(&ret);
+	if (!WIFEXITED(ret)) {
+		ksft_test_result_fail("wait() failed\n");
+		goto close_pipe;
+	}
+
+	/* See if we still read the old values. */
+	for (total = 0; total < transferred; total += cur) {
+		cur = read(fds[0], new + total, transferred - total);
+		if (cur < 0) {
+			ksft_test_result_fail("read() failed\n");
+			goto close_pipe;
+		}
+	}
+
+	ksft_test_result(!memcmp(old, new, transferred),
+			 "No leak from child into parent\n");
+close_pipe:
+	close(fds[0]);
+	close(fds[1]);
+close_comm_pipes:
+	close_comm_pipes(&comm_pipes);
+free:
+	free(old);
+	free(new);
+}
+
+static void test_vmsplice_before_fork(char *mem, size_t size)
+{
+	do_test_vmsplice_in_parent(mem, size, true);
+}
+
+static void test_vmsplice_after_fork(char *mem, size_t size)
+{
+	do_test_vmsplice_in_parent(mem, size, false);
+}
+
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+static void do_test_iouring(char *mem, size_t size, bool use_fork)
+{
+	struct comm_pipes comm_pipes;
+	struct io_uring_cqe *cqe;
+	struct io_uring_sqe *sqe;
+	struct io_uring ring;
+	ssize_t cur, total;
+	struct iovec iov;
+	char *buf, *tmp;
+	int ret, fd;
+	FILE *file;
+
+	ret = setup_comm_pipes(&comm_pipes);
+	if (ret) {
+		ksft_test_result_fail("pipe() failed\n");
+		return;
+	}
+
+	file = tmpfile();
+	if (!file) {
+		ksft_test_result_fail("tmpfile() failed\n");
+		goto close_comm_pipes;
+	}
+	fd = fileno(file);
+	assert(fd);
+
+	tmp = malloc(size);
+	if (!tmp) {
+		ksft_test_result_fail("malloc() failed\n");
+		goto close_file;
+	}
+
+	/* Skip on errors, as we might just lack kernel support. */
+	ret = io_uring_queue_init(1, &ring, 0);
+	if (ret < 0) {
+		ksft_test_result_skip("io_uring_queue_init() failed\n");
+		goto free_tmp;
+	}
+
+	/*
+	 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
+	 * | FOLL_LONGTERM the range.
+	 *
+	 * Skip on errors, as we might just lack kernel support or might not
+	 * have sufficient MEMLOCK permissions.
+	 */
+	iov.iov_base = mem;
+	iov.iov_len = size;
+	ret = io_uring_register_buffers(&ring, &iov, 1);
+	if (ret) {
+		ksft_test_result_skip("io_uring_register_buffers() failed\n");
+		goto queue_exit;
+	}
+
+	if (use_fork) {
+		/*
+		 * fork() and keep the child alive until we're done. Note that
+		 * we expect the pinned page to not get shared with the child.
+		 */
+		ret = fork();
+		if (ret < 0) {
+			ksft_test_result_fail("fork() failed\n");
+			goto unregister_buffers;
+		} else if (!ret) {
+			write(comm_pipes.child_ready[1], "0", 1);
+			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+				;
+			exit(0);
+		}
+
+		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+			;
+	} else {
+		/*
+		 * Map the page R/O into the page table. Enable softdirty
+		 * tracking to stop the page from getting mapped R/W immediately
+		 * again by mprotect() optimizations. Note that we don't have an
+		 * easy way to test if that worked (the pagemap does not export
+		 * if the page is mapped R/O vs. R/W).
+		 */
+		ret = mprotect(mem, size, PROT_READ);
+		clear_softdirty();
+		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
+		if (ret) {
+			ksft_test_result_fail("mprotect() failed\n");
+			goto unregister_buffers;
+		}
+	}
+
+	/*
+	 * Modify the page and write page content as observed by the fixed
+	 * buffer pin to the file so we can verify it.
+	 */
+	memset(mem, 0xff, size);
+	sqe = io_uring_get_sqe(&ring);
+	if (!sqe) {
+		ksft_test_result_fail("io_uring_get_sqe() failed\n");
+		goto quit_child;
+	}
+	io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
+
+	ret = io_uring_submit(&ring);
+	if (ret < 0) {
+		ksft_test_result_fail("io_uring_submit() failed\n");
+		goto quit_child;
+	}
+
+	ret = io_uring_wait_cqe(&ring, &cqe);
+	if (ret < 0) {
+		ksft_test_result_fail("io_uring_wait_cqe() failed\n");
+		goto quit_child;
+	}
+
+	if (cqe->res != size) {
+		ksft_test_result_fail("write_fixed failed\n");
+		goto quit_child;
+	}
+	io_uring_cqe_seen(&ring, cqe);
+
+	/* Read back the file content to the temporary buffer. */
+	total = 0;
+	while (total < size) {
+		cur = pread(fd, tmp + total, size - total, total);
+		if (cur < 0) {
+			ksft_test_result_fail("pread() failed\n");
+			goto quit_child;
+		}
+		total += cur;
+	}
+
+	/* Finally, check if we read what we expected. */
+	ksft_test_result(!memcmp(mem, tmp, size),
+			 "Longterm R/W pin is reliable\n");
+
+quit_child:
+	if (use_fork) {
+		write(comm_pipes.parent_ready[1], "0", 1);
+		wait(&ret);
+	}
+unregister_buffers:
+	io_uring_unregister_buffers(&ring);
+queue_exit:
+	io_uring_queue_exit(&ring);
+free_tmp:
+	free(tmp);
+close_file:
+	fclose(file);
+close_comm_pipes:
+	close_comm_pipes(&comm_pipes);
+}
+
+static void test_iouring_ro(char *mem, size_t size)
+{
+	do_test_iouring(mem, size, false);
+}
+
+static void test_iouring_fork(char *mem, size_t size)
+{
+	do_test_iouring(mem, size, true);
+}
+
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+
+enum ro_pin_test {
+	RO_PIN_TEST,
+	RO_PIN_TEST_SHARED,
+	RO_PIN_TEST_PREVIOUSLY_SHARED,
+	RO_PIN_TEST_RO_EXCLUSIVE,
+};
+
+static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
+			   bool fast)
+{
+	struct pin_longterm_test args;
+	struct comm_pipes comm_pipes;
+	char *tmp, buf;
+	__u64 tmp_val;
+	int ret;
+
+	if (gup_fd < 0) {
+		ksft_test_result_skip("gup_test not available\n");
+		return;
+	}
+
+	tmp = malloc(size);
+	if (!tmp) {
+		ksft_test_result_fail("malloc() failed\n");
+		return;
+	}
+
+	ret = setup_comm_pipes(&comm_pipes);
+	if (ret) {
+		ksft_test_result_fail("pipe() failed\n");
+		goto free_tmp;
+	}
+
+	switch (test) {
+	case RO_PIN_TEST:
+		break;
+	case RO_PIN_TEST_SHARED:
+	case RO_PIN_TEST_PREVIOUSLY_SHARED:
+		/*
+		 * Share the pages with our child. As the pages are not pinned,
+		 * this should just work.
+		 */
+		ret = fork();
+		if (ret < 0) {
+			ksft_test_result_fail("fork() failed\n");
+			goto close_comm_pipes;
+		} else if (!ret) {
+			write(comm_pipes.child_ready[1], "0", 1);
+			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+				;
+			exit(0);
+		}
+
+		/* Wait until our child is ready. */
+		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+			;
+
+		if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
+			/*
+			 * Tell the child to quit now and wait until it quit.
+			 * The pages should now be mapped R/O into our page
+			 * tables, but they are no longer shared.
+			 */
+			write(comm_pipes.parent_ready[1], "0", 1);
+			wait(&ret);
+			if (!WIFEXITED(ret))
+				ksft_print_msg("[INFO] wait() failed\n");
+		}
+		break;
+	case RO_PIN_TEST_RO_EXCLUSIVE:
+		/*
+		 * Map the page R/O into the page table. Enable softdirty
+		 * tracking to stop the page from getting mapped R/W immediately
+		 * again by mprotect() optimizations. Note that we don't have an
+		 * easy way to test if that worked (the pagemap does not export
+		 * if the page is mapped R/O vs. R/W).
+		 */
+		ret = mprotect(mem, size, PROT_READ);
+		clear_softdirty();
+		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
+		if (ret) {
+			ksft_test_result_fail("mprotect() failed\n");
+			goto close_comm_pipes;
+		}
+		break;
+	default:
+		assert(false);
+	}
+
+	/* Take a R/O pin. This should trigger unsharing. */
+	args.addr = (__u64)(uintptr_t)mem;
+	args.size = size;
+	args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
+	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
+	if (ret) {
+		if (errno == EINVAL)
+			ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
+		else
+			ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
+		goto wait;
+	}
+
+	/* Modify the page. */
+	memset(mem, 0xff, size);
+
+	/*
+	 * Read back the content via the pin to the temporary buffer and
+	 * test if we observed the modification.
+	 */
+	tmp_val = (__u64)(uintptr_t)tmp;
+	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
+	if (ret)
+		ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
+	else
+		ksft_test_result(!memcmp(mem, tmp, size),
+				 "Longterm R/O pin is reliable\n");
+
+	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
+	if (ret)
+		ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
+wait:
+	switch (test) {
+	case RO_PIN_TEST_SHARED:
+		write(comm_pipes.parent_ready[1], "0", 1);
+		wait(&ret);
+		if (!WIFEXITED(ret))
+			ksft_print_msg("[INFO] wait() failed\n");
+		break;
+	default:
+		break;
+	}
+close_comm_pipes:
+	close_comm_pipes(&comm_pipes);
+free_tmp:
+	free(tmp);
+}
+
+static void test_ro_pin_on_shared(char *mem, size_t size)
+{
+	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
+}
+
+static void test_ro_fast_pin_on_shared(char *mem, size_t size)
+{
+	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
+}
+
+static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
+{
+	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
+}
+
+static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
+{
+	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
+}
+
+static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
+{
+	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
+}
+
+static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
+{
+	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
+}
+
+typedef void (*test_fn)(char *mem, size_t size);
+
+static void do_run_with_base_page(test_fn fn, bool swapout)
+{
+	char *mem;
+	int ret;
+
+	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (mem == MAP_FAILED) {
+		ksft_test_result_fail("mmap() failed\n");
+		return;
+	}
+
+	ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
+	/* Ignore if not around on a kernel. */
+	if (ret && errno != EINVAL) {
+		ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
+		goto munmap;
+	}
+
+	/* Populate a base page. */
+	memset(mem, 0, pagesize);
+
+	if (swapout) {
+		madvise(mem, pagesize, MADV_PAGEOUT);
+		if (!pagemap_is_swapped(pagemap_fd, mem)) {
+			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
+			goto munmap;
+		}
+	}
+
+	fn(mem, pagesize);
+munmap:
+	munmap(mem, pagesize);
+}
+
+static void run_with_base_page(test_fn fn, const char *desc)
+{
+	ksft_print_msg("[RUN] %s ... with base page\n", desc);
+	do_run_with_base_page(fn, false);
+}
+
+static void run_with_base_page_swap(test_fn fn, const char *desc)
+{
+	ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
+	do_run_with_base_page(fn, true);
+}
+
+enum thp_run {
+	THP_RUN_PMD,
+	THP_RUN_PMD_SWAPOUT,
+	THP_RUN_PTE,
+	THP_RUN_PTE_SWAPOUT,
+	THP_RUN_SINGLE_PTE,
+	THP_RUN_SINGLE_PTE_SWAPOUT,
+	THP_RUN_PARTIAL_MREMAP,
+	THP_RUN_PARTIAL_SHARED,
+};
+
+static void do_run_with_thp(test_fn fn, enum thp_run thp_run)
+{
+	char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
+	size_t size, mmap_size, mremap_size;
+	int ret;
+
+	/* For alignment purposes, we need twice the thp size. */
+	mmap_size = 2 * thpsize;
+	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (mmap_mem == MAP_FAILED) {
+		ksft_test_result_fail("mmap() failed\n");
+		return;
+	}
+
+	/* We need a THP-aligned memory area. */
+	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
+
+	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
+	if (ret) {
+		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
+		goto munmap;
+	}
+
+	/*
+	 * Try to populate a THP. Touch the first sub-page and test if we get
+	 * another sub-page populated automatically.
+	 */
+	mem[0] = 0;
+	if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) {
+		ksft_test_result_skip("Did not get a THP populated\n");
+		goto munmap;
+	}
+	memset(mem, 0, thpsize);
+
+	size = thpsize;
+	switch (thp_run) {
+	case THP_RUN_PMD:
+	case THP_RUN_PMD_SWAPOUT:
+		break;
+	case THP_RUN_PTE:
+	case THP_RUN_PTE_SWAPOUT:
+		/*
+		 * Trigger PTE-mapping the THP by temporarily mapping a single
+		 * subpage R/O.
+		 */
+		ret = mprotect(mem + pagesize, pagesize, PROT_READ);
+		if (ret) {
+			ksft_test_result_fail("mprotect() failed\n");
+			goto munmap;
+		}
+		ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
+		if (ret) {
+			ksft_test_result_fail("mprotect() failed\n");
+			goto munmap;
+		}
+		break;
+	case THP_RUN_SINGLE_PTE:
+	case THP_RUN_SINGLE_PTE_SWAPOUT:
+		/*
+		 * Discard all but a single subpage of that PTE-mapped THP. What
+		 * remains is a single PTE mapping a single subpage.
+		 */
+		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
+		if (ret) {
+			ksft_test_result_fail("MADV_DONTNEED failed\n");
+			goto munmap;
+		}
+		size = pagesize;
+		break;
+	case THP_RUN_PARTIAL_MREMAP:
+		/*
+		 * Remap half of the THP. We need some new memory location
+		 * for that.
+		 */
+		mremap_size = thpsize / 2;
+		mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
+				  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		if (mem == MAP_FAILED) {
+			ksft_test_result_fail("mmap() failed\n");
+			goto munmap;
+		}
+		tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
+			     MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
+		if (tmp != mremap_mem) {
+			ksft_test_result_fail("mremap() failed\n");
+			goto munmap;
+		}
+		size = mremap_size;
+		break;
+	case THP_RUN_PARTIAL_SHARED:
+		/*
+		 * Share the first page of the THP with a child and quit the
+		 * child. This will result in some parts of the THP never
+		 * have been shared.
+		 */
+		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
+		if (ret) {
+			ksft_test_result_fail("MADV_DONTFORK failed\n");
+			goto munmap;
+		}
+		ret = fork();
+		if (ret < 0) {
+			ksft_test_result_fail("fork() failed\n");
+			goto munmap;
+		} else if (!ret) {
+			exit(0);
+		}
+		wait(&ret);
+		/* Allow for sharing all pages again. */
+		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
+		if (ret) {
+			ksft_test_result_fail("MADV_DOFORK failed\n");
+			goto munmap;
+		}
+		break;
+	default:
+		assert(false);
+	}
+
+	switch (thp_run) {
+	case THP_RUN_PMD_SWAPOUT:
+	case THP_RUN_PTE_SWAPOUT:
+	case THP_RUN_SINGLE_PTE_SWAPOUT:
+		madvise(mem, size, MADV_PAGEOUT);
+		if (!range_is_swapped(mem, size)) {
+			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
+			goto munmap;
+		}
+		break;
+	default:
+		break;
+	}
+
+	fn(mem, size);
+munmap:
+	munmap(mmap_mem, mmap_size);
+	if (mremap_mem != MAP_FAILED)
+		munmap(mremap_mem, mremap_size);
+}
+
+static void run_with_thp(test_fn fn, const char *desc)
+{
+	ksft_print_msg("[RUN] %s ... with THP\n", desc);
+	do_run_with_thp(fn, THP_RUN_PMD);
+}
+
+static void run_with_thp_swap(test_fn fn, const char *desc)
+{
+	ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc);
+	do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT);
+}
+
+static void run_with_pte_mapped_thp(test_fn fn, const char *desc)
+{
+	ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc);
+	do_run_with_thp(fn, THP_RUN_PTE);
+}
+
+static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc)
+{
+	ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc);
+	do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT);
+}
+
+static void run_with_single_pte_of_thp(test_fn fn, const char *desc)
+{
+	ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc);
+	do_run_with_thp(fn, THP_RUN_SINGLE_PTE);
+}
+
+static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc)
+{
+	ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc);
+	do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT);
+}
+
+static void run_with_partial_mremap_thp(test_fn fn, const char *desc)
+{
+	ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc);
+	do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP);
+}
+
+static void run_with_partial_shared_thp(test_fn fn, const char *desc)
+{
+	ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc);
+	do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED);
+}
+
+static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
+{
+	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
+	char *mem, *dummy;
+
+	ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
+		       hugetlbsize / 1024);
+
+	flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
+
+	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
+	if (mem == MAP_FAILED) {
+		ksft_test_result_skip("need more free huge pages\n");
+		return;
+	}
+
+	/* Populate an huge page. */
+	memset(mem, 0, hugetlbsize);
+
+	/*
+	 * We need a total of two hugetlb pages to handle COW/unsharing
+	 * properly, otherwise we might get zapped by a SIGBUS.
+	 */
+	dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
+	if (dummy == MAP_FAILED) {
+		ksft_test_result_skip("need more free huge pages\n");
+		goto munmap;
+	}
+	munmap(dummy, hugetlbsize);
+
+	fn(mem, hugetlbsize);
+munmap:
+	munmap(mem, hugetlbsize);
+}
+
+struct test_case {
+	const char *desc;
+	test_fn fn;
+};
+
+/*
+ * Test cases that are specific to anonymous pages: pages in private mappings
+ * that may get shared via COW during fork().
+ */
+static const struct test_case anon_test_cases[] = {
+	/*
+	 * Basic COW tests for fork() without any GUP. If we miss to break COW,
+	 * either the child can observe modifications by the parent or the
+	 * other way around.
+	 */
+	{
+		"Basic COW after fork()",
+		test_cow_in_parent,
+	},
+	/*
+	 * Basic test, but do an additional mprotect(PROT_READ)+
+	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
+	 */
+	{
+		"Basic COW after fork() with mprotect() optimization",
+		test_cow_in_parent_mprotect,
+	},
+	/*
+	 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
+	 * we miss to break COW, the child observes modifications by the parent.
+	 * This is CVE-2020-29374 reported by Jann Horn.
+	 */
+	{
+		"vmsplice() + unmap in child",
+		test_vmsplice_in_child
+	},
+	/*
+	 * vmsplice() test, but do an additional mprotect(PROT_READ)+
+	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
+	 */
+	{
+		"vmsplice() + unmap in child with mprotect() optimization",
+		test_vmsplice_in_child_mprotect
+	},
+	/*
+	 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
+	 * fork(); modify in the child. If we miss to break COW, the parent
+	 * observes modifications by the child.
+	 */
+	{
+		"vmsplice() before fork(), unmap in parent after fork()",
+		test_vmsplice_before_fork,
+	},
+	/*
+	 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
+	 * child. If we miss to break COW, the parent observes modifications by
+	 * the child.
+	 */
+	{
+		"vmsplice() + unmap in parent after fork()",
+		test_vmsplice_after_fork,
+	},
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+	/*
+	 * Take a R/W longterm pin and then map the page R/O into the page
+	 * table to trigger a write fault on next access. When modifying the
+	 * page, the page content must be visible via the pin.
+	 */
+	{
+		"R/O-mapping a page registered as iouring fixed buffer",
+		test_iouring_ro,
+	},
+	/*
+	 * Take a R/W longterm pin and then fork() a child. When modifying the
+	 * page, the page content must be visible via the pin. We expect the
+	 * pinned page to not get shared with the child.
+	 */
+	{
+		"fork() with an iouring fixed buffer",
+		test_iouring_fork,
+	},
+
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+	/*
+	 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
+	 * When modifying the page via the page table, the page content change
+	 * must be visible via the pin.
+	 */
+	{
+		"R/O GUP pin on R/O-mapped shared page",
+		test_ro_pin_on_shared,
+	},
+	/* Same as above, but using GUP-fast. */
+	{
+		"R/O GUP-fast pin on R/O-mapped shared page",
+		test_ro_fast_pin_on_shared,
+	},
+	/*
+	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
+	 * was previously shared. When modifying the page via the page table,
+	 * the page content change must be visible via the pin.
+	 */
+	{
+		"R/O GUP pin on R/O-mapped previously-shared page",
+		test_ro_pin_on_ro_previously_shared,
+	},
+	/* Same as above, but using GUP-fast. */
+	{
+		"R/O GUP-fast pin on R/O-mapped previously-shared page",
+		test_ro_fast_pin_on_ro_previously_shared,
+	},
+	/*
+	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
+	 * When modifying the page via the page table, the page content change
+	 * must be visible via the pin.
+	 */
+	{
+		"R/O GUP pin on R/O-mapped exclusive page",
+		test_ro_pin_on_ro_exclusive,
+	},
+	/* Same as above, but using GUP-fast. */
+	{
+		"R/O GUP-fast pin on R/O-mapped exclusive page",
+		test_ro_fast_pin_on_ro_exclusive,
+	},
+};
+
+static void run_anon_test_case(struct test_case const *test_case)
+{
+	int i;
+
+	run_with_base_page(test_case->fn, test_case->desc);
+	run_with_base_page_swap(test_case->fn, test_case->desc);
+	if (thpsize) {
+		run_with_thp(test_case->fn, test_case->desc);
+		run_with_thp_swap(test_case->fn, test_case->desc);
+		run_with_pte_mapped_thp(test_case->fn, test_case->desc);
+		run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc);
+		run_with_single_pte_of_thp(test_case->fn, test_case->desc);
+		run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc);
+		run_with_partial_mremap_thp(test_case->fn, test_case->desc);
+		run_with_partial_shared_thp(test_case->fn, test_case->desc);
+	}
+	for (i = 0; i < nr_hugetlbsizes; i++)
+		run_with_hugetlb(test_case->fn, test_case->desc,
+				 hugetlbsizes[i]);
+}
+
+static void run_anon_test_cases(void)
+{
+	int i;
+
+	ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
+
+	for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
+		run_anon_test_case(&anon_test_cases[i]);
+}
+
+static int tests_per_anon_test_case(void)
+{
+	int tests = 2 + nr_hugetlbsizes;
+
+	if (thpsize)
+		tests += 8;
+	return tests;
+}
+
+enum anon_thp_collapse_test {
+	ANON_THP_COLLAPSE_UNSHARED,
+	ANON_THP_COLLAPSE_FULLY_SHARED,
+	ANON_THP_COLLAPSE_LOWER_SHARED,
+	ANON_THP_COLLAPSE_UPPER_SHARED,
+};
+
+static void do_test_anon_thp_collapse(char *mem, size_t size,
+				      enum anon_thp_collapse_test test)
+{
+	struct comm_pipes comm_pipes;
+	char buf;
+	int ret;
+
+	ret = setup_comm_pipes(&comm_pipes);
+	if (ret) {
+		ksft_test_result_fail("pipe() failed\n");
+		return;
+	}
+
+	/*
+	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
+	 * R/O, such that we can try collapsing it later.
+	 */
+	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
+	if (ret) {
+		ksft_test_result_fail("mprotect() failed\n");
+		goto close_comm_pipes;
+	}
+	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
+	if (ret) {
+		ksft_test_result_fail("mprotect() failed\n");
+		goto close_comm_pipes;
+	}
+
+	switch (test) {
+	case ANON_THP_COLLAPSE_UNSHARED:
+		/* Collapse before actually COW-sharing the page. */
+		ret = madvise(mem, size, MADV_COLLAPSE);
+		if (ret) {
+			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
+					      strerror(errno));
+			goto close_comm_pipes;
+		}
+		break;
+	case ANON_THP_COLLAPSE_FULLY_SHARED:
+		/* COW-share the full PTE-mapped THP. */
+		break;
+	case ANON_THP_COLLAPSE_LOWER_SHARED:
+		/* Don't COW-share the upper part of the THP. */
+		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
+		if (ret) {
+			ksft_test_result_fail("MADV_DONTFORK failed\n");
+			goto close_comm_pipes;
+		}
+		break;
+	case ANON_THP_COLLAPSE_UPPER_SHARED:
+		/* Don't COW-share the lower part of the THP. */
+		ret = madvise(mem, size / 2, MADV_DONTFORK);
+		if (ret) {
+			ksft_test_result_fail("MADV_DONTFORK failed\n");
+			goto close_comm_pipes;
+		}
+		break;
+	default:
+		assert(false);
+	}
+
+	ret = fork();
+	if (ret < 0) {
+		ksft_test_result_fail("fork() failed\n");
+		goto close_comm_pipes;
+	} else if (!ret) {
+		switch (test) {
+		case ANON_THP_COLLAPSE_UNSHARED:
+		case ANON_THP_COLLAPSE_FULLY_SHARED:
+			exit(child_memcmp_fn(mem, size, &comm_pipes));
+			break;
+		case ANON_THP_COLLAPSE_LOWER_SHARED:
+			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
+			break;
+		case ANON_THP_COLLAPSE_UPPER_SHARED:
+			exit(child_memcmp_fn(mem + size / 2, size / 2,
+					     &comm_pipes));
+			break;
+		default:
+			assert(false);
+		}
+	}
+
+	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+		;
+
+	switch (test) {
+	case ANON_THP_COLLAPSE_UNSHARED:
+		break;
+	case ANON_THP_COLLAPSE_UPPER_SHARED:
+	case ANON_THP_COLLAPSE_LOWER_SHARED:
+		/*
+		 * Revert MADV_DONTFORK such that we merge the VMAs and are
+		 * able to actually collapse.
+		 */
+		ret = madvise(mem, size, MADV_DOFORK);
+		if (ret) {
+			ksft_test_result_fail("MADV_DOFORK failed\n");
+			write(comm_pipes.parent_ready[1], "0", 1);
+			wait(&ret);
+			goto close_comm_pipes;
+		}
+		/* FALLTHROUGH */
+	case ANON_THP_COLLAPSE_FULLY_SHARED:
+		/* Collapse before anyone modified the COW-shared page. */
+		ret = madvise(mem, size, MADV_COLLAPSE);
+		if (ret) {
+			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
+					      strerror(errno));
+			write(comm_pipes.parent_ready[1], "0", 1);
+			wait(&ret);
+			goto close_comm_pipes;
+		}
+		break;
+	default:
+		assert(false);
+	}
+
+	/* Modify the page. */
+	memset(mem, 0xff, size);
+	write(comm_pipes.parent_ready[1], "0", 1);
+
+	wait(&ret);
+	if (WIFEXITED(ret))
+		ret = WEXITSTATUS(ret);
+	else
+		ret = -EINVAL;
+
+	ksft_test_result(!ret, "No leak from parent into child\n");
+close_comm_pipes:
+	close_comm_pipes(&comm_pipes);
+}
+
+static void test_anon_thp_collapse_unshared(char *mem, size_t size)
+{
+	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
+}
+
+static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
+{
+	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
+}
+
+static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
+{
+	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
+}
+
+static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
+{
+	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
+}
+
+/*
+ * Test cases that are specific to anonymous THP: pages in private mappings
+ * that may get shared via COW during fork().
+ */
+static const struct test_case anon_thp_test_cases[] = {
+	/*
+	 * Basic COW test for fork() without any GUP when collapsing a THP
+	 * before fork().
+	 *
+	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
+	 * collapse") might easily get COW handling wrong when not collapsing
+	 * exclusivity information properly.
+	 */
+	{
+		"Basic COW after fork() when collapsing before fork()",
+		test_anon_thp_collapse_unshared,
+	},
+	/* Basic COW test, but collapse after COW-sharing a full THP. */
+	{
+		"Basic COW after fork() when collapsing after fork() (fully shared)",
+		test_anon_thp_collapse_fully_shared,
+	},
+	/*
+	 * Basic COW test, but collapse after COW-sharing the lower half of a
+	 * THP.
+	 */
+	{
+		"Basic COW after fork() when collapsing after fork() (lower shared)",
+		test_anon_thp_collapse_lower_shared,
+	},
+	/*
+	 * Basic COW test, but collapse after COW-sharing the upper half of a
+	 * THP.
+	 */
+	{
+		"Basic COW after fork() when collapsing after fork() (upper shared)",
+		test_anon_thp_collapse_upper_shared,
+	},
+};
+
+static void run_anon_thp_test_cases(void)
+{
+	int i;
+
+	if (!thpsize)
+		return;
+
+	ksft_print_msg("[INFO] Anonymous THP tests\n");
+
+	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
+		struct test_case const *test_case = &anon_thp_test_cases[i];
+
+		ksft_print_msg("[RUN] %s\n", test_case->desc);
+		do_run_with_thp(test_case->fn, THP_RUN_PMD);
+	}
+}
+
+static int tests_per_anon_thp_test_case(void)
+{
+	return thpsize ? 1 : 0;
+}
+
+typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
+
+static void test_cow(char *mem, const char *smem, size_t size)
+{
+	char *old = malloc(size);
+
+	/* Backup the original content. */
+	memcpy(old, smem, size);
+
+	/* Modify the page. */
+	memset(mem, 0xff, size);
+
+	/* See if we still read the old values via the other mapping. */
+	ksft_test_result(!memcmp(smem, old, size),
+			 "Other mapping not modified\n");
+	free(old);
+}
+
+static void test_ro_pin(char *mem, const char *smem, size_t size)
+{
+	do_test_ro_pin(mem, size, RO_PIN_TEST, false);
+}
+
+static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
+{
+	do_test_ro_pin(mem, size, RO_PIN_TEST, true);
+}
+
+static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
+{
+	char *mem, *smem, tmp;
+
+	ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
+
+	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANON, -1, 0);
+	if (mem == MAP_FAILED) {
+		ksft_test_result_fail("mmap() failed\n");
+		return;
+	}
+
+	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
+	if (mem == MAP_FAILED) {
+		ksft_test_result_fail("mmap() failed\n");
+		goto munmap;
+	}
+
+	/* Read from the page to populate the shared zeropage. */
+	tmp = *mem + *smem;
+	asm volatile("" : "+r" (tmp));
+
+	fn(mem, smem, pagesize);
+munmap:
+	munmap(mem, pagesize);
+	if (smem != MAP_FAILED)
+		munmap(smem, pagesize);
+}
+
+static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
+{
+	char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
+	size_t mmap_size;
+	int ret;
+
+	ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
+
+	if (!has_huge_zeropage) {
+		ksft_test_result_skip("Huge zeropage not enabled\n");
+		return;
+	}
+
+	/* For alignment purposes, we need twice the thp size. */
+	mmap_size = 2 * thpsize;
+	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (mmap_mem == MAP_FAILED) {
+		ksft_test_result_fail("mmap() failed\n");
+		return;
+	}
+	mmap_smem = mmap(NULL, mmap_size, PROT_READ,
+			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (mmap_smem == MAP_FAILED) {
+		ksft_test_result_fail("mmap() failed\n");
+		goto munmap;
+	}
+
+	/* We need a THP-aligned memory area. */
+	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
+	smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1));
+
+	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
+	ret |= madvise(smem, thpsize, MADV_HUGEPAGE);
+	if (ret) {
+		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
+		goto munmap;
+	}
+
+	/*
+	 * Read from the memory to populate the huge shared zeropage. Read from
+	 * the first sub-page and test if we get another sub-page populated
+	 * automatically.
+	 */
+	tmp = *mem + *smem;
+	asm volatile("" : "+r" (tmp));
+	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
+	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
+		ksft_test_result_skip("Did not get THPs populated\n");
+		goto munmap;
+	}
+
+	fn(mem, smem, thpsize);
+munmap:
+	munmap(mmap_mem, mmap_size);
+	if (mmap_smem != MAP_FAILED)
+		munmap(mmap_smem, mmap_size);
+}
+
+static void run_with_memfd(non_anon_test_fn fn, const char *desc)
+{
+	char *mem, *smem, tmp;
+	int fd;
+
+	ksft_print_msg("[RUN] %s ... with memfd\n", desc);
+
+	fd = memfd_create("test", 0);
+	if (fd < 0) {
+		ksft_test_result_fail("memfd_create() failed\n");
+		return;
+	}
+
+	/* File consists of a single page filled with zeroes. */
+	if (fallocate(fd, 0, 0, pagesize)) {
+		ksft_test_result_fail("fallocate() failed\n");
+		goto close;
+	}
+
+	/* Create a private mapping of the memfd. */
+	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+	if (mem == MAP_FAILED) {
+		ksft_test_result_fail("mmap() failed\n");
+		goto close;
+	}
+	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
+	if (mem == MAP_FAILED) {
+		ksft_test_result_fail("mmap() failed\n");
+		goto munmap;
+	}
+
+	/* Fault the page in. */
+	tmp = *mem + *smem;
+	asm volatile("" : "+r" (tmp));
+
+	fn(mem, smem, pagesize);
+munmap:
+	munmap(mem, pagesize);
+	if (smem != MAP_FAILED)
+		munmap(smem, pagesize);
+close:
+	close(fd);
+}
+
+static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
+{
+	char *mem, *smem, tmp;
+	FILE *file;
+	int fd;
+
+	ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
+
+	file = tmpfile();
+	if (!file) {
+		ksft_test_result_fail("tmpfile() failed\n");
+		return;
+	}
+
+	fd = fileno(file);
+	if (fd < 0) {
+		ksft_test_result_skip("fileno() failed\n");
+		return;
+	}
+
+	/* File consists of a single page filled with zeroes. */
+	if (fallocate(fd, 0, 0, pagesize)) {
+		ksft_test_result_fail("fallocate() failed\n");
+		goto close;
+	}
+
+	/* Create a private mapping of the memfd. */
+	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+	if (mem == MAP_FAILED) {
+		ksft_test_result_fail("mmap() failed\n");
+		goto close;
+	}
+	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
+	if (mem == MAP_FAILED) {
+		ksft_test_result_fail("mmap() failed\n");
+		goto munmap;
+	}
+
+	/* Fault the page in. */
+	tmp = *mem + *smem;
+	asm volatile("" : "+r" (tmp));
+
+	fn(mem, smem, pagesize);
+munmap:
+	munmap(mem, pagesize);
+	if (smem != MAP_FAILED)
+		munmap(smem, pagesize);
+close:
+	fclose(file);
+}
+
+static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
+				   size_t hugetlbsize)
+{
+	int flags = MFD_HUGETLB;
+	char *mem, *smem, tmp;
+	int fd;
+
+	ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
+		       hugetlbsize / 1024);
+
+	flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
+
+	fd = memfd_create("test", flags);
+	if (fd < 0) {
+		ksft_test_result_skip("memfd_create() failed\n");
+		return;
+	}
+
+	/* File consists of a single page filled with zeroes. */
+	if (fallocate(fd, 0, 0, hugetlbsize)) {
+		ksft_test_result_skip("need more free huge pages\n");
+		goto close;
+	}
+
+	/* Create a private mapping of the memfd. */
+	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
+		   0);
+	if (mem == MAP_FAILED) {
+		ksft_test_result_skip("need more free huge pages\n");
+		goto close;
+	}
+	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
+	if (mem == MAP_FAILED) {
+		ksft_test_result_fail("mmap() failed\n");
+		goto munmap;
+	}
+
+	/* Fault the page in. */
+	tmp = *mem + *smem;
+	asm volatile("" : "+r" (tmp));
+
+	fn(mem, smem, hugetlbsize);
+munmap:
+	munmap(mem, hugetlbsize);
+	if (mem != MAP_FAILED)
+		munmap(smem, hugetlbsize);
+close:
+	close(fd);
+}
+
+struct non_anon_test_case {
+	const char *desc;
+	non_anon_test_fn fn;
+};
+
+/*
+ * Test cases that target any pages in private mappings that are not anonymous:
+ * pages that may get shared via COW ndependent of fork(). This includes
+ * the shared zeropage(s), pagecache pages, ...
+ */
+static const struct non_anon_test_case non_anon_test_cases[] = {
+	/*
+	 * Basic COW test without any GUP. If we miss to break COW, changes are
+	 * visible via other private/shared mappings.
+	 */
+	{
+		"Basic COW",
+		test_cow,
+	},
+	/*
+	 * Take a R/O longterm pin. When modifying the page via the page table,
+	 * the page content change must be visible via the pin.
+	 */
+	{
+		"R/O longterm GUP pin",
+		test_ro_pin,
+	},
+	/* Same as above, but using GUP-fast. */
+	{
+		"R/O longterm GUP-fast pin",
+		test_ro_fast_pin,
+	},
+};
+
+static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
+{
+	int i;
+
+	run_with_zeropage(test_case->fn, test_case->desc);
+	run_with_memfd(test_case->fn, test_case->desc);
+	run_with_tmpfile(test_case->fn, test_case->desc);
+	if (thpsize)
+		run_with_huge_zeropage(test_case->fn, test_case->desc);
+	for (i = 0; i < nr_hugetlbsizes; i++)
+		run_with_memfd_hugetlb(test_case->fn, test_case->desc,
+				       hugetlbsizes[i]);
+}
+
+static void run_non_anon_test_cases(void)
+{
+	int i;
+
+	ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
+
+	for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
+		run_non_anon_test_case(&non_anon_test_cases[i]);
+}
+
+static int tests_per_non_anon_test_case(void)
+{
+	int tests = 3 + nr_hugetlbsizes;
+
+	if (thpsize)
+		tests += 1;
+	return tests;
+}
+
+int main(int argc, char **argv)
+{
+	int err;
+
+	pagesize = getpagesize();
+	detect_thpsize();
+	detect_hugetlbsizes();
+	detect_huge_zeropage();
+
+	ksft_print_header();
+	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
+		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
+		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
+
+	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+	if (pagemap_fd < 0)
+		ksft_exit_fail_msg("opening pagemap failed\n");
+
+	run_anon_test_cases();
+	run_anon_thp_test_cases();
+	run_non_anon_test_cases();
+
+	err = ksft_get_fail_cnt();
+	if (err)
+		ksft_exit_fail_msg("%d out of %d tests failed\n",
+				   err, ksft_test_num());
+	return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c
new file mode 100644
index 000000000000..e43879291dac
--- /dev/null
+++ b/tools/testing/selftests/mm/gup_test.c
@@ -0,0 +1,271 @@
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <assert.h>
+#include <mm/gup_test.h>
+#include "../kselftest.h"
+
+#include "util.h"
+
+#define MB (1UL << 20)
+
+/* Just the flags we need, copied from mm.h: */
+#define FOLL_WRITE	0x01	/* check pte is writable */
+#define FOLL_TOUCH	0x02	/* mark page accessed */
+
+#define GUP_TEST_FILE "/sys/kernel/debug/gup_test"
+
+static unsigned long cmd = GUP_FAST_BENCHMARK;
+static int gup_fd, repeats = 1;
+static unsigned long size = 128 * MB;
+/* Serialize prints */
+static pthread_mutex_t print_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static char *cmd_to_str(unsigned long cmd)
+{
+	switch (cmd) {
+	case GUP_FAST_BENCHMARK:
+		return "GUP_FAST_BENCHMARK";
+	case PIN_FAST_BENCHMARK:
+		return "PIN_FAST_BENCHMARK";
+	case PIN_LONGTERM_BENCHMARK:
+		return "PIN_LONGTERM_BENCHMARK";
+	case GUP_BASIC_TEST:
+		return "GUP_BASIC_TEST";
+	case PIN_BASIC_TEST:
+		return "PIN_BASIC_TEST";
+	case DUMP_USER_PAGES_TEST:
+		return "DUMP_USER_PAGES_TEST";
+	}
+	return "Unknown command";
+}
+
+void *gup_thread(void *data)
+{
+	struct gup_test gup = *(struct gup_test *)data;
+	int i;
+
+	/* Only report timing information on the *_BENCHMARK commands: */
+	if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) ||
+	     (cmd == PIN_LONGTERM_BENCHMARK)) {
+		for (i = 0; i < repeats; i++) {
+			gup.size = size;
+			if (ioctl(gup_fd, cmd, &gup))
+				perror("ioctl"), exit(1);
+
+			pthread_mutex_lock(&print_mutex);
+			printf("%s: Time: get:%lld put:%lld us",
+			       cmd_to_str(cmd), gup.get_delta_usec,
+			       gup.put_delta_usec);
+			if (gup.size != size)
+				printf(", truncated (size: %lld)", gup.size);
+			printf("\n");
+			pthread_mutex_unlock(&print_mutex);
+		}
+	} else {
+		gup.size = size;
+		if (ioctl(gup_fd, cmd, &gup)) {
+			perror("ioctl");
+			exit(1);
+		}
+
+		pthread_mutex_lock(&print_mutex);
+		printf("%s: done\n", cmd_to_str(cmd));
+		if (gup.size != size)
+			printf("Truncated (size: %lld)\n", gup.size);
+		pthread_mutex_unlock(&print_mutex);
+	}
+
+	return NULL;
+}
+
+int main(int argc, char **argv)
+{
+	struct gup_test gup = { 0 };
+	int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret;
+	int flags = MAP_PRIVATE, touch = 0;
+	char *file = "/dev/zero";
+	pthread_t *tid;
+	char *p;
+
+	while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) {
+		switch (opt) {
+		case 'a':
+			cmd = PIN_FAST_BENCHMARK;
+			break;
+		case 'b':
+			cmd = PIN_BASIC_TEST;
+			break;
+		case 'L':
+			cmd = PIN_LONGTERM_BENCHMARK;
+			break;
+		case 'c':
+			cmd = DUMP_USER_PAGES_TEST;
+			/*
+			 * Dump page 0 (index 1). May be overridden later, by
+			 * user's non-option arguments.
+			 *
+			 * .which_pages is zero-based, so that zero can mean "do
+			 * nothing".
+			 */
+			gup.which_pages[0] = 1;
+			break;
+		case 'p':
+			/* works only with DUMP_USER_PAGES_TEST */
+			gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN;
+			break;
+		case 'F':
+			/* strtol, so you can pass flags in hex form */
+			gup.gup_flags = strtol(optarg, 0, 0);
+			break;
+		case 'j':
+			nthreads = atoi(optarg);
+			break;
+		case 'm':
+			size = atoi(optarg) * MB;
+			break;
+		case 'r':
+			repeats = atoi(optarg);
+			break;
+		case 'n':
+			nr_pages = atoi(optarg);
+			break;
+		case 't':
+			thp = 1;
+			break;
+		case 'T':
+			thp = 0;
+			break;
+		case 'U':
+			cmd = GUP_BASIC_TEST;
+			break;
+		case 'u':
+			cmd = GUP_FAST_BENCHMARK;
+			break;
+		case 'w':
+			write = 1;
+			break;
+		case 'W':
+			write = 0;
+			break;
+		case 'f':
+			file = optarg;
+			break;
+		case 'S':
+			flags &= ~MAP_PRIVATE;
+			flags |= MAP_SHARED;
+			break;
+		case 'H':
+			flags |= (MAP_HUGETLB | MAP_ANONYMOUS);
+			break;
+		case 'z':
+			/* fault pages in gup, do not fault in userland */
+			touch = 1;
+			break;
+		default:
+			return -1;
+		}
+	}
+
+	if (optind < argc) {
+		int extra_arg_count = 0;
+		/*
+		 * For example:
+		 *
+		 *   ./gup_test -c 0 1 0x1001
+		 *
+		 * ...to dump pages 0, 1, and 4097
+		 */
+
+		while ((optind < argc) &&
+		       (extra_arg_count < GUP_TEST_MAX_PAGES_TO_DUMP)) {
+			/*
+			 * Do the 1-based indexing here, so that the user can
+			 * use normal 0-based indexing on the command line.
+			 */
+			long page_index = strtol(argv[optind], 0, 0) + 1;
+
+			gup.which_pages[extra_arg_count] = page_index;
+			extra_arg_count++;
+			optind++;
+		}
+	}
+
+	filed = open(file, O_RDWR|O_CREAT);
+	if (filed < 0) {
+		perror("open");
+		exit(filed);
+	}
+
+	gup.nr_pages_per_call = nr_pages;
+	if (write)
+		gup.gup_flags |= FOLL_WRITE;
+
+	gup_fd = open(GUP_TEST_FILE, O_RDWR);
+	if (gup_fd == -1) {
+		switch (errno) {
+		case EACCES:
+			if (getuid())
+				printf("Please run this test as root\n");
+			break;
+		case ENOENT:
+			if (opendir("/sys/kernel/debug") == NULL) {
+				printf("mount debugfs at /sys/kernel/debug\n");
+				break;
+			}
+			printf("check if CONFIG_GUP_TEST is enabled in kernel config\n");
+			break;
+		default:
+			perror("failed to open " GUP_TEST_FILE);
+			break;
+		}
+		exit(KSFT_SKIP);
+	}
+
+	p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0);
+	if (p == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+	gup.addr = (unsigned long)p;
+
+	if (thp == 1)
+		madvise(p, size, MADV_HUGEPAGE);
+	else if (thp == 0)
+		madvise(p, size, MADV_NOHUGEPAGE);
+
+	/*
+	 * FOLL_TOUCH, in gup_test, is used as an either/or case: either
+	 * fault pages in from the kernel via FOLL_TOUCH, or fault them
+	 * in here, from user space. This allows comparison of performance
+	 * between those two cases.
+	 */
+	if (touch) {
+		gup.gup_flags |= FOLL_TOUCH;
+	} else {
+		for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
+			p[0] = 0;
+	}
+
+	tid = malloc(sizeof(pthread_t) * nthreads);
+	assert(tid);
+	for (i = 0; i < nthreads; i++) {
+		ret = pthread_create(&tid[i], NULL, gup_thread, &gup);
+		assert(ret == 0);
+	}
+	for (i = 0; i < nthreads; i++) {
+		ret = pthread_join(tid[i], NULL);
+		assert(ret == 0);
+	}
+	free(tid);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c
new file mode 100644
index 000000000000..4adaad1b822f
--- /dev/null
+++ b/tools/testing/selftests/mm/hmm-tests.c
@@ -0,0 +1,2054 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * HMM stands for Heterogeneous Memory Management, it is a helper layer inside
+ * the linux kernel to help device drivers mirror a process address space in
+ * the device. This allows the device to use the same address space which
+ * makes communication and data exchange a lot easier.
+ *
+ * This framework's sole purpose is to exercise various code paths inside
+ * the kernel to make sure that HMM performs as expected and to flush out any
+ * bugs.
+ */
+
+#include "../kselftest_harness.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <strings.h>
+#include <time.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+
+
+/*
+ * This is a private UAPI to the kernel test module so it isn't exported
+ * in the usual include/uapi/... directory.
+ */
+#include <lib/test_hmm_uapi.h>
+#include <mm/gup_test.h>
+
+struct hmm_buffer {
+	void		*ptr;
+	void		*mirror;
+	unsigned long	size;
+	int		fd;
+	uint64_t	cpages;
+	uint64_t	faults;
+};
+
+enum {
+	HMM_PRIVATE_DEVICE_ONE,
+	HMM_PRIVATE_DEVICE_TWO,
+	HMM_COHERENCE_DEVICE_ONE,
+	HMM_COHERENCE_DEVICE_TWO,
+};
+
+#define TWOMEG		(1 << 21)
+#define HMM_BUFFER_SIZE (1024 << 12)
+#define HMM_PATH_MAX    64
+#define NTIMES		10
+
+#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
+/* Just the flags we need, copied from mm.h: */
+#define FOLL_WRITE	0x01	/* check pte is writable */
+#define FOLL_LONGTERM   0x10000 /* mapping lifetime is indefinite */
+
+FIXTURE(hmm)
+{
+	int		fd;
+	unsigned int	page_size;
+	unsigned int	page_shift;
+};
+
+FIXTURE_VARIANT(hmm)
+{
+	int     device_number;
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_private)
+{
+	.device_number = HMM_PRIVATE_DEVICE_ONE,
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent)
+{
+	.device_number = HMM_COHERENCE_DEVICE_ONE,
+};
+
+FIXTURE(hmm2)
+{
+	int		fd0;
+	int		fd1;
+	unsigned int	page_size;
+	unsigned int	page_shift;
+};
+
+FIXTURE_VARIANT(hmm2)
+{
+	int     device_number0;
+	int     device_number1;
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private)
+{
+	.device_number0 = HMM_PRIVATE_DEVICE_ONE,
+	.device_number1 = HMM_PRIVATE_DEVICE_TWO,
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent)
+{
+	.device_number0 = HMM_COHERENCE_DEVICE_ONE,
+	.device_number1 = HMM_COHERENCE_DEVICE_TWO,
+};
+
+static int hmm_open(int unit)
+{
+	char pathname[HMM_PATH_MAX];
+	int fd;
+
+	snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit);
+	fd = open(pathname, O_RDWR, 0);
+	if (fd < 0)
+		fprintf(stderr, "could not open hmm dmirror driver (%s)\n",
+			pathname);
+	return fd;
+}
+
+static bool hmm_is_coherent_type(int dev_num)
+{
+	return (dev_num >= HMM_COHERENCE_DEVICE_ONE);
+}
+
+FIXTURE_SETUP(hmm)
+{
+	self->page_size = sysconf(_SC_PAGE_SIZE);
+	self->page_shift = ffs(self->page_size) - 1;
+
+	self->fd = hmm_open(variant->device_number);
+	if (self->fd < 0 && hmm_is_coherent_type(variant->device_number))
+		SKIP(exit(0), "DEVICE_COHERENT not available");
+	ASSERT_GE(self->fd, 0);
+}
+
+FIXTURE_SETUP(hmm2)
+{
+	self->page_size = sysconf(_SC_PAGE_SIZE);
+	self->page_shift = ffs(self->page_size) - 1;
+
+	self->fd0 = hmm_open(variant->device_number0);
+	if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0))
+		SKIP(exit(0), "DEVICE_COHERENT not available");
+	ASSERT_GE(self->fd0, 0);
+	self->fd1 = hmm_open(variant->device_number1);
+	ASSERT_GE(self->fd1, 0);
+}
+
+FIXTURE_TEARDOWN(hmm)
+{
+	int ret = close(self->fd);
+
+	ASSERT_EQ(ret, 0);
+	self->fd = -1;
+}
+
+FIXTURE_TEARDOWN(hmm2)
+{
+	int ret = close(self->fd0);
+
+	ASSERT_EQ(ret, 0);
+	self->fd0 = -1;
+
+	ret = close(self->fd1);
+	ASSERT_EQ(ret, 0);
+	self->fd1 = -1;
+}
+
+static int hmm_dmirror_cmd(int fd,
+			   unsigned long request,
+			   struct hmm_buffer *buffer,
+			   unsigned long npages)
+{
+	struct hmm_dmirror_cmd cmd;
+	int ret;
+
+	/* Simulate a device reading system memory. */
+	cmd.addr = (__u64)buffer->ptr;
+	cmd.ptr = (__u64)buffer->mirror;
+	cmd.npages = npages;
+
+	for (;;) {
+		ret = ioctl(fd, request, &cmd);
+		if (ret == 0)
+			break;
+		if (errno == EINTR)
+			continue;
+		return -errno;
+	}
+	buffer->cpages = cmd.cpages;
+	buffer->faults = cmd.faults;
+
+	return 0;
+}
+
+static void hmm_buffer_free(struct hmm_buffer *buffer)
+{
+	if (buffer == NULL)
+		return;
+
+	if (buffer->ptr)
+		munmap(buffer->ptr, buffer->size);
+	free(buffer->mirror);
+	free(buffer);
+}
+
+/*
+ * Create a temporary file that will be deleted on close.
+ */
+static int hmm_create_file(unsigned long size)
+{
+	char path[HMM_PATH_MAX];
+	int fd;
+
+	strcpy(path, "/tmp");
+	fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600);
+	if (fd >= 0) {
+		int r;
+
+		do {
+			r = ftruncate(fd, size);
+		} while (r == -1 && errno == EINTR);
+		if (!r)
+			return fd;
+		close(fd);
+	}
+	return -1;
+}
+
+/*
+ * Return a random unsigned number.
+ */
+static unsigned int hmm_random(void)
+{
+	static int fd = -1;
+	unsigned int r;
+
+	if (fd < 0) {
+		fd = open("/dev/urandom", O_RDONLY);
+		if (fd < 0) {
+			fprintf(stderr, "%s:%d failed to open /dev/urandom\n",
+					__FILE__, __LINE__);
+			return ~0U;
+		}
+	}
+	read(fd, &r, sizeof(r));
+	return r;
+}
+
+static void hmm_nanosleep(unsigned int n)
+{
+	struct timespec t;
+
+	t.tv_sec = 0;
+	t.tv_nsec = n;
+	nanosleep(&t, NULL);
+}
+
+static int hmm_migrate_sys_to_dev(int fd,
+				   struct hmm_buffer *buffer,
+				   unsigned long npages)
+{
+	return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages);
+}
+
+static int hmm_migrate_dev_to_sys(int fd,
+				   struct hmm_buffer *buffer,
+				   unsigned long npages)
+{
+	return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages);
+}
+
+/*
+ * Simple NULL test of device open/close.
+ */
+TEST_F(hmm, open_close)
+{
+}
+
+/*
+ * Read private anonymous memory.
+ */
+TEST_F(hmm, anon_read)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+	int val;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/*
+	 * Initialize buffer in system memory but leave the first two pages
+	 * zero (pte_none and pfn_zero).
+	 */
+	i = 2 * self->page_size / sizeof(*ptr);
+	for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Set buffer permission to read-only. */
+	ret = mprotect(buffer->ptr, size, PROT_READ);
+	ASSERT_EQ(ret, 0);
+
+	/* Populate the CPU page table with a special zero page. */
+	val = *(int *)(buffer->ptr + self->page_size);
+	ASSERT_EQ(val, 0);
+
+	/* Simulate a device reading system memory. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	ASSERT_EQ(buffer->faults, 1);
+
+	/* Check what the device read. */
+	ptr = buffer->mirror;
+	for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], 0);
+	for (; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Read private anonymous memory which has been protected with
+ * mprotect() PROT_NONE.
+ */
+TEST_F(hmm, anon_read_prot)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize buffer in system memory. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Initialize mirror buffer so we can verify it isn't written. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ptr[i] = -i;
+
+	/* Protect buffer from reading. */
+	ret = mprotect(buffer->ptr, size, PROT_NONE);
+	ASSERT_EQ(ret, 0);
+
+	/* Simulate a device reading system memory. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
+	ASSERT_EQ(ret, -EFAULT);
+
+	/* Allow CPU to read the buffer so we can check it. */
+	ret = mprotect(buffer->ptr, size, PROT_READ);
+	ASSERT_EQ(ret, 0);
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	/* Check what the device read. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], -i);
+
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Write private anonymous memory.
+ */
+TEST_F(hmm, anon_write)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize data that the device will write to buffer->ptr. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Simulate a device writing system memory. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	ASSERT_EQ(buffer->faults, 1);
+
+	/* Check what the device wrote. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Write private anonymous memory which has been protected with
+ * mprotect() PROT_READ.
+ */
+TEST_F(hmm, anon_write_prot)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Simulate a device reading a zero page of memory. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, 1);
+	ASSERT_EQ(buffer->faults, 1);
+
+	/* Initialize data that the device will write to buffer->ptr. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Simulate a device writing system memory. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+	ASSERT_EQ(ret, -EPERM);
+
+	/* Check what the device wrote. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], 0);
+
+	/* Now allow writing and see that the zero page is replaced. */
+	ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ);
+	ASSERT_EQ(ret, 0);
+
+	/* Simulate a device writing system memory. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	ASSERT_EQ(buffer->faults, 1);
+
+	/* Check what the device wrote. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Check that a device writing an anonymous private mapping
+ * will copy-on-write if a child process inherits the mapping.
+ */
+TEST_F(hmm, anon_write_child)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	pid_t pid;
+	int child_fd;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize buffer->ptr so we can tell if it is written. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Initialize data that the device will write to buffer->ptr. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ptr[i] = -i;
+
+	pid = fork();
+	if (pid == -1)
+		ASSERT_EQ(pid, 0);
+	if (pid != 0) {
+		waitpid(pid, &ret, 0);
+		ASSERT_EQ(WIFEXITED(ret), 1);
+
+		/* Check that the parent's buffer did not change. */
+		for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+			ASSERT_EQ(ptr[i], i);
+		return;
+	}
+
+	/* Check that we see the parent's values. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], -i);
+
+	/* The child process needs its own mirror to its own mm. */
+	child_fd = hmm_open(0);
+	ASSERT_GE(child_fd, 0);
+
+	/* Simulate a device writing system memory. */
+	ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	ASSERT_EQ(buffer->faults, 1);
+
+	/* Check what the device wrote. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], -i);
+
+	close(child_fd);
+	exit(0);
+}
+
+/*
+ * Check that a device writing an anonymous shared mapping
+ * will not copy-on-write if a child process inherits the mapping.
+ */
+TEST_F(hmm, anon_write_child_shared)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	pid_t pid;
+	int child_fd;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_SHARED | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize buffer->ptr so we can tell if it is written. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Initialize data that the device will write to buffer->ptr. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ptr[i] = -i;
+
+	pid = fork();
+	if (pid == -1)
+		ASSERT_EQ(pid, 0);
+	if (pid != 0) {
+		waitpid(pid, &ret, 0);
+		ASSERT_EQ(WIFEXITED(ret), 1);
+
+		/* Check that the parent's buffer did change. */
+		for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+			ASSERT_EQ(ptr[i], -i);
+		return;
+	}
+
+	/* Check that we see the parent's values. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], -i);
+
+	/* The child process needs its own mirror to its own mm. */
+	child_fd = hmm_open(0);
+	ASSERT_GE(child_fd, 0);
+
+	/* Simulate a device writing system memory. */
+	ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	ASSERT_EQ(buffer->faults, 1);
+
+	/* Check what the device wrote. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], -i);
+
+	close(child_fd);
+	exit(0);
+}
+
+/*
+ * Write private anonymous huge page.
+ */
+TEST_F(hmm, anon_write_huge)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	void *old_ptr;
+	void *map;
+	int *ptr;
+	int ret;
+
+	size = 2 * TWOMEG;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	size = TWOMEG;
+	npages = size >> self->page_shift;
+	map = (void *)ALIGN((uintptr_t)buffer->ptr, size);
+	ret = madvise(map, size, MADV_HUGEPAGE);
+	ASSERT_EQ(ret, 0);
+	old_ptr = buffer->ptr;
+	buffer->ptr = map;
+
+	/* Initialize data that the device will write to buffer->ptr. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Simulate a device writing system memory. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	ASSERT_EQ(buffer->faults, 1);
+
+	/* Check what the device wrote. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	buffer->ptr = old_ptr;
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Read numeric data from raw and tagged kernel status files.  Used to read
+ * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag).
+ */
+static long file_read_ulong(char *file, const char *tag)
+{
+	int fd;
+	char buf[2048];
+	int len;
+	char *p, *q;
+	long val;
+
+	fd = open(file, O_RDONLY);
+	if (fd < 0) {
+		/* Error opening the file */
+		return -1;
+	}
+
+	len = read(fd, buf, sizeof(buf));
+	close(fd);
+	if (len < 0) {
+		/* Error in reading the file */
+		return -1;
+	}
+	if (len == sizeof(buf)) {
+		/* Error file is too large */
+		return -1;
+	}
+	buf[len] = '\0';
+
+	/* Search for a tag if provided */
+	if (tag) {
+		p = strstr(buf, tag);
+		if (!p)
+			return -1; /* looks like the line we want isn't there */
+		p += strlen(tag);
+	} else
+		p = buf;
+
+	val = strtol(p, &q, 0);
+	if (*q != ' ') {
+		/* Error parsing the file */
+		return -1;
+	}
+
+	return val;
+}
+
+/*
+ * Write huge TLBFS page.
+ */
+TEST_F(hmm, anon_write_hugetlbfs)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long default_hsize;
+	unsigned long i;
+	int *ptr;
+	int ret;
+
+	default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
+	if (default_hsize < 0 || default_hsize*1024 < default_hsize)
+		SKIP(return, "Huge page size could not be determined");
+	default_hsize = default_hsize*1024; /* KB to B */
+
+	size = ALIGN(TWOMEG, default_hsize);
+	npages = size >> self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+				   PROT_READ | PROT_WRITE,
+				   MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+				   -1, 0);
+	if (buffer->ptr == MAP_FAILED) {
+		free(buffer);
+		SKIP(return, "Huge page could not be allocated");
+	}
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	/* Initialize data that the device will write to buffer->ptr. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Simulate a device writing system memory. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	ASSERT_EQ(buffer->faults, 1);
+
+	/* Check what the device wrote. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	munmap(buffer->ptr, buffer->size);
+	buffer->ptr = NULL;
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Read mmap'ed file memory.
+ */
+TEST_F(hmm, file_read)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+	int fd;
+	ssize_t len;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	fd = hmm_create_file(size);
+	ASSERT_GE(fd, 0);
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = fd;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	/* Write initial contents of the file. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+	len = pwrite(fd, buffer->mirror, size, 0);
+	ASSERT_EQ(len, size);
+	memset(buffer->mirror, 0, size);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ,
+			   MAP_SHARED,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Simulate a device reading system memory. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	ASSERT_EQ(buffer->faults, 1);
+
+	/* Check what the device read. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Write mmap'ed file memory.
+ */
+TEST_F(hmm, file_write)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+	int fd;
+	ssize_t len;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	fd = hmm_create_file(size);
+	ASSERT_GE(fd, 0);
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = fd;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_SHARED,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize data that the device will write to buffer->ptr. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Simulate a device writing system memory. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	ASSERT_EQ(buffer->faults, 1);
+
+	/* Check what the device wrote. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	/* Check that the device also wrote the file. */
+	len = pread(fd, buffer->mirror, size, 0);
+	ASSERT_EQ(len, size);
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous memory to device private memory.
+ */
+TEST_F(hmm, migrate)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize buffer in system memory. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Migrate memory to device. */
+	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	/* Check what the device read. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous memory to device private memory and fault some of it back
+ * to system memory, then try migrating the resulting mix of system and device
+ * private memory to the device.
+ */
+TEST_F(hmm, migrate_fault)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize buffer in system memory. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Migrate memory to device. */
+	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	/* Check what the device read. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	/* Fault half the pages back to system memory and check them. */
+	for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	/* Migrate memory to the device again. */
+	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	/* Check what the device read. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	hmm_buffer_free(buffer);
+}
+
+TEST_F(hmm, migrate_release)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize buffer in system memory. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Migrate memory to device. */
+	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	/* Check what the device read. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	/* Release device memory. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages);
+	ASSERT_EQ(ret, 0);
+
+	/* Fault pages back to system memory and check them. */
+	for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous shared memory to device private memory.
+ */
+TEST_F(hmm, migrate_shared)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_SHARED | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Migrate memory to device. */
+	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+	ASSERT_EQ(ret, -ENOENT);
+
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Try to migrate various memory types to device private memory.
+ */
+TEST_F(hmm2, migrate_mixed)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	int *ptr;
+	unsigned char *p;
+	int ret;
+	int val;
+
+	npages = 6;
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	/* Reserve a range of addresses. */
+	buffer->ptr = mmap(NULL, size,
+			   PROT_NONE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+	p = buffer->ptr;
+
+	/* Migrating a protected area should be an error. */
+	ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages);
+	ASSERT_EQ(ret, -EINVAL);
+
+	/* Punch a hole after the first page address. */
+	ret = munmap(buffer->ptr + self->page_size, self->page_size);
+	ASSERT_EQ(ret, 0);
+
+	/* We expect an error if the vma doesn't cover the range. */
+	ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3);
+	ASSERT_EQ(ret, -EINVAL);
+
+	/* Page 2 will be a read-only zero page. */
+	ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size,
+				PROT_READ);
+	ASSERT_EQ(ret, 0);
+	ptr = (int *)(buffer->ptr + 2 * self->page_size);
+	val = *ptr + 3;
+	ASSERT_EQ(val, 3);
+
+	/* Page 3 will be read-only. */
+	ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+				PROT_READ | PROT_WRITE);
+	ASSERT_EQ(ret, 0);
+	ptr = (int *)(buffer->ptr + 3 * self->page_size);
+	*ptr = val;
+	ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+				PROT_READ);
+	ASSERT_EQ(ret, 0);
+
+	/* Page 4-5 will be read-write. */
+	ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size,
+				PROT_READ | PROT_WRITE);
+	ASSERT_EQ(ret, 0);
+	ptr = (int *)(buffer->ptr + 4 * self->page_size);
+	*ptr = val;
+	ptr = (int *)(buffer->ptr + 5 * self->page_size);
+	*ptr = val;
+
+	/* Now try to migrate pages 2-5 to device 1. */
+	buffer->ptr = p + 2 * self->page_size;
+	ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, 4);
+
+	/* Page 5 won't be migrated to device 0 because it's on device 1. */
+	buffer->ptr = p + 5 * self->page_size;
+	ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1);
+	ASSERT_EQ(ret, -ENOENT);
+	buffer->ptr = p;
+
+	buffer->ptr = p;
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous memory to device memory and back to system memory
+ * multiple times. In case of private zone configuration, this is done
+ * through fault pages accessed by CPU. In case of coherent zone configuration,
+ * the pages from the device should be explicitly migrated back to system memory.
+ * The reason is Coherent device zone has coherent access by CPU, therefore
+ * it will not generate any page fault.
+ */
+TEST_F(hmm, migrate_multiple)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	unsigned long c;
+	int *ptr;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	for (c = 0; c < NTIMES; c++) {
+		buffer = malloc(sizeof(*buffer));
+		ASSERT_NE(buffer, NULL);
+
+		buffer->fd = -1;
+		buffer->size = size;
+		buffer->mirror = malloc(size);
+		ASSERT_NE(buffer->mirror, NULL);
+
+		buffer->ptr = mmap(NULL, size,
+				   PROT_READ | PROT_WRITE,
+				   MAP_PRIVATE | MAP_ANONYMOUS,
+				   buffer->fd, 0);
+		ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+		/* Initialize buffer in system memory. */
+		for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+			ptr[i] = i;
+
+		/* Migrate memory to device. */
+		ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+		ASSERT_EQ(ret, 0);
+		ASSERT_EQ(buffer->cpages, npages);
+
+		/* Check what the device read. */
+		for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+			ASSERT_EQ(ptr[i], i);
+
+		/* Migrate back to system memory and check them. */
+		if (hmm_is_coherent_type(variant->device_number)) {
+			ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages);
+			ASSERT_EQ(ret, 0);
+			ASSERT_EQ(buffer->cpages, npages);
+		}
+
+		for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+			ASSERT_EQ(ptr[i], i);
+
+		hmm_buffer_free(buffer);
+	}
+}
+
+/*
+ * Read anonymous memory multiple times.
+ */
+TEST_F(hmm, anon_read_multiple)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	unsigned long c;
+	int *ptr;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	for (c = 0; c < NTIMES; c++) {
+		buffer = malloc(sizeof(*buffer));
+		ASSERT_NE(buffer, NULL);
+
+		buffer->fd = -1;
+		buffer->size = size;
+		buffer->mirror = malloc(size);
+		ASSERT_NE(buffer->mirror, NULL);
+
+		buffer->ptr = mmap(NULL, size,
+				   PROT_READ | PROT_WRITE,
+				   MAP_PRIVATE | MAP_ANONYMOUS,
+				   buffer->fd, 0);
+		ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+		/* Initialize buffer in system memory. */
+		for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+			ptr[i] = i + c;
+
+		/* Simulate a device reading system memory. */
+		ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer,
+				      npages);
+		ASSERT_EQ(ret, 0);
+		ASSERT_EQ(buffer->cpages, npages);
+		ASSERT_EQ(buffer->faults, 1);
+
+		/* Check what the device read. */
+		for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+			ASSERT_EQ(ptr[i], i + c);
+
+		hmm_buffer_free(buffer);
+	}
+}
+
+void *unmap_buffer(void *p)
+{
+	struct hmm_buffer *buffer = p;
+
+	/* Delay for a bit and then unmap buffer while it is being read. */
+	hmm_nanosleep(hmm_random() % 32000);
+	munmap(buffer->ptr + buffer->size / 2, buffer->size / 2);
+	buffer->ptr = NULL;
+
+	return NULL;
+}
+
+/*
+ * Try reading anonymous memory while it is being unmapped.
+ */
+TEST_F(hmm, anon_teardown)
+{
+	unsigned long npages;
+	unsigned long size;
+	unsigned long c;
+	void *ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	for (c = 0; c < NTIMES; ++c) {
+		pthread_t thread;
+		struct hmm_buffer *buffer;
+		unsigned long i;
+		int *ptr;
+		int rc;
+
+		buffer = malloc(sizeof(*buffer));
+		ASSERT_NE(buffer, NULL);
+
+		buffer->fd = -1;
+		buffer->size = size;
+		buffer->mirror = malloc(size);
+		ASSERT_NE(buffer->mirror, NULL);
+
+		buffer->ptr = mmap(NULL, size,
+				   PROT_READ | PROT_WRITE,
+				   MAP_PRIVATE | MAP_ANONYMOUS,
+				   buffer->fd, 0);
+		ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+		/* Initialize buffer in system memory. */
+		for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+			ptr[i] = i + c;
+
+		rc = pthread_create(&thread, NULL, unmap_buffer, buffer);
+		ASSERT_EQ(rc, 0);
+
+		/* Simulate a device reading system memory. */
+		rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer,
+				     npages);
+		if (rc == 0) {
+			ASSERT_EQ(buffer->cpages, npages);
+			ASSERT_EQ(buffer->faults, 1);
+
+			/* Check what the device read. */
+			for (i = 0, ptr = buffer->mirror;
+			     i < size / sizeof(*ptr);
+			     ++i)
+				ASSERT_EQ(ptr[i], i + c);
+		}
+
+		pthread_join(thread, &ret);
+		hmm_buffer_free(buffer);
+	}
+}
+
+/*
+ * Test memory snapshot without faulting in pages accessed by the device.
+ */
+TEST_F(hmm, mixedmap)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned char *m;
+	int ret;
+
+	npages = 1;
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(npages);
+	ASSERT_NE(buffer->mirror, NULL);
+
+
+	/* Reserve a range of addresses. */
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE,
+			   self->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Simulate a device snapshotting CPU pagetables. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	/* Check what the device saw. */
+	m = buffer->mirror;
+	ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ);
+
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Test memory snapshot without faulting in pages accessed by the device.
+ */
+TEST_F(hmm2, snapshot)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	int *ptr;
+	unsigned char *p;
+	unsigned char *m;
+	int ret;
+	int val;
+
+	npages = 7;
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(npages);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	/* Reserve a range of addresses. */
+	buffer->ptr = mmap(NULL, size,
+			   PROT_NONE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+	p = buffer->ptr;
+
+	/* Punch a hole after the first page address. */
+	ret = munmap(buffer->ptr + self->page_size, self->page_size);
+	ASSERT_EQ(ret, 0);
+
+	/* Page 2 will be read-only zero page. */
+	ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size,
+				PROT_READ);
+	ASSERT_EQ(ret, 0);
+	ptr = (int *)(buffer->ptr + 2 * self->page_size);
+	val = *ptr + 3;
+	ASSERT_EQ(val, 3);
+
+	/* Page 3 will be read-only. */
+	ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+				PROT_READ | PROT_WRITE);
+	ASSERT_EQ(ret, 0);
+	ptr = (int *)(buffer->ptr + 3 * self->page_size);
+	*ptr = val;
+	ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+				PROT_READ);
+	ASSERT_EQ(ret, 0);
+
+	/* Page 4-6 will be read-write. */
+	ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size,
+				PROT_READ | PROT_WRITE);
+	ASSERT_EQ(ret, 0);
+	ptr = (int *)(buffer->ptr + 4 * self->page_size);
+	*ptr = val;
+
+	/* Page 5 will be migrated to device 0. */
+	buffer->ptr = p + 5 * self->page_size;
+	ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, 1);
+
+	/* Page 6 will be migrated to device 1. */
+	buffer->ptr = p + 6 * self->page_size;
+	ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, 1);
+
+	/* Simulate a device snapshotting CPU pagetables. */
+	buffer->ptr = p;
+	ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	/* Check what the device saw. */
+	m = buffer->mirror;
+	ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR);
+	ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR);
+	ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ);
+	ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ);
+	ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE);
+	if (!hmm_is_coherent_type(variant->device_number0)) {
+		ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL |
+				HMM_DMIRROR_PROT_WRITE);
+		ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE);
+	} else {
+		ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL |
+				HMM_DMIRROR_PROT_WRITE);
+		ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE |
+				HMM_DMIRROR_PROT_WRITE);
+	}
+
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that
+ * should be mapped by a large page table entry.
+ */
+TEST_F(hmm, compound)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long default_hsize;
+	int *ptr;
+	unsigned char *m;
+	int ret;
+	unsigned long i;
+
+	/* Skip test if we can't allocate a hugetlbfs page. */
+
+	default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
+	if (default_hsize < 0 || default_hsize*1024 < default_hsize)
+		SKIP(return, "Huge page size could not be determined");
+	default_hsize = default_hsize*1024; /* KB to B */
+
+	size = ALIGN(TWOMEG, default_hsize);
+	npages = size >> self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+				   PROT_READ | PROT_WRITE,
+				   MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+				   -1, 0);
+	if (buffer->ptr == MAP_FAILED) {
+		free(buffer);
+		return;
+	}
+
+	buffer->size = size;
+	buffer->mirror = malloc(npages);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	/* Initialize the pages the device will snapshot in buffer->ptr. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Simulate a device snapshotting CPU pagetables. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	/* Check what the device saw. */
+	m = buffer->mirror;
+	for (i = 0; i < npages; ++i)
+		ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE |
+				HMM_DMIRROR_PROT_PMD);
+
+	/* Make the region read-only. */
+	ret = mprotect(buffer->ptr, size, PROT_READ);
+	ASSERT_EQ(ret, 0);
+
+	/* Simulate a device snapshotting CPU pagetables. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	/* Check what the device saw. */
+	m = buffer->mirror;
+	for (i = 0; i < npages; ++i)
+		ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ |
+				HMM_DMIRROR_PROT_PMD);
+
+	munmap(buffer->ptr, buffer->size);
+	buffer->ptr = NULL;
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Test two devices reading the same memory (double mapped).
+ */
+TEST_F(hmm2, double_map)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+
+	npages = 6;
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(npages);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	/* Reserve a range of addresses. */
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize buffer in system memory. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Make region read-only. */
+	ret = mprotect(buffer->ptr, size, PROT_READ);
+	ASSERT_EQ(ret, 0);
+
+	/* Simulate device 0 reading system memory. */
+	ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	ASSERT_EQ(buffer->faults, 1);
+
+	/* Check what the device read. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	/* Simulate device 1 reading system memory. */
+	ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	ASSERT_EQ(buffer->faults, 1);
+
+	/* Check what the device read. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	/* Migrate pages to device 1 and try to read from device 0. */
+	ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	ASSERT_EQ(buffer->faults, 1);
+
+	/* Check what device 0 read. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Basic check of exclusive faulting.
+ */
+TEST_F(hmm, exclusive)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize buffer in system memory. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Map memory exclusively for device access. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	/* Check what the device read. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	/* Fault pages back to system memory and check them. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i]++, i);
+
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i+1);
+
+	/* Check atomic access revoked */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_CHECK_EXCLUSIVE, buffer, npages);
+	ASSERT_EQ(ret, 0);
+
+	hmm_buffer_free(buffer);
+}
+
+TEST_F(hmm, exclusive_mprotect)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize buffer in system memory. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Map memory exclusively for device access. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	/* Check what the device read. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	ret = mprotect(buffer->ptr, size, PROT_READ);
+	ASSERT_EQ(ret, 0);
+
+	/* Simulate a device writing system memory. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+	ASSERT_EQ(ret, -EPERM);
+
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Check copy-on-write works.
+ */
+TEST_F(hmm, exclusive_cow)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize buffer in system memory. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Map memory exclusively for device access. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	fork();
+
+	/* Fault pages back to system memory and check them. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i]++, i);
+
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i+1);
+
+	hmm_buffer_free(buffer);
+}
+
+static int gup_test_exec(int gup_fd, unsigned long addr, int cmd,
+			 int npages, int size, int flags)
+{
+	struct gup_test gup = {
+		.nr_pages_per_call	= npages,
+		.addr			= addr,
+		.gup_flags		= FOLL_WRITE | flags,
+		.size			= size,
+	};
+
+	if (ioctl(gup_fd, cmd, &gup)) {
+		perror("ioctl on error\n");
+		return errno;
+	}
+
+	return 0;
+}
+
+/*
+ * Test get user device pages through gup_test. Setting PIN_LONGTERM flag.
+ * This should trigger a migration back to system memory for both, private
+ * and coherent type pages.
+ * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added
+ * to your configuration before you run it.
+ */
+TEST_F(hmm, hmm_gup_test)
+{
+	struct hmm_buffer *buffer;
+	int gup_fd;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+	unsigned char *m;
+
+	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+	if (gup_fd == -1)
+		SKIP(return, "Skipping test, could not find gup_test driver");
+
+	npages = 4;
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize buffer in system memory. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Migrate memory to device. */
+	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	/* Check what the device read. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	ASSERT_EQ(gup_test_exec(gup_fd,
+				(unsigned long)buffer->ptr,
+				GUP_BASIC_TEST, 1, self->page_size, 0), 0);
+	ASSERT_EQ(gup_test_exec(gup_fd,
+				(unsigned long)buffer->ptr + 1 * self->page_size,
+				GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0);
+	ASSERT_EQ(gup_test_exec(gup_fd,
+				(unsigned long)buffer->ptr + 2 * self->page_size,
+				PIN_FAST_BENCHMARK, 1, self->page_size, FOLL_LONGTERM), 0);
+	ASSERT_EQ(gup_test_exec(gup_fd,
+				(unsigned long)buffer->ptr + 3 * self->page_size,
+				PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 0);
+
+	/* Take snapshot to CPU pagetables */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	m = buffer->mirror;
+	if (hmm_is_coherent_type(variant->device_number)) {
+		ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[0]);
+		ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[1]);
+	} else {
+		ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]);
+		ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]);
+	}
+	ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]);
+	ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[3]);
+	/*
+	 * Check again the content on the pages. Make sure there's no
+	 * corrupted data.
+	 */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	close(gup_fd);
+	hmm_buffer_free(buffer);
+}
+
+/*
+ * Test copy-on-write in device pages.
+ * In case of writing to COW private page(s), a page fault will migrate pages
+ * back to system memory first. Then, these pages will be duplicated. In case
+ * of COW device coherent type, pages are duplicated directly from device
+ * memory.
+ */
+TEST_F(hmm, hmm_cow_in_device)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+	unsigned char *m;
+	pid_t pid;
+	int status;
+
+	npages = 4;
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS,
+			   buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize buffer in system memory. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Migrate memory to device. */
+
+	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	pid = fork();
+	if (pid == -1)
+		ASSERT_EQ(pid, 0);
+	if (!pid) {
+		/* Child process waitd for SIGTERM from the parent. */
+		while (1) {
+		}
+		perror("Should not reach this\n");
+		exit(0);
+	}
+	/* Parent process writes to COW pages(s) and gets a
+	 * new copy in system. In case of device private pages,
+	 * this write causes a migration to system mem first.
+	 */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Terminate child and wait */
+	EXPECT_EQ(0, kill(pid, SIGTERM));
+	EXPECT_EQ(pid, waitpid(pid, &status, 0));
+	EXPECT_NE(0, WIFSIGNALED(status));
+	EXPECT_EQ(SIGTERM, WTERMSIG(status));
+
+	/* Take snapshot to CPU pagetables */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+	m = buffer->mirror;
+	for (i = 0; i < npages; i++)
+		ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]);
+
+	hmm_buffer_free(buffer);
+}
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/hugepage-mmap.c b/tools/testing/selftests/mm/hugepage-mmap.c
new file mode 100644
index 000000000000..955ef87f382c
--- /dev/null
+++ b/tools/testing/selftests/mm/hugepage-mmap.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-mmap:
+ *
+ * Example of using huge page memory in a user application using the mmap
+ * system call.  Before running this application, make sure that the
+ * administrator has mounted the hugetlbfs filesystem (on some directory
+ * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
+ * example, the app is requesting memory of size 256MB that is backed by
+ * huge pages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages.  That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required.  If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_SHARED | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_SHARED)
+#endif
+
+static void check_bytes(char *addr)
+{
+	printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr)
+{
+	unsigned long i;
+
+	for (i = 0; i < LENGTH; i++)
+		*(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr)
+{
+	unsigned long i;
+
+	check_bytes(addr);
+	for (i = 0; i < LENGTH; i++)
+		if (*(addr + i) != (char)i) {
+			printf("Mismatch at %lu\n", i);
+			return 1;
+		}
+	return 0;
+}
+
+int main(void)
+{
+	void *addr;
+	int fd, ret;
+
+	fd = memfd_create("hugepage-mmap", MFD_HUGETLB);
+	if (fd < 0) {
+		perror("memfd_create() failed");
+		exit(1);
+	}
+
+	addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		close(fd);
+		exit(1);
+	}
+
+	printf("Returned address is %p\n", addr);
+	check_bytes(addr);
+	write_bytes(addr);
+	ret = read_bytes(addr);
+
+	munmap(addr, LENGTH);
+	close(fd);
+
+	return ret;
+}
diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c
new file mode 100644
index 000000000000..e53b5eaa8fce
--- /dev/null
+++ b/tools/testing/selftests/mm/hugepage-mremap.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-mremap:
+ *
+ * Example of remapping huge page memory in a user application using the
+ * mremap system call.  The path to a file in a hugetlbfs filesystem must
+ * be passed as the last argument to this test.  The amount of memory used
+ * by this test in MBs can optionally be passed as an argument.  If no memory
+ * amount is passed, the default amount is 10MB.
+ *
+ * To make sure the test triggers pmd sharing and goes through the 'unshare'
+ * path in the mremap code use 1GB (1024) or more.
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <fcntl.h> /* Definition of O_* constants */
+#include <sys/syscall.h> /* Definition of SYS_* constants */
+#include <linux/userfaultfd.h>
+#include <sys/ioctl.h>
+#include <string.h>
+
+#define DEFAULT_LENGTH_MB 10UL
+#define MB_TO_BYTES(x) (x * 1024 * 1024)
+
+#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC)
+#define FLAGS (MAP_SHARED | MAP_ANONYMOUS)
+
+static void check_bytes(char *addr)
+{
+	printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr, size_t len)
+{
+	unsigned long i;
+
+	for (i = 0; i < len; i++)
+		*(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr, size_t len)
+{
+	unsigned long i;
+
+	check_bytes(addr);
+	for (i = 0; i < len; i++)
+		if (*(addr + i) != (char)i) {
+			printf("Mismatch at %lu\n", i);
+			return 1;
+		}
+	return 0;
+}
+
+static void register_region_with_uffd(char *addr, size_t len)
+{
+	long uffd; /* userfaultfd file descriptor */
+	struct uffdio_api uffdio_api;
+	struct uffdio_register uffdio_register;
+
+	/* Create and enable userfaultfd object. */
+
+	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+	if (uffd == -1) {
+		perror("userfaultfd");
+		exit(1);
+	}
+
+	uffdio_api.api = UFFD_API;
+	uffdio_api.features = 0;
+	if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
+		perror("ioctl-UFFDIO_API");
+		exit(1);
+	}
+
+	/* Create a private anonymous mapping. The memory will be
+	 * demand-zero paged--that is, not yet allocated. When we
+	 * actually touch the memory, it will be allocated via
+	 * the userfaultfd.
+	 */
+
+	addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
+		    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+
+	printf("Address returned by mmap() = %p\n", addr);
+
+	/* Register the memory range of the mapping we just created for
+	 * handling by the userfaultfd object. In mode, we request to track
+	 * missing pages (i.e., pages that have not yet been faulted in).
+	 */
+
+	uffdio_register.range.start = (unsigned long)addr;
+	uffdio_register.range.len = len;
+	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
+		perror("ioctl-UFFDIO_REGISTER");
+		exit(1);
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	size_t length = 0;
+	int ret = 0, fd;
+
+	if (argc >= 2 && !strcmp(argv[1], "-h")) {
+		printf("Usage: %s [length_in_MB]\n", argv[0]);
+		exit(1);
+	}
+
+	/* Read memory length as the first arg if valid, otherwise fallback to
+	 * the default length.
+	 */
+	if (argc >= 2)
+		length = (size_t)atoi(argv[1]);
+	else
+		length = DEFAULT_LENGTH_MB;
+
+	length = MB_TO_BYTES(length);
+	fd = memfd_create(argv[0], MFD_HUGETLB);
+	if (fd < 0) {
+		perror("Open failed");
+		exit(1);
+	}
+
+	/* mmap to a PUD aligned address to hopefully trigger pmd sharing. */
+	unsigned long suggested_addr = 0x7eaa40000000;
+	void *haddr = mmap((void *)suggested_addr, length, PROTECTION,
+			   MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
+	printf("Map haddr: Returned address is %p\n", haddr);
+	if (haddr == MAP_FAILED) {
+		perror("mmap1");
+		exit(1);
+	}
+
+	/* mmap again to a dummy address to hopefully trigger pmd sharing. */
+	suggested_addr = 0x7daa40000000;
+	void *daddr = mmap((void *)suggested_addr, length, PROTECTION,
+			   MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
+	printf("Map daddr: Returned address is %p\n", daddr);
+	if (daddr == MAP_FAILED) {
+		perror("mmap3");
+		exit(1);
+	}
+
+	suggested_addr = 0x7faa40000000;
+	void *vaddr =
+		mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0);
+	printf("Map vaddr: Returned address is %p\n", vaddr);
+	if (vaddr == MAP_FAILED) {
+		perror("mmap2");
+		exit(1);
+	}
+
+	register_region_with_uffd(haddr, length);
+
+	void *addr = mremap(haddr, length, length,
+			    MREMAP_MAYMOVE | MREMAP_FIXED, vaddr);
+	if (addr == MAP_FAILED) {
+		perror("mremap");
+		exit(1);
+	}
+
+	printf("Mremap: Returned address is %p\n", addr);
+	check_bytes(addr);
+	write_bytes(addr, length);
+	ret = read_bytes(addr, length);
+
+	munmap(addr, length);
+
+	addr = mremap(addr, length, length, 0);
+	if (addr != MAP_FAILED) {
+		printf("mremap: Expected failure, but call succeeded\n");
+		exit(1);
+	}
+
+	close(fd);
+
+	return ret;
+}
diff --git a/tools/testing/selftests/mm/hugepage-shm.c b/tools/testing/selftests/mm/hugepage-shm.c
new file mode 100644
index 000000000000..e2527f32005b
--- /dev/null
+++ b/tools/testing/selftests/mm/hugepage-shm.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-shm:
+ *
+ * Example of using huge page memory in a user application using Sys V shared
+ * memory system calls.  In this example the app is requesting 256MB of
+ * memory that is backed by huge pages.  The application uses the flag
+ * SHM_HUGETLB in the shmget system call to inform the kernel that it is
+ * requesting huge pages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages.  That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required.  If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
+ *
+ * Note: The default shared memory limit is quite low on many kernels,
+ * you may need to increase it via:
+ *
+ * echo 268435456 > /proc/sys/kernel/shmmax
+ *
+ * This will increase the maximum size per shared memory segment to 256MB.
+ * The other limit that you will hit eventually is shmall which is the
+ * total amount of shared memory in pages. To set it to 16GB on a system
+ * with a 4kB pagesize do:
+ *
+ * echo 4194304 > /proc/sys/kernel/shmall
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+#define LENGTH (256UL*1024*1024)
+
+#define dprintf(x)  printf(x)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define SHMAT_FLAGS (SHM_RND)
+#else
+#define ADDR (void *)(0x0UL)
+#define SHMAT_FLAGS (0)
+#endif
+
+int main(void)
+{
+	int shmid;
+	unsigned long i;
+	char *shmaddr;
+
+	shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+	if (shmid < 0) {
+		perror("shmget");
+		exit(1);
+	}
+	printf("shmid: 0x%x\n", shmid);
+
+	shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
+	if (shmaddr == (char *)-1) {
+		perror("Shared memory attach failure");
+		shmctl(shmid, IPC_RMID, NULL);
+		exit(2);
+	}
+	printf("shmaddr: %p\n", shmaddr);
+
+	dprintf("Starting the writes:\n");
+	for (i = 0; i < LENGTH; i++) {
+		shmaddr[i] = (char)(i);
+		if (!(i % (1024 * 1024)))
+			dprintf(".");
+	}
+	dprintf("\n");
+
+	dprintf("Starting the Check...");
+	for (i = 0; i < LENGTH; i++)
+		if (shmaddr[i] != (char)i) {
+			printf("\nIndex %lu mismatched\n", i);
+			exit(3);
+		}
+	dprintf("Done.\n");
+
+	if (shmdt((const void *)shmaddr) != 0) {
+		perror("Detach failure");
+		shmctl(shmid, IPC_RMID, NULL);
+		exit(4);
+	}
+
+	shmctl(shmid, IPC_RMID, NULL);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c
new file mode 100644
index 000000000000..557bdbd4f87e
--- /dev/null
+++ b/tools/testing/selftests/mm/hugepage-vmemmap.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test case of using hugepage memory in a user application using the
+ * mmap system call with MAP_HUGETLB flag.  Before running this program
+ * make sure the administrator has allocated enough default sized huge
+ * pages to cover the 2 MB allocation.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define MAP_LENGTH		(2UL * 1024 * 1024)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB		0x40000	/* arch specific */
+#endif
+
+#define PAGE_SIZE		4096
+
+#define PAGE_COMPOUND_HEAD	(1UL << 15)
+#define PAGE_COMPOUND_TAIL	(1UL << 16)
+#define PAGE_HUGE		(1UL << 17)
+
+#define HEAD_PAGE_FLAGS		(PAGE_COMPOUND_HEAD | PAGE_HUGE)
+#define TAIL_PAGE_FLAGS		(PAGE_COMPOUND_TAIL | PAGE_HUGE)
+
+#define PM_PFRAME_BITS		55
+#define PM_PFRAME_MASK		~((1UL << PM_PFRAME_BITS) - 1)
+
+/*
+ * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * That means the addresses starting with 0x800000... will need to be
+ * specified.  Specifying a fixed address is not required on ppc64, i386
+ * or x86_64.
+ */
+#ifdef __ia64__
+#define MAP_ADDR		(void *)(0x8000000000000000UL)
+#define MAP_FLAGS		(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
+#else
+#define MAP_ADDR		NULL
+#define MAP_FLAGS		(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
+#endif
+
+static void write_bytes(char *addr, size_t length)
+{
+	unsigned long i;
+
+	for (i = 0; i < length; i++)
+		*(addr + i) = (char)i;
+}
+
+static unsigned long virt_to_pfn(void *addr)
+{
+	int fd;
+	unsigned long pagemap;
+
+	fd = open("/proc/self/pagemap", O_RDONLY);
+	if (fd < 0)
+		return -1UL;
+
+	lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET);
+	read(fd, &pagemap, sizeof(pagemap));
+	close(fd);
+
+	return pagemap & ~PM_PFRAME_MASK;
+}
+
+static int check_page_flags(unsigned long pfn)
+{
+	int fd, i;
+	unsigned long pageflags;
+
+	fd = open("/proc/kpageflags", O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	lseek(fd, pfn * sizeof(pageflags), SEEK_SET);
+
+	read(fd, &pageflags, sizeof(pageflags));
+	if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) {
+		close(fd);
+		printf("Head page flags (%lx) is invalid\n", pageflags);
+		return -1;
+	}
+
+	/*
+	 * pages other than the first page must be tail and shouldn't be head;
+	 * this also verifies kernel has correctly set the fake page_head to tail
+	 * while hugetlb_free_vmemmap is enabled.
+	 */
+	for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) {
+		read(fd, &pageflags, sizeof(pageflags));
+		if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS ||
+		    (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) {
+			close(fd);
+			printf("Tail page flags (%lx) is invalid\n", pageflags);
+			return -1;
+		}
+	}
+
+	close(fd);
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	void *addr;
+	unsigned long pfn;
+
+	addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+
+	/* Trigger allocation of HugeTLB page. */
+	write_bytes(addr, MAP_LENGTH);
+
+	pfn = virt_to_pfn(addr);
+	if (pfn == -1UL) {
+		munmap(addr, MAP_LENGTH);
+		perror("virt_to_pfn");
+		exit(1);
+	}
+
+	printf("Returned address is %p whose pfn is %lx\n", addr, pfn);
+
+	if (check_page_flags(pfn) < 0) {
+		munmap(addr, MAP_LENGTH);
+		perror("check_page_flags");
+		exit(1);
+	}
+
+	/* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
+	if (munmap(addr, MAP_LENGTH)) {
+		perror("munmap");
+		exit(1);
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c
new file mode 100644
index 000000000000..a634f47d1e56
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb-madvise.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-madvise:
+ *
+ * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE
+ * on hugetlb mappings.
+ *
+ * Before running this test, make sure the administrator has pre-allocated
+ * at least MIN_FREE_PAGES hugetlb pages and they are free.  In addition,
+ * the test takes an argument that is the path to a file in a hugetlbfs
+ * filesystem.  Therefore, a hugetlbfs filesystem must be mounted on some
+ * directory.
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#define __USE_GNU
+#include <fcntl.h>
+
+#define MIN_FREE_PAGES	20
+#define NR_HUGE_PAGES	10	/* common number of pages to map/allocate */
+
+#define validate_free_pages(exp_free)					\
+	do {								\
+		int fhp = get_free_hugepages();				\
+		if (fhp != (exp_free)) {				\
+			printf("Unexpected number of free huge "	\
+				"pages line %d\n", __LINE__);		\
+			exit(1);					\
+		}							\
+	} while (0)
+
+unsigned long huge_page_size;
+unsigned long base_page_size;
+
+/*
+ * default_huge_page_size copied from mlock2-tests.c
+ */
+unsigned long default_huge_page_size(void)
+{
+	unsigned long hps = 0;
+	char *line = NULL;
+	size_t linelen = 0;
+	FILE *f = fopen("/proc/meminfo", "r");
+
+	if (!f)
+		return 0;
+	while (getline(&line, &linelen, f) > 0) {
+		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
+			hps <<= 10;
+			break;
+		}
+	}
+
+	free(line);
+	fclose(f);
+	return hps;
+}
+
+unsigned long get_free_hugepages(void)
+{
+	unsigned long fhp = 0;
+	char *line = NULL;
+	size_t linelen = 0;
+	FILE *f = fopen("/proc/meminfo", "r");
+
+	if (!f)
+		return fhp;
+	while (getline(&line, &linelen, f) > 0) {
+		if (sscanf(line, "HugePages_Free:      %lu", &fhp) == 1)
+			break;
+	}
+
+	free(line);
+	fclose(f);
+	return fhp;
+}
+
+void write_fault_pages(void *addr, unsigned long nr_pages)
+{
+	unsigned long i;
+
+	for (i = 0; i < nr_pages; i++)
+		*((unsigned long *)(addr + (i * huge_page_size))) = i;
+}
+
+void read_fault_pages(void *addr, unsigned long nr_pages)
+{
+	unsigned long dummy = 0;
+	unsigned long i;
+
+	for (i = 0; i < nr_pages; i++)
+		dummy += *((unsigned long *)(addr + (i * huge_page_size)));
+}
+
+int main(int argc, char **argv)
+{
+	unsigned long free_hugepages;
+	void *addr, *addr2;
+	int fd;
+	int ret;
+
+	huge_page_size = default_huge_page_size();
+	if (!huge_page_size) {
+		printf("Unable to determine huge page size, exiting!\n");
+		exit(1);
+	}
+	base_page_size = sysconf(_SC_PAGE_SIZE);
+	if (!huge_page_size) {
+		printf("Unable to determine base page size, exiting!\n");
+		exit(1);
+	}
+
+	free_hugepages = get_free_hugepages();
+	if (free_hugepages < MIN_FREE_PAGES) {
+		printf("Not enough free huge pages to test, exiting!\n");
+		exit(1);
+	}
+
+	fd = memfd_create(argv[0], MFD_HUGETLB);
+	if (fd < 0) {
+		perror("memfd_create() failed");
+		exit(1);
+	}
+
+	/*
+	 * Test validity of MADV_DONTNEED addr and length arguments.  mmap
+	 * size is NR_HUGE_PAGES + 2.  One page at the beginning and end of
+	 * the mapping will be unmapped so we KNOW there is nothing mapped
+	 * there.
+	 */
+	addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size,
+			PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+			-1, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+	if (munmap(addr, huge_page_size) ||
+			munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size,
+				huge_page_size)) {
+		perror("munmap");
+		exit(1);
+	}
+	addr = addr + huge_page_size;
+
+	write_fault_pages(addr, NR_HUGE_PAGES);
+	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+	/* addr before mapping should fail */
+	ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size,
+		MADV_DONTNEED);
+	if (!ret) {
+		printf("Unexpected success of madvise call with invalid addr line %d\n",
+				__LINE__);
+			exit(1);
+	}
+
+	/* addr + length after mapping should fail */
+	ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size,
+		MADV_DONTNEED);
+	if (!ret) {
+		printf("Unexpected success of madvise call with invalid length line %d\n",
+				__LINE__);
+			exit(1);
+	}
+
+	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+	/*
+	 * Test alignment of MADV_DONTNEED addr and length arguments
+	 */
+	addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+			PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+			-1, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+	write_fault_pages(addr, NR_HUGE_PAGES);
+	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+	/* addr is not huge page size aligned and should fail */
+	ret = madvise(addr + base_page_size,
+			NR_HUGE_PAGES * huge_page_size - base_page_size,
+			MADV_DONTNEED);
+	if (!ret) {
+		printf("Unexpected success of madvise call with unaligned start address %d\n",
+				__LINE__);
+			exit(1);
+	}
+
+	/* addr + length should be aligned down to huge page size */
+	if (madvise(addr,
+			((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size,
+			MADV_DONTNEED)) {
+		perror("madvise");
+		exit(1);
+	}
+
+	/* should free all but last page in mapping */
+	validate_free_pages(free_hugepages - 1);
+
+	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+	validate_free_pages(free_hugepages);
+
+	/*
+	 * Test MADV_DONTNEED on anonymous private mapping
+	 */
+	addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+			PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+			-1, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+	write_fault_pages(addr, NR_HUGE_PAGES);
+	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+		perror("madvise");
+		exit(1);
+	}
+
+	/* should free all pages in mapping */
+	validate_free_pages(free_hugepages);
+
+	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+	/*
+	 * Test MADV_DONTNEED on private mapping of hugetlb file
+	 */
+	if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
+		perror("fallocate");
+		exit(1);
+	}
+	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+	addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+			PROT_READ | PROT_WRITE,
+			MAP_PRIVATE, fd, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+
+	/* read should not consume any pages */
+	read_fault_pages(addr, NR_HUGE_PAGES);
+	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+	/* madvise should not free any pages */
+	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+		perror("madvise");
+		exit(1);
+	}
+	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+	/* writes should allocate private pages */
+	write_fault_pages(addr, NR_HUGE_PAGES);
+	validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+	/* madvise should free private pages */
+	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+		perror("madvise");
+		exit(1);
+	}
+	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+	/* writes should allocate private pages */
+	write_fault_pages(addr, NR_HUGE_PAGES);
+	validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+	/*
+	 * The fallocate below certainly should free the pages associated
+	 * with the file.  However, pages in the private mapping are also
+	 * freed.  This is not the 'correct' behavior, but is expected
+	 * because this is how it has worked since the initial hugetlb
+	 * implementation.
+	 */
+	if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+					0, NR_HUGE_PAGES * huge_page_size)) {
+		perror("fallocate");
+		exit(1);
+	}
+	validate_free_pages(free_hugepages);
+
+	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+	/*
+	 * Test MADV_DONTNEED on shared mapping of hugetlb file
+	 */
+	if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
+		perror("fallocate");
+		exit(1);
+	}
+	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+	addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+			PROT_READ | PROT_WRITE,
+			MAP_SHARED, fd, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+
+	/* write should not consume any pages */
+	write_fault_pages(addr, NR_HUGE_PAGES);
+	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+	/* madvise should not free any pages */
+	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+		perror("madvise");
+		exit(1);
+	}
+	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+	/*
+	 * Test MADV_REMOVE on shared mapping of hugetlb file
+	 *
+	 * madvise is same as hole punch and should free all pages.
+	 */
+	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) {
+		perror("madvise");
+		exit(1);
+	}
+	validate_free_pages(free_hugepages);
+	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+	/*
+	 * Test MADV_REMOVE on shared and private mapping of hugetlb file
+	 */
+	if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
+		perror("fallocate");
+		exit(1);
+	}
+	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+	addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+			PROT_READ | PROT_WRITE,
+			MAP_SHARED, fd, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+
+	/* shared write should not consume any additional pages */
+	write_fault_pages(addr, NR_HUGE_PAGES);
+	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+	addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+			PROT_READ | PROT_WRITE,
+			MAP_PRIVATE, fd, 0);
+	if (addr2 == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+
+	/* private read should not consume any pages */
+	read_fault_pages(addr2, NR_HUGE_PAGES);
+	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+	/* private write should consume additional pages */
+	write_fault_pages(addr2, NR_HUGE_PAGES);
+	validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+	/* madvise of shared mapping should not free any pages */
+	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+		perror("madvise");
+		exit(1);
+	}
+	validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+	/* madvise of private mapping should free private pages */
+	if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+		perror("madvise");
+		exit(1);
+	}
+	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+	/* private write should consume additional pages again */
+	write_fault_pages(addr2, NR_HUGE_PAGES);
+	validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+	/*
+	 * madvise should free both file and private pages although this is
+	 * not correct.  private pages should not be freed, but this is
+	 * expected.  See comment associated with FALLOC_FL_PUNCH_HOLE call.
+	 */
+	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) {
+		perror("madvise");
+		exit(1);
+	}
+	validate_free_pages(free_hugepages);
+
+	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+	(void)munmap(addr2, NR_HUGE_PAGES * huge_page_size);
+
+	close(fd);
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
new file mode 100644
index 000000000000..bf2d2a684edf
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
@@ -0,0 +1,252 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+set -e
+
+if [[ $(id -u) -ne 0 ]]; then
+  echo "This test must be run as root. Skipping..."
+  exit $ksft_skip
+fi
+
+usage_file=usage_in_bytes
+
+if [[ "$1" == "-cgroup-v2" ]]; then
+  cgroup2=1
+  usage_file=current
+fi
+
+
+if [[ $cgroup2 ]]; then
+  CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
+  if [[ -z "$CGROUP_ROOT" ]]; then
+    CGROUP_ROOT=/dev/cgroup/memory
+    mount -t cgroup2 none $CGROUP_ROOT
+    do_umount=1
+  fi
+  echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control
+else
+  CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}')
+  if [[ -z "$CGROUP_ROOT" ]]; then
+    CGROUP_ROOT=/dev/cgroup/memory
+    mount -t cgroup memory,hugetlb $CGROUP_ROOT
+    do_umount=1
+  fi
+fi
+MNT='/mnt/huge/'
+
+function get_machine_hugepage_size() {
+  hpz=$(grep -i hugepagesize /proc/meminfo)
+  kb=${hpz:14:-3}
+  mb=$(($kb / 1024))
+  echo $mb
+}
+
+MB=$(get_machine_hugepage_size)
+
+function cleanup() {
+  echo cleanup
+  set +e
+  rm -rf "$MNT"/* 2>/dev/null
+  umount "$MNT" 2>/dev/null
+  rmdir "$MNT" 2>/dev/null
+  rmdir "$CGROUP_ROOT"/a/b 2>/dev/null
+  rmdir "$CGROUP_ROOT"/a 2>/dev/null
+  rmdir "$CGROUP_ROOT"/test1 2>/dev/null
+  echo 0 >/proc/sys/vm/nr_hugepages
+  set -e
+}
+
+function assert_state() {
+  local expected_a="$1"
+  local expected_a_hugetlb="$2"
+  local expected_b=""
+  local expected_b_hugetlb=""
+
+  if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then
+    expected_b="$3"
+    expected_b_hugetlb="$4"
+  fi
+  local tolerance=$((5 * 1024 * 1024))
+
+  local actual_a
+  actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)"
+  if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] ||
+    [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then
+    echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB
+    echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB
+    echo fail
+
+    cleanup
+    exit 1
+  fi
+
+  local actual_a_hugetlb
+  actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)"
+  if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] ||
+    [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then
+    echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB
+    echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB
+    echo fail
+
+    cleanup
+    exit 1
+  fi
+
+  if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then
+    return
+  fi
+
+  local actual_b
+  actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)"
+  if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] ||
+    [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then
+    echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB
+    echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB
+    echo fail
+
+    cleanup
+    exit 1
+  fi
+
+  local actual_b_hugetlb
+  actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)"
+  if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] ||
+    [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then
+    echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB
+    echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB
+    echo fail
+
+    cleanup
+    exit 1
+  fi
+}
+
+function setup() {
+  echo 100 >/proc/sys/vm/nr_hugepages
+  mkdir "$CGROUP_ROOT"/a
+  sleep 1
+  if [[ $cgroup2 ]]; then
+    echo "+hugetlb +memory" >$CGROUP_ROOT/a/cgroup.subtree_control
+  else
+    echo 0 >$CGROUP_ROOT/a/cpuset.mems
+    echo 0 >$CGROUP_ROOT/a/cpuset.cpus
+  fi
+
+  mkdir "$CGROUP_ROOT"/a/b
+
+  if [[ ! $cgroup2 ]]; then
+    echo 0 >$CGROUP_ROOT/a/b/cpuset.mems
+    echo 0 >$CGROUP_ROOT/a/b/cpuset.cpus
+  fi
+
+  mkdir -p "$MNT"
+  mount -t hugetlbfs none "$MNT"
+}
+
+write_hugetlbfs() {
+  local cgroup="$1"
+  local path="$2"
+  local size="$3"
+
+  if [[ $cgroup2 ]]; then
+    echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs
+  else
+    echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems
+    echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus
+    echo $$ >"$CGROUP_ROOT/$cgroup/tasks"
+  fi
+  ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o
+  if [[ $cgroup2 ]]; then
+    echo $$ >$CGROUP_ROOT/cgroup.procs
+  else
+    echo $$ >"$CGROUP_ROOT/tasks"
+  fi
+  echo
+}
+
+set -e
+
+size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages.
+
+cleanup
+
+echo
+echo
+echo Test charge, rmdir, uncharge
+setup
+echo mkdir
+mkdir $CGROUP_ROOT/test1
+
+echo write
+write_hugetlbfs test1 "$MNT"/test $size
+
+echo rmdir
+rmdir $CGROUP_ROOT/test1
+mkdir $CGROUP_ROOT/test1
+
+echo uncharge
+rm -rf /mnt/huge/*
+
+cleanup
+
+echo done
+echo
+echo
+if [[ ! $cgroup2 ]]; then
+  echo "Test parent and child hugetlb usage"
+  setup
+
+  echo write
+  write_hugetlbfs a "$MNT"/test $size
+
+  echo Assert memory charged correctly for parent use.
+  assert_state 0 $size 0 0
+
+  write_hugetlbfs a/b "$MNT"/test2 $size
+
+  echo Assert memory charged correctly for child use.
+  assert_state 0 $(($size * 2)) 0 $size
+
+  rmdir "$CGROUP_ROOT"/a/b
+  sleep 5
+  echo Assert memory reparent correctly.
+  assert_state 0 $(($size * 2))
+
+  rm -rf "$MNT"/*
+  umount "$MNT"
+  echo Assert memory uncharged correctly.
+  assert_state 0 0
+
+  cleanup
+fi
+
+echo
+echo
+echo "Test child only hugetlb usage"
+echo setup
+setup
+
+echo write
+write_hugetlbfs a/b "$MNT"/test2 $size
+
+echo Assert memory charged correctly for child only use.
+assert_state 0 $(($size)) 0 $size
+
+rmdir "$CGROUP_ROOT"/a/b
+echo Assert memory reparent correctly.
+assert_state 0 $size
+
+rm -rf "$MNT"/*
+umount "$MNT"
+echo Assert memory uncharged correctly.
+assert_state 0 0
+
+cleanup
+
+echo ALL PASS
+
+umount $CGROUP_ROOT
+rm -rf $CGROUP_ROOT
diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c
new file mode 100644
index 000000000000..64126c8cd561
--- /dev/null
+++ b/tools/testing/selftests/mm/khugepaged.c
@@ -0,0 +1,1558 @@
+#define _GNU_SOURCE
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <dirent.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+
+#include "linux/magic.h"
+
+#include "vm_util.h"
+
+#ifndef MADV_PAGEOUT
+#define MADV_PAGEOUT 21
+#endif
+#ifndef MADV_POPULATE_READ
+#define MADV_POPULATE_READ 22
+#endif
+#ifndef MADV_COLLAPSE
+#define MADV_COLLAPSE 25
+#endif
+
+#define BASE_ADDR ((void *)(1UL << 30))
+static unsigned long hpage_pmd_size;
+static unsigned long page_size;
+static int hpage_pmd_nr;
+
+#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
+#define PID_SMAPS "/proc/self/smaps"
+#define TEST_FILE "collapse_test_file"
+
+#define MAX_LINE_LENGTH 500
+
+enum vma_type {
+	VMA_ANON,
+	VMA_FILE,
+	VMA_SHMEM,
+};
+
+struct mem_ops {
+	void *(*setup_area)(int nr_hpages);
+	void (*cleanup_area)(void *p, unsigned long size);
+	void (*fault)(void *p, unsigned long start, unsigned long end);
+	bool (*check_huge)(void *addr, int nr_hpages);
+	const char *name;
+};
+
+static struct mem_ops *file_ops;
+static struct mem_ops *anon_ops;
+static struct mem_ops *shmem_ops;
+
+struct collapse_context {
+	void (*collapse)(const char *msg, char *p, int nr_hpages,
+			 struct mem_ops *ops, bool expect);
+	bool enforce_pte_scan_limits;
+	const char *name;
+};
+
+static struct collapse_context *khugepaged_context;
+static struct collapse_context *madvise_context;
+
+struct file_info {
+	const char *dir;
+	char path[PATH_MAX];
+	enum vma_type type;
+	int fd;
+	char dev_queue_read_ahead_path[PATH_MAX];
+};
+
+static struct file_info finfo;
+
+enum thp_enabled {
+	THP_ALWAYS,
+	THP_MADVISE,
+	THP_NEVER,
+};
+
+static const char *thp_enabled_strings[] = {
+	"always",
+	"madvise",
+	"never",
+	NULL
+};
+
+enum thp_defrag {
+	THP_DEFRAG_ALWAYS,
+	THP_DEFRAG_DEFER,
+	THP_DEFRAG_DEFER_MADVISE,
+	THP_DEFRAG_MADVISE,
+	THP_DEFRAG_NEVER,
+};
+
+static const char *thp_defrag_strings[] = {
+	"always",
+	"defer",
+	"defer+madvise",
+	"madvise",
+	"never",
+	NULL
+};
+
+enum shmem_enabled {
+	SHMEM_ALWAYS,
+	SHMEM_WITHIN_SIZE,
+	SHMEM_ADVISE,
+	SHMEM_NEVER,
+	SHMEM_DENY,
+	SHMEM_FORCE,
+};
+
+static const char *shmem_enabled_strings[] = {
+	"always",
+	"within_size",
+	"advise",
+	"never",
+	"deny",
+	"force",
+	NULL
+};
+
+struct khugepaged_settings {
+	bool defrag;
+	unsigned int alloc_sleep_millisecs;
+	unsigned int scan_sleep_millisecs;
+	unsigned int max_ptes_none;
+	unsigned int max_ptes_swap;
+	unsigned int max_ptes_shared;
+	unsigned long pages_to_scan;
+};
+
+struct settings {
+	enum thp_enabled thp_enabled;
+	enum thp_defrag thp_defrag;
+	enum shmem_enabled shmem_enabled;
+	bool use_zero_page;
+	struct khugepaged_settings khugepaged;
+	unsigned long read_ahead_kb;
+};
+
+static struct settings saved_settings;
+static bool skip_settings_restore;
+
+static int exit_status;
+
+static void success(const char *msg)
+{
+	printf(" \e[32m%s\e[0m\n", msg);
+}
+
+static void fail(const char *msg)
+{
+	printf(" \e[31m%s\e[0m\n", msg);
+	exit_status++;
+}
+
+static void skip(const char *msg)
+{
+	printf(" \e[33m%s\e[0m\n", msg);
+}
+
+static int read_file(const char *path, char *buf, size_t buflen)
+{
+	int fd;
+	ssize_t numread;
+
+	fd = open(path, O_RDONLY);
+	if (fd == -1)
+		return 0;
+
+	numread = read(fd, buf, buflen - 1);
+	if (numread < 1) {
+		close(fd);
+		return 0;
+	}
+
+	buf[numread] = '\0';
+	close(fd);
+
+	return (unsigned int) numread;
+}
+
+static int write_file(const char *path, const char *buf, size_t buflen)
+{
+	int fd;
+	ssize_t numwritten;
+
+	fd = open(path, O_WRONLY);
+	if (fd == -1) {
+		printf("open(%s)\n", path);
+		exit(EXIT_FAILURE);
+		return 0;
+	}
+
+	numwritten = write(fd, buf, buflen - 1);
+	close(fd);
+	if (numwritten < 1) {
+		printf("write(%s)\n", buf);
+		exit(EXIT_FAILURE);
+		return 0;
+	}
+
+	return (unsigned int) numwritten;
+}
+
+static int read_string(const char *name, const char *strings[])
+{
+	char path[PATH_MAX];
+	char buf[256];
+	char *c;
+	int ret;
+
+	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+	if (ret >= PATH_MAX) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	if (!read_file(path, buf, sizeof(buf))) {
+		perror(path);
+		exit(EXIT_FAILURE);
+	}
+
+	c = strchr(buf, '[');
+	if (!c) {
+		printf("%s: Parse failure\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	c++;
+	memmove(buf, c, sizeof(buf) - (c - buf));
+
+	c = strchr(buf, ']');
+	if (!c) {
+		printf("%s: Parse failure\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+	*c = '\0';
+
+	ret = 0;
+	while (strings[ret]) {
+		if (!strcmp(strings[ret], buf))
+			return ret;
+		ret++;
+	}
+
+	printf("Failed to parse %s\n", name);
+	exit(EXIT_FAILURE);
+}
+
+static void write_string(const char *name, const char *val)
+{
+	char path[PATH_MAX];
+	int ret;
+
+	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+	if (ret >= PATH_MAX) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	if (!write_file(path, val, strlen(val) + 1)) {
+		perror(path);
+		exit(EXIT_FAILURE);
+	}
+}
+
+static const unsigned long _read_num(const char *path)
+{
+	char buf[21];
+
+	if (read_file(path, buf, sizeof(buf)) < 0) {
+		perror("read_file(read_num)");
+		exit(EXIT_FAILURE);
+	}
+
+	return strtoul(buf, NULL, 10);
+}
+
+static const unsigned long read_num(const char *name)
+{
+	char path[PATH_MAX];
+	int ret;
+
+	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+	if (ret >= PATH_MAX) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+	return _read_num(path);
+}
+
+static void _write_num(const char *path, unsigned long num)
+{
+	char buf[21];
+
+	sprintf(buf, "%ld", num);
+	if (!write_file(path, buf, strlen(buf) + 1)) {
+		perror(path);
+		exit(EXIT_FAILURE);
+	}
+}
+
+static void write_num(const char *name, unsigned long num)
+{
+	char path[PATH_MAX];
+	int ret;
+
+	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+	if (ret >= PATH_MAX) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+	_write_num(path, num);
+}
+
+static void write_settings(struct settings *settings)
+{
+	struct khugepaged_settings *khugepaged = &settings->khugepaged;
+
+	write_string("enabled", thp_enabled_strings[settings->thp_enabled]);
+	write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
+	write_string("shmem_enabled",
+			shmem_enabled_strings[settings->shmem_enabled]);
+	write_num("use_zero_page", settings->use_zero_page);
+
+	write_num("khugepaged/defrag", khugepaged->defrag);
+	write_num("khugepaged/alloc_sleep_millisecs",
+			khugepaged->alloc_sleep_millisecs);
+	write_num("khugepaged/scan_sleep_millisecs",
+			khugepaged->scan_sleep_millisecs);
+	write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
+	write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
+	write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
+	write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
+
+	if (file_ops && finfo.type == VMA_FILE)
+		_write_num(finfo.dev_queue_read_ahead_path,
+			   settings->read_ahead_kb);
+}
+
+#define MAX_SETTINGS_DEPTH 4
+static struct settings settings_stack[MAX_SETTINGS_DEPTH];
+static int settings_index;
+
+static struct settings *current_settings(void)
+{
+	if (!settings_index) {
+		printf("Fail: No settings set");
+		exit(EXIT_FAILURE);
+	}
+	return settings_stack + settings_index - 1;
+}
+
+static void push_settings(struct settings *settings)
+{
+	if (settings_index >= MAX_SETTINGS_DEPTH) {
+		printf("Fail: Settings stack exceeded");
+		exit(EXIT_FAILURE);
+	}
+	settings_stack[settings_index++] = *settings;
+	write_settings(current_settings());
+}
+
+static void pop_settings(void)
+{
+	if (settings_index <= 0) {
+		printf("Fail: Settings stack empty");
+		exit(EXIT_FAILURE);
+	}
+	--settings_index;
+	write_settings(current_settings());
+}
+
+static void restore_settings(int sig)
+{
+	if (skip_settings_restore)
+		goto out;
+
+	printf("Restore THP and khugepaged settings...");
+	write_settings(&saved_settings);
+	success("OK");
+	if (sig)
+		exit(EXIT_FAILURE);
+out:
+	exit(exit_status);
+}
+
+static void save_settings(void)
+{
+	printf("Save THP and khugepaged settings...");
+	saved_settings = (struct settings) {
+		.thp_enabled = read_string("enabled", thp_enabled_strings),
+		.thp_defrag = read_string("defrag", thp_defrag_strings),
+		.shmem_enabled =
+			read_string("shmem_enabled", shmem_enabled_strings),
+		.use_zero_page = read_num("use_zero_page"),
+	};
+	saved_settings.khugepaged = (struct khugepaged_settings) {
+		.defrag = read_num("khugepaged/defrag"),
+		.alloc_sleep_millisecs =
+			read_num("khugepaged/alloc_sleep_millisecs"),
+		.scan_sleep_millisecs =
+			read_num("khugepaged/scan_sleep_millisecs"),
+		.max_ptes_none = read_num("khugepaged/max_ptes_none"),
+		.max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
+		.max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
+		.pages_to_scan = read_num("khugepaged/pages_to_scan"),
+	};
+	if (file_ops && finfo.type == VMA_FILE)
+		saved_settings.read_ahead_kb =
+				_read_num(finfo.dev_queue_read_ahead_path);
+
+	success("OK");
+
+	signal(SIGTERM, restore_settings);
+	signal(SIGINT, restore_settings);
+	signal(SIGHUP, restore_settings);
+	signal(SIGQUIT, restore_settings);
+}
+
+static void get_finfo(const char *dir)
+{
+	struct stat path_stat;
+	struct statfs fs;
+	char buf[1 << 10];
+	char path[PATH_MAX];
+	char *str, *end;
+
+	finfo.dir = dir;
+	stat(finfo.dir, &path_stat);
+	if (!S_ISDIR(path_stat.st_mode)) {
+		printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
+		exit(EXIT_FAILURE);
+	}
+	if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
+		     finfo.dir) >= sizeof(finfo.path)) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+	if (statfs(finfo.dir, &fs)) {
+		perror("statfs()");
+		exit(EXIT_FAILURE);
+	}
+	finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
+	if (finfo.type == VMA_SHMEM)
+		return;
+
+	/* Find owning device's queue/read_ahead_kb control */
+	if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
+		     major(path_stat.st_dev), minor(path_stat.st_dev))
+	    >= sizeof(path)) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+	if (read_file(path, buf, sizeof(buf)) < 0) {
+		perror("read_file(read_num)");
+		exit(EXIT_FAILURE);
+	}
+	if (strstr(buf, "DEVTYPE=disk")) {
+		/* Found it */
+		if (snprintf(finfo.dev_queue_read_ahead_path,
+			     sizeof(finfo.dev_queue_read_ahead_path),
+			     "/sys/dev/block/%d:%d/queue/read_ahead_kb",
+			     major(path_stat.st_dev), minor(path_stat.st_dev))
+		    >= sizeof(finfo.dev_queue_read_ahead_path)) {
+			printf("%s: Pathname is too long\n", __func__);
+			exit(EXIT_FAILURE);
+		}
+		return;
+	}
+	if (!strstr(buf, "DEVTYPE=partition")) {
+		printf("%s: Unknown device type: %s\n", __func__, path);
+		exit(EXIT_FAILURE);
+	}
+	/*
+	 * Partition of block device - need to find actual device.
+	 * Using naming convention that devnameN is partition of
+	 * device devname.
+	 */
+	str = strstr(buf, "DEVNAME=");
+	if (!str) {
+		printf("%s: Could not read: %s", __func__, path);
+		exit(EXIT_FAILURE);
+	}
+	str += 8;
+	end = str;
+	while (*end) {
+		if (isdigit(*end)) {
+			*end = '\0';
+			if (snprintf(finfo.dev_queue_read_ahead_path,
+				     sizeof(finfo.dev_queue_read_ahead_path),
+				     "/sys/block/%s/queue/read_ahead_kb",
+				     str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
+				printf("%s: Pathname is too long\n", __func__);
+				exit(EXIT_FAILURE);
+			}
+			return;
+		}
+		++end;
+	}
+	printf("%s: Could not read: %s\n", __func__, path);
+	exit(EXIT_FAILURE);
+}
+
+static bool check_swap(void *addr, unsigned long size)
+{
+	bool swap = false;
+	int ret;
+	FILE *fp;
+	char buffer[MAX_LINE_LENGTH];
+	char addr_pattern[MAX_LINE_LENGTH];
+
+	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+		       (unsigned long) addr);
+	if (ret >= MAX_LINE_LENGTH) {
+		printf("%s: Pattern is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+
+	fp = fopen(PID_SMAPS, "r");
+	if (!fp) {
+		printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
+		exit(EXIT_FAILURE);
+	}
+	if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
+		goto err_out;
+
+	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
+		       size >> 10);
+	if (ret >= MAX_LINE_LENGTH) {
+		printf("%s: Pattern is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+	/*
+	 * Fetch the Swap: in the same block and check whether it got
+	 * the expected number of hugeepages next.
+	 */
+	if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
+		goto err_out;
+
+	if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
+		goto err_out;
+
+	swap = true;
+err_out:
+	fclose(fp);
+	return swap;
+}
+
+static void *alloc_mapping(int nr)
+{
+	void *p;
+
+	p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
+		 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (p != BASE_ADDR) {
+		printf("Failed to allocate VMA at %p\n", BASE_ADDR);
+		exit(EXIT_FAILURE);
+	}
+
+	return p;
+}
+
+static void fill_memory(int *p, unsigned long start, unsigned long end)
+{
+	int i;
+
+	for (i = start / page_size; i < end / page_size; i++)
+		p[i * page_size / sizeof(*p)] = i + 0xdead0000;
+}
+
+/*
+ * MADV_COLLAPSE is a best-effort request and may fail if an internal
+ * resource is temporarily unavailable, in which case it will set errno to
+ * EAGAIN.  In such a case, immediately reattempt the operation one more
+ * time.
+ */
+static int madvise_collapse_retry(void *p, unsigned long size)
+{
+	bool retry = true;
+	int ret;
+
+retry:
+	ret = madvise(p, size, MADV_COLLAPSE);
+	if (ret && errno == EAGAIN && retry) {
+		retry = false;
+		goto retry;
+	}
+	return ret;
+}
+
+/*
+ * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
+ * validate_memory()'able contents.
+ */
+static void *alloc_hpage(struct mem_ops *ops)
+{
+	void *p = ops->setup_area(1);
+
+	ops->fault(p, 0, hpage_pmd_size);
+
+	/*
+	 * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
+	 * The latter is ineligible for collapse by MADV_COLLAPSE
+	 * while the former might cause MADV_COLLAPSE to race with
+	 * khugepaged on low-load system (like a test machine), which
+	 * would cause MADV_COLLAPSE to fail with EAGAIN.
+	 */
+	printf("Allocate huge page...");
+	if (madvise_collapse_retry(p, hpage_pmd_size)) {
+		perror("madvise(MADV_COLLAPSE)");
+		exit(EXIT_FAILURE);
+	}
+	if (!ops->check_huge(p, 1)) {
+		perror("madvise(MADV_COLLAPSE)");
+		exit(EXIT_FAILURE);
+	}
+	if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
+		perror("madvise(MADV_HUGEPAGE)");
+		exit(EXIT_FAILURE);
+	}
+	success("OK");
+	return p;
+}
+
+static void validate_memory(int *p, unsigned long start, unsigned long end)
+{
+	int i;
+
+	for (i = start / page_size; i < end / page_size; i++) {
+		if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
+			printf("Page %d is corrupted: %#x\n",
+					i, p[i * page_size / sizeof(*p)]);
+			exit(EXIT_FAILURE);
+		}
+	}
+}
+
+static void *anon_setup_area(int nr_hpages)
+{
+	return alloc_mapping(nr_hpages);
+}
+
+static void anon_cleanup_area(void *p, unsigned long size)
+{
+	munmap(p, size);
+}
+
+static void anon_fault(void *p, unsigned long start, unsigned long end)
+{
+	fill_memory(p, start, end);
+}
+
+static bool anon_check_huge(void *addr, int nr_hpages)
+{
+	return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
+}
+
+static void *file_setup_area(int nr_hpages)
+{
+	int fd;
+	void *p;
+	unsigned long size;
+
+	unlink(finfo.path);  /* Cleanup from previous failed tests */
+	printf("Creating %s for collapse%s...", finfo.path,
+	       finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
+	fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
+		  777);
+	if (fd < 0) {
+		perror("open()");
+		exit(EXIT_FAILURE);
+	}
+
+	size = nr_hpages * hpage_pmd_size;
+	p = alloc_mapping(nr_hpages);
+	fill_memory(p, 0, size);
+	write(fd, p, size);
+	close(fd);
+	munmap(p, size);
+	success("OK");
+
+	printf("Opening %s read only for collapse...", finfo.path);
+	finfo.fd = open(finfo.path, O_RDONLY, 777);
+	if (finfo.fd < 0) {
+		perror("open()");
+		exit(EXIT_FAILURE);
+	}
+	p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
+		 MAP_PRIVATE, finfo.fd, 0);
+	if (p == MAP_FAILED || p != BASE_ADDR) {
+		perror("mmap()");
+		exit(EXIT_FAILURE);
+	}
+
+	/* Drop page cache */
+	write_file("/proc/sys/vm/drop_caches", "3", 2);
+	success("OK");
+	return p;
+}
+
+static void file_cleanup_area(void *p, unsigned long size)
+{
+	munmap(p, size);
+	close(finfo.fd);
+	unlink(finfo.path);
+}
+
+static void file_fault(void *p, unsigned long start, unsigned long end)
+{
+	if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
+		perror("madvise(MADV_POPULATE_READ");
+		exit(EXIT_FAILURE);
+	}
+}
+
+static bool file_check_huge(void *addr, int nr_hpages)
+{
+	switch (finfo.type) {
+	case VMA_FILE:
+		return check_huge_file(addr, nr_hpages, hpage_pmd_size);
+	case VMA_SHMEM:
+		return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
+	default:
+		exit(EXIT_FAILURE);
+		return false;
+	}
+}
+
+static void *shmem_setup_area(int nr_hpages)
+{
+	void *p;
+	unsigned long size = nr_hpages * hpage_pmd_size;
+
+	finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
+	if (finfo.fd < 0)  {
+		perror("memfd_create()");
+		exit(EXIT_FAILURE);
+	}
+	if (ftruncate(finfo.fd, size)) {
+		perror("ftruncate()");
+		exit(EXIT_FAILURE);
+	}
+	p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
+		 0);
+	if (p != BASE_ADDR) {
+		perror("mmap()");
+		exit(EXIT_FAILURE);
+	}
+	return p;
+}
+
+static void shmem_cleanup_area(void *p, unsigned long size)
+{
+	munmap(p, size);
+	close(finfo.fd);
+}
+
+static bool shmem_check_huge(void *addr, int nr_hpages)
+{
+	return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
+}
+
+static struct mem_ops __anon_ops = {
+	.setup_area = &anon_setup_area,
+	.cleanup_area = &anon_cleanup_area,
+	.fault = &anon_fault,
+	.check_huge = &anon_check_huge,
+	.name = "anon",
+};
+
+static struct mem_ops __file_ops = {
+	.setup_area = &file_setup_area,
+	.cleanup_area = &file_cleanup_area,
+	.fault = &file_fault,
+	.check_huge = &file_check_huge,
+	.name = "file",
+};
+
+static struct mem_ops __shmem_ops = {
+	.setup_area = &shmem_setup_area,
+	.cleanup_area = &shmem_cleanup_area,
+	.fault = &anon_fault,
+	.check_huge = &shmem_check_huge,
+	.name = "shmem",
+};
+
+static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
+			       struct mem_ops *ops, bool expect)
+{
+	int ret;
+	struct settings settings = *current_settings();
+
+	printf("%s...", msg);
+
+	/*
+	 * Prevent khugepaged interference and tests that MADV_COLLAPSE
+	 * ignores /sys/kernel/mm/transparent_hugepage/enabled
+	 */
+	settings.thp_enabled = THP_NEVER;
+	settings.shmem_enabled = SHMEM_NEVER;
+	push_settings(&settings);
+
+	/* Clear VM_NOHUGEPAGE */
+	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
+	ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
+	if (((bool)ret) == expect)
+		fail("Fail: Bad return value");
+	else if (!ops->check_huge(p, expect ? nr_hpages : 0))
+		fail("Fail: check_huge()");
+	else
+		success("OK");
+
+	pop_settings();
+}
+
+static void madvise_collapse(const char *msg, char *p, int nr_hpages,
+			     struct mem_ops *ops, bool expect)
+{
+	/* Sanity check */
+	if (!ops->check_huge(p, 0)) {
+		printf("Unexpected huge page\n");
+		exit(EXIT_FAILURE);
+	}
+	__madvise_collapse(msg, p, nr_hpages, ops, expect);
+}
+
+#define TICK 500000
+static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
+			  struct mem_ops *ops)
+{
+	int full_scans;
+	int timeout = 6; /* 3 seconds */
+
+	/* Sanity check */
+	if (!ops->check_huge(p, 0)) {
+		printf("Unexpected huge page\n");
+		exit(EXIT_FAILURE);
+	}
+
+	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
+
+	/* Wait until the second full_scan completed */
+	full_scans = read_num("khugepaged/full_scans") + 2;
+
+	printf("%s...", msg);
+	while (timeout--) {
+		if (ops->check_huge(p, nr_hpages))
+			break;
+		if (read_num("khugepaged/full_scans") >= full_scans)
+			break;
+		printf(".");
+		usleep(TICK);
+	}
+
+	madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
+
+	return timeout == -1;
+}
+
+static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
+				struct mem_ops *ops, bool expect)
+{
+	if (wait_for_scan(msg, p, nr_hpages, ops)) {
+		if (expect)
+			fail("Timeout");
+		else
+			success("OK");
+		return;
+	}
+
+	/*
+	 * For file and shmem memory, khugepaged only retracts pte entries after
+	 * putting the new hugepage in the page cache. The hugepage must be
+	 * subsequently refaulted to install the pmd mapping for the mm.
+	 */
+	if (ops != &__anon_ops)
+		ops->fault(p, 0, nr_hpages * hpage_pmd_size);
+
+	if (ops->check_huge(p, expect ? nr_hpages : 0))
+		success("OK");
+	else
+		fail("Fail");
+}
+
+static struct collapse_context __khugepaged_context = {
+	.collapse = &khugepaged_collapse,
+	.enforce_pte_scan_limits = true,
+	.name = "khugepaged",
+};
+
+static struct collapse_context __madvise_context = {
+	.collapse = &madvise_collapse,
+	.enforce_pte_scan_limits = false,
+	.name = "madvise",
+};
+
+static bool is_tmpfs(struct mem_ops *ops)
+{
+	return ops == &__file_ops && finfo.type == VMA_SHMEM;
+}
+
+static void alloc_at_fault(void)
+{
+	struct settings settings = *current_settings();
+	char *p;
+
+	settings.thp_enabled = THP_ALWAYS;
+	push_settings(&settings);
+
+	p = alloc_mapping(1);
+	*p = 1;
+	printf("Allocate huge page on fault...");
+	if (check_huge_anon(p, 1, hpage_pmd_size))
+		success("OK");
+	else
+		fail("Fail");
+
+	pop_settings();
+
+	madvise(p, page_size, MADV_DONTNEED);
+	printf("Split huge PMD on MADV_DONTNEED...");
+	if (check_huge_anon(p, 0, hpage_pmd_size))
+		success("OK");
+	else
+		fail("Fail");
+	munmap(p, hpage_pmd_size);
+}
+
+static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
+{
+	void *p;
+	int nr_hpages = 4;
+	unsigned long size = nr_hpages * hpage_pmd_size;
+
+	p = ops->setup_area(nr_hpages);
+	ops->fault(p, 0, size);
+	c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
+		    ops, true);
+	validate_memory(p, 0, size);
+	ops->cleanup_area(p, size);
+}
+
+static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
+{
+	void *p;
+
+	p = ops->setup_area(1);
+	c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
+	ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
+{
+	void *p;
+
+	p = ops->setup_area(1);
+	ops->fault(p, 0, page_size);
+	c->collapse("Collapse PTE table with single PTE entry present", p,
+		    1, ops, true);
+	ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
+{
+	int max_ptes_none = hpage_pmd_nr / 2;
+	struct settings settings = *current_settings();
+	void *p;
+
+	settings.khugepaged.max_ptes_none = max_ptes_none;
+	push_settings(&settings);
+
+	p = ops->setup_area(1);
+
+	if (is_tmpfs(ops)) {
+		/* shmem pages always in the page cache */
+		printf("tmpfs...");
+		skip("Skip");
+		goto skip;
+	}
+
+	ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
+	c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
+		    ops, !c->enforce_pte_scan_limits);
+	validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
+
+	if (c->enforce_pte_scan_limits) {
+		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+		c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
+			    true);
+		validate_memory(p, 0,
+				(hpage_pmd_nr - max_ptes_none) * page_size);
+	}
+skip:
+	ops->cleanup_area(p, hpage_pmd_size);
+	pop_settings();
+}
+
+static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
+{
+	void *p;
+
+	p = ops->setup_area(1);
+	ops->fault(p, 0, hpage_pmd_size);
+
+	printf("Swapout one page...");
+	if (madvise(p, page_size, MADV_PAGEOUT)) {
+		perror("madvise(MADV_PAGEOUT)");
+		exit(EXIT_FAILURE);
+	}
+	if (check_swap(p, page_size)) {
+		success("OK");
+	} else {
+		fail("Fail");
+		goto out;
+	}
+
+	c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
+		    true);
+	validate_memory(p, 0, hpage_pmd_size);
+out:
+	ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
+{
+	int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
+	void *p;
+
+	p = ops->setup_area(1);
+	ops->fault(p, 0, hpage_pmd_size);
+
+	printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
+	if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
+		perror("madvise(MADV_PAGEOUT)");
+		exit(EXIT_FAILURE);
+	}
+	if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
+		success("OK");
+	} else {
+		fail("Fail");
+		goto out;
+	}
+
+	c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
+		    !c->enforce_pte_scan_limits);
+	validate_memory(p, 0, hpage_pmd_size);
+
+	if (c->enforce_pte_scan_limits) {
+		ops->fault(p, 0, hpage_pmd_size);
+		printf("Swapout %d of %d pages...", max_ptes_swap,
+		       hpage_pmd_nr);
+		if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
+			perror("madvise(MADV_PAGEOUT)");
+			exit(EXIT_FAILURE);
+		}
+		if (check_swap(p, max_ptes_swap * page_size)) {
+			success("OK");
+		} else {
+			fail("Fail");
+			goto out;
+		}
+
+		c->collapse("Collapse with max_ptes_swap pages swapped out", p,
+			    1, ops, true);
+		validate_memory(p, 0, hpage_pmd_size);
+	}
+out:
+	ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
+{
+	void *p;
+
+	p = alloc_hpage(ops);
+
+	if (is_tmpfs(ops)) {
+		/* MADV_DONTNEED won't evict tmpfs pages */
+		printf("tmpfs...");
+		skip("Skip");
+		goto skip;
+	}
+
+	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+	printf("Split huge page leaving single PTE mapping compound page...");
+	madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
+	if (ops->check_huge(p, 0))
+		success("OK");
+	else
+		fail("Fail");
+
+	c->collapse("Collapse PTE table with single PTE mapping compound page",
+		    p, 1, ops, true);
+	validate_memory(p, 0, page_size);
+skip:
+	ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
+{
+	void *p;
+
+	p = alloc_hpage(ops);
+	printf("Split huge page leaving single PTE page table full of compound pages...");
+	madvise(p, page_size, MADV_NOHUGEPAGE);
+	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+	if (ops->check_huge(p, 0))
+		success("OK");
+	else
+		fail("Fail");
+
+	c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
+		    true);
+	validate_memory(p, 0, hpage_pmd_size);
+	ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
+{
+	void *p;
+	int i;
+
+	p = ops->setup_area(1);
+	for (i = 0; i < hpage_pmd_nr; i++) {
+		printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
+				i + 1, hpage_pmd_nr);
+
+		madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
+		ops->fault(BASE_ADDR, 0, hpage_pmd_size);
+		if (!ops->check_huge(BASE_ADDR, 1)) {
+			printf("Failed to allocate huge page\n");
+			exit(EXIT_FAILURE);
+		}
+		madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+		p = mremap(BASE_ADDR - i * page_size,
+				i * page_size + hpage_pmd_size,
+				(i + 1) * page_size,
+				MREMAP_MAYMOVE | MREMAP_FIXED,
+				BASE_ADDR + 2 * hpage_pmd_size);
+		if (p == MAP_FAILED) {
+			perror("mremap+unmap");
+			exit(EXIT_FAILURE);
+		}
+
+		p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
+				(i + 1) * page_size,
+				(i + 1) * page_size + hpage_pmd_size,
+				MREMAP_MAYMOVE | MREMAP_FIXED,
+				BASE_ADDR - (i + 1) * page_size);
+		if (p == MAP_FAILED) {
+			perror("mremap+alloc");
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
+	ops->fault(p, 0, hpage_pmd_size);
+	if (!ops->check_huge(p, 1))
+		success("OK");
+	else
+		fail("Fail");
+
+	c->collapse("Collapse PTE table full of different compound pages", p, 1,
+		    ops, true);
+
+	validate_memory(p, 0, hpage_pmd_size);
+	ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
+{
+	int wstatus;
+	void *p;
+
+	p = ops->setup_area(1);
+
+	printf("Allocate small page...");
+	ops->fault(p, 0, page_size);
+	if (ops->check_huge(p, 0))
+		success("OK");
+	else
+		fail("Fail");
+
+	printf("Share small page over fork()...");
+	if (!fork()) {
+		/* Do not touch settings on child exit */
+		skip_settings_restore = true;
+		exit_status = 0;
+
+		if (ops->check_huge(p, 0))
+			success("OK");
+		else
+			fail("Fail");
+
+		ops->fault(p, page_size, 2 * page_size);
+		c->collapse("Collapse PTE table with single page shared with parent process",
+			    p, 1, ops, true);
+
+		validate_memory(p, 0, page_size);
+		ops->cleanup_area(p, hpage_pmd_size);
+		exit(exit_status);
+	}
+
+	wait(&wstatus);
+	exit_status += WEXITSTATUS(wstatus);
+
+	printf("Check if parent still has small page...");
+	if (ops->check_huge(p, 0))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, page_size);
+	ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
+{
+	int wstatus;
+	void *p;
+
+	p = alloc_hpage(ops);
+	printf("Share huge page over fork()...");
+	if (!fork()) {
+		/* Do not touch settings on child exit */
+		skip_settings_restore = true;
+		exit_status = 0;
+
+		if (ops->check_huge(p, 1))
+			success("OK");
+		else
+			fail("Fail");
+
+		printf("Split huge page PMD in child process...");
+		madvise(p, page_size, MADV_NOHUGEPAGE);
+		madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+		if (ops->check_huge(p, 0))
+			success("OK");
+		else
+			fail("Fail");
+		ops->fault(p, 0, page_size);
+
+		write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
+		c->collapse("Collapse PTE table full of compound pages in child",
+			    p, 1, ops, true);
+		write_num("khugepaged/max_ptes_shared",
+			  current_settings()->khugepaged.max_ptes_shared);
+
+		validate_memory(p, 0, hpage_pmd_size);
+		ops->cleanup_area(p, hpage_pmd_size);
+		exit(exit_status);
+	}
+
+	wait(&wstatus);
+	exit_status += WEXITSTATUS(wstatus);
+
+	printf("Check if parent still has huge page...");
+	if (ops->check_huge(p, 1))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, hpage_pmd_size);
+	ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
+{
+	int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
+	int wstatus;
+	void *p;
+
+	p = alloc_hpage(ops);
+	printf("Share huge page over fork()...");
+	if (!fork()) {
+		/* Do not touch settings on child exit */
+		skip_settings_restore = true;
+		exit_status = 0;
+
+		if (ops->check_huge(p, 1))
+			success("OK");
+		else
+			fail("Fail");
+
+		printf("Trigger CoW on page %d of %d...",
+				hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
+		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
+		if (ops->check_huge(p, 0))
+			success("OK");
+		else
+			fail("Fail");
+
+		c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
+			    1, ops, !c->enforce_pte_scan_limits);
+
+		if (c->enforce_pte_scan_limits) {
+			printf("Trigger CoW on page %d of %d...",
+			       hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
+			ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
+				    page_size);
+			if (ops->check_huge(p, 0))
+				success("OK");
+			else
+				fail("Fail");
+
+			c->collapse("Collapse with max_ptes_shared PTEs shared",
+				    p, 1, ops, true);
+		}
+
+		validate_memory(p, 0, hpage_pmd_size);
+		ops->cleanup_area(p, hpage_pmd_size);
+		exit(exit_status);
+	}
+
+	wait(&wstatus);
+	exit_status += WEXITSTATUS(wstatus);
+
+	printf("Check if parent still has huge page...");
+	if (ops->check_huge(p, 1))
+		success("OK");
+	else
+		fail("Fail");
+	validate_memory(p, 0, hpage_pmd_size);
+	ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void madvise_collapse_existing_thps(struct collapse_context *c,
+					   struct mem_ops *ops)
+{
+	void *p;
+
+	p = ops->setup_area(1);
+	ops->fault(p, 0, hpage_pmd_size);
+	c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
+	validate_memory(p, 0, hpage_pmd_size);
+
+	/* c->collapse() will find a hugepage and complain - call directly. */
+	__madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
+	validate_memory(p, 0, hpage_pmd_size);
+	ops->cleanup_area(p, hpage_pmd_size);
+}
+
+/*
+ * Test race with khugepaged where page tables have been retracted and
+ * pmd cleared.
+ */
+static void madvise_retracted_page_tables(struct collapse_context *c,
+					  struct mem_ops *ops)
+{
+	void *p;
+	int nr_hpages = 1;
+	unsigned long size = nr_hpages * hpage_pmd_size;
+
+	p = ops->setup_area(nr_hpages);
+	ops->fault(p, 0, size);
+
+	/* Let khugepaged collapse and leave pmd cleared */
+	if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
+			  ops)) {
+		fail("Timeout");
+		return;
+	}
+	success("OK");
+	c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
+		    true);
+	validate_memory(p, 0, size);
+	ops->cleanup_area(p, size);
+}
+
+static void usage(void)
+{
+	fprintf(stderr, "\nUsage: ./khugepaged <test type> [dir]\n\n");
+	fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
+	fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
+	fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
+	fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
+	fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
+	fprintf(stderr,	"\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
+	fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
+	fprintf(stderr,	"\tmounted with huge=madvise option for khugepaged tests to work\n");
+	exit(1);
+}
+
+static void parse_test_type(int argc, const char **argv)
+{
+	char *buf;
+	const char *token;
+
+	if (argc == 1) {
+		/* Backwards compatibility */
+		khugepaged_context =  &__khugepaged_context;
+		madvise_context =  &__madvise_context;
+		anon_ops = &__anon_ops;
+		return;
+	}
+
+	buf = strdup(argv[1]);
+	token = strsep(&buf, ":");
+
+	if (!strcmp(token, "all")) {
+		khugepaged_context =  &__khugepaged_context;
+		madvise_context =  &__madvise_context;
+	} else if (!strcmp(token, "khugepaged")) {
+		khugepaged_context =  &__khugepaged_context;
+	} else if (!strcmp(token, "madvise")) {
+		madvise_context =  &__madvise_context;
+	} else {
+		usage();
+	}
+
+	if (!buf)
+		usage();
+
+	if (!strcmp(buf, "all")) {
+		file_ops =  &__file_ops;
+		anon_ops = &__anon_ops;
+		shmem_ops = &__shmem_ops;
+	} else if (!strcmp(buf, "anon")) {
+		anon_ops = &__anon_ops;
+	} else if (!strcmp(buf, "file")) {
+		file_ops =  &__file_ops;
+	} else if (!strcmp(buf, "shmem")) {
+		shmem_ops = &__shmem_ops;
+	} else {
+		usage();
+	}
+
+	if (!file_ops)
+		return;
+
+	if (argc != 3)
+		usage();
+}
+
+int main(int argc, const char **argv)
+{
+	struct settings default_settings = {
+		.thp_enabled = THP_MADVISE,
+		.thp_defrag = THP_DEFRAG_ALWAYS,
+		.shmem_enabled = SHMEM_ADVISE,
+		.use_zero_page = 0,
+		.khugepaged = {
+			.defrag = 1,
+			.alloc_sleep_millisecs = 10,
+			.scan_sleep_millisecs = 10,
+		},
+		/*
+		 * When testing file-backed memory, the collapse path
+		 * looks at how many pages are found in the page cache, not
+		 * what pages are mapped. Disable read ahead optimization so
+		 * pages don't find their way into the page cache unless
+		 * we mem_ops->fault() them in.
+		 */
+		.read_ahead_kb = 0,
+	};
+
+	parse_test_type(argc, argv);
+
+	if (file_ops)
+		get_finfo(argv[2]);
+
+	setbuf(stdout, NULL);
+
+	page_size = getpagesize();
+	hpage_pmd_size = read_pmd_pagesize();
+	hpage_pmd_nr = hpage_pmd_size / page_size;
+
+	default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
+	default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
+	default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
+	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
+
+	save_settings();
+	push_settings(&default_settings);
+
+	alloc_at_fault();
+
+#define TEST(t, c, o) do { \
+	if (c && o) { \
+		printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
+		t(c, o); \
+	} \
+	} while (0)
+
+	TEST(collapse_full, khugepaged_context, anon_ops);
+	TEST(collapse_full, khugepaged_context, file_ops);
+	TEST(collapse_full, khugepaged_context, shmem_ops);
+	TEST(collapse_full, madvise_context, anon_ops);
+	TEST(collapse_full, madvise_context, file_ops);
+	TEST(collapse_full, madvise_context, shmem_ops);
+
+	TEST(collapse_empty, khugepaged_context, anon_ops);
+	TEST(collapse_empty, madvise_context, anon_ops);
+
+	TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
+	TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
+	TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
+	TEST(collapse_single_pte_entry, madvise_context, anon_ops);
+	TEST(collapse_single_pte_entry, madvise_context, file_ops);
+	TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
+
+	TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
+	TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
+	TEST(collapse_max_ptes_none, madvise_context, anon_ops);
+	TEST(collapse_max_ptes_none, madvise_context, file_ops);
+
+	TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
+	TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
+	TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
+	TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
+
+	TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
+	TEST(collapse_full_of_compound, khugepaged_context, file_ops);
+	TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
+	TEST(collapse_full_of_compound, madvise_context, anon_ops);
+	TEST(collapse_full_of_compound, madvise_context, file_ops);
+	TEST(collapse_full_of_compound, madvise_context, shmem_ops);
+
+	TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
+	TEST(collapse_compound_extreme, madvise_context, anon_ops);
+
+	TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
+	TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
+
+	TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
+	TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
+
+	TEST(collapse_fork, khugepaged_context, anon_ops);
+	TEST(collapse_fork, madvise_context, anon_ops);
+
+	TEST(collapse_fork_compound, khugepaged_context, anon_ops);
+	TEST(collapse_fork_compound, madvise_context, anon_ops);
+
+	TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
+	TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
+
+	TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
+	TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
+	TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
+
+	TEST(madvise_retracted_page_tables, madvise_context, file_ops);
+	TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
+
+	restore_settings(0);
+}
diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
new file mode 100644
index 000000000000..d8b5b4930412
--- /dev/null
+++ b/tools/testing/selftests/mm/ksm_functional_tests.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KSM functional tests
+ *
+ * Copyright 2022, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <linux/userfaultfd.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define KiB 1024u
+#define MiB (1024 * KiB)
+
+static int ksm_fd;
+static int ksm_full_scans_fd;
+static int pagemap_fd;
+static size_t pagesize;
+
+static bool range_maps_duplicates(char *addr, unsigned long size)
+{
+	unsigned long offs_a, offs_b, pfn_a, pfn_b;
+
+	/*
+	 * There is no easy way to check if there are KSM pages mapped into
+	 * this range. We only check that the range does not map the same PFN
+	 * twice by comparing each pair of mapped pages.
+	 */
+	for (offs_a = 0; offs_a < size; offs_a += pagesize) {
+		pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a);
+		/* Page not present or PFN not exposed by the kernel. */
+		if (pfn_a == -1ul || !pfn_a)
+			continue;
+
+		for (offs_b = offs_a + pagesize; offs_b < size;
+		     offs_b += pagesize) {
+			pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b);
+			if (pfn_b == -1ul || !pfn_b)
+				continue;
+			if (pfn_a == pfn_b)
+				return true;
+		}
+	}
+	return false;
+}
+
+static long ksm_get_full_scans(void)
+{
+	char buf[10];
+	ssize_t ret;
+
+	ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0);
+	if (ret <= 0)
+		return -errno;
+	buf[ret] = 0;
+
+	return strtol(buf, NULL, 10);
+}
+
+static int ksm_merge(void)
+{
+	long start_scans, end_scans;
+
+	/* Wait for two full scans such that any possible merging happened. */
+	start_scans = ksm_get_full_scans();
+	if (start_scans < 0)
+		return start_scans;
+	if (write(ksm_fd, "1", 1) != 1)
+		return -errno;
+	do {
+		end_scans = ksm_get_full_scans();
+		if (end_scans < 0)
+			return end_scans;
+	} while (end_scans < start_scans + 2);
+
+	return 0;
+}
+
+static char *mmap_and_merge_range(char val, unsigned long size)
+{
+	char *map;
+
+	map = mmap(NULL, size, PROT_READ|PROT_WRITE,
+		   MAP_PRIVATE|MAP_ANON, -1, 0);
+	if (map == MAP_FAILED) {
+		ksft_test_result_fail("mmap() failed\n");
+		return MAP_FAILED;
+	}
+
+	/* Don't use THP. Ignore if THP are not around on a kernel. */
+	if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) {
+		ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
+		goto unmap;
+	}
+
+	/* Make sure each page contains the same values to merge them. */
+	memset(map, val, size);
+	if (madvise(map, size, MADV_MERGEABLE)) {
+		ksft_test_result_fail("MADV_MERGEABLE failed\n");
+		goto unmap;
+	}
+
+	/* Run KSM to trigger merging and wait. */
+	if (ksm_merge()) {
+		ksft_test_result_fail("Running KSM failed\n");
+		goto unmap;
+	}
+	return map;
+unmap:
+	munmap(map, size);
+	return MAP_FAILED;
+}
+
+static void test_unmerge(void)
+{
+	const unsigned int size = 2 * MiB;
+	char *map;
+
+	ksft_print_msg("[RUN] %s\n", __func__);
+
+	map = mmap_and_merge_range(0xcf, size);
+	if (map == MAP_FAILED)
+		return;
+
+	if (madvise(map, size, MADV_UNMERGEABLE)) {
+		ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+		goto unmap;
+	}
+
+	ksft_test_result(!range_maps_duplicates(map, size),
+			 "Pages were unmerged\n");
+unmap:
+	munmap(map, size);
+}
+
+static void test_unmerge_discarded(void)
+{
+	const unsigned int size = 2 * MiB;
+	char *map;
+
+	ksft_print_msg("[RUN] %s\n", __func__);
+
+	map = mmap_and_merge_range(0xcf, size);
+	if (map == MAP_FAILED)
+		return;
+
+	/* Discard half of all mapped pages so we have pte_none() entries. */
+	if (madvise(map, size / 2, MADV_DONTNEED)) {
+		ksft_test_result_fail("MADV_DONTNEED failed\n");
+		goto unmap;
+	}
+
+	if (madvise(map, size, MADV_UNMERGEABLE)) {
+		ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+		goto unmap;
+	}
+
+	ksft_test_result(!range_maps_duplicates(map, size),
+			 "Pages were unmerged\n");
+unmap:
+	munmap(map, size);
+}
+
+#ifdef __NR_userfaultfd
+static void test_unmerge_uffd_wp(void)
+{
+	struct uffdio_writeprotect uffd_writeprotect;
+	struct uffdio_register uffdio_register;
+	const unsigned int size = 2 * MiB;
+	struct uffdio_api uffdio_api;
+	char *map;
+	int uffd;
+
+	ksft_print_msg("[RUN] %s\n", __func__);
+
+	map = mmap_and_merge_range(0xcf, size);
+	if (map == MAP_FAILED)
+		return;
+
+	/* See if UFFD is around. */
+	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+	if (uffd < 0) {
+		ksft_test_result_skip("__NR_userfaultfd failed\n");
+		goto unmap;
+	}
+
+	/* See if UFFD-WP is around. */
+	uffdio_api.api = UFFD_API;
+	uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
+	if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
+		ksft_test_result_fail("UFFDIO_API failed\n");
+		goto close_uffd;
+	}
+	if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
+		ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n");
+		goto close_uffd;
+	}
+
+	/* Register UFFD-WP, no need for an actual handler. */
+	uffdio_register.range.start = (unsigned long) map;
+	uffdio_register.range.len = size;
+	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
+	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
+		ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n");
+		goto close_uffd;
+	}
+
+	/* Write-protect the range using UFFD-WP. */
+	uffd_writeprotect.range.start = (unsigned long) map;
+	uffd_writeprotect.range.len = size;
+	uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
+	if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
+		ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n");
+		goto close_uffd;
+	}
+
+	if (madvise(map, size, MADV_UNMERGEABLE)) {
+		ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+		goto close_uffd;
+	}
+
+	ksft_test_result(!range_maps_duplicates(map, size),
+			 "Pages were unmerged\n");
+close_uffd:
+	close(uffd);
+unmap:
+	munmap(map, size);
+}
+#endif
+
+int main(int argc, char **argv)
+{
+	unsigned int tests = 2;
+	int err;
+
+#ifdef __NR_userfaultfd
+	tests++;
+#endif
+
+	ksft_print_header();
+	ksft_set_plan(tests);
+
+	pagesize = getpagesize();
+
+	ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR);
+	if (ksm_fd < 0)
+		ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n");
+	ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY);
+	if (ksm_full_scans_fd < 0)
+		ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n");
+	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+	if (pagemap_fd < 0)
+		ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n");
+
+	test_unmerge();
+	test_unmerge_discarded();
+#ifdef __NR_userfaultfd
+	test_unmerge_uffd_wp();
+#endif
+
+	err = ksft_get_fail_cnt();
+	if (err)
+		ksft_exit_fail_msg("%d out of %d tests failed\n",
+				   err, ksft_test_num());
+	return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c
new file mode 100644
index 000000000000..f9eb4d67e0dd
--- /dev/null
+++ b/tools/testing/selftests/mm/ksm_tests.c
@@ -0,0 +1,849 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <time.h>
+#include <string.h>
+#include <numa.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <err.h>
+
+#include "../kselftest.h"
+#include <include/vdso/time64.h>
+#include "util.h"
+
+#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/"
+#define KSM_FP(s) (KSM_SYSFS_PATH s)
+#define KSM_SCAN_LIMIT_SEC_DEFAULT 120
+#define KSM_PAGE_COUNT_DEFAULT 10l
+#define KSM_PROT_STR_DEFAULT "rw"
+#define KSM_USE_ZERO_PAGES_DEFAULT false
+#define KSM_MERGE_ACROSS_NODES_DEFAULT true
+#define MB (1ul << 20)
+
+struct ksm_sysfs {
+	unsigned long max_page_sharing;
+	unsigned long merge_across_nodes;
+	unsigned long pages_to_scan;
+	unsigned long run;
+	unsigned long sleep_millisecs;
+	unsigned long stable_node_chains_prune_millisecs;
+	unsigned long use_zero_pages;
+};
+
+enum ksm_test_name {
+	CHECK_KSM_MERGE,
+	CHECK_KSM_UNMERGE,
+	CHECK_KSM_ZERO_PAGE_MERGE,
+	CHECK_KSM_NUMA_MERGE,
+	KSM_MERGE_TIME,
+	KSM_MERGE_TIME_HUGE_PAGES,
+	KSM_UNMERGE_TIME,
+	KSM_COW_TIME
+};
+
+static int ksm_write_sysfs(const char *file_path, unsigned long val)
+{
+	FILE *f = fopen(file_path, "w");
+
+	if (!f) {
+		fprintf(stderr, "f %s\n", file_path);
+		perror("fopen");
+		return 1;
+	}
+	if (fprintf(f, "%lu", val) < 0) {
+		perror("fprintf");
+		fclose(f);
+		return 1;
+	}
+	fclose(f);
+
+	return 0;
+}
+
+static int ksm_read_sysfs(const char *file_path, unsigned long *val)
+{
+	FILE *f = fopen(file_path, "r");
+
+	if (!f) {
+		fprintf(stderr, "f %s\n", file_path);
+		perror("fopen");
+		return 1;
+	}
+	if (fscanf(f, "%lu", val) != 1) {
+		perror("fscanf");
+		fclose(f);
+		return 1;
+	}
+	fclose(f);
+
+	return 0;
+}
+
+static int str_to_prot(char *prot_str)
+{
+	int prot = 0;
+
+	if ((strchr(prot_str, 'r')) != NULL)
+		prot |= PROT_READ;
+	if ((strchr(prot_str, 'w')) != NULL)
+		prot |= PROT_WRITE;
+	if ((strchr(prot_str, 'x')) != NULL)
+		prot |= PROT_EXEC;
+
+	return prot;
+}
+
+static void print_help(void)
+{
+	printf("usage: ksm_tests [-h] <test type> [-a prot] [-p page_count] [-l timeout]\n"
+	       "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n");
+
+	printf("Supported <test type>:\n"
+	       " -M (page merging)\n"
+	       " -Z (zero pages merging)\n"
+	       " -N (merging of pages in different NUMA nodes)\n"
+	       " -U (page unmerging)\n"
+	       " -P evaluate merging time and speed.\n"
+	       "    For this test, the size of duplicated memory area (in MiB)\n"
+	       "    must be provided using -s option\n"
+	       " -H evaluate merging time and speed of area allocated mostly with huge pages\n"
+	       "    For this test, the size of duplicated memory area (in MiB)\n"
+	       "    must be provided using -s option\n"
+	       " -D evaluate unmerging time and speed when disabling KSM.\n"
+	       "    For this test, the size of duplicated memory area (in MiB)\n"
+	       "    must be provided using -s option\n"
+	       " -C evaluate the time required to break COW of merged pages.\n\n");
+
+	printf(" -a: specify the access protections of pages.\n"
+	       "     <prot> must be of the form [rwx].\n"
+	       "     Default: %s\n", KSM_PROT_STR_DEFAULT);
+	printf(" -p: specify the number of pages to test.\n"
+	       "     Default: %ld\n", KSM_PAGE_COUNT_DEFAULT);
+	printf(" -l: limit the maximum running time (in seconds) for a test.\n"
+	       "     Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT);
+	printf(" -z: change use_zero_pages tunable\n"
+	       "     Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT);
+	printf(" -m: change merge_across_nodes tunable\n"
+	       "     Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT);
+	printf(" -s: the size of duplicated memory area (in MiB)\n");
+
+	exit(0);
+}
+
+static void  *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size)
+{
+	void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0);
+
+	if (!map_ptr) {
+		perror("mmap");
+		return NULL;
+	}
+	memset(map_ptr, data, map_size);
+	if (mprotect(map_ptr, map_size, prot)) {
+		perror("mprotect");
+		munmap(map_ptr, map_size);
+		return NULL;
+	}
+
+	return map_ptr;
+}
+
+static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout)
+{
+	struct timespec cur_time;
+	unsigned long cur_scan, init_scan;
+
+	if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan))
+		return 1;
+	cur_scan = init_scan;
+
+	while (cur_scan < init_scan + scan_count) {
+		if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan))
+			return 1;
+		if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) {
+			perror("clock_gettime");
+			return 1;
+		}
+		if ((cur_time.tv_sec - start_time.tv_sec) > timeout) {
+			printf("Scan time limit exceeded\n");
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout)
+{
+	if (madvise(addr, size, MADV_MERGEABLE)) {
+		perror("madvise");
+		return 1;
+	}
+	if (ksm_write_sysfs(KSM_FP("run"), 1))
+		return 1;
+
+	/* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */
+	if (ksm_do_scan(2, start_time, timeout))
+		return 1;
+
+	return 0;
+}
+
+static int ksm_unmerge_pages(void *addr, size_t size,
+			     struct timespec start_time, int timeout)
+{
+	if (madvise(addr, size, MADV_UNMERGEABLE)) {
+		perror("madvise");
+		return 1;
+	}
+	return 0;
+}
+
+static bool assert_ksm_pages_count(long dupl_page_count)
+{
+	unsigned long max_page_sharing, pages_sharing, pages_shared;
+
+	if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) ||
+	    ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) ||
+	    ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing))
+		return false;
+
+	/*
+	 * Since there must be at least 2 pages for merging and 1 page can be
+	 * shared with the limited number of pages (max_page_sharing), sometimes
+	 * there are 'leftover' pages that cannot be merged. For example, if there
+	 * are 11 pages and max_page_sharing = 10, then only 10 pages will be
+	 * merged and the 11th page won't be affected. As a result, when the number
+	 * of duplicate pages is divided by max_page_sharing and the remainder is 1,
+	 * pages_shared and pages_sharing values will be equal between dupl_page_count
+	 * and dupl_page_count - 1.
+	 */
+	if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) {
+		if (pages_shared == dupl_page_count / max_page_sharing &&
+		    pages_sharing == pages_shared * (max_page_sharing - 1))
+			return true;
+	} else {
+		if (pages_shared == (dupl_page_count / max_page_sharing + 1) &&
+		    pages_sharing == dupl_page_count - pages_shared)
+			return true;
+	}
+
+	return false;
+}
+
+static int ksm_save_def(struct ksm_sysfs *ksm_sysfs)
+{
+	if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) ||
+	    numa_available() ? 0 :
+		ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) ||
+	    ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) ||
+	    ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) ||
+	    ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) ||
+	    ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
+			   &ksm_sysfs->stable_node_chains_prune_millisecs) ||
+	    ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages))
+		return 1;
+
+	return 0;
+}
+
+static int ksm_restore(struct ksm_sysfs *ksm_sysfs)
+{
+	if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) ||
+	    numa_available() ? 0 :
+		ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) ||
+	    ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) ||
+	    ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) ||
+	    ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) ||
+	    ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
+			    ksm_sysfs->stable_node_chains_prune_millisecs) ||
+	    ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages))
+		return 1;
+
+	return 0;
+}
+
+static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size)
+{
+	void *map_ptr;
+	struct timespec start_time;
+
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+		perror("clock_gettime");
+		return KSFT_FAIL;
+	}
+
+	/* fill pages with the same data and merge them */
+	map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+	if (!map_ptr)
+		return KSFT_FAIL;
+
+	if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+		goto err_out;
+
+	/* verify that the right number of pages are merged */
+	if (assert_ksm_pages_count(page_count)) {
+		printf("OK\n");
+		munmap(map_ptr, page_size * page_count);
+		return KSFT_PASS;
+	}
+
+err_out:
+	printf("Not OK\n");
+	munmap(map_ptr, page_size * page_count);
+	return KSFT_FAIL;
+}
+
+static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size)
+{
+	void *map_ptr;
+	struct timespec start_time;
+	int page_count = 2;
+
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+		perror("clock_gettime");
+		return KSFT_FAIL;
+	}
+
+	/* fill pages with the same data and merge them */
+	map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+	if (!map_ptr)
+		return KSFT_FAIL;
+
+	if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+		goto err_out;
+
+	/* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */
+	memset(map_ptr, '-', 1);
+	memset(map_ptr + page_size, '+', 1);
+
+	/* get at least 1 scan, so KSM can detect that the pages were modified */
+	if (ksm_do_scan(1, start_time, timeout))
+		goto err_out;
+
+	/* check that unmerging was successful and 0 pages are currently merged */
+	if (assert_ksm_pages_count(0)) {
+		printf("OK\n");
+		munmap(map_ptr, page_size * page_count);
+		return KSFT_PASS;
+	}
+
+err_out:
+	printf("Not OK\n");
+	munmap(map_ptr, page_size * page_count);
+	return KSFT_FAIL;
+}
+
+static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout,
+				     bool use_zero_pages, size_t page_size)
+{
+	void *map_ptr;
+	struct timespec start_time;
+
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+		perror("clock_gettime");
+		return KSFT_FAIL;
+	}
+
+	if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages))
+		return KSFT_FAIL;
+
+	/* fill pages with zero and try to merge them */
+	map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count);
+	if (!map_ptr)
+		return KSFT_FAIL;
+
+	if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+		goto err_out;
+
+       /*
+	* verify that the right number of pages are merged:
+	* 1) if use_zero_pages is set to 1, empty pages are merged
+	*    with the kernel zero page instead of with each other;
+	* 2) if use_zero_pages is set to 0, empty pages are not treated specially
+	*    and merged as usual.
+	*/
+	if (use_zero_pages && !assert_ksm_pages_count(0))
+		goto err_out;
+	else if (!use_zero_pages && !assert_ksm_pages_count(page_count))
+		goto err_out;
+
+	printf("OK\n");
+	munmap(map_ptr, page_size * page_count);
+	return KSFT_PASS;
+
+err_out:
+	printf("Not OK\n");
+	munmap(map_ptr, page_size * page_count);
+	return KSFT_FAIL;
+}
+
+static int get_next_mem_node(int node)
+{
+
+	long node_size;
+	int mem_node = 0;
+	int i, max_node = numa_max_node();
+
+	for (i = node + 1; i <= max_node + node; i++) {
+		mem_node = i % (max_node + 1);
+		node_size = numa_node_size(mem_node, NULL);
+		if (node_size > 0)
+			break;
+	}
+	return mem_node;
+}
+
+static int get_first_mem_node(void)
+{
+	return get_next_mem_node(numa_max_node());
+}
+
+static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes,
+				size_t page_size)
+{
+	void *numa1_map_ptr, *numa2_map_ptr;
+	struct timespec start_time;
+	int page_count = 2;
+	int first_node;
+
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+		perror("clock_gettime");
+		return KSFT_FAIL;
+	}
+
+	if (numa_available() < 0) {
+		perror("NUMA support not enabled");
+		return KSFT_SKIP;
+	}
+	if (numa_num_configured_nodes() <= 1) {
+		printf("At least 2 NUMA nodes must be available\n");
+		return KSFT_SKIP;
+	}
+	if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes))
+		return KSFT_FAIL;
+
+	/* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */
+	first_node = get_first_mem_node();
+	numa1_map_ptr = numa_alloc_onnode(page_size, first_node);
+	numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node));
+	if (!numa1_map_ptr || !numa2_map_ptr) {
+		perror("numa_alloc_onnode");
+		return KSFT_FAIL;
+	}
+
+	memset(numa1_map_ptr, '*', page_size);
+	memset(numa2_map_ptr, '*', page_size);
+
+	/* try to merge the pages */
+	if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) ||
+	    ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout))
+		goto err_out;
+
+       /*
+	* verify that the right number of pages are merged:
+	* 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged;
+	* 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is
+	*    only 1 unique page in each node and they can't be shared.
+	*/
+	if (merge_across_nodes && !assert_ksm_pages_count(page_count))
+		goto err_out;
+	else if (!merge_across_nodes && !assert_ksm_pages_count(0))
+		goto err_out;
+
+	numa_free(numa1_map_ptr, page_size);
+	numa_free(numa2_map_ptr, page_size);
+	printf("OK\n");
+	return KSFT_PASS;
+
+err_out:
+	numa_free(numa1_map_ptr, page_size);
+	numa_free(numa2_map_ptr, page_size);
+	printf("Not OK\n");
+	return KSFT_FAIL;
+}
+
+static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size)
+{
+	void *map_ptr, *map_ptr_orig;
+	struct timespec start_time, end_time;
+	unsigned long scan_time_ns;
+	int pagemap_fd, n_normal_pages, n_huge_pages;
+
+	map_size *= MB;
+	size_t len = map_size;
+
+	len -= len % HPAGE_SIZE;
+	map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE,
+			MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
+	map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE;
+
+	if (map_ptr_orig == MAP_FAILED)
+		err(2, "initial mmap");
+
+	if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE))
+		err(2, "MADV_HUGEPAGE");
+
+	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+	if (pagemap_fd < 0)
+		err(2, "open pagemap");
+
+	n_normal_pages = 0;
+	n_huge_pages = 0;
+	for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) {
+		if (allocate_transhuge(p, pagemap_fd) < 0)
+			n_normal_pages++;
+		else
+			n_huge_pages++;
+	}
+	printf("Number of normal pages:    %d\n", n_normal_pages);
+	printf("Number of huge pages:    %d\n", n_huge_pages);
+
+	memset(map_ptr, '*', len);
+
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+		perror("clock_gettime");
+		goto err_out;
+	}
+	if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
+		goto err_out;
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+		perror("clock_gettime");
+		goto err_out;
+	}
+
+	scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+		       (end_time.tv_nsec - start_time.tv_nsec);
+
+	printf("Total size:    %lu MiB\n", map_size / MB);
+	printf("Total time:    %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
+	       scan_time_ns % NSEC_PER_SEC);
+	printf("Average speed:  %.3f MiB/s\n", (map_size / MB) /
+					       ((double)scan_time_ns / NSEC_PER_SEC));
+
+	munmap(map_ptr_orig, len + HPAGE_SIZE);
+	return KSFT_PASS;
+
+err_out:
+	printf("Not OK\n");
+	munmap(map_ptr_orig, len + HPAGE_SIZE);
+	return KSFT_FAIL;
+}
+
+static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size)
+{
+	void *map_ptr;
+	struct timespec start_time, end_time;
+	unsigned long scan_time_ns;
+
+	map_size *= MB;
+
+	map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size);
+	if (!map_ptr)
+		return KSFT_FAIL;
+
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+		perror("clock_gettime");
+		goto err_out;
+	}
+	if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
+		goto err_out;
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+		perror("clock_gettime");
+		goto err_out;
+	}
+
+	scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+		       (end_time.tv_nsec - start_time.tv_nsec);
+
+	printf("Total size:    %lu MiB\n", map_size / MB);
+	printf("Total time:    %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
+	       scan_time_ns % NSEC_PER_SEC);
+	printf("Average speed:  %.3f MiB/s\n", (map_size / MB) /
+					       ((double)scan_time_ns / NSEC_PER_SEC));
+
+	munmap(map_ptr, map_size);
+	return KSFT_PASS;
+
+err_out:
+	printf("Not OK\n");
+	munmap(map_ptr, map_size);
+	return KSFT_FAIL;
+}
+
+static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size)
+{
+	void *map_ptr;
+	struct timespec start_time, end_time;
+	unsigned long scan_time_ns;
+
+	map_size *= MB;
+
+	map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size);
+	if (!map_ptr)
+		return KSFT_FAIL;
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+		perror("clock_gettime");
+		goto err_out;
+	}
+	if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
+		goto err_out;
+
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+		perror("clock_gettime");
+		goto err_out;
+	}
+	if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout))
+		goto err_out;
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+		perror("clock_gettime");
+		goto err_out;
+	}
+
+	scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+		       (end_time.tv_nsec - start_time.tv_nsec);
+
+	printf("Total size:    %lu MiB\n", map_size / MB);
+	printf("Total time:    %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
+	       scan_time_ns % NSEC_PER_SEC);
+	printf("Average speed:  %.3f MiB/s\n", (map_size / MB) /
+					       ((double)scan_time_ns / NSEC_PER_SEC));
+
+	munmap(map_ptr, map_size);
+	return KSFT_PASS;
+
+err_out:
+	printf("Not OK\n");
+	munmap(map_ptr, map_size);
+	return KSFT_FAIL;
+}
+
+static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size)
+{
+	void *map_ptr;
+	struct timespec start_time, end_time;
+	unsigned long cow_time_ns;
+
+	/* page_count must be less than 2*page_size */
+	size_t page_count = 4000;
+
+	map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+	if (!map_ptr)
+		return KSFT_FAIL;
+
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+		perror("clock_gettime");
+		return KSFT_FAIL;
+	}
+	for (size_t i = 0; i < page_count - 1; i = i + 2)
+		memset(map_ptr + page_size * i, '-', 1);
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+		perror("clock_gettime");
+		return KSFT_FAIL;
+	}
+
+	cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+		       (end_time.tv_nsec - start_time.tv_nsec);
+
+	printf("Total size:    %lu MiB\n\n", (page_size * page_count) / MB);
+	printf("Not merged pages:\n");
+	printf("Total time:     %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC,
+	       cow_time_ns % NSEC_PER_SEC);
+	printf("Average speed:  %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) /
+					       ((double)cow_time_ns / NSEC_PER_SEC));
+
+	/* Create 2000 pairs of duplicate pages */
+	for (size_t i = 0; i < page_count - 1; i = i + 2) {
+		memset(map_ptr + page_size * i, '+', i / 2 + 1);
+		memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1);
+	}
+	if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+		goto err_out;
+
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+		perror("clock_gettime");
+		goto err_out;
+	}
+	for (size_t i = 0; i < page_count - 1; i = i + 2)
+		memset(map_ptr + page_size * i, '-', 1);
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+		perror("clock_gettime");
+		goto err_out;
+	}
+
+	cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+		       (end_time.tv_nsec - start_time.tv_nsec);
+
+	printf("Merged pages:\n");
+	printf("Total time:     %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC,
+	       cow_time_ns % NSEC_PER_SEC);
+	printf("Average speed:  %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) /
+					       ((double)cow_time_ns / NSEC_PER_SEC));
+
+	munmap(map_ptr, page_size * page_count);
+	return KSFT_PASS;
+
+err_out:
+	printf("Not OK\n");
+	munmap(map_ptr, page_size * page_count);
+	return KSFT_FAIL;
+}
+
+int main(int argc, char *argv[])
+{
+	int ret, opt;
+	int prot = 0;
+	int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT;
+	long page_count = KSM_PAGE_COUNT_DEFAULT;
+	size_t page_size = sysconf(_SC_PAGESIZE);
+	struct ksm_sysfs ksm_sysfs_old;
+	int test_name = CHECK_KSM_MERGE;
+	bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT;
+	bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT;
+	long size_MB = 0;
+
+	while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) {
+		switch (opt) {
+		case 'a':
+			prot = str_to_prot(optarg);
+			break;
+		case 'p':
+			page_count = atol(optarg);
+			if (page_count <= 0) {
+				printf("The number of pages must be greater than 0\n");
+				return KSFT_FAIL;
+			}
+			break;
+		case 'l':
+			ksm_scan_limit_sec = atoi(optarg);
+			if (ksm_scan_limit_sec <= 0) {
+				printf("Timeout value must be greater than 0\n");
+				return KSFT_FAIL;
+			}
+			break;
+		case 'h':
+			print_help();
+			break;
+		case 'z':
+			if (strcmp(optarg, "0") == 0)
+				use_zero_pages = 0;
+			else
+				use_zero_pages = 1;
+			break;
+		case 'm':
+			if (strcmp(optarg, "0") == 0)
+				merge_across_nodes = 0;
+			else
+				merge_across_nodes = 1;
+			break;
+		case 's':
+			size_MB = atoi(optarg);
+			if (size_MB <= 0) {
+				printf("Size must be greater than 0\n");
+				return KSFT_FAIL;
+			}
+		case 'M':
+			break;
+		case 'U':
+			test_name = CHECK_KSM_UNMERGE;
+			break;
+		case 'Z':
+			test_name = CHECK_KSM_ZERO_PAGE_MERGE;
+			break;
+		case 'N':
+			test_name = CHECK_KSM_NUMA_MERGE;
+			break;
+		case 'P':
+			test_name = KSM_MERGE_TIME;
+			break;
+		case 'H':
+			test_name = KSM_MERGE_TIME_HUGE_PAGES;
+			break;
+		case 'D':
+			test_name = KSM_UNMERGE_TIME;
+			break;
+		case 'C':
+			test_name = KSM_COW_TIME;
+			break;
+		default:
+			return KSFT_FAIL;
+		}
+	}
+
+	if (prot == 0)
+		prot = str_to_prot(KSM_PROT_STR_DEFAULT);
+
+	if (access(KSM_SYSFS_PATH, F_OK)) {
+		printf("Config KSM not enabled\n");
+		return KSFT_SKIP;
+	}
+
+	if (ksm_save_def(&ksm_sysfs_old)) {
+		printf("Cannot save default tunables\n");
+		return KSFT_FAIL;
+	}
+
+	if (ksm_write_sysfs(KSM_FP("run"), 2) ||
+	    ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) ||
+	    numa_available() ? 0 :
+		ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) ||
+	    ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count))
+		return KSFT_FAIL;
+
+	switch (test_name) {
+	case CHECK_KSM_MERGE:
+		ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
+				      ksm_scan_limit_sec, page_size);
+		break;
+	case CHECK_KSM_UNMERGE:
+		ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+					page_size);
+		break;
+	case CHECK_KSM_ZERO_PAGE_MERGE:
+		ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
+						ksm_scan_limit_sec, use_zero_pages, page_size);
+		break;
+	case CHECK_KSM_NUMA_MERGE:
+		ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+					   merge_across_nodes, page_size);
+		break;
+	case KSM_MERGE_TIME:
+		if (size_MB == 0) {
+			printf("Option '-s' is required.\n");
+			return KSFT_FAIL;
+		}
+		ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+				     size_MB);
+		break;
+	case KSM_MERGE_TIME_HUGE_PAGES:
+		if (size_MB == 0) {
+			printf("Option '-s' is required.\n");
+			return KSFT_FAIL;
+		}
+		ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
+				ksm_scan_limit_sec, size_MB);
+		break;
+	case KSM_UNMERGE_TIME:
+		if (size_MB == 0) {
+			printf("Option '-s' is required.\n");
+			return KSFT_FAIL;
+		}
+		ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
+				       ksm_scan_limit_sec, size_MB);
+		break;
+	case KSM_COW_TIME:
+		ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+				   page_size);
+		break;
+	}
+
+	if (ksm_restore(&ksm_sysfs_old)) {
+		printf("Cannot restore default tunables\n");
+		return KSFT_FAIL;
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c
new file mode 100644
index 000000000000..262eae6b58f2
--- /dev/null
+++ b/tools/testing/selftests/mm/madv_populate.c
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
+ *
+ * Copyright 2021, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#ifndef MADV_POPULATE_READ
+#define MADV_POPULATE_READ	22
+#endif /* MADV_POPULATE_READ */
+#ifndef MADV_POPULATE_WRITE
+#define MADV_POPULATE_WRITE	23
+#endif /* MADV_POPULATE_WRITE */
+
+/*
+ * For now, we're using 2 MiB of private anonymous memory for all tests.
+ */
+#define SIZE (2 * 1024 * 1024)
+
+static size_t pagesize;
+
+static void sense_support(void)
+{
+	char *addr;
+	int ret;
+
+	addr = mmap(0, pagesize, PROT_READ | PROT_WRITE,
+		    MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+	if (!addr)
+		ksft_exit_fail_msg("mmap failed\n");
+
+	ret = madvise(addr, pagesize, MADV_POPULATE_READ);
+	if (ret)
+		ksft_exit_skip("MADV_POPULATE_READ is not available\n");
+
+	ret = madvise(addr, pagesize, MADV_POPULATE_WRITE);
+	if (ret)
+		ksft_exit_skip("MADV_POPULATE_WRITE is not available\n");
+
+	munmap(addr, pagesize);
+}
+
+static void test_prot_read(void)
+{
+	char *addr;
+	int ret;
+
+	ksft_print_msg("[RUN] %s\n", __func__);
+
+	addr = mmap(0, SIZE, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap failed\n");
+
+	ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+	ksft_test_result(!ret, "MADV_POPULATE_READ with PROT_READ\n");
+
+	ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+	ksft_test_result(ret == -1 && errno == EINVAL,
+			 "MADV_POPULATE_WRITE with PROT_READ\n");
+
+	munmap(addr, SIZE);
+}
+
+static void test_prot_write(void)
+{
+	char *addr;
+	int ret;
+
+	ksft_print_msg("[RUN] %s\n", __func__);
+
+	addr = mmap(0, SIZE, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap failed\n");
+
+	ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+	ksft_test_result(ret == -1 && errno == EINVAL,
+			 "MADV_POPULATE_READ with PROT_WRITE\n");
+
+	ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+	ksft_test_result(!ret, "MADV_POPULATE_WRITE with PROT_WRITE\n");
+
+	munmap(addr, SIZE);
+}
+
+static void test_holes(void)
+{
+	char *addr;
+	int ret;
+
+	ksft_print_msg("[RUN] %s\n", __func__);
+
+	addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+		    MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap failed\n");
+	ret = munmap(addr + pagesize, pagesize);
+	if (ret)
+		ksft_exit_fail_msg("munmap failed\n");
+
+	/* Hole in the middle */
+	ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+	ksft_test_result(ret == -1 && errno == ENOMEM,
+			 "MADV_POPULATE_READ with holes in the middle\n");
+	ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+	ksft_test_result(ret == -1 && errno == ENOMEM,
+			 "MADV_POPULATE_WRITE with holes in the middle\n");
+
+	/* Hole at end */
+	ret = madvise(addr, 2 * pagesize, MADV_POPULATE_READ);
+	ksft_test_result(ret == -1 && errno == ENOMEM,
+			 "MADV_POPULATE_READ with holes at the end\n");
+	ret = madvise(addr, 2 * pagesize, MADV_POPULATE_WRITE);
+	ksft_test_result(ret == -1 && errno == ENOMEM,
+			 "MADV_POPULATE_WRITE with holes at the end\n");
+
+	/* Hole at beginning */
+	ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_READ);
+	ksft_test_result(ret == -1 && errno == ENOMEM,
+			 "MADV_POPULATE_READ with holes at the beginning\n");
+	ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_WRITE);
+	ksft_test_result(ret == -1 && errno == ENOMEM,
+			 "MADV_POPULATE_WRITE with holes at the beginning\n");
+
+	munmap(addr, SIZE);
+}
+
+static bool range_is_populated(char *start, ssize_t size)
+{
+	int fd = open("/proc/self/pagemap", O_RDONLY);
+	bool ret = true;
+
+	if (fd < 0)
+		ksft_exit_fail_msg("opening pagemap failed\n");
+	for (; size > 0 && ret; size -= pagesize, start += pagesize)
+		if (!pagemap_is_populated(fd, start))
+			ret = false;
+	close(fd);
+	return ret;
+}
+
+static bool range_is_not_populated(char *start, ssize_t size)
+{
+	int fd = open("/proc/self/pagemap", O_RDONLY);
+	bool ret = true;
+
+	if (fd < 0)
+		ksft_exit_fail_msg("opening pagemap failed\n");
+	for (; size > 0 && ret; size -= pagesize, start += pagesize)
+		if (pagemap_is_populated(fd, start))
+			ret = false;
+	close(fd);
+	return ret;
+}
+
+static void test_populate_read(void)
+{
+	char *addr;
+	int ret;
+
+	ksft_print_msg("[RUN] %s\n", __func__);
+
+	addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+		    MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap failed\n");
+	ksft_test_result(range_is_not_populated(addr, SIZE),
+			 "range initially not populated\n");
+
+	ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+	ksft_test_result(!ret, "MADV_POPULATE_READ\n");
+	ksft_test_result(range_is_populated(addr, SIZE),
+			 "range is populated\n");
+
+	munmap(addr, SIZE);
+}
+
+static void test_populate_write(void)
+{
+	char *addr;
+	int ret;
+
+	ksft_print_msg("[RUN] %s\n", __func__);
+
+	addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+		    MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap failed\n");
+	ksft_test_result(range_is_not_populated(addr, SIZE),
+			 "range initially not populated\n");
+
+	ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+	ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
+	ksft_test_result(range_is_populated(addr, SIZE),
+			 "range is populated\n");
+
+	munmap(addr, SIZE);
+}
+
+static bool range_is_softdirty(char *start, ssize_t size)
+{
+	int fd = open("/proc/self/pagemap", O_RDONLY);
+	bool ret = true;
+
+	if (fd < 0)
+		ksft_exit_fail_msg("opening pagemap failed\n");
+	for (; size > 0 && ret; size -= pagesize, start += pagesize)
+		if (!pagemap_is_softdirty(fd, start))
+			ret = false;
+	close(fd);
+	return ret;
+}
+
+static bool range_is_not_softdirty(char *start, ssize_t size)
+{
+	int fd = open("/proc/self/pagemap", O_RDONLY);
+	bool ret = true;
+
+	if (fd < 0)
+		ksft_exit_fail_msg("opening pagemap failed\n");
+	for (; size > 0 && ret; size -= pagesize, start += pagesize)
+		if (pagemap_is_softdirty(fd, start))
+			ret = false;
+	close(fd);
+	return ret;
+}
+
+static void test_softdirty(void)
+{
+	char *addr;
+	int ret;
+
+	ksft_print_msg("[RUN] %s\n", __func__);
+
+	addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+		    MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap failed\n");
+
+	/* Clear any softdirty bits. */
+	clear_softdirty();
+	ksft_test_result(range_is_not_softdirty(addr, SIZE),
+			 "range is not softdirty\n");
+
+	/* Populating READ should set softdirty. */
+	ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+	ksft_test_result(!ret, "MADV_POPULATE_READ\n");
+	ksft_test_result(range_is_not_softdirty(addr, SIZE),
+			 "range is not softdirty\n");
+
+	/* Populating WRITE should set softdirty. */
+	ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+	ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
+	ksft_test_result(range_is_softdirty(addr, SIZE),
+			 "range is softdirty\n");
+
+	munmap(addr, SIZE);
+}
+
+int main(int argc, char **argv)
+{
+	int err;
+
+	pagesize = getpagesize();
+
+	ksft_print_header();
+	ksft_set_plan(21);
+
+	sense_support();
+	test_prot_read();
+	test_prot_write();
+	test_holes();
+	test_populate_read();
+	test_populate_write();
+	test_softdirty();
+
+	err = ksft_get_fail_cnt();
+	if (err)
+		ksft_exit_fail_msg("%d out of %d tests failed\n",
+				   err, ksft_test_num());
+	return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c
new file mode 100644
index 000000000000..eed44322d1a6
--- /dev/null
+++ b/tools/testing/selftests/mm/map_fixed_noreplace.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test that MAP_FIXED_NOREPLACE works.
+ *
+ * Copyright 2018, Jann Horn <jannh@google.com>
+ * Copyright 2018, Michael Ellerman, IBM Corporation.
+ */
+
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#ifndef MAP_FIXED_NOREPLACE
+#define MAP_FIXED_NOREPLACE 0x100000
+#endif
+
+static void dump_maps(void)
+{
+	char cmd[32];
+
+	snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
+	system(cmd);
+}
+
+static unsigned long find_base_addr(unsigned long size)
+{
+	void *addr;
+	unsigned long flags;
+
+	flags = MAP_PRIVATE | MAP_ANONYMOUS;
+	addr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
+	if (addr == MAP_FAILED) {
+		printf("Error: couldn't map the space we need for the test\n");
+		return 0;
+	}
+
+	if (munmap(addr, size) != 0) {
+		printf("Error: couldn't map the space we need for the test\n");
+		return 0;
+	}
+	return (unsigned long)addr;
+}
+
+int main(void)
+{
+	unsigned long base_addr;
+	unsigned long flags, addr, size, page_size;
+	char *p;
+
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	//let's find a base addr that is free before we start the tests
+	size = 5 * page_size;
+	base_addr = find_base_addr(size);
+	if (!base_addr) {
+		printf("Error: couldn't map the space we need for the test\n");
+		return 1;
+	}
+
+	flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
+
+	// Check we can map all the areas we need below
+	errno = 0;
+	addr = base_addr;
+	size = 5 * page_size;
+	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+
+	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+	if (p == MAP_FAILED) {
+		dump_maps();
+		printf("Error: couldn't map the space we need for the test\n");
+		return 1;
+	}
+
+	errno = 0;
+	if (munmap((void *)addr, 5 * page_size) != 0) {
+		dump_maps();
+		printf("Error: munmap failed!?\n");
+		return 1;
+	}
+	printf("unmap() successful\n");
+
+	errno = 0;
+	addr = base_addr + page_size;
+	size = 3 * page_size;
+	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+	if (p == MAP_FAILED) {
+		dump_maps();
+		printf("Error: first mmap() failed unexpectedly\n");
+		return 1;
+	}
+
+	/*
+	 * Exact same mapping again:
+	 *   base |  free  | new
+	 *     +1 | mapped | new
+	 *     +2 | mapped | new
+	 *     +3 | mapped | new
+	 *     +4 |  free  | new
+	 */
+	errno = 0;
+	addr = base_addr;
+	size = 5 * page_size;
+	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+	if (p != MAP_FAILED) {
+		dump_maps();
+		printf("Error:1: mmap() succeeded when it shouldn't have\n");
+		return 1;
+	}
+
+	/*
+	 * Second mapping contained within first:
+	 *
+	 *   base |  free  |
+	 *     +1 | mapped |
+	 *     +2 | mapped | new
+	 *     +3 | mapped |
+	 *     +4 |  free  |
+	 */
+	errno = 0;
+	addr = base_addr + (2 * page_size);
+	size = page_size;
+	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+	if (p != MAP_FAILED) {
+		dump_maps();
+		printf("Error:2: mmap() succeeded when it shouldn't have\n");
+		return 1;
+	}
+
+	/*
+	 * Overlap end of existing mapping:
+	 *   base |  free  |
+	 *     +1 | mapped |
+	 *     +2 | mapped |
+	 *     +3 | mapped | new
+	 *     +4 |  free  | new
+	 */
+	errno = 0;
+	addr = base_addr + (3 * page_size);
+	size = 2 * page_size;
+	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+	if (p != MAP_FAILED) {
+		dump_maps();
+		printf("Error:3: mmap() succeeded when it shouldn't have\n");
+		return 1;
+	}
+
+	/*
+	 * Overlap start of existing mapping:
+	 *   base |  free  | new
+	 *     +1 | mapped | new
+	 *     +2 | mapped |
+	 *     +3 | mapped |
+	 *     +4 |  free  |
+	 */
+	errno = 0;
+	addr = base_addr;
+	size = 2 * page_size;
+	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+	if (p != MAP_FAILED) {
+		dump_maps();
+		printf("Error:4: mmap() succeeded when it shouldn't have\n");
+		return 1;
+	}
+
+	/*
+	 * Adjacent to start of existing mapping:
+	 *   base |  free  | new
+	 *     +1 | mapped |
+	 *     +2 | mapped |
+	 *     +3 | mapped |
+	 *     +4 |  free  |
+	 */
+	errno = 0;
+	addr = base_addr;
+	size = page_size;
+	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+	if (p == MAP_FAILED) {
+		dump_maps();
+		printf("Error:5: mmap() failed when it shouldn't have\n");
+		return 1;
+	}
+
+	/*
+	 * Adjacent to end of existing mapping:
+	 *   base |  free  |
+	 *     +1 | mapped |
+	 *     +2 | mapped |
+	 *     +3 | mapped |
+	 *     +4 |  free  |  new
+	 */
+	errno = 0;
+	addr = base_addr + (4 * page_size);
+	size = page_size;
+	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+	if (p == MAP_FAILED) {
+		dump_maps();
+		printf("Error:6: mmap() failed when it shouldn't have\n");
+		return 1;
+	}
+
+	addr = base_addr;
+	size = 5 * page_size;
+	if (munmap((void *)addr, size) != 0) {
+		dump_maps();
+		printf("Error: munmap failed!?\n");
+		return 1;
+	}
+	printf("unmap() successful\n");
+
+	printf("OK\n");
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c
new file mode 100644
index 000000000000..312889edb84a
--- /dev/null
+++ b/tools/testing/selftests/mm/map_hugetlb.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Example of using hugepage memory in a user application using the mmap
+ * system call with MAP_HUGETLB flag.  Before running this program make
+ * sure the administrator has allocated enough default sized huge pages
+ * to cover the 256 MB allocation.
+ *
+ * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * That means the addresses starting with 0x800000... will need to be
+ * specified.  Specifying a fixed address is not required on ppc64, i386
+ * or x86_64.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000 /* arch specific */
+#endif
+
+#ifndef MAP_HUGE_SHIFT
+#define MAP_HUGE_SHIFT 26
+#endif
+
+#ifndef MAP_HUGE_MASK
+#define MAP_HUGE_MASK 0x3f
+#endif
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
+#endif
+
+static void check_bytes(char *addr)
+{
+	printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr, size_t length)
+{
+	unsigned long i;
+
+	for (i = 0; i < length; i++)
+		*(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr, size_t length)
+{
+	unsigned long i;
+
+	check_bytes(addr);
+	for (i = 0; i < length; i++)
+		if (*(addr + i) != (char)i) {
+			printf("Mismatch at %lu\n", i);
+			return 1;
+		}
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	void *addr;
+	int ret;
+	size_t length = LENGTH;
+	int flags = FLAGS;
+	int shift = 0;
+
+	if (argc > 1)
+		length = atol(argv[1]) << 20;
+	if (argc > 2) {
+		shift = atoi(argv[2]);
+		if (shift)
+			flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+	}
+
+	if (shift)
+		printf("%u kB hugepages\n", 1 << (shift - 10));
+	else
+		printf("Default size hugepages\n");
+	printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
+
+	addr = mmap(ADDR, length, PROTECTION, flags, -1, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+
+	printf("Returned address is %p\n", addr);
+	check_bytes(addr);
+	write_bytes(addr, length);
+	ret = read_bytes(addr, length);
+
+	/* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
+	if (munmap(addr, length)) {
+		perror("munmap");
+		exit(1);
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/mm/map_populate.c b/tools/testing/selftests/mm/map_populate.c
new file mode 100644
index 000000000000..6b8aeaa0bf7a
--- /dev/null
+++ b/tools/testing/selftests/mm/map_populate.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Dmitry Safonov, Arista Networks
+ *
+ * MAP_POPULATE | MAP_PRIVATE should COW VMA pages.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifndef MMAP_SZ
+#define MMAP_SZ		4096
+#endif
+
+#define BUG_ON(condition, description)					\
+	do {								\
+		if (condition) {					\
+			fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \
+				__LINE__, (description), strerror(errno)); \
+			exit(1);					\
+		}							\
+	} while (0)
+
+static int parent_f(int sock, unsigned long *smap, int child)
+{
+	int status, ret;
+
+	ret = read(sock, &status, sizeof(int));
+	BUG_ON(ret <= 0, "read(sock)");
+
+	*smap = 0x22222BAD;
+	ret = msync(smap, MMAP_SZ, MS_SYNC);
+	BUG_ON(ret, "msync()");
+
+	ret = write(sock, &status, sizeof(int));
+	BUG_ON(ret <= 0, "write(sock)");
+
+	waitpid(child, &status, 0);
+	BUG_ON(!WIFEXITED(status), "child in unexpected state");
+
+	return WEXITSTATUS(status);
+}
+
+static int child_f(int sock, unsigned long *smap, int fd)
+{
+	int ret, buf = 0;
+
+	smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_POPULATE, fd, 0);
+	BUG_ON(smap == MAP_FAILED, "mmap()");
+
+	BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file");
+
+	ret = write(sock, &buf, sizeof(int));
+	BUG_ON(ret <= 0, "write(sock)");
+
+	ret = read(sock, &buf, sizeof(int));
+	BUG_ON(ret <= 0, "read(sock)");
+
+	BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page");
+	BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted");
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int sock[2], child, ret;
+	FILE *ftmp;
+	unsigned long *smap;
+
+	ftmp = tmpfile();
+	BUG_ON(ftmp == 0, "tmpfile()");
+
+	ret = ftruncate(fileno(ftmp), MMAP_SZ);
+	BUG_ON(ret, "ftruncate()");
+
+	smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+			MAP_SHARED, fileno(ftmp), 0);
+	BUG_ON(smap == MAP_FAILED, "mmap()");
+
+	*smap = 0xdeadbabe;
+	/* Probably unnecessary, but let it be. */
+	ret = msync(smap, MMAP_SZ, MS_SYNC);
+	BUG_ON(ret, "msync()");
+
+	ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock);
+	BUG_ON(ret, "socketpair()");
+
+	child = fork();
+	BUG_ON(child == -1, "fork()");
+
+	if (child) {
+		ret = close(sock[0]);
+		BUG_ON(ret, "close()");
+
+		return parent_f(sock[1], smap, child);
+	}
+
+	ret = close(sock[1]);
+	BUG_ON(ret, "close()");
+
+	return child_f(sock[0], smap, fileno(ftmp));
+}
diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c
new file mode 100644
index 000000000000..957b9e18c729
--- /dev/null
+++ b/tools/testing/selftests/mm/memfd_secret.c
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2021
+ *
+ * Author: Mike Rapoport <rppt@linux.ibm.com>
+ */
+
+#define _GNU_SOURCE
+#include <sys/uio.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/ptrace.h>
+#include <sys/syscall.h>
+#include <sys/resource.h>
+#include <sys/capability.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+
+#include "../kselftest.h"
+
+#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__)
+#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__)
+#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__)
+
+#ifdef __NR_memfd_secret
+
+#define PATTERN	0x55
+
+static const int prot = PROT_READ | PROT_WRITE;
+static const int mode = MAP_SHARED;
+
+static unsigned long page_size;
+static unsigned long mlock_limit_cur;
+static unsigned long mlock_limit_max;
+
+static int memfd_secret(unsigned int flags)
+{
+	return syscall(__NR_memfd_secret, flags);
+}
+
+static void test_file_apis(int fd)
+{
+	char buf[64];
+
+	if ((read(fd, buf, sizeof(buf)) >= 0) ||
+	    (write(fd, buf, sizeof(buf)) >= 0) ||
+	    (pread(fd, buf, sizeof(buf), 0) >= 0) ||
+	    (pwrite(fd, buf, sizeof(buf), 0) >= 0))
+		fail("unexpected file IO\n");
+	else
+		pass("file IO is blocked as expected\n");
+}
+
+static void test_mlock_limit(int fd)
+{
+	size_t len;
+	char *mem;
+
+	len = mlock_limit_cur;
+	mem = mmap(NULL, len, prot, mode, fd, 0);
+	if (mem == MAP_FAILED) {
+		fail("unable to mmap secret memory\n");
+		return;
+	}
+	munmap(mem, len);
+
+	len = mlock_limit_max * 2;
+	mem = mmap(NULL, len, prot, mode, fd, 0);
+	if (mem != MAP_FAILED) {
+		fail("unexpected mlock limit violation\n");
+		munmap(mem, len);
+		return;
+	}
+
+	pass("mlock limit is respected\n");
+}
+
+static void try_process_vm_read(int fd, int pipefd[2])
+{
+	struct iovec liov, riov;
+	char buf[64];
+	char *mem;
+
+	if (read(pipefd[0], &mem, sizeof(mem)) < 0) {
+		fail("pipe write: %s\n", strerror(errno));
+		exit(KSFT_FAIL);
+	}
+
+	liov.iov_len = riov.iov_len = sizeof(buf);
+	liov.iov_base = buf;
+	riov.iov_base = mem;
+
+	if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) {
+		if (errno == ENOSYS)
+			exit(KSFT_SKIP);
+		exit(KSFT_PASS);
+	}
+
+	exit(KSFT_FAIL);
+}
+
+static void try_ptrace(int fd, int pipefd[2])
+{
+	pid_t ppid = getppid();
+	int status;
+	char *mem;
+	long ret;
+
+	if (read(pipefd[0], &mem, sizeof(mem)) < 0) {
+		perror("pipe write");
+		exit(KSFT_FAIL);
+	}
+
+	ret = ptrace(PTRACE_ATTACH, ppid, 0, 0);
+	if (ret) {
+		perror("ptrace_attach");
+		exit(KSFT_FAIL);
+	}
+
+	ret = waitpid(ppid, &status, WUNTRACED);
+	if ((ret != ppid) || !(WIFSTOPPED(status))) {
+		fprintf(stderr, "weird waitppid result %ld stat %x\n",
+			ret, status);
+		exit(KSFT_FAIL);
+	}
+
+	if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0))
+		exit(KSFT_PASS);
+
+	exit(KSFT_FAIL);
+}
+
+static void check_child_status(pid_t pid, const char *name)
+{
+	int status;
+
+	waitpid(pid, &status, 0);
+
+	if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) {
+		skip("%s is not supported\n", name);
+		return;
+	}
+
+	if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) ||
+	    WIFSIGNALED(status)) {
+		pass("%s is blocked as expected\n", name);
+		return;
+	}
+
+	fail("%s: unexpected memory access\n", name);
+}
+
+static void test_remote_access(int fd, const char *name,
+			       void (*func)(int fd, int pipefd[2]))
+{
+	int pipefd[2];
+	pid_t pid;
+	char *mem;
+
+	if (pipe(pipefd)) {
+		fail("pipe failed: %s\n", strerror(errno));
+		return;
+	}
+
+	pid = fork();
+	if (pid < 0) {
+		fail("fork failed: %s\n", strerror(errno));
+		return;
+	}
+
+	if (pid == 0) {
+		func(fd, pipefd);
+		return;
+	}
+
+	mem = mmap(NULL, page_size, prot, mode, fd, 0);
+	if (mem == MAP_FAILED) {
+		fail("Unable to mmap secret memory\n");
+		return;
+	}
+
+	ftruncate(fd, page_size);
+	memset(mem, PATTERN, page_size);
+
+	if (write(pipefd[1], &mem, sizeof(mem)) < 0) {
+		fail("pipe write: %s\n", strerror(errno));
+		return;
+	}
+
+	check_child_status(pid, name);
+}
+
+static void test_process_vm_read(int fd)
+{
+	test_remote_access(fd, "process_vm_read", try_process_vm_read);
+}
+
+static void test_ptrace(int fd)
+{
+	test_remote_access(fd, "ptrace", try_ptrace);
+}
+
+static int set_cap_limits(rlim_t max)
+{
+	struct rlimit new;
+	cap_t cap = cap_init();
+
+	new.rlim_cur = max;
+	new.rlim_max = max;
+	if (setrlimit(RLIMIT_MEMLOCK, &new)) {
+		perror("setrlimit() returns error");
+		return -1;
+	}
+
+	/* drop capabilities including CAP_IPC_LOCK */
+	if (cap_set_proc(cap)) {
+		perror("cap_set_proc() returns error");
+		return -2;
+	}
+
+	return 0;
+}
+
+static void prepare(void)
+{
+	struct rlimit rlim;
+
+	page_size = sysconf(_SC_PAGE_SIZE);
+	if (!page_size)
+		ksft_exit_fail_msg("Failed to get page size %s\n",
+				   strerror(errno));
+
+	if (getrlimit(RLIMIT_MEMLOCK, &rlim))
+		ksft_exit_fail_msg("Unable to detect mlock limit: %s\n",
+				   strerror(errno));
+
+	mlock_limit_cur = rlim.rlim_cur;
+	mlock_limit_max = rlim.rlim_max;
+
+	printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n",
+	       page_size, mlock_limit_cur, mlock_limit_max);
+
+	if (page_size > mlock_limit_cur)
+		mlock_limit_cur = page_size;
+	if (page_size > mlock_limit_max)
+		mlock_limit_max = page_size;
+
+	if (set_cap_limits(mlock_limit_max))
+		ksft_exit_fail_msg("Unable to set mlock limit: %s\n",
+				   strerror(errno));
+}
+
+#define NUM_TESTS 4
+
+int main(int argc, char *argv[])
+{
+	int fd;
+
+	prepare();
+
+	ksft_print_header();
+	ksft_set_plan(NUM_TESTS);
+
+	fd = memfd_secret(0);
+	if (fd < 0) {
+		if (errno == ENOSYS)
+			ksft_exit_skip("memfd_secret is not supported\n");
+		else
+			ksft_exit_fail_msg("memfd_secret failed: %s\n",
+					   strerror(errno));
+	}
+
+	test_mlock_limit(fd);
+	test_file_apis(fd);
+	test_process_vm_read(fd);
+	test_ptrace(fd);
+
+	close(fd);
+
+	ksft_finished();
+}
+
+#else /* __NR_memfd_secret */
+
+int main(int argc, char *argv[])
+{
+	printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n");
+	return KSFT_SKIP;
+}
+
+#endif /* __NR_memfd_secret */
diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c
new file mode 100644
index 000000000000..1cec8425e3ca
--- /dev/null
+++ b/tools/testing/selftests/mm/migration.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * The main purpose of the tests here is to exercise the migration entry code
+ * paths in the kernel.
+ */
+
+#include "../kselftest_harness.h"
+#include <strings.h>
+#include <pthread.h>
+#include <numa.h>
+#include <numaif.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <time.h>
+
+#define TWOMEG (2<<20)
+#define RUNTIME (60)
+
+#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
+
+FIXTURE(migration)
+{
+	pthread_t *threads;
+	pid_t *pids;
+	int nthreads;
+	int n1;
+	int n2;
+};
+
+FIXTURE_SETUP(migration)
+{
+	int n;
+
+	ASSERT_EQ(numa_available(), 0);
+	self->nthreads = numa_num_task_cpus() - 1;
+	self->n1 = -1;
+	self->n2 = -1;
+
+	for (n = 0; n < numa_max_possible_node(); n++)
+		if (numa_bitmask_isbitset(numa_all_nodes_ptr, n)) {
+			if (self->n1 == -1) {
+				self->n1 = n;
+			} else {
+				self->n2 = n;
+				break;
+			}
+		}
+
+	self->threads = malloc(self->nthreads * sizeof(*self->threads));
+	ASSERT_NE(self->threads, NULL);
+	self->pids = malloc(self->nthreads * sizeof(*self->pids));
+	ASSERT_NE(self->pids, NULL);
+};
+
+FIXTURE_TEARDOWN(migration)
+{
+	free(self->threads);
+	free(self->pids);
+}
+
+int migrate(uint64_t *ptr, int n1, int n2)
+{
+	int ret, tmp;
+	int status = 0;
+	struct timespec ts1, ts2;
+
+	if (clock_gettime(CLOCK_MONOTONIC, &ts1))
+		return -1;
+
+	while (1) {
+		if (clock_gettime(CLOCK_MONOTONIC, &ts2))
+			return -1;
+
+		if (ts2.tv_sec - ts1.tv_sec >= RUNTIME)
+			return 0;
+
+		ret = move_pages(0, 1, (void **) &ptr, &n2, &status,
+				MPOL_MF_MOVE_ALL);
+		if (ret) {
+			if (ret > 0)
+				printf("Didn't migrate %d pages\n", ret);
+			else
+				perror("Couldn't migrate pages");
+			return -2;
+		}
+
+		tmp = n2;
+		n2 = n1;
+		n1 = tmp;
+	}
+
+	return 0;
+}
+
+void *access_mem(void *ptr)
+{
+	uint64_t y = 0;
+	volatile uint64_t *x = ptr;
+
+	while (1) {
+		pthread_testcancel();
+		y += *x;
+	}
+
+	return NULL;
+}
+
+/*
+ * Basic migration entry testing. One thread will move pages back and forth
+ * between nodes whilst other threads try and access them triggering the
+ * migration entry wait paths in the kernel.
+ */
+TEST_F_TIMEOUT(migration, private_anon, 2*RUNTIME)
+{
+	uint64_t *ptr;
+	int i;
+
+	if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+		SKIP(return, "Not enough threads or NUMA nodes available");
+
+	ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	memset(ptr, 0xde, TWOMEG);
+	for (i = 0; i < self->nthreads - 1; i++)
+		if (pthread_create(&self->threads[i], NULL, access_mem, ptr))
+			perror("Couldn't create thread");
+
+	ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+	for (i = 0; i < self->nthreads - 1; i++)
+		ASSERT_EQ(pthread_cancel(self->threads[i]), 0);
+}
+
+/*
+ * Same as the previous test but with shared memory.
+ */
+TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME)
+{
+	pid_t pid;
+	uint64_t *ptr;
+	int i;
+
+	if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+		SKIP(return, "Not enough threads or NUMA nodes available");
+
+	ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE,
+		MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	memset(ptr, 0xde, TWOMEG);
+	for (i = 0; i < self->nthreads - 1; i++) {
+		pid = fork();
+		if (!pid)
+			access_mem(ptr);
+		else
+			self->pids[i] = pid;
+	}
+
+	ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+	for (i = 0; i < self->nthreads - 1; i++)
+		ASSERT_EQ(kill(self->pids[i], SIGTERM), 0);
+}
+
+/*
+ * Tests the pmd migration entry paths.
+ */
+TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME)
+{
+	uint64_t *ptr;
+	int i;
+
+	if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+		SKIP(return, "Not enough threads or NUMA nodes available");
+
+	ptr = mmap(NULL, 2*TWOMEG, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	ASSERT_NE(ptr, MAP_FAILED);
+
+	ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG);
+	ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0);
+	memset(ptr, 0xde, TWOMEG);
+	for (i = 0; i < self->nthreads - 1; i++)
+		if (pthread_create(&self->threads[i], NULL, access_mem, ptr))
+			perror("Couldn't create thread");
+
+	ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+	for (i = 0; i < self->nthreads - 1; i++)
+		ASSERT_EQ(pthread_cancel(self->threads[i]), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c
new file mode 100644
index 000000000000..782ea94dee2f
--- /dev/null
+++ b/tools/testing/selftests/mm/mlock-random-test.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * It tests the mlock/mlock2() when they are invoked
+ * on randomly memory region.
+ */
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/capability.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <time.h>
+#include "mlock2.h"
+
+#define CHUNK_UNIT (128 * 1024)
+#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2)
+#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT
+#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3)
+
+#define TEST_LOOP 100
+#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1))
+
+int set_cap_limits(rlim_t max)
+{
+	struct rlimit new;
+	cap_t cap = cap_init();
+
+	new.rlim_cur = max;
+	new.rlim_max = max;
+	if (setrlimit(RLIMIT_MEMLOCK, &new)) {
+		perror("setrlimit() returns error\n");
+		return -1;
+	}
+
+	/* drop capabilities including CAP_IPC_LOCK */
+	if (cap_set_proc(cap)) {
+		perror("cap_set_proc() returns error\n");
+		return -2;
+	}
+
+	return 0;
+}
+
+int get_proc_locked_vm_size(void)
+{
+	FILE *f;
+	int ret = -1;
+	char line[1024] = {0};
+	unsigned long lock_size = 0;
+
+	f = fopen("/proc/self/status", "r");
+	if (!f) {
+		perror("fopen");
+		return -1;
+	}
+
+	while (fgets(line, 1024, f)) {
+		if (strstr(line, "VmLck")) {
+			ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size);
+			if (ret <= 0) {
+				printf("sscanf() on VmLck error: %s: %d\n",
+						line, ret);
+				fclose(f);
+				return -1;
+			}
+			fclose(f);
+			return (int)(lock_size << 10);
+		}
+	}
+
+	perror("cannot parse VmLck in /proc/self/status\n");
+	fclose(f);
+	return -1;
+}
+
+/*
+ * Get the MMUPageSize of the memory region including input
+ * address from proc file.
+ *
+ * return value: on error case, 0 will be returned.
+ * Otherwise the page size(in bytes) is returned.
+ */
+int get_proc_page_size(unsigned long addr)
+{
+	FILE *smaps;
+	char *line;
+	unsigned long mmupage_size = 0;
+	size_t size;
+
+	smaps = seek_to_smaps_entry(addr);
+	if (!smaps) {
+		printf("Unable to parse /proc/self/smaps\n");
+		return 0;
+	}
+
+	while (getline(&line, &size, smaps) > 0) {
+		if (!strstr(line, "MMUPageSize")) {
+			free(line);
+			line = NULL;
+			size = 0;
+			continue;
+		}
+
+		/* found the MMUPageSize of this section */
+		if (sscanf(line, "MMUPageSize:    %8lu kB",
+					&mmupage_size) < 1) {
+			printf("Unable to parse smaps entry for Size:%s\n",
+					line);
+			break;
+		}
+
+	}
+	free(line);
+	if (smaps)
+		fclose(smaps);
+	return mmupage_size << 10;
+}
+
+/*
+ * Test mlock/mlock2() on provided memory chunk.
+ * It expects the mlock/mlock2() to be successful (within rlimit)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will choose start/len randomly to perform mlock/mlock2
+ * [start, start +  len] memory range. The range is within range
+ * of the allocated chunk.
+ *
+ * The memory region size alloc_size is within the rlimit.
+ * So we always expect a success of mlock/mlock2.
+ *
+ * VmLck is assumed to be 0 before this test.
+ *
+ *    return value: 0 - success
+ *    else: failure
+ */
+int test_mlock_within_limit(char *p, int alloc_size)
+{
+	int i;
+	int ret = 0;
+	int locked_vm_size = 0;
+	struct rlimit cur;
+	int page_size = 0;
+
+	getrlimit(RLIMIT_MEMLOCK, &cur);
+	if (cur.rlim_cur < alloc_size) {
+		printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
+				alloc_size, (unsigned int)cur.rlim_cur);
+		return -1;
+	}
+
+	srand(time(NULL));
+	for (i = 0; i < TEST_LOOP; i++) {
+		/*
+		 * - choose mlock/mlock2 randomly
+		 * - choose lock_size randomly but lock_size < alloc_size
+		 * - choose start_offset randomly but p+start_offset+lock_size
+		 *   < p+alloc_size
+		 */
+		int is_mlock = !!(rand() % 2);
+		int lock_size = rand() % alloc_size;
+		int start_offset = rand() % (alloc_size - lock_size);
+
+		if (is_mlock)
+			ret = mlock(p + start_offset, lock_size);
+		else
+			ret = mlock2_(p + start_offset, lock_size,
+				       MLOCK_ONFAULT);
+
+		if (ret) {
+			printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
+					is_mlock ? "mlock" : "mlock2",
+					p, alloc_size,
+					p + start_offset, lock_size);
+			return ret;
+		}
+	}
+
+	/*
+	 * Check VmLck left by the tests.
+	 */
+	locked_vm_size = get_proc_locked_vm_size();
+	page_size = get_proc_page_size((unsigned long)p);
+	if (page_size == 0) {
+		printf("cannot get proc MMUPageSize\n");
+		return -1;
+	}
+
+	if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) {
+		printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n",
+				locked_vm_size, alloc_size);
+		return -1;
+	}
+
+	return 0;
+}
+
+
+/*
+ * We expect the mlock/mlock2() to be fail (outof limitation)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will randomly choose start/len and perform mlock/mlock2
+ * on [start, start+len] range.
+ *
+ * The memory region size alloc_size is above the rlimit.
+ * And the len to be locked is higher than rlimit.
+ * So we always expect a failure of mlock/mlock2.
+ * No locked page number should be increased as a side effect.
+ *
+ *    return value: 0 - success
+ *    else: failure
+ */
+int test_mlock_outof_limit(char *p, int alloc_size)
+{
+	int i;
+	int ret = 0;
+	int locked_vm_size = 0, old_locked_vm_size = 0;
+	struct rlimit cur;
+
+	getrlimit(RLIMIT_MEMLOCK, &cur);
+	if (cur.rlim_cur >= alloc_size) {
+		printf("alloc_size[%d] >%u rlimit, violates test condition\n",
+				alloc_size, (unsigned int)cur.rlim_cur);
+		return -1;
+	}
+
+	old_locked_vm_size = get_proc_locked_vm_size();
+	srand(time(NULL));
+	for (i = 0; i < TEST_LOOP; i++) {
+		int is_mlock = !!(rand() % 2);
+		int lock_size = (rand() % (alloc_size - cur.rlim_cur))
+			+ cur.rlim_cur;
+		int start_offset = rand() % (alloc_size - lock_size);
+
+		if (is_mlock)
+			ret = mlock(p + start_offset, lock_size);
+		else
+			ret = mlock2_(p + start_offset, lock_size,
+					MLOCK_ONFAULT);
+		if (ret == 0) {
+			printf("%s() succeeds? on %p(%d) mlock%p(%d)\n",
+					is_mlock ? "mlock" : "mlock2",
+					p, alloc_size,
+					p + start_offset, lock_size);
+			return -1;
+		}
+	}
+
+	locked_vm_size = get_proc_locked_vm_size();
+	if (locked_vm_size != old_locked_vm_size) {
+		printf("tests leads to new mlocked page: old[%d], new[%d]\n",
+				old_locked_vm_size,
+				locked_vm_size);
+		return -1;
+	}
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	char *p = NULL;
+	int ret = 0;
+
+	if (set_cap_limits(MLOCK_RLIMIT_SIZE))
+		return -1;
+
+	p = malloc(MLOCK_WITHIN_LIMIT_SIZE);
+	if (p == NULL) {
+		perror("malloc() failure\n");
+		return -1;
+	}
+	ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
+	if (ret)
+		return ret;
+	munlock(p, MLOCK_WITHIN_LIMIT_SIZE);
+	free(p);
+
+
+	p = malloc(MLOCK_OUTOF_LIMIT_SIZE);
+	if (p == NULL) {
+		perror("malloc() failure\n");
+		return -1;
+	}
+	ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
+	if (ret)
+		return ret;
+	munlock(p, MLOCK_OUTOF_LIMIT_SIZE);
+	free(p);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c
new file mode 100644
index 000000000000..11b2301f3aa3
--- /dev/null
+++ b/tools/testing/selftests/mm/mlock2-tests.c
@@ -0,0 +1,520 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <stdbool.h>
+#include "mlock2.h"
+
+#include "../kselftest.h"
+
+struct vm_boundaries {
+	unsigned long start;
+	unsigned long end;
+};
+
+static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
+{
+	FILE *file;
+	int ret = 1;
+	char line[1024] = {0};
+	char *end_addr;
+	char *stop;
+	unsigned long start;
+	unsigned long end;
+
+	if (!area)
+		return ret;
+
+	file = fopen("/proc/self/maps", "r");
+	if (!file) {
+		perror("fopen");
+		return ret;
+	}
+
+	memset(area, 0, sizeof(struct vm_boundaries));
+
+	while(fgets(line, 1024, file)) {
+		end_addr = strchr(line, '-');
+		if (!end_addr) {
+			printf("cannot parse /proc/self/maps\n");
+			goto out;
+		}
+		*end_addr = '\0';
+		end_addr++;
+		stop = strchr(end_addr, ' ');
+		if (!stop) {
+			printf("cannot parse /proc/self/maps\n");
+			goto out;
+		}
+		stop = '\0';
+
+		sscanf(line, "%lx", &start);
+		sscanf(end_addr, "%lx", &end);
+
+		if (start <= addr && end > addr) {
+			area->start = start;
+			area->end = end;
+			ret = 0;
+			goto out;
+		}
+	}
+out:
+	fclose(file);
+	return ret;
+}
+
+#define VMFLAGS "VmFlags:"
+
+static bool is_vmflag_set(unsigned long addr, const char *vmflag)
+{
+	char *line = NULL;
+	char *flags;
+	size_t size = 0;
+	bool ret = false;
+	FILE *smaps;
+
+	smaps = seek_to_smaps_entry(addr);
+	if (!smaps) {
+		printf("Unable to parse /proc/self/smaps\n");
+		goto out;
+	}
+
+	while (getline(&line, &size, smaps) > 0) {
+		if (!strstr(line, VMFLAGS)) {
+			free(line);
+			line = NULL;
+			size = 0;
+			continue;
+		}
+
+		flags = line + strlen(VMFLAGS);
+		ret = (strstr(flags, vmflag) != NULL);
+		goto out;
+	}
+
+out:
+	free(line);
+	fclose(smaps);
+	return ret;
+}
+
+#define SIZE "Size:"
+#define RSS  "Rss:"
+#define LOCKED "lo"
+
+static unsigned long get_value_for_name(unsigned long addr, const char *name)
+{
+	char *line = NULL;
+	size_t size = 0;
+	char *value_ptr;
+	FILE *smaps = NULL;
+	unsigned long value = -1UL;
+
+	smaps = seek_to_smaps_entry(addr);
+	if (!smaps) {
+		printf("Unable to parse /proc/self/smaps\n");
+		goto out;
+	}
+
+	while (getline(&line, &size, smaps) > 0) {
+		if (!strstr(line, name)) {
+			free(line);
+			line = NULL;
+			size = 0;
+			continue;
+		}
+
+		value_ptr = line + strlen(name);
+		if (sscanf(value_ptr, "%lu kB", &value) < 1) {
+			printf("Unable to parse smaps entry for Size\n");
+			goto out;
+		}
+		break;
+	}
+
+out:
+	if (smaps)
+		fclose(smaps);
+	free(line);
+	return value;
+}
+
+static bool is_vma_lock_on_fault(unsigned long addr)
+{
+	bool locked;
+	unsigned long vma_size, vma_rss;
+
+	locked = is_vmflag_set(addr, LOCKED);
+	if (!locked)
+		return false;
+
+	vma_size = get_value_for_name(addr, SIZE);
+	vma_rss = get_value_for_name(addr, RSS);
+
+	/* only one page is faulted in */
+	return (vma_rss < vma_size);
+}
+
+#define PRESENT_BIT     0x8000000000000000ULL
+#define PFN_MASK        0x007FFFFFFFFFFFFFULL
+#define UNEVICTABLE_BIT (1UL << 18)
+
+static int lock_check(unsigned long addr)
+{
+	bool locked;
+	unsigned long vma_size, vma_rss;
+
+	locked = is_vmflag_set(addr, LOCKED);
+	if (!locked)
+		return false;
+
+	vma_size = get_value_for_name(addr, SIZE);
+	vma_rss = get_value_for_name(addr, RSS);
+
+	return (vma_rss == vma_size);
+}
+
+static int unlock_lock_check(char *map)
+{
+	if (is_vmflag_set((unsigned long)map, LOCKED)) {
+		printf("VMA flag %s is present on page 1 after unlock\n", LOCKED);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int test_mlock_lock()
+{
+	char *map;
+	int ret = 1;
+	unsigned long page_size = getpagesize();
+
+	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (map == MAP_FAILED) {
+		perror("test_mlock_locked mmap");
+		goto out;
+	}
+
+	if (mlock2_(map, 2 * page_size, 0)) {
+		if (errno == ENOSYS) {
+			printf("Cannot call new mlock family, skipping test\n");
+			_exit(KSFT_SKIP);
+		}
+		perror("mlock2(0)");
+		goto unmap;
+	}
+
+	if (!lock_check((unsigned long)map))
+		goto unmap;
+
+	/* Now unlock and recheck attributes */
+	if (munlock(map, 2 * page_size)) {
+		perror("munlock()");
+		goto unmap;
+	}
+
+	ret = unlock_lock_check(map);
+
+unmap:
+	munmap(map, 2 * page_size);
+out:
+	return ret;
+}
+
+static int onfault_check(char *map)
+{
+	*map = 'a';
+	if (!is_vma_lock_on_fault((unsigned long)map)) {
+		printf("VMA is not marked for lock on fault\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+static int unlock_onfault_check(char *map)
+{
+	unsigned long page_size = getpagesize();
+
+	if (is_vma_lock_on_fault((unsigned long)map) ||
+	    is_vma_lock_on_fault((unsigned long)map + page_size)) {
+		printf("VMA is still lock on fault after unlock\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+static int test_mlock_onfault()
+{
+	char *map;
+	int ret = 1;
+	unsigned long page_size = getpagesize();
+
+	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (map == MAP_FAILED) {
+		perror("test_mlock_locked mmap");
+		goto out;
+	}
+
+	if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+		if (errno == ENOSYS) {
+			printf("Cannot call new mlock family, skipping test\n");
+			_exit(KSFT_SKIP);
+		}
+		perror("mlock2(MLOCK_ONFAULT)");
+		goto unmap;
+	}
+
+	if (onfault_check(map))
+		goto unmap;
+
+	/* Now unlock and recheck attributes */
+	if (munlock(map, 2 * page_size)) {
+		if (errno == ENOSYS) {
+			printf("Cannot call new mlock family, skipping test\n");
+			_exit(KSFT_SKIP);
+		}
+		perror("munlock()");
+		goto unmap;
+	}
+
+	ret = unlock_onfault_check(map);
+unmap:
+	munmap(map, 2 * page_size);
+out:
+	return ret;
+}
+
+static int test_lock_onfault_of_present()
+{
+	char *map;
+	int ret = 1;
+	unsigned long page_size = getpagesize();
+
+	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (map == MAP_FAILED) {
+		perror("test_mlock_locked mmap");
+		goto out;
+	}
+
+	*map = 'a';
+
+	if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+		if (errno == ENOSYS) {
+			printf("Cannot call new mlock family, skipping test\n");
+			_exit(KSFT_SKIP);
+		}
+		perror("mlock2(MLOCK_ONFAULT)");
+		goto unmap;
+	}
+
+	if (!is_vma_lock_on_fault((unsigned long)map) ||
+	    !is_vma_lock_on_fault((unsigned long)map + page_size)) {
+		printf("VMA with present pages is not marked lock on fault\n");
+		goto unmap;
+	}
+	ret = 0;
+unmap:
+	munmap(map, 2 * page_size);
+out:
+	return ret;
+}
+
+static int test_munlockall()
+{
+	char *map;
+	int ret = 1;
+	unsigned long page_size = getpagesize();
+
+	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+	if (map == MAP_FAILED) {
+		perror("test_munlockall mmap");
+		goto out;
+	}
+
+	if (mlockall(MCL_CURRENT)) {
+		perror("mlockall(MCL_CURRENT)");
+		goto out;
+	}
+
+	if (!lock_check((unsigned long)map))
+		goto unmap;
+
+	if (munlockall()) {
+		perror("munlockall()");
+		goto unmap;
+	}
+
+	if (unlock_lock_check(map))
+		goto unmap;
+
+	munmap(map, 2 * page_size);
+
+	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+	if (map == MAP_FAILED) {
+		perror("test_munlockall second mmap");
+		goto out;
+	}
+
+	if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
+		perror("mlockall(MCL_CURRENT | MCL_ONFAULT)");
+		goto unmap;
+	}
+
+	if (onfault_check(map))
+		goto unmap;
+
+	if (munlockall()) {
+		perror("munlockall()");
+		goto unmap;
+	}
+
+	if (unlock_onfault_check(map))
+		goto unmap;
+
+	if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
+		perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
+		goto out;
+	}
+
+	if (!lock_check((unsigned long)map))
+		goto unmap;
+
+	if (munlockall()) {
+		perror("munlockall()");
+		goto unmap;
+	}
+
+	ret = unlock_lock_check(map);
+
+unmap:
+	munmap(map, 2 * page_size);
+out:
+	munlockall();
+	return ret;
+}
+
+static int test_vma_management(bool call_mlock)
+{
+	int ret = 1;
+	void *map;
+	unsigned long page_size = getpagesize();
+	struct vm_boundaries page1;
+	struct vm_boundaries page2;
+	struct vm_boundaries page3;
+
+	map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (map == MAP_FAILED) {
+		perror("mmap()");
+		return ret;
+	}
+
+	if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
+		if (errno == ENOSYS) {
+			printf("Cannot call new mlock family, skipping test\n");
+			_exit(KSFT_SKIP);
+		}
+		perror("mlock(ONFAULT)\n");
+		goto out;
+	}
+
+	if (get_vm_area((unsigned long)map, &page1) ||
+	    get_vm_area((unsigned long)map + page_size, &page2) ||
+	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+		printf("couldn't find mapping in /proc/self/maps\n");
+		goto out;
+	}
+
+	/*
+	 * Before we unlock a portion, we need to that all three pages are in
+	 * the same VMA.  If they are not we abort this test (Note that this is
+	 * not a failure)
+	 */
+	if (page1.start != page2.start || page2.start != page3.start) {
+		printf("VMAs are not merged to start, aborting test\n");
+		ret = 0;
+		goto out;
+	}
+
+	if (munlock(map + page_size, page_size)) {
+		perror("munlock()");
+		goto out;
+	}
+
+	if (get_vm_area((unsigned long)map, &page1) ||
+	    get_vm_area((unsigned long)map + page_size, &page2) ||
+	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+		printf("couldn't find mapping in /proc/self/maps\n");
+		goto out;
+	}
+
+	/* All three VMAs should be different */
+	if (page1.start == page2.start || page2.start == page3.start) {
+		printf("failed to split VMA for munlock\n");
+		goto out;
+	}
+
+	/* Now unlock the first and third page and check the VMAs again */
+	if (munlock(map, page_size * 3)) {
+		perror("munlock()");
+		goto out;
+	}
+
+	if (get_vm_area((unsigned long)map, &page1) ||
+	    get_vm_area((unsigned long)map + page_size, &page2) ||
+	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+		printf("couldn't find mapping in /proc/self/maps\n");
+		goto out;
+	}
+
+	/* Now all three VMAs should be the same */
+	if (page1.start != page2.start || page2.start != page3.start) {
+		printf("failed to merge VMAs after munlock\n");
+		goto out;
+	}
+
+	ret = 0;
+out:
+	munmap(map, 3 * page_size);
+	return ret;
+}
+
+static int test_mlockall(int (test_function)(bool call_mlock))
+{
+	int ret = 1;
+
+	if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
+		perror("mlockall");
+		return ret;
+	}
+
+	ret = test_function(false);
+	munlockall();
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int ret = 0;
+	ret += test_mlock_lock();
+	ret += test_mlock_onfault();
+	ret += test_munlockall();
+	ret += test_lock_onfault_of_present();
+	ret += test_vma_management(true);
+	ret += test_mlockall(test_vma_management);
+	return ret;
+}
diff --git a/tools/testing/selftests/mm/mlock2.h b/tools/testing/selftests/mm/mlock2.h
new file mode 100644
index 000000000000..2a6e76c226bc
--- /dev/null
+++ b/tools/testing/selftests/mm/mlock2.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <syscall.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 1
+#endif
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int mlock2_(void *start, size_t len, int flags)
+{
+#ifdef __NR_mlock2
+	return syscall(__NR_mlock2, start, len, flags);
+#else
+	errno = ENOSYS;
+	return -1;
+#endif
+}
+
+static FILE *seek_to_smaps_entry(unsigned long addr)
+{
+	FILE *file;
+	char *line = NULL;
+	size_t size = 0;
+	unsigned long start, end;
+	char perms[5];
+	unsigned long offset;
+	char dev[32];
+	unsigned long inode;
+	char path[BUFSIZ];
+
+	file = fopen("/proc/self/smaps", "r");
+	if (!file) {
+		perror("fopen smaps");
+		_exit(1);
+	}
+
+	while (getline(&line, &size, file) > 0) {
+		if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
+			   &start, &end, perms, &offset, dev, &inode, path) < 6)
+			goto next;
+
+		if (start <= addr && addr < end)
+			goto out;
+
+next:
+		free(line);
+		line = NULL;
+		size = 0;
+	}
+
+	fclose(file);
+	file = NULL;
+
+out:
+	free(line);
+	return file;
+}
diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c
new file mode 100644
index 000000000000..6c62966ab5db
--- /dev/null
+++ b/tools/testing/selftests/mm/mrelease_test.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2022 Google LLC
+ */
+#define _GNU_SOURCE
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "util.h"
+
+#include "../kselftest.h"
+
+#ifndef __NR_pidfd_open
+#define __NR_pidfd_open -1
+#endif
+
+#ifndef __NR_process_mrelease
+#define __NR_process_mrelease -1
+#endif
+
+#define MB(x) (x << 20)
+#define MAX_SIZE_MB 1024
+
+static int alloc_noexit(unsigned long nr_pages, int pipefd)
+{
+	int ppid = getppid();
+	int timeout = 10; /* 10sec timeout to get killed */
+	unsigned long i;
+	char *buf;
+
+	buf = (char *)mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANON, 0, 0);
+	if (buf == MAP_FAILED) {
+		perror("mmap failed, halting the test");
+		return KSFT_FAIL;
+	}
+
+	for (i = 0; i < nr_pages; i++)
+		*((unsigned long *)(buf + (i * PAGE_SIZE))) = i;
+
+	/* Signal the parent that the child is ready */
+	if (write(pipefd, "", 1) < 0) {
+		perror("write");
+		return KSFT_FAIL;
+	}
+
+	/* Wait to be killed (when reparenting happens) */
+	while (getppid() == ppid && timeout > 0) {
+		sleep(1);
+		timeout--;
+	}
+
+	munmap(buf, nr_pages * PAGE_SIZE);
+
+	return (timeout > 0) ? KSFT_PASS : KSFT_FAIL;
+}
+
+/* The process_mrelease calls in this test are expected to fail */
+static void run_negative_tests(int pidfd)
+{
+	int res;
+	/* Test invalid flags. Expect to fail with EINVAL error code. */
+	if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) ||
+			errno != EINVAL) {
+		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
+		perror("process_mrelease with wrong flags");
+		exit(res);
+	}
+	/*
+	 * Test reaping while process is alive with no pending SIGKILL.
+	 * Expect to fail with EINVAL error code.
+	 */
+	if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) {
+		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
+		perror("process_mrelease on a live process");
+		exit(res);
+	}
+}
+
+static int child_main(int pipefd[], size_t size)
+{
+	int res;
+
+	/* Allocate and fault-in memory and wait to be killed */
+	close(pipefd[0]);
+	res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]);
+	close(pipefd[1]);
+	return res;
+}
+
+int main(void)
+{
+	int pipefd[2], pidfd;
+	bool success, retry;
+	size_t size;
+	pid_t pid;
+	char byte;
+	int res;
+
+	/* Test a wrong pidfd */
+	if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) {
+		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
+		perror("process_mrelease with wrong pidfd");
+		exit(res);
+	}
+
+	/* Start the test with 1MB child memory allocation */
+	size = 1;
+retry:
+	/*
+	 * Pipe for the child to signal when it's done allocating
+	 * memory
+	 */
+	if (pipe(pipefd)) {
+		perror("pipe");
+		exit(KSFT_FAIL);
+	}
+	pid = fork();
+	if (pid < 0) {
+		perror("fork");
+		close(pipefd[0]);
+		close(pipefd[1]);
+		exit(KSFT_FAIL);
+	}
+
+	if (pid == 0) {
+		/* Child main routine */
+		res = child_main(pipefd, size);
+		exit(res);
+	}
+
+	/*
+	 * Parent main routine:
+	 * Wait for the child to finish allocations, then kill and reap
+	 */
+	close(pipefd[1]);
+	/* Block until the child is ready */
+	res = read(pipefd[0], &byte, 1);
+	close(pipefd[0]);
+	if (res < 0) {
+		perror("read");
+		if (!kill(pid, SIGKILL))
+			waitpid(pid, NULL, 0);
+		exit(KSFT_FAIL);
+	}
+
+	pidfd = syscall(__NR_pidfd_open, pid, 0);
+	if (pidfd < 0) {
+		perror("pidfd_open");
+		if (!kill(pid, SIGKILL))
+			waitpid(pid, NULL, 0);
+		exit(KSFT_FAIL);
+	}
+
+	/* Run negative tests which require a live child */
+	run_negative_tests(pidfd);
+
+	if (kill(pid, SIGKILL)) {
+		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
+		perror("kill");
+		exit(res);
+	}
+
+	success = (syscall(__NR_process_mrelease, pidfd, 0) == 0);
+	if (!success) {
+		/*
+		 * If we failed to reap because the child exited too soon,
+		 * before we could call process_mrelease. Double child's memory
+		 * which causes it to spend more time on cleanup and increases
+		 * our chances of reaping its memory before it exits.
+		 * Retry until we succeed or reach MAX_SIZE_MB.
+		 */
+		if (errno == ESRCH) {
+			retry = (size <= MAX_SIZE_MB);
+		} else {
+			res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
+			perror("process_mrelease");
+			waitpid(pid, NULL, 0);
+			exit(res);
+		}
+	}
+
+	/* Cleanup to prevent zombies */
+	if (waitpid(pid, NULL, 0) < 0) {
+		perror("waitpid");
+		exit(KSFT_FAIL);
+	}
+	close(pidfd);
+
+	if (!success) {
+		if (retry) {
+			size *= 2;
+			goto retry;
+		}
+		printf("All process_mrelease attempts failed!\n");
+		exit(KSFT_FAIL);
+	}
+
+	printf("Success reaping a child with %zuMB of memory allocations\n",
+	       size);
+	return KSFT_PASS;
+}
diff --git a/tools/testing/selftests/mm/mremap_dontunmap.c b/tools/testing/selftests/mm/mremap_dontunmap.c
new file mode 100644
index 000000000000..f01dc4a85b0b
--- /dev/null
+++ b/tools/testing/selftests/mm/mremap_dontunmap.c
@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Tests for mremap w/ MREMAP_DONTUNMAP.
+ *
+ * Copyright 2020, Brian Geffon <bgeffon@google.com>
+ */
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+#ifndef MREMAP_DONTUNMAP
+#define MREMAP_DONTUNMAP 4
+#endif
+
+unsigned long page_size;
+char *page_buffer;
+
+static void dump_maps(void)
+{
+	char cmd[32];
+
+	snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
+	system(cmd);
+}
+
+#define BUG_ON(condition, description)					      \
+	do {								      \
+		if (condition) {					      \
+			fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \
+				__LINE__, (description), strerror(errno));    \
+			dump_maps();					  \
+			exit(1);					      \
+		} 							      \
+	} while (0)
+
+// Try a simple operation for to "test" for kernel support this prevents
+// reporting tests as failed when it's run on an older kernel.
+static int kernel_support_for_mremap_dontunmap()
+{
+	int ret = 0;
+	unsigned long num_pages = 1;
+	void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE,
+				    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+	// This simple remap should only fail if MREMAP_DONTUNMAP isn't
+	// supported.
+	void *dest_mapping =
+	    mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+		   MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0);
+	if (dest_mapping == MAP_FAILED) {
+		ret = errno;
+	} else {
+		BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+		       "unable to unmap destination mapping");
+	}
+
+	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+	       "unable to unmap source mapping");
+	return ret;
+}
+
+// This helper will just validate that an entire mapping contains the expected
+// byte.
+static int check_region_contains_byte(void *addr, unsigned long size, char byte)
+{
+	BUG_ON(size & (page_size - 1),
+	       "check_region_contains_byte expects page multiples");
+	BUG_ON((unsigned long)addr & (page_size - 1),
+	       "check_region_contains_byte expects page alignment");
+
+	memset(page_buffer, byte, page_size);
+
+	unsigned long num_pages = size / page_size;
+	unsigned long i;
+
+	// Compare each page checking that it contains our expected byte.
+	for (i = 0; i < num_pages; ++i) {
+		int ret =
+		    memcmp(addr + (i * page_size), page_buffer, page_size);
+		if (ret) {
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving
+// the source mapping mapped.
+static void mremap_dontunmap_simple()
+{
+	unsigned long num_pages = 5;
+
+	void *source_mapping =
+	    mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+	memset(source_mapping, 'a', num_pages * page_size);
+
+	// Try to just move the whole mapping anywhere (not fixed).
+	void *dest_mapping =
+	    mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+		   MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+	BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+	// Validate that the pages have been moved, we know they were moved if
+	// the dest_mapping contains a's.
+	BUG_ON(check_region_contains_byte
+	       (dest_mapping, num_pages * page_size, 'a') != 0,
+	       "pages did not migrate");
+	BUG_ON(check_region_contains_byte
+	       (source_mapping, num_pages * page_size, 0) != 0,
+	       "source should have no ptes");
+
+	BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+	       "unable to unmap destination mapping");
+	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+	       "unable to unmap source mapping");
+}
+
+// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected.
+static void mremap_dontunmap_simple_shmem()
+{
+	unsigned long num_pages = 5;
+
+	int mem_fd = memfd_create("memfd", MFD_CLOEXEC);
+	BUG_ON(mem_fd < 0, "memfd_create");
+
+	BUG_ON(ftruncate(mem_fd, num_pages * page_size) < 0,
+			"ftruncate");
+
+	void *source_mapping =
+	    mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+		 MAP_FILE | MAP_SHARED, mem_fd, 0);
+	BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+	BUG_ON(close(mem_fd) < 0, "close");
+
+	memset(source_mapping, 'a', num_pages * page_size);
+
+	// Try to just move the whole mapping anywhere (not fixed).
+	void *dest_mapping =
+	    mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+		   MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+	if (dest_mapping == MAP_FAILED && errno == EINVAL) {
+		// Old kernel which doesn't support MREMAP_DONTUNMAP on shmem.
+		BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+			"unable to unmap source mapping");
+		return;
+	}
+
+	BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+	// Validate that the pages have been moved, we know they were moved if
+	// the dest_mapping contains a's.
+	BUG_ON(check_region_contains_byte
+	       (dest_mapping, num_pages * page_size, 'a') != 0,
+	       "pages did not migrate");
+
+	// Because the region is backed by shmem, we will actually see the same
+	// memory at the source location still.
+	BUG_ON(check_region_contains_byte
+	       (source_mapping, num_pages * page_size, 'a') != 0,
+	       "source should have no ptes");
+
+	BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+	       "unable to unmap destination mapping");
+	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+	       "unable to unmap source mapping");
+}
+
+// This test validates MREMAP_DONTUNMAP will move page tables to a specific
+// destination using MREMAP_FIXED, also while validating that the source
+// remains intact.
+static void mremap_dontunmap_simple_fixed()
+{
+	unsigned long num_pages = 5;
+
+	// Since we want to guarantee that we can remap to a point, we will
+	// create a mapping up front.
+	void *dest_mapping =
+	    mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(dest_mapping == MAP_FAILED, "mmap");
+	memset(dest_mapping, 'X', num_pages * page_size);
+
+	void *source_mapping =
+	    mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(source_mapping == MAP_FAILED, "mmap");
+	memset(source_mapping, 'a', num_pages * page_size);
+
+	void *remapped_mapping =
+	    mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+		   MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE,
+		   dest_mapping);
+	BUG_ON(remapped_mapping == MAP_FAILED, "mremap");
+	BUG_ON(remapped_mapping != dest_mapping,
+	       "mremap should have placed the remapped mapping at dest_mapping");
+
+	// The dest mapping will have been unmap by mremap so we expect the Xs
+	// to be gone and replaced with a's.
+	BUG_ON(check_region_contains_byte
+	       (dest_mapping, num_pages * page_size, 'a') != 0,
+	       "pages did not migrate");
+
+	// And the source mapping will have had its ptes dropped.
+	BUG_ON(check_region_contains_byte
+	       (source_mapping, num_pages * page_size, 0) != 0,
+	       "source should have no ptes");
+
+	BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+	       "unable to unmap destination mapping");
+	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+	       "unable to unmap source mapping");
+}
+
+// This test validates that we can MREMAP_DONTUNMAP for a portion of an
+// existing mapping.
+static void mremap_dontunmap_partial_mapping()
+{
+	/*
+	 *  source mapping:
+	 *  --------------
+	 *  | aaaaaaaaaa |
+	 *  --------------
+	 *  to become:
+	 *  --------------
+	 *  | aaaaa00000 |
+	 *  --------------
+	 *  With the destination mapping containing 5 pages of As.
+	 *  ---------
+	 *  | aaaaa |
+	 *  ---------
+	 */
+	unsigned long num_pages = 10;
+	void *source_mapping =
+	    mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(source_mapping == MAP_FAILED, "mmap");
+	memset(source_mapping, 'a', num_pages * page_size);
+
+	// We will grab the last 5 pages of the source and move them.
+	void *dest_mapping =
+	    mremap(source_mapping + (5 * page_size), 5 * page_size,
+		   5 * page_size,
+		   MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+	BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+	// We expect the first 5 pages of the source to contain a's and the
+	// final 5 pages to contain zeros.
+	BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') !=
+	       0, "first 5 pages of source should have original pages");
+	BUG_ON(check_region_contains_byte
+	       (source_mapping + (5 * page_size), 5 * page_size, 0) != 0,
+	       "final 5 pages of source should have no ptes");
+
+	// Finally we expect the destination to have 5 pages worth of a's.
+	BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') !=
+	       0, "dest mapping should contain ptes from the source");
+
+	BUG_ON(munmap(dest_mapping, 5 * page_size) == -1,
+	       "unable to unmap destination mapping");
+	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+	       "unable to unmap source mapping");
+}
+
+// This test validates that we can remap over only a portion of a mapping.
+static void mremap_dontunmap_partial_mapping_overwrite(void)
+{
+	/*
+	 *  source mapping:
+	 *  ---------
+	 *  |aaaaa|
+	 *  ---------
+	 *  dest mapping initially:
+	 *  -----------
+	 *  |XXXXXXXXXX|
+	 *  ------------
+	 *  Source to become:
+	 *  ---------
+	 *  |00000|
+	 *  ---------
+	 *  With the destination mapping containing 5 pages of As.
+	 *  ------------
+	 *  |aaaaaXXXXX|
+	 *  ------------
+	 */
+	void *source_mapping =
+	    mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(source_mapping == MAP_FAILED, "mmap");
+	memset(source_mapping, 'a', 5 * page_size);
+
+	void *dest_mapping =
+	    mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(dest_mapping == MAP_FAILED, "mmap");
+	memset(dest_mapping, 'X', 10 * page_size);
+
+	// We will grab the last 5 pages of the source and move them.
+	void *remapped_mapping =
+	    mremap(source_mapping, 5 * page_size,
+		   5 * page_size,
+		   MREMAP_DONTUNMAP | MREMAP_MAYMOVE | MREMAP_FIXED, dest_mapping);
+	BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+	BUG_ON(dest_mapping != remapped_mapping, "expected to remap to dest_mapping");
+
+	BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) !=
+	       0, "first 5 pages of source should have no ptes");
+
+	// Finally we expect the destination to have 5 pages worth of a's.
+	BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != 0,
+			"dest mapping should contain ptes from the source");
+
+	// Finally the last 5 pages shouldn't have been touched.
+	BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size),
+				5 * page_size, 'X') != 0,
+			"dest mapping should have retained the last 5 pages");
+
+	BUG_ON(munmap(dest_mapping, 10 * page_size) == -1,
+	       "unable to unmap destination mapping");
+	BUG_ON(munmap(source_mapping, 5 * page_size) == -1,
+	       "unable to unmap source mapping");
+}
+
+int main(void)
+{
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	// test for kernel support for MREMAP_DONTUNMAP skipping the test if
+	// not.
+	if (kernel_support_for_mremap_dontunmap() != 0) {
+		printf("No kernel support for MREMAP_DONTUNMAP\n");
+		return KSFT_SKIP;
+	}
+
+	// Keep a page sized buffer around for when we need it.
+	page_buffer =
+	    mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page.");
+
+	mremap_dontunmap_simple();
+	mremap_dontunmap_simple_shmem();
+	mremap_dontunmap_simple_fixed();
+	mremap_dontunmap_partial_mapping();
+	mremap_dontunmap_partial_mapping_overwrite();
+
+	BUG_ON(munmap(page_buffer, page_size) == -1,
+	       "unable to unmap page buffer");
+
+	printf("OK\n");
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c
new file mode 100644
index 000000000000..9496346973d4
--- /dev/null
+++ b/tools/testing/selftests/mm/mremap_test.c
@@ -0,0 +1,475 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2020 Google LLC
+ */
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <stdbool.h>
+
+#include "../kselftest.h"
+
+#define EXPECT_SUCCESS 0
+#define EXPECT_FAILURE 1
+#define NON_OVERLAPPING 0
+#define OVERLAPPING 1
+#define NS_PER_SEC 1000000000ULL
+#define VALIDATION_DEFAULT_THRESHOLD 4	/* 4MB */
+#define VALIDATION_NO_THRESHOLD 0	/* Verify the entire region */
+
+#define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
+
+struct config {
+	unsigned long long src_alignment;
+	unsigned long long dest_alignment;
+	unsigned long long region_size;
+	int overlapping;
+};
+
+struct test {
+	const char *name;
+	struct config config;
+	int expect_failure;
+};
+
+enum {
+	_1KB = 1ULL << 10,	/* 1KB -> not page aligned */
+	_4KB = 4ULL << 10,
+	_8KB = 8ULL << 10,
+	_1MB = 1ULL << 20,
+	_2MB = 2ULL << 20,
+	_4MB = 4ULL << 20,
+	_1GB = 1ULL << 30,
+	_2GB = 2ULL << 30,
+	PMD = _2MB,
+	PUD = _1GB,
+};
+
+#define PTE page_size
+
+#define MAKE_TEST(source_align, destination_align, size,	\
+		  overlaps, should_fail, test_name)		\
+(struct test){							\
+	.name = test_name,					\
+	.config = {						\
+		.src_alignment = source_align,			\
+		.dest_alignment = destination_align,		\
+		.region_size = size,				\
+		.overlapping = overlaps,			\
+	},							\
+	.expect_failure = should_fail				\
+}
+
+/*
+ * Returns false if the requested remap region overlaps with an
+ * existing mapping (e.g text, stack) else returns true.
+ */
+static bool is_remap_region_valid(void *addr, unsigned long long size)
+{
+	void *remap_addr = NULL;
+	bool ret = true;
+
+	/* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */
+	remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE,
+					 MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
+					 -1, 0);
+
+	if (remap_addr == MAP_FAILED) {
+		if (errno == EEXIST)
+			ret = false;
+	} else {
+		munmap(remap_addr, size);
+	}
+
+	return ret;
+}
+
+/* Returns mmap_min_addr sysctl tunable from procfs */
+static unsigned long long get_mmap_min_addr(void)
+{
+	FILE *fp;
+	int n_matched;
+	static unsigned long long addr;
+
+	if (addr)
+		return addr;
+
+	fp = fopen("/proc/sys/vm/mmap_min_addr", "r");
+	if (fp == NULL) {
+		ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n",
+			strerror(errno));
+		exit(KSFT_SKIP);
+	}
+
+	n_matched = fscanf(fp, "%llu", &addr);
+	if (n_matched != 1) {
+		ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n",
+			strerror(errno));
+		fclose(fp);
+		exit(KSFT_SKIP);
+	}
+
+	fclose(fp);
+	return addr;
+}
+
+/*
+ * This test validates that merge is called when expanding a mapping.
+ * Mapping containing three pages is created, middle page is unmapped
+ * and then the mapping containing the first page is expanded so that
+ * it fills the created hole. The two parts should merge creating
+ * single mapping with three pages.
+ */
+static void mremap_expand_merge(unsigned long page_size)
+{
+	char *test_name = "mremap expand merge";
+	FILE *fp;
+	char *line = NULL;
+	size_t len = 0;
+	bool success = false;
+	char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+	munmap(start + page_size, page_size);
+	mremap(start, page_size, 2 * page_size, 0);
+
+	fp = fopen("/proc/self/maps", "r");
+	if (fp == NULL) {
+		ksft_test_result_fail("%s\n", test_name);
+		return;
+	}
+
+	while (getline(&line, &len, fp) != -1) {
+		char *first = strtok(line, "- ");
+		void *first_val = (void *)strtol(first, NULL, 16);
+		char *second = strtok(NULL, "- ");
+		void *second_val = (void *) strtol(second, NULL, 16);
+
+		if (first_val == start && second_val == start + 3 * page_size) {
+			success = true;
+			break;
+		}
+	}
+	if (success)
+		ksft_test_result_pass("%s\n", test_name);
+	else
+		ksft_test_result_fail("%s\n", test_name);
+	fclose(fp);
+}
+
+/*
+ * Returns the start address of the mapping on success, else returns
+ * NULL on failure.
+ */
+static void *get_source_mapping(struct config c)
+{
+	unsigned long long addr = 0ULL;
+	void *src_addr = NULL;
+	unsigned long long mmap_min_addr;
+
+	mmap_min_addr = get_mmap_min_addr();
+
+retry:
+	addr += c.src_alignment;
+	if (addr < mmap_min_addr)
+		goto retry;
+
+	src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE,
+					MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
+					-1, 0);
+	if (src_addr == MAP_FAILED) {
+		if (errno == EPERM || errno == EEXIST)
+			goto retry;
+		goto error;
+	}
+	/*
+	 * Check that the address is aligned to the specified alignment.
+	 * Addresses which have alignments that are multiples of that
+	 * specified are not considered valid. For instance, 1GB address is
+	 * 2MB-aligned, however it will not be considered valid for a
+	 * requested alignment of 2MB. This is done to reduce coincidental
+	 * alignment in the tests.
+	 */
+	if (((unsigned long long) src_addr & (c.src_alignment - 1)) ||
+			!((unsigned long long) src_addr & c.src_alignment)) {
+		munmap(src_addr, c.region_size);
+		goto retry;
+	}
+
+	if (!src_addr)
+		goto error;
+
+	return src_addr;
+error:
+	ksft_print_msg("Failed to map source region: %s\n",
+			strerror(errno));
+	return NULL;
+}
+
+/* Returns the time taken for the remap on success else returns -1. */
+static long long remap_region(struct config c, unsigned int threshold_mb,
+			      char pattern_seed)
+{
+	void *addr, *src_addr, *dest_addr;
+	unsigned long long i;
+	struct timespec t_start = {0, 0}, t_end = {0, 0};
+	long long  start_ns, end_ns, align_mask, ret, offset;
+	unsigned long long threshold;
+
+	if (threshold_mb == VALIDATION_NO_THRESHOLD)
+		threshold = c.region_size;
+	else
+		threshold = MIN(threshold_mb * _1MB, c.region_size);
+
+	src_addr = get_source_mapping(c);
+	if (!src_addr) {
+		ret = -1;
+		goto out;
+	}
+
+	/* Set byte pattern */
+	srand(pattern_seed);
+	for (i = 0; i < threshold; i++)
+		memset((char *) src_addr + i, (char) rand(), 1);
+
+	/* Mask to zero out lower bits of address for alignment */
+	align_mask = ~(c.dest_alignment - 1);
+	/* Offset of destination address from the end of the source region */
+	offset = (c.overlapping) ? -c.dest_alignment : c.dest_alignment;
+	addr = (void *) (((unsigned long long) src_addr + c.region_size
+			  + offset) & align_mask);
+
+	/* See comment in get_source_mapping() */
+	if (!((unsigned long long) addr & c.dest_alignment))
+		addr = (void *) ((unsigned long long) addr | c.dest_alignment);
+
+	/* Don't destroy existing mappings unless expected to overlap */
+	while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) {
+		/* Check for unsigned overflow */
+		if (addr + c.dest_alignment < addr) {
+			ksft_print_msg("Couldn't find a valid region to remap to\n");
+			ret = -1;
+			goto out;
+		}
+		addr += c.dest_alignment;
+	}
+
+	clock_gettime(CLOCK_MONOTONIC, &t_start);
+	dest_addr = mremap(src_addr, c.region_size, c.region_size,
+					  MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr);
+	clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+	if (dest_addr == MAP_FAILED) {
+		ksft_print_msg("mremap failed: %s\n", strerror(errno));
+		ret = -1;
+		goto clean_up_src;
+	}
+
+	/* Verify byte pattern after remapping */
+	srand(pattern_seed);
+	for (i = 0; i < threshold; i++) {
+		char c = (char) rand();
+
+		if (((char *) dest_addr)[i] != c) {
+			ksft_print_msg("Data after remap doesn't match at offset %d\n",
+				       i);
+			ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff,
+					((char *) dest_addr)[i] & 0xff);
+			ret = -1;
+			goto clean_up_dest;
+		}
+	}
+
+	start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec;
+	end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec;
+	ret = end_ns - start_ns;
+
+/*
+ * Since the destination address is specified using MREMAP_FIXED, subsequent
+ * mremap will unmap any previous mapping at the address range specified by
+ * dest_addr and region_size. This significantly affects the remap time of
+ * subsequent tests. So we clean up mappings after each test.
+ */
+clean_up_dest:
+	munmap(dest_addr, c.region_size);
+clean_up_src:
+	munmap(src_addr, c.region_size);
+out:
+	return ret;
+}
+
+static void run_mremap_test_case(struct test test_case, int *failures,
+				 unsigned int threshold_mb,
+				 unsigned int pattern_seed)
+{
+	long long remap_time = remap_region(test_case.config, threshold_mb,
+					    pattern_seed);
+
+	if (remap_time < 0) {
+		if (test_case.expect_failure)
+			ksft_test_result_xfail("%s\n\tExpected mremap failure\n",
+					      test_case.name);
+		else {
+			ksft_test_result_fail("%s\n", test_case.name);
+			*failures += 1;
+		}
+	} else {
+		/*
+		 * Comparing mremap time is only applicable if entire region
+		 * was faulted in.
+		 */
+		if (threshold_mb == VALIDATION_NO_THRESHOLD ||
+		    test_case.config.region_size <= threshold_mb * _1MB)
+			ksft_test_result_pass("%s\n\tmremap time: %12lldns\n",
+					      test_case.name, remap_time);
+		else
+			ksft_test_result_pass("%s\n", test_case.name);
+	}
+}
+
+static void usage(const char *cmd)
+{
+	fprintf(stderr,
+		"Usage: %s [[-t <threshold_mb>] [-p <pattern_seed>]]\n"
+		"-t\t only validate threshold_mb of the remapped region\n"
+		"  \t if 0 is supplied no threshold is used; all tests\n"
+		"  \t are run and remapped regions validated fully.\n"
+		"  \t The default threshold used is 4MB.\n"
+		"-p\t provide a seed to generate the random pattern for\n"
+		"  \t validating the remapped region.\n", cmd);
+}
+
+static int parse_args(int argc, char **argv, unsigned int *threshold_mb,
+		      unsigned int *pattern_seed)
+{
+	const char *optstr = "t:p:";
+	int opt;
+
+	while ((opt = getopt(argc, argv, optstr)) != -1) {
+		switch (opt) {
+		case 't':
+			*threshold_mb = atoi(optarg);
+			break;
+		case 'p':
+			*pattern_seed = atoi(optarg);
+			break;
+		default:
+			usage(argv[0]);
+			return -1;
+		}
+	}
+
+	if (optind < argc) {
+		usage(argv[0]);
+		return -1;
+	}
+
+	return 0;
+}
+
+#define MAX_TEST 13
+#define MAX_PERF_TEST 3
+int main(int argc, char **argv)
+{
+	int failures = 0;
+	int i, run_perf_tests;
+	unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD;
+	unsigned int pattern_seed;
+	int num_expand_tests = 1;
+	struct test test_cases[MAX_TEST];
+	struct test perf_test_cases[MAX_PERF_TEST];
+	int page_size;
+	time_t t;
+
+	pattern_seed = (unsigned int) time(&t);
+
+	if (parse_args(argc, argv, &threshold_mb, &pattern_seed) < 0)
+		exit(EXIT_FAILURE);
+
+	ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n",
+		       threshold_mb, pattern_seed);
+
+	page_size = sysconf(_SC_PAGESIZE);
+
+	/* Expected mremap failures */
+	test_cases[0] =	MAKE_TEST(page_size, page_size, page_size,
+				  OVERLAPPING, EXPECT_FAILURE,
+				  "mremap - Source and Destination Regions Overlapping");
+
+	test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size,
+				  NON_OVERLAPPING, EXPECT_FAILURE,
+				  "mremap - Destination Address Misaligned (1KB-aligned)");
+	test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size,
+				  NON_OVERLAPPING, EXPECT_FAILURE,
+				  "mremap - Source Address Misaligned (1KB-aligned)");
+
+	/* Src addr PTE aligned */
+	test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2,
+				  NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "8KB mremap - Source PTE-aligned, Destination PTE-aligned");
+
+	/* Src addr 1MB aligned */
+	test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "2MB mremap - Source 1MB-aligned, Destination PTE-aligned");
+	test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned");
+
+	/* Src addr PMD aligned */
+	test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "4MB mremap - Source PMD-aligned, Destination PTE-aligned");
+	test_cases[7] =	MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "4MB mremap - Source PMD-aligned, Destination 1MB-aligned");
+	test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "4MB mremap - Source PMD-aligned, Destination PMD-aligned");
+
+	/* Src addr PUD aligned */
+	test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "2GB mremap - Source PUD-aligned, Destination PTE-aligned");
+	test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				   "2GB mremap - Source PUD-aligned, Destination 1MB-aligned");
+	test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				   "2GB mremap - Source PUD-aligned, Destination PMD-aligned");
+	test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				   "2GB mremap - Source PUD-aligned, Destination PUD-aligned");
+
+	perf_test_cases[0] =  MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+					"1GB mremap - Source PTE-aligned, Destination PTE-aligned");
+	/*
+	 * mremap 1GB region - Page table level aligned time
+	 * comparison.
+	 */
+	perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				       "1GB mremap - Source PMD-aligned, Destination PMD-aligned");
+	perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				       "1GB mremap - Source PUD-aligned, Destination PUD-aligned");
+
+	run_perf_tests =  (threshold_mb == VALIDATION_NO_THRESHOLD) ||
+				(threshold_mb * _1MB >= _1GB);
+
+	ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ?
+		      ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests);
+
+	for (i = 0; i < ARRAY_SIZE(test_cases); i++)
+		run_mremap_test_case(test_cases[i], &failures, threshold_mb,
+				     pattern_seed);
+
+	mremap_expand_merge(page_size);
+
+	if (run_perf_tests) {
+		ksft_print_msg("\n%s\n",
+		 "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:");
+		for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++)
+			run_mremap_test_case(perf_test_cases[i], &failures,
+					     threshold_mb, pattern_seed);
+	}
+
+	if (failures > 0)
+		ksft_exit_fail();
+	else
+		ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c
new file mode 100644
index 000000000000..634d87dfb2a4
--- /dev/null
+++ b/tools/testing/selftests/mm/on-fault-limit.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/mman.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int test_limit(void)
+{
+	int ret = 1;
+	struct rlimit lims;
+	void *map;
+
+	if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
+		perror("getrlimit");
+		return ret;
+	}
+
+	if (mlockall(MCL_ONFAULT | MCL_FUTURE)) {
+		perror("mlockall");
+		return ret;
+	}
+
+	map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
+	if (map != MAP_FAILED)
+		printf("mmap should have failed, but didn't\n");
+	else {
+		ret = 0;
+		munmap(map, 2 * lims.rlim_max);
+	}
+
+	munlockall();
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int ret = 0;
+
+	ret += test_limit();
+	return ret;
+}
diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
new file mode 100644
index 000000000000..92f3be3dd8e5
--- /dev/null
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -0,0 +1,226 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PKEYS_HELPER_H
+#define _PKEYS_HELPER_H
+#define _GNU_SOURCE
+#include <string.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+
+#include "../kselftest.h"
+
+/* Define some kernel-like types */
+#define  u8 __u8
+#define u16 __u16
+#define u32 __u32
+#define u64 __u64
+
+#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
+
+#ifndef DEBUG_LEVEL
+#define DEBUG_LEVEL 0
+#endif
+#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
+extern int dprint_in_signal;
+extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+
+extern int test_nr;
+extern int iteration_nr;
+
+#ifdef __GNUC__
+__attribute__((format(printf, 1, 2)))
+#endif
+static inline void sigsafe_printf(const char *format, ...)
+{
+	va_list ap;
+
+	if (!dprint_in_signal) {
+		va_start(ap, format);
+		vprintf(format, ap);
+		va_end(ap);
+	} else {
+		int ret;
+		/*
+		 * No printf() functions are signal-safe.
+		 * They deadlock easily. Write the format
+		 * string to get some output, even if
+		 * incomplete.
+		 */
+		ret = write(1, format, strlen(format));
+		if (ret < 0)
+			exit(1);
+	}
+}
+#define dprintf_level(level, args...) do {	\
+	if (level <= DEBUG_LEVEL)		\
+		sigsafe_printf(args);		\
+} while (0)
+#define dprintf0(args...) dprintf_level(0, args)
+#define dprintf1(args...) dprintf_level(1, args)
+#define dprintf2(args...) dprintf_level(2, args)
+#define dprintf3(args...) dprintf_level(3, args)
+#define dprintf4(args...) dprintf_level(4, args)
+
+extern void abort_hooks(void);
+#define pkey_assert(condition) do {		\
+	if (!(condition)) {			\
+		dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
+				__FILE__, __LINE__,	\
+				test_nr, iteration_nr);	\
+		dprintf0("errno at assert: %d", errno);	\
+		abort_hooks();			\
+		exit(__LINE__);			\
+	}					\
+} while (0)
+
+__attribute__((noinline)) int read_ptr(int *ptr);
+void expected_pkey_fault(int pkey);
+int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
+int sys_pkey_free(unsigned long pkey);
+int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+		unsigned long pkey);
+void record_pkey_malloc(void *ptr, long size, int prot);
+
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+#include "pkey-x86.h"
+#elif defined(__powerpc64__) /* arch */
+#include "pkey-powerpc.h"
+#else /* arch */
+#error Architecture not supported
+#endif /* arch */
+
+#define PKEY_MASK	(PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
+
+static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
+{
+	u32 shift = pkey_bit_position(pkey);
+	/* mask out bits from pkey in old value */
+	reg &= ~((u64)PKEY_MASK << shift);
+	/* OR in new bits for pkey */
+	reg |= (flags & PKEY_MASK) << shift;
+	return reg;
+}
+
+static inline u64 get_pkey_bits(u64 reg, int pkey)
+{
+	u32 shift = pkey_bit_position(pkey);
+	/*
+	 * shift down the relevant bits to the lowest two, then
+	 * mask off all the other higher bits
+	 */
+	return ((reg >> shift) & PKEY_MASK);
+}
+
+extern u64 shadow_pkey_reg;
+
+static inline u64 _read_pkey_reg(int line)
+{
+	u64 pkey_reg = __read_pkey_reg();
+
+	dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx"
+			" shadow: %016llx\n",
+			line, pkey_reg, shadow_pkey_reg);
+	assert(pkey_reg == shadow_pkey_reg);
+
+	return pkey_reg;
+}
+
+#define read_pkey_reg() _read_pkey_reg(__LINE__)
+
+static inline void write_pkey_reg(u64 pkey_reg)
+{
+	dprintf4("%s() changing %016llx to %016llx\n", __func__,
+			__read_pkey_reg(), pkey_reg);
+	/* will do the shadow check for us: */
+	read_pkey_reg();
+	__write_pkey_reg(pkey_reg);
+	shadow_pkey_reg = pkey_reg;
+	dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__,
+			pkey_reg, __read_pkey_reg());
+}
+
+/*
+ * These are technically racy. since something could
+ * change PKEY register between the read and the write.
+ */
+static inline void __pkey_access_allow(int pkey, int do_allow)
+{
+	u64 pkey_reg = read_pkey_reg();
+	int bit = pkey * 2;
+
+	if (do_allow)
+		pkey_reg &= (1<<bit);
+	else
+		pkey_reg |= (1<<bit);
+
+	dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
+	write_pkey_reg(pkey_reg);
+}
+
+static inline void __pkey_write_allow(int pkey, int do_allow_write)
+{
+	u64 pkey_reg = read_pkey_reg();
+	int bit = pkey * 2 + 1;
+
+	if (do_allow_write)
+		pkey_reg &= (1<<bit);
+	else
+		pkey_reg |= (1<<bit);
+
+	write_pkey_reg(pkey_reg);
+	dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
+}
+
+#define ALIGN_UP(x, align_to)	(((x) + ((align_to)-1)) & ~((align_to)-1))
+#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
+#define ALIGN_PTR_UP(p, ptr_align_to)	\
+	((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
+#define ALIGN_PTR_DOWN(p, ptr_align_to)	\
+	((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to))
+#define __stringify_1(x...)     #x
+#define __stringify(x...)       __stringify_1(x)
+
+static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si)
+{
+#ifdef si_pkey
+	return &si->si_pkey;
+#else
+	return (u32 *)(((u8 *)si) + si_pkey_offset);
+#endif
+}
+
+static inline int kernel_has_pkeys(void)
+{
+	/* try allocating a key and see if it succeeds */
+	int ret = sys_pkey_alloc(0, 0);
+	if (ret <= 0) {
+		return 0;
+	}
+	sys_pkey_free(ret);
+	return 1;
+}
+
+static inline int is_pkeys_supported(void)
+{
+	/* check if the cpu supports pkeys */
+	if (!cpu_has_pkeys()) {
+		dprintf1("SKIP: %s: no CPU support\n", __func__);
+		return 0;
+	}
+
+	/* check if the kernel supports pkeys */
+	if (!kernel_has_pkeys()) {
+		dprintf1("SKIP: %s: no kernel support\n", __func__);
+		return 0;
+	}
+
+	return 1;
+}
+
+#endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/mm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h
new file mode 100644
index 000000000000..1ebb586b2fbc
--- /dev/null
+++ b/tools/testing/selftests/mm/pkey-powerpc.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _PKEYS_POWERPC_H
+#define _PKEYS_POWERPC_H
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key	386
+#endif
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc		384
+# define SYS_pkey_free		385
+#endif
+#define REG_IP_IDX		PT_NIP
+#define REG_TRAPNO		PT_TRAP
+#define gregs			gp_regs
+#define fpregs			fp_regs
+#define si_pkey_offset		0x20
+
+#undef PKEY_DISABLE_ACCESS
+#define PKEY_DISABLE_ACCESS	0x3  /* disable read and write */
+
+#undef PKEY_DISABLE_WRITE
+#define PKEY_DISABLE_WRITE	0x2
+
+#define NR_PKEYS		32
+#define NR_RESERVED_PKEYS_4K	27 /* pkey-0, pkey-1, exec-only-pkey
+				      and 24 other keys that cannot be
+				      represented in the PTE */
+#define NR_RESERVED_PKEYS_64K_3KEYS	3 /* PowerNV and KVM: pkey-0,
+					     pkey-1 and exec-only key */
+#define NR_RESERVED_PKEYS_64K_4KEYS	4 /* PowerVM: pkey-0, pkey-1,
+					     pkey-31 and exec-only key */
+#define PKEY_BITS_PER_PKEY	2
+#define HPAGE_SIZE		(1UL << 24)
+#define PAGE_SIZE		sysconf(_SC_PAGESIZE)
+
+static inline u32 pkey_bit_position(int pkey)
+{
+	return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY;
+}
+
+static inline u64 __read_pkey_reg(void)
+{
+	u64 pkey_reg;
+
+	asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg));
+
+	return pkey_reg;
+}
+
+static inline void __write_pkey_reg(u64 pkey_reg)
+{
+	u64 amr = pkey_reg;
+
+	dprintf4("%s() changing %016llx to %016llx\n",
+			 __func__, __read_pkey_reg(), pkey_reg);
+
+	asm volatile("isync; mtspr 0xd, %0; isync"
+		     : : "r" ((unsigned long)(amr)) : "memory");
+
+	dprintf4("%s() pkey register after changing %016llx to %016llx\n",
+			__func__, __read_pkey_reg(), pkey_reg);
+}
+
+static inline int cpu_has_pkeys(void)
+{
+	/* No simple way to determine this */
+	return 1;
+}
+
+static inline bool arch_is_powervm()
+{
+	struct stat buf;
+
+	if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) &&
+	    (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) &&
+	    (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) )
+		return true;
+
+	return false;
+}
+
+static inline int get_arch_reserved_keys(void)
+{
+	if (sysconf(_SC_PAGESIZE) == 4096)
+		return NR_RESERVED_PKEYS_4K;
+	else
+		if (arch_is_powervm())
+			return NR_RESERVED_PKEYS_64K_4KEYS;
+		else
+			return NR_RESERVED_PKEYS_64K_3KEYS;
+}
+
+void expect_fault_on_read_execonly_key(void *p1, int pkey)
+{
+	/*
+	 * powerpc does not allow userspace to change permissions of exec-only
+	 * keys since those keys are not allocated by userspace. The signal
+	 * handler wont be able to reset the permissions, which means the code
+	 * will infinitely continue to segfault here.
+	 */
+	return;
+}
+
+/* 4-byte instructions * 16384 = 64K page */
+#define __page_o_noops() asm(".rept 16384 ; nop; .endr")
+
+void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+	void *ptr;
+	int ret;
+
+	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+			size, prot, pkey);
+	pkey_assert(pkey < NR_PKEYS);
+	ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+	pkey_assert(ptr != (void *)-1);
+
+	ret = syscall(__NR_subpage_prot, ptr, size, NULL);
+	if (ret) {
+		perror("subpage_perm");
+		return PTR_ERR_ENOTSUP;
+	}
+
+	ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
+	pkey_assert(!ret);
+	record_pkey_malloc(ptr, size, prot);
+
+	dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
+	return ptr;
+}
+
+#endif /* _PKEYS_POWERPC_H */
diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h
new file mode 100644
index 000000000000..72c14cd3ddc7
--- /dev/null
+++ b/tools/testing/selftests/mm/pkey-x86.h
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _PKEYS_X86_H
+#define _PKEYS_X86_H
+
+#ifdef __i386__
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key	380
+#endif
+
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc		381
+# define SYS_pkey_free		382
+#endif
+
+#define REG_IP_IDX		REG_EIP
+#define si_pkey_offset		0x14
+
+#else
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key	329
+#endif
+
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc		330
+# define SYS_pkey_free		331
+#endif
+
+#define REG_IP_IDX		REG_RIP
+#define si_pkey_offset		0x20
+
+#endif
+
+#ifndef PKEY_DISABLE_ACCESS
+# define PKEY_DISABLE_ACCESS	0x1
+#endif
+
+#ifndef PKEY_DISABLE_WRITE
+# define PKEY_DISABLE_WRITE	0x2
+#endif
+
+#define NR_PKEYS		16
+#define NR_RESERVED_PKEYS	2 /* pkey-0 and exec-only-pkey */
+#define PKEY_BITS_PER_PKEY	2
+#define HPAGE_SIZE		(1UL<<21)
+#define PAGE_SIZE		4096
+#define MB			(1<<20)
+
+static inline void __page_o_noops(void)
+{
+	/* 8-bytes of instruction * 512 bytes = 1 page */
+	asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
+}
+
+static inline u64 __read_pkey_reg(void)
+{
+	unsigned int eax, edx;
+	unsigned int ecx = 0;
+	unsigned pkey_reg;
+
+	asm volatile(".byte 0x0f,0x01,0xee\n\t"
+		     : "=a" (eax), "=d" (edx)
+		     : "c" (ecx));
+	pkey_reg = eax;
+	return pkey_reg;
+}
+
+static inline void __write_pkey_reg(u64 pkey_reg)
+{
+	unsigned int eax = pkey_reg;
+	unsigned int ecx = 0;
+	unsigned int edx = 0;
+
+	dprintf4("%s() changing %016llx to %016llx\n", __func__,
+			__read_pkey_reg(), pkey_reg);
+	asm volatile(".byte 0x0f,0x01,0xef\n\t"
+		     : : "a" (eax), "c" (ecx), "d" (edx));
+	assert(pkey_reg == __read_pkey_reg());
+}
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */
+#define X86_FEATURE_PKU        (1<<3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE      (1<<4) /* OS Protection Keys Enable */
+
+static inline int cpu_has_pkeys(void)
+{
+	unsigned int eax;
+	unsigned int ebx;
+	unsigned int ecx;
+	unsigned int edx;
+
+	__cpuid_count(0x7, 0x0, eax, ebx, ecx, edx);
+
+	if (!(ecx & X86_FEATURE_PKU)) {
+		dprintf2("cpu does not have PKU\n");
+		return 0;
+	}
+	if (!(ecx & X86_FEATURE_OSPKE)) {
+		dprintf2("cpu does not have OSPKE\n");
+		return 0;
+	}
+	return 1;
+}
+
+static inline int cpu_max_xsave_size(void)
+{
+	unsigned long XSTATE_CPUID = 0xd;
+	unsigned int eax;
+	unsigned int ebx;
+	unsigned int ecx;
+	unsigned int edx;
+
+	__cpuid_count(XSTATE_CPUID, 0, eax, ebx, ecx, edx);
+	return ecx;
+}
+
+static inline u32 pkey_bit_position(int pkey)
+{
+	return pkey * PKEY_BITS_PER_PKEY;
+}
+
+#define XSTATE_PKEY_BIT	(9)
+#define XSTATE_PKEY	0x200
+#define XSTATE_BV_OFFSET	512
+
+int pkey_reg_xstate_offset(void)
+{
+	unsigned int eax;
+	unsigned int ebx;
+	unsigned int ecx;
+	unsigned int edx;
+	int xstate_offset;
+	int xstate_size;
+	unsigned long XSTATE_CPUID = 0xd;
+	int leaf;
+
+	/* assume that XSTATE_PKEY is set in XCR0 */
+	leaf = XSTATE_PKEY_BIT;
+	{
+		__cpuid_count(XSTATE_CPUID, leaf, eax, ebx, ecx, edx);
+
+		if (leaf == XSTATE_PKEY_BIT) {
+			xstate_offset = ebx;
+			xstate_size = eax;
+		}
+	}
+
+	if (xstate_size == 0) {
+		printf("could not find size/offset of PKEY in xsave state\n");
+		return 0;
+	}
+
+	return xstate_offset;
+}
+
+static inline int get_arch_reserved_keys(void)
+{
+	return NR_RESERVED_PKEYS;
+}
+
+void expect_fault_on_read_execonly_key(void *p1, int pkey)
+{
+	int ptr_contents;
+
+	ptr_contents = read_ptr(p1);
+	dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+	expected_pkey_fault(pkey);
+}
+
+void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+	return PTR_ERR_ENOTSUP;
+}
+
+#endif /* _PKEYS_X86_H */
diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
new file mode 100644
index 000000000000..95f403a0c46d
--- /dev/null
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -0,0 +1,1788 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
+ *
+ * There are examples in here of:
+ *  * how to set protection keys on memory
+ *  * how to set/clear bits in pkey registers (the rights register)
+ *  * how to handle SEGV_PKUERR signals and extract pkey-relevant
+ *    information from the siginfo
+ *
+ * Things to add:
+ *	make sure KSM and KSM COW breaking works
+ *	prefault pages in at malloc, or not
+ *	protect MPX bounds tables with protection keys?
+ *	make sure VMA splitting/merging is working correctly
+ *	OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
+ *	look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
+ *	do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
+ *
+ * Compile like this:
+ *	gcc -mxsave      -o protection_keys    -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
+ *	gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
+ */
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+#include <errno.h>
+#include <linux/elf.h>
+#include <linux/futex.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/ptrace.h>
+#include <setjmp.h>
+
+#include "pkey-helpers.h"
+
+int iteration_nr = 1;
+int test_nr;
+
+u64 shadow_pkey_reg;
+int dprint_in_signal;
+char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+
+void cat_into_file(char *str, char *file)
+{
+	int fd = open(file, O_RDWR);
+	int ret;
+
+	dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
+	/*
+	 * these need to be raw because they are called under
+	 * pkey_assert()
+	 */
+	if (fd < 0) {
+		fprintf(stderr, "error opening '%s'\n", str);
+		perror("error: ");
+		exit(__LINE__);
+	}
+
+	ret = write(fd, str, strlen(str));
+	if (ret != strlen(str)) {
+		perror("write to file failed");
+		fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
+		exit(__LINE__);
+	}
+	close(fd);
+}
+
+#if CONTROL_TRACING > 0
+static int warned_tracing;
+int tracing_root_ok(void)
+{
+	if (geteuid() != 0) {
+		if (!warned_tracing)
+			fprintf(stderr, "WARNING: not run as root, "
+					"can not do tracing control\n");
+		warned_tracing = 1;
+		return 0;
+	}
+	return 1;
+}
+#endif
+
+void tracing_on(void)
+{
+#if CONTROL_TRACING > 0
+#define TRACEDIR "/sys/kernel/debug/tracing"
+	char pidstr[32];
+
+	if (!tracing_root_ok())
+		return;
+
+	sprintf(pidstr, "%d", getpid());
+	cat_into_file("0", TRACEDIR "/tracing_on");
+	cat_into_file("\n", TRACEDIR "/trace");
+	if (1) {
+		cat_into_file("function_graph", TRACEDIR "/current_tracer");
+		cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
+	} else {
+		cat_into_file("nop", TRACEDIR "/current_tracer");
+	}
+	cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
+	cat_into_file("1", TRACEDIR "/tracing_on");
+	dprintf1("enabled tracing\n");
+#endif
+}
+
+void tracing_off(void)
+{
+#if CONTROL_TRACING > 0
+	if (!tracing_root_ok())
+		return;
+	cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
+#endif
+}
+
+void abort_hooks(void)
+{
+	fprintf(stderr, "running %s()...\n", __func__);
+	tracing_off();
+#ifdef SLEEP_ON_ABORT
+	sleep(SLEEP_ON_ABORT);
+#endif
+}
+
+/*
+ * This attempts to have roughly a page of instructions followed by a few
+ * instructions that do a write, and another page of instructions.  That
+ * way, we are pretty sure that the write is in the second page of
+ * instructions and has at least a page of padding behind it.
+ *
+ * *That* lets us be sure to madvise() away the write instruction, which
+ * will then fault, which makes sure that the fault code handles
+ * execute-only memory properly.
+ */
+#ifdef __powerpc64__
+/* This way, both 4K and 64K alignment are maintained */
+__attribute__((__aligned__(65536)))
+#else
+__attribute__((__aligned__(PAGE_SIZE)))
+#endif
+void lots_o_noops_around_write(int *write_to_me)
+{
+	dprintf3("running %s()\n", __func__);
+	__page_o_noops();
+	/* Assume this happens in the second page of instructions: */
+	*write_to_me = __LINE__;
+	/* pad out by another page: */
+	__page_o_noops();
+	dprintf3("%s() done\n", __func__);
+}
+
+void dump_mem(void *dumpme, int len_bytes)
+{
+	char *c = (void *)dumpme;
+	int i;
+
+	for (i = 0; i < len_bytes; i += sizeof(u64)) {
+		u64 *ptr = (u64 *)(c + i);
+		dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr);
+	}
+}
+
+static u32 hw_pkey_get(int pkey, unsigned long flags)
+{
+	u64 pkey_reg = __read_pkey_reg();
+
+	dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
+			__func__, pkey, flags, 0, 0);
+	dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg);
+
+	return (u32) get_pkey_bits(pkey_reg, pkey);
+}
+
+static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
+{
+	u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
+	u64 old_pkey_reg = __read_pkey_reg();
+	u64 new_pkey_reg;
+
+	/* make sure that 'rights' only contains the bits we expect: */
+	assert(!(rights & ~mask));
+
+	/* modify bits accordingly in old pkey_reg and assign it */
+	new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights);
+
+	__write_pkey_reg(new_pkey_reg);
+
+	dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x"
+		" pkey_reg now: %016llx old_pkey_reg: %016llx\n",
+		__func__, pkey, rights, flags, 0, __read_pkey_reg(),
+		old_pkey_reg);
+	return 0;
+}
+
+void pkey_disable_set(int pkey, int flags)
+{
+	unsigned long syscall_flags = 0;
+	int ret;
+	int pkey_rights;
+	u64 orig_pkey_reg = read_pkey_reg();
+
+	dprintf1("START->%s(%d, 0x%x)\n", __func__,
+		pkey, flags);
+	pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+	pkey_rights = hw_pkey_get(pkey, syscall_flags);
+
+	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+			pkey, pkey, pkey_rights);
+
+	pkey_assert(pkey_rights >= 0);
+
+	pkey_rights |= flags;
+
+	ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
+	assert(!ret);
+	/* pkey_reg and flags have the same format */
+	shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
+	dprintf1("%s(%d) shadow: 0x%016llx\n",
+		__func__, pkey, shadow_pkey_reg);
+
+	pkey_assert(ret >= 0);
+
+	pkey_rights = hw_pkey_get(pkey, syscall_flags);
+	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+			pkey, pkey, pkey_rights);
+
+	dprintf1("%s(%d) pkey_reg: 0x%016llx\n",
+		__func__, pkey, read_pkey_reg());
+	if (flags)
+		pkey_assert(read_pkey_reg() >= orig_pkey_reg);
+	dprintf1("END<---%s(%d, 0x%x)\n", __func__,
+		pkey, flags);
+}
+
+void pkey_disable_clear(int pkey, int flags)
+{
+	unsigned long syscall_flags = 0;
+	int ret;
+	int pkey_rights = hw_pkey_get(pkey, syscall_flags);
+	u64 orig_pkey_reg = read_pkey_reg();
+
+	pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+			pkey, pkey, pkey_rights);
+	pkey_assert(pkey_rights >= 0);
+
+	pkey_rights &= ~flags;
+
+	ret = hw_pkey_set(pkey, pkey_rights, 0);
+	shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
+	pkey_assert(ret >= 0);
+
+	pkey_rights = hw_pkey_get(pkey, syscall_flags);
+	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+			pkey, pkey, pkey_rights);
+
+	dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__,
+			pkey, read_pkey_reg());
+	if (flags)
+		assert(read_pkey_reg() <= orig_pkey_reg);
+}
+
+void pkey_write_allow(int pkey)
+{
+	pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
+}
+void pkey_write_deny(int pkey)
+{
+	pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
+}
+void pkey_access_allow(int pkey)
+{
+	pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
+}
+void pkey_access_deny(int pkey)
+{
+	pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
+}
+
+/* Failed address bound checks: */
+#ifndef SEGV_BNDERR
+# define SEGV_BNDERR		3
+#endif
+
+#ifndef SEGV_PKUERR
+# define SEGV_PKUERR		4
+#endif
+
+static char *si_code_str(int si_code)
+{
+	if (si_code == SEGV_MAPERR)
+		return "SEGV_MAPERR";
+	if (si_code == SEGV_ACCERR)
+		return "SEGV_ACCERR";
+	if (si_code == SEGV_BNDERR)
+		return "SEGV_BNDERR";
+	if (si_code == SEGV_PKUERR)
+		return "SEGV_PKUERR";
+	return "UNKNOWN";
+}
+
+int pkey_faults;
+int last_si_pkey = -1;
+void signal_handler(int signum, siginfo_t *si, void *vucontext)
+{
+	ucontext_t *uctxt = vucontext;
+	int trapno;
+	unsigned long ip;
+	char *fpregs;
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+	u32 *pkey_reg_ptr;
+	int pkey_reg_offset;
+#endif /* arch */
+	u64 siginfo_pkey;
+	u32 *si_pkey_ptr;
+
+	dprint_in_signal = 1;
+	dprintf1(">>>>===============SIGSEGV============================\n");
+	dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
+			__func__, __LINE__,
+			__read_pkey_reg(), shadow_pkey_reg);
+
+	trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
+	ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
+	fpregs = (char *) uctxt->uc_mcontext.fpregs;
+
+	dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n",
+			__func__, trapno, ip, si_code_str(si->si_code),
+			si->si_code);
+
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+#ifdef __i386__
+	/*
+	 * 32-bit has some extra padding so that userspace can tell whether
+	 * the XSTATE header is present in addition to the "legacy" FPU
+	 * state.  We just assume that it is here.
+	 */
+	fpregs += 0x70;
+#endif /* i386 */
+	pkey_reg_offset = pkey_reg_xstate_offset();
+	pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]);
+
+	/*
+	 * If we got a PKEY fault, we *HAVE* to have at least one bit set in
+	 * here.
+	 */
+	dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset());
+	if (DEBUG_LEVEL > 4)
+		dump_mem(pkey_reg_ptr - 128, 256);
+	pkey_assert(*pkey_reg_ptr);
+#endif /* arch */
+
+	dprintf1("siginfo: %p\n", si);
+	dprintf1(" fpregs: %p\n", fpregs);
+
+	if ((si->si_code == SEGV_MAPERR) ||
+	    (si->si_code == SEGV_ACCERR) ||
+	    (si->si_code == SEGV_BNDERR)) {
+		printf("non-PK si_code, exiting...\n");
+		exit(4);
+	}
+
+	si_pkey_ptr = siginfo_get_pkey_ptr(si);
+	dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
+	dump_mem((u8 *)si_pkey_ptr - 8, 24);
+	siginfo_pkey = *si_pkey_ptr;
+	pkey_assert(siginfo_pkey < NR_PKEYS);
+	last_si_pkey = siginfo_pkey;
+
+	/*
+	 * need __read_pkey_reg() version so we do not do shadow_pkey_reg
+	 * checking
+	 */
+	dprintf1("signal pkey_reg from  pkey_reg: %016llx\n",
+			__read_pkey_reg());
+	dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey);
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+	dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr);
+	*(u64 *)pkey_reg_ptr = 0x00000000;
+	dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n");
+#elif defined(__powerpc64__) /* arch */
+	/* restore access and let the faulting instruction continue */
+	pkey_access_allow(siginfo_pkey);
+#endif /* arch */
+	pkey_faults++;
+	dprintf1("<<<<==================================================\n");
+	dprint_in_signal = 0;
+}
+
+int wait_all_children(void)
+{
+	int status;
+	return waitpid(-1, &status, 0);
+}
+
+void sig_chld(int x)
+{
+	dprint_in_signal = 1;
+	dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
+	dprint_in_signal = 0;
+}
+
+void setup_sigsegv_handler(void)
+{
+	int r, rs;
+	struct sigaction newact;
+	struct sigaction oldact;
+
+	/* #PF is mapped to sigsegv */
+	int signum  = SIGSEGV;
+
+	newact.sa_handler = 0;
+	newact.sa_sigaction = signal_handler;
+
+	/*sigset_t - signals to block while in the handler */
+	/* get the old signal mask. */
+	rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
+	pkey_assert(rs == 0);
+
+	/* call sa_sigaction, not sa_handler*/
+	newact.sa_flags = SA_SIGINFO;
+
+	newact.sa_restorer = 0;  /* void(*)(), obsolete */
+	r = sigaction(signum, &newact, &oldact);
+	r = sigaction(SIGALRM, &newact, &oldact);
+	pkey_assert(r == 0);
+}
+
+void setup_handlers(void)
+{
+	signal(SIGCHLD, &sig_chld);
+	setup_sigsegv_handler();
+}
+
+pid_t fork_lazy_child(void)
+{
+	pid_t forkret;
+
+	forkret = fork();
+	pkey_assert(forkret >= 0);
+	dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
+
+	if (!forkret) {
+		/* in the child */
+		while (1) {
+			dprintf1("child sleeping...\n");
+			sleep(30);
+		}
+	}
+	return forkret;
+}
+
+int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+		unsigned long pkey)
+{
+	int sret;
+
+	dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
+			ptr, size, orig_prot, pkey);
+
+	errno = 0;
+	sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey);
+	if (errno) {
+		dprintf2("SYS_mprotect_key sret: %d\n", sret);
+		dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
+		dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
+		if (DEBUG_LEVEL >= 2)
+			perror("SYS_mprotect_pkey");
+	}
+	return sret;
+}
+
+int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
+{
+	int ret = syscall(SYS_pkey_alloc, flags, init_val);
+	dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
+			__func__, flags, init_val, ret, errno);
+	return ret;
+}
+
+int alloc_pkey(void)
+{
+	int ret;
+	unsigned long init_val = 0x0;
+
+	dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
+			__func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg);
+	ret = sys_pkey_alloc(0, init_val);
+	/*
+	 * pkey_alloc() sets PKEY register, so we need to reflect it in
+	 * shadow_pkey_reg:
+	 */
+	dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+			" shadow: 0x%016llx\n",
+			__func__, __LINE__, ret, __read_pkey_reg(),
+			shadow_pkey_reg);
+	if (ret > 0) {
+		/* clear both the bits: */
+		shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
+						~PKEY_MASK);
+		dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+				" shadow: 0x%016llx\n",
+				__func__,
+				__LINE__, ret, __read_pkey_reg(),
+				shadow_pkey_reg);
+		/*
+		 * move the new state in from init_val
+		 * (remember, we cheated and init_val == pkey_reg format)
+		 */
+		shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
+						init_val);
+	}
+	dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+			" shadow: 0x%016llx\n",
+			__func__, __LINE__, ret, __read_pkey_reg(),
+			shadow_pkey_reg);
+	dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno);
+	/* for shadow checking: */
+	read_pkey_reg();
+	dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+		 " shadow: 0x%016llx\n",
+		__func__, __LINE__, ret, __read_pkey_reg(),
+		shadow_pkey_reg);
+	return ret;
+}
+
+int sys_pkey_free(unsigned long pkey)
+{
+	int ret = syscall(SYS_pkey_free, pkey);
+	dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
+	return ret;
+}
+
+/*
+ * I had a bug where pkey bits could be set by mprotect() but
+ * not cleared.  This ensures we get lots of random bit sets
+ * and clears on the vma and pte pkey bits.
+ */
+int alloc_random_pkey(void)
+{
+	int max_nr_pkey_allocs;
+	int ret;
+	int i;
+	int alloced_pkeys[NR_PKEYS];
+	int nr_alloced = 0;
+	int random_index;
+	memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
+
+	/* allocate every possible key and make a note of which ones we got */
+	max_nr_pkey_allocs = NR_PKEYS;
+	for (i = 0; i < max_nr_pkey_allocs; i++) {
+		int new_pkey = alloc_pkey();
+		if (new_pkey < 0)
+			break;
+		alloced_pkeys[nr_alloced++] = new_pkey;
+	}
+
+	pkey_assert(nr_alloced > 0);
+	/* select a random one out of the allocated ones */
+	random_index = rand() % nr_alloced;
+	ret = alloced_pkeys[random_index];
+	/* now zero it out so we don't free it next */
+	alloced_pkeys[random_index] = 0;
+
+	/* go through the allocated ones that we did not want and free them */
+	for (i = 0; i < nr_alloced; i++) {
+		int free_ret;
+		if (!alloced_pkeys[i])
+			continue;
+		free_ret = sys_pkey_free(alloced_pkeys[i]);
+		pkey_assert(!free_ret);
+	}
+	dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+			 " shadow: 0x%016llx\n", __func__,
+			__LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
+	return ret;
+}
+
+int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+		unsigned long pkey)
+{
+	int nr_iterations = random() % 100;
+	int ret;
+
+	while (0) {
+		int rpkey = alloc_random_pkey();
+		ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
+		dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
+				ptr, size, orig_prot, pkey, ret);
+		if (nr_iterations-- < 0)
+			break;
+
+		dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+			" shadow: 0x%016llx\n",
+			__func__, __LINE__, ret, __read_pkey_reg(),
+			shadow_pkey_reg);
+		sys_pkey_free(rpkey);
+		dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+			" shadow: 0x%016llx\n",
+			__func__, __LINE__, ret, __read_pkey_reg(),
+			shadow_pkey_reg);
+	}
+	pkey_assert(pkey < NR_PKEYS);
+
+	ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
+	dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
+			ptr, size, orig_prot, pkey, ret);
+	pkey_assert(!ret);
+	dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+			" shadow: 0x%016llx\n", __func__,
+			__LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
+	return ret;
+}
+
+struct pkey_malloc_record {
+	void *ptr;
+	long size;
+	int prot;
+};
+struct pkey_malloc_record *pkey_malloc_records;
+struct pkey_malloc_record *pkey_last_malloc_record;
+long nr_pkey_malloc_records;
+void record_pkey_malloc(void *ptr, long size, int prot)
+{
+	long i;
+	struct pkey_malloc_record *rec = NULL;
+
+	for (i = 0; i < nr_pkey_malloc_records; i++) {
+		rec = &pkey_malloc_records[i];
+		/* find a free record */
+		if (rec)
+			break;
+	}
+	if (!rec) {
+		/* every record is full */
+		size_t old_nr_records = nr_pkey_malloc_records;
+		size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
+		size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
+		dprintf2("new_nr_records: %zd\n", new_nr_records);
+		dprintf2("new_size: %zd\n", new_size);
+		pkey_malloc_records = realloc(pkey_malloc_records, new_size);
+		pkey_assert(pkey_malloc_records != NULL);
+		rec = &pkey_malloc_records[nr_pkey_malloc_records];
+		/*
+		 * realloc() does not initialize memory, so zero it from
+		 * the first new record all the way to the end.
+		 */
+		for (i = 0; i < new_nr_records - old_nr_records; i++)
+			memset(rec + i, 0, sizeof(*rec));
+	}
+	dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
+		(int)(rec - pkey_malloc_records), rec, ptr, size);
+	rec->ptr = ptr;
+	rec->size = size;
+	rec->prot = prot;
+	pkey_last_malloc_record = rec;
+	nr_pkey_malloc_records++;
+}
+
+void free_pkey_malloc(void *ptr)
+{
+	long i;
+	int ret;
+	dprintf3("%s(%p)\n", __func__, ptr);
+	for (i = 0; i < nr_pkey_malloc_records; i++) {
+		struct pkey_malloc_record *rec = &pkey_malloc_records[i];
+		dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
+				ptr, i, rec, rec->ptr, rec->size);
+		if ((ptr <  rec->ptr) ||
+		    (ptr >= rec->ptr + rec->size))
+			continue;
+
+		dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
+				ptr, i, rec, rec->ptr, rec->size);
+		nr_pkey_malloc_records--;
+		ret = munmap(rec->ptr, rec->size);
+		dprintf3("munmap ret: %d\n", ret);
+		pkey_assert(!ret);
+		dprintf3("clearing rec->ptr, rec: %p\n", rec);
+		rec->ptr = NULL;
+		dprintf3("done clearing rec->ptr, rec: %p\n", rec);
+		return;
+	}
+	pkey_assert(false);
+}
+
+
+void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
+{
+	void *ptr;
+	int ret;
+
+	read_pkey_reg();
+	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+			size, prot, pkey);
+	pkey_assert(pkey < NR_PKEYS);
+	ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+	pkey_assert(ptr != (void *)-1);
+	ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
+	pkey_assert(!ret);
+	record_pkey_malloc(ptr, size, prot);
+	read_pkey_reg();
+
+	dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
+	return ptr;
+}
+
+void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
+{
+	int ret;
+	void *ptr;
+
+	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+			size, prot, pkey);
+	/*
+	 * Guarantee we can fit at least one huge page in the resulting
+	 * allocation by allocating space for 2:
+	 */
+	size = ALIGN_UP(size, HPAGE_SIZE * 2);
+	ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+	pkey_assert(ptr != (void *)-1);
+	record_pkey_malloc(ptr, size, prot);
+	mprotect_pkey(ptr, size, prot, pkey);
+
+	dprintf1("unaligned ptr: %p\n", ptr);
+	ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
+	dprintf1("  aligned ptr: %p\n", ptr);
+	ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
+	dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
+	ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
+	dprintf1("MADV_WILLNEED ret: %d\n", ret);
+	memset(ptr, 0, HPAGE_SIZE);
+
+	dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
+	return ptr;
+}
+
+int hugetlb_setup_ok;
+#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages"
+#define GET_NR_HUGE_PAGES 10
+void setup_hugetlbfs(void)
+{
+	int err;
+	int fd;
+	char buf[256];
+	long hpagesz_kb;
+	long hpagesz_mb;
+
+	if (geteuid() != 0) {
+		fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
+		return;
+	}
+
+	cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
+
+	/*
+	 * Now go make sure that we got the pages and that they
+	 * are PMD-level pages. Someone might have made PUD-level
+	 * pages the default.
+	 */
+	hpagesz_kb = HPAGE_SIZE / 1024;
+	hpagesz_mb = hpagesz_kb / 1024;
+	sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb);
+	fd = open(buf, O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n",
+			hpagesz_mb, strerror(errno));
+		return;
+	}
+
+	/* -1 to guarantee leaving the trailing \0 */
+	err = read(fd, buf, sizeof(buf)-1);
+	close(fd);
+	if (err <= 0) {
+		fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n",
+			hpagesz_mb, strerror(errno));
+		return;
+	}
+
+	if (atoi(buf) != GET_NR_HUGE_PAGES) {
+		fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n",
+			hpagesz_mb, buf, GET_NR_HUGE_PAGES);
+		return;
+	}
+
+	hugetlb_setup_ok = 1;
+}
+
+void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
+{
+	void *ptr;
+	int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
+
+	if (!hugetlb_setup_ok)
+		return PTR_ERR_ENOTSUP;
+
+	dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
+	size = ALIGN_UP(size, HPAGE_SIZE * 2);
+	pkey_assert(pkey < NR_PKEYS);
+	ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
+	pkey_assert(ptr != (void *)-1);
+	mprotect_pkey(ptr, size, prot, pkey);
+
+	record_pkey_malloc(ptr, size, prot);
+
+	dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
+	return ptr;
+}
+
+void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
+{
+	void *ptr;
+	int fd;
+
+	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+			size, prot, pkey);
+	pkey_assert(pkey < NR_PKEYS);
+	fd = open("/dax/foo", O_RDWR);
+	pkey_assert(fd >= 0);
+
+	ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
+	pkey_assert(ptr != (void *)-1);
+
+	mprotect_pkey(ptr, size, prot, pkey);
+
+	record_pkey_malloc(ptr, size, prot);
+
+	dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
+	close(fd);
+	return ptr;
+}
+
+void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
+
+	malloc_pkey_with_mprotect,
+	malloc_pkey_with_mprotect_subpage,
+	malloc_pkey_anon_huge,
+	malloc_pkey_hugetlb
+/* can not do direct with the pkey_mprotect() API:
+	malloc_pkey_mmap_direct,
+	malloc_pkey_mmap_dax,
+*/
+};
+
+void *malloc_pkey(long size, int prot, u16 pkey)
+{
+	void *ret;
+	static int malloc_type;
+	int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
+
+	pkey_assert(pkey < NR_PKEYS);
+
+	while (1) {
+		pkey_assert(malloc_type < nr_malloc_types);
+
+		ret = pkey_malloc[malloc_type](size, prot, pkey);
+		pkey_assert(ret != (void *)-1);
+
+		malloc_type++;
+		if (malloc_type >= nr_malloc_types)
+			malloc_type = (random()%nr_malloc_types);
+
+		/* try again if the malloc_type we tried is unsupported */
+		if (ret == PTR_ERR_ENOTSUP)
+			continue;
+
+		break;
+	}
+
+	dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
+			size, prot, pkey, ret);
+	return ret;
+}
+
+int last_pkey_faults;
+#define UNKNOWN_PKEY -2
+void expected_pkey_fault(int pkey)
+{
+	dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n",
+			__func__, last_pkey_faults, pkey_faults);
+	dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
+	pkey_assert(last_pkey_faults + 1 == pkey_faults);
+
+       /*
+	* For exec-only memory, we do not know the pkey in
+	* advance, so skip this check.
+	*/
+	if (pkey != UNKNOWN_PKEY)
+		pkey_assert(last_si_pkey == pkey);
+
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+	/*
+	 * The signal handler shold have cleared out PKEY register to let the
+	 * test program continue.  We now have to restore it.
+	 */
+	if (__read_pkey_reg() != 0)
+#else /* arch */
+	if (__read_pkey_reg() != shadow_pkey_reg)
+#endif /* arch */
+		pkey_assert(0);
+
+	__write_pkey_reg(shadow_pkey_reg);
+	dprintf1("%s() set pkey_reg=%016llx to restore state after signal "
+		       "nuked it\n", __func__, shadow_pkey_reg);
+	last_pkey_faults = pkey_faults;
+	last_si_pkey = -1;
+}
+
+#define do_not_expect_pkey_fault(msg)	do {			\
+	if (last_pkey_faults != pkey_faults)			\
+		dprintf0("unexpected PKey fault: %s\n", msg);	\
+	pkey_assert(last_pkey_faults == pkey_faults);		\
+} while (0)
+
+int test_fds[10] = { -1 };
+int nr_test_fds;
+void __save_test_fd(int fd)
+{
+	pkey_assert(fd >= 0);
+	pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
+	test_fds[nr_test_fds] = fd;
+	nr_test_fds++;
+}
+
+int get_test_read_fd(void)
+{
+	int test_fd = open("/etc/passwd", O_RDONLY);
+	__save_test_fd(test_fd);
+	return test_fd;
+}
+
+void close_test_fds(void)
+{
+	int i;
+
+	for (i = 0; i < nr_test_fds; i++) {
+		if (test_fds[i] < 0)
+			continue;
+		close(test_fds[i]);
+		test_fds[i] = -1;
+	}
+	nr_test_fds = 0;
+}
+
+#define barrier() __asm__ __volatile__("": : :"memory")
+__attribute__((noinline)) int read_ptr(int *ptr)
+{
+	/*
+	 * Keep GCC from optimizing this away somehow
+	 */
+	barrier();
+	return *ptr;
+}
+
+void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
+{
+	int i, err;
+	int max_nr_pkey_allocs;
+	int alloced_pkeys[NR_PKEYS];
+	int nr_alloced = 0;
+	long size;
+
+	pkey_assert(pkey_last_malloc_record);
+	size = pkey_last_malloc_record->size;
+	/*
+	 * This is a bit of a hack.  But mprotect() requires
+	 * huge-page-aligned sizes when operating on hugetlbfs.
+	 * So, make sure that we use something that's a multiple
+	 * of a huge page when we can.
+	 */
+	if (size >= HPAGE_SIZE)
+		size = HPAGE_SIZE;
+
+	/* allocate every possible key and make sure key-0 never got allocated */
+	max_nr_pkey_allocs = NR_PKEYS;
+	for (i = 0; i < max_nr_pkey_allocs; i++) {
+		int new_pkey = alloc_pkey();
+		pkey_assert(new_pkey != 0);
+
+		if (new_pkey < 0)
+			break;
+		alloced_pkeys[nr_alloced++] = new_pkey;
+	}
+	/* free all the allocated keys */
+	for (i = 0; i < nr_alloced; i++) {
+		int free_ret;
+
+		if (!alloced_pkeys[i])
+			continue;
+		free_ret = sys_pkey_free(alloced_pkeys[i]);
+		pkey_assert(!free_ret);
+	}
+
+	/* attach key-0 in various modes */
+	err = sys_mprotect_pkey(ptr, size, PROT_READ, 0);
+	pkey_assert(!err);
+	err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0);
+	pkey_assert(!err);
+	err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0);
+	pkey_assert(!err);
+	err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0);
+	pkey_assert(!err);
+	err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0);
+	pkey_assert(!err);
+}
+
+void test_read_of_write_disabled_region(int *ptr, u16 pkey)
+{
+	int ptr_contents;
+
+	dprintf1("disabling write access to PKEY[1], doing read\n");
+	pkey_write_deny(pkey);
+	ptr_contents = read_ptr(ptr);
+	dprintf1("*ptr: %d\n", ptr_contents);
+	dprintf1("\n");
+}
+void test_read_of_access_disabled_region(int *ptr, u16 pkey)
+{
+	int ptr_contents;
+
+	dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
+	read_pkey_reg();
+	pkey_access_deny(pkey);
+	ptr_contents = read_ptr(ptr);
+	dprintf1("*ptr: %d\n", ptr_contents);
+	expected_pkey_fault(pkey);
+}
+
+void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
+		u16 pkey)
+{
+	int ptr_contents;
+
+	dprintf1("disabling access to PKEY[%02d], doing read @ %p\n",
+				pkey, ptr);
+	ptr_contents = read_ptr(ptr);
+	dprintf1("reading ptr before disabling the read : %d\n",
+			ptr_contents);
+	read_pkey_reg();
+	pkey_access_deny(pkey);
+	ptr_contents = read_ptr(ptr);
+	dprintf1("*ptr: %d\n", ptr_contents);
+	expected_pkey_fault(pkey);
+}
+
+void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
+		u16 pkey)
+{
+	*ptr = __LINE__;
+	dprintf1("disabling write access; after accessing the page, "
+		"to PKEY[%02d], doing write\n", pkey);
+	pkey_write_deny(pkey);
+	*ptr = __LINE__;
+	expected_pkey_fault(pkey);
+}
+
+void test_write_of_write_disabled_region(int *ptr, u16 pkey)
+{
+	dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
+	pkey_write_deny(pkey);
+	*ptr = __LINE__;
+	expected_pkey_fault(pkey);
+}
+void test_write_of_access_disabled_region(int *ptr, u16 pkey)
+{
+	dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
+	pkey_access_deny(pkey);
+	*ptr = __LINE__;
+	expected_pkey_fault(pkey);
+}
+
+void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
+			u16 pkey)
+{
+	*ptr = __LINE__;
+	dprintf1("disabling access; after accessing the page, "
+		" to PKEY[%02d], doing write\n", pkey);
+	pkey_access_deny(pkey);
+	*ptr = __LINE__;
+	expected_pkey_fault(pkey);
+}
+
+void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
+{
+	int ret;
+	int test_fd = get_test_read_fd();
+
+	dprintf1("disabling access to PKEY[%02d], "
+		 "having kernel read() to buffer\n", pkey);
+	pkey_access_deny(pkey);
+	ret = read(test_fd, ptr, 1);
+	dprintf1("read ret: %d\n", ret);
+	pkey_assert(ret);
+}
+void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
+{
+	int ret;
+	int test_fd = get_test_read_fd();
+
+	pkey_write_deny(pkey);
+	ret = read(test_fd, ptr, 100);
+	dprintf1("read ret: %d\n", ret);
+	if (ret < 0 && (DEBUG_LEVEL > 0))
+		perror("verbose read result (OK for this to be bad)");
+	pkey_assert(ret);
+}
+
+void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
+{
+	int pipe_ret, vmsplice_ret;
+	struct iovec iov;
+	int pipe_fds[2];
+
+	pipe_ret = pipe(pipe_fds);
+
+	pkey_assert(pipe_ret == 0);
+	dprintf1("disabling access to PKEY[%02d], "
+		 "having kernel vmsplice from buffer\n", pkey);
+	pkey_access_deny(pkey);
+	iov.iov_base = ptr;
+	iov.iov_len = PAGE_SIZE;
+	vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
+	dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
+	pkey_assert(vmsplice_ret == -1);
+
+	close(pipe_fds[0]);
+	close(pipe_fds[1]);
+}
+
+void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
+{
+	int ignored = 0xdada;
+	int futex_ret;
+	int some_int = __LINE__;
+
+	dprintf1("disabling write to PKEY[%02d], "
+		 "doing futex gunk in buffer\n", pkey);
+	*ptr = some_int;
+	pkey_write_deny(pkey);
+	futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
+			&ignored, ignored);
+	if (DEBUG_LEVEL > 0)
+		perror("futex");
+	dprintf1("futex() ret: %d\n", futex_ret);
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
+{
+	int err;
+	int i;
+
+	/* Note: 0 is the default pkey, so don't mess with it */
+	for (i = 1; i < NR_PKEYS; i++) {
+		if (pkey == i)
+			continue;
+
+		dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
+		err = sys_pkey_free(i);
+		pkey_assert(err);
+
+		err = sys_pkey_free(i);
+		pkey_assert(err);
+
+		err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
+		pkey_assert(err);
+	}
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
+{
+	int err;
+	int bad_pkey = NR_PKEYS+99;
+
+	/* pass a known-invalid pkey in: */
+	err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
+	pkey_assert(err);
+}
+
+void become_child(void)
+{
+	pid_t forkret;
+
+	forkret = fork();
+	pkey_assert(forkret >= 0);
+	dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
+
+	if (!forkret) {
+		/* in the child */
+		return;
+	}
+	exit(0);
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
+{
+	int err;
+	int allocated_pkeys[NR_PKEYS] = {0};
+	int nr_allocated_pkeys = 0;
+	int i;
+
+	for (i = 0; i < NR_PKEYS*3; i++) {
+		int new_pkey;
+		dprintf1("%s() alloc loop: %d\n", __func__, i);
+		new_pkey = alloc_pkey();
+		dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx"
+				" shadow: 0x%016llx\n",
+				__func__, __LINE__, err, __read_pkey_reg(),
+				shadow_pkey_reg);
+		read_pkey_reg(); /* for shadow checking */
+		dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
+		if ((new_pkey == -1) && (errno == ENOSPC)) {
+			dprintf2("%s() failed to allocate pkey after %d tries\n",
+				__func__, nr_allocated_pkeys);
+		} else {
+			/*
+			 * Ensure the number of successes never
+			 * exceeds the number of keys supported
+			 * in the hardware.
+			 */
+			pkey_assert(nr_allocated_pkeys < NR_PKEYS);
+			allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
+		}
+
+		/*
+		 * Make sure that allocation state is properly
+		 * preserved across fork().
+		 */
+		if (i == NR_PKEYS*2)
+			become_child();
+	}
+
+	dprintf3("%s()::%d\n", __func__, __LINE__);
+
+	/*
+	 * On x86:
+	 * There are 16 pkeys supported in hardware.  Three are
+	 * allocated by the time we get here:
+	 *   1. The default key (0)
+	 *   2. One possibly consumed by an execute-only mapping.
+	 *   3. One allocated by the test code and passed in via
+	 *      'pkey' to this function.
+	 * Ensure that we can allocate at least another 13 (16-3).
+	 *
+	 * On powerpc:
+	 * There are either 5, 28, 29 or 32 pkeys supported in
+	 * hardware depending on the page size (4K or 64K) and
+	 * platform (powernv or powervm). Four are allocated by
+	 * the time we get here. These include pkey-0, pkey-1,
+	 * exec-only pkey and the one allocated by the test code.
+	 * Ensure that we can allocate the remaining.
+	 */
+	pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1));
+
+	for (i = 0; i < nr_allocated_pkeys; i++) {
+		err = sys_pkey_free(allocated_pkeys[i]);
+		pkey_assert(!err);
+		read_pkey_reg(); /* for shadow checking */
+	}
+}
+
+void arch_force_pkey_reg_init(void)
+{
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+	u64 *buf;
+
+	/*
+	 * All keys should be allocated and set to allow reads and
+	 * writes, so the register should be all 0.  If not, just
+	 * skip the test.
+	 */
+	if (read_pkey_reg())
+		return;
+
+	/*
+	 * Just allocate an absurd about of memory rather than
+	 * doing the XSAVE size enumeration dance.
+	 */
+	buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+
+	/* These __builtins require compiling with -mxsave */
+
+	/* XSAVE to build a valid buffer: */
+	__builtin_ia32_xsave(buf, XSTATE_PKEY);
+	/* Clear XSTATE_BV[PKRU]: */
+	buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY;
+	/* XRSTOR will likely get PKRU back to the init state: */
+	__builtin_ia32_xrstor(buf, XSTATE_PKEY);
+
+	munmap(buf, 1*MB);
+#endif
+}
+
+
+/*
+ * This is mostly useless on ppc for now.  But it will not
+ * hurt anything and should give some better coverage as
+ * a long-running test that continually checks the pkey
+ * register.
+ */
+void test_pkey_init_state(int *ptr, u16 pkey)
+{
+	int err;
+	int allocated_pkeys[NR_PKEYS] = {0};
+	int nr_allocated_pkeys = 0;
+	int i;
+
+	for (i = 0; i < NR_PKEYS; i++) {
+		int new_pkey = alloc_pkey();
+
+		if (new_pkey < 0)
+			continue;
+		allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
+	}
+
+	dprintf3("%s()::%d\n", __func__, __LINE__);
+
+	arch_force_pkey_reg_init();
+
+	/*
+	 * Loop for a bit, hoping to get exercise the kernel
+	 * context switch code.
+	 */
+	for (i = 0; i < 1000000; i++)
+		read_pkey_reg();
+
+	for (i = 0; i < nr_allocated_pkeys; i++) {
+		err = sys_pkey_free(allocated_pkeys[i]);
+		pkey_assert(!err);
+		read_pkey_reg(); /* for shadow checking */
+	}
+}
+
+/*
+ * pkey 0 is special.  It is allocated by default, so you do not
+ * have to call pkey_alloc() to use it first.  Make sure that it
+ * is usable.
+ */
+void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
+{
+	long size;
+	int prot;
+
+	assert(pkey_last_malloc_record);
+	size = pkey_last_malloc_record->size;
+	/*
+	 * This is a bit of a hack.  But mprotect() requires
+	 * huge-page-aligned sizes when operating on hugetlbfs.
+	 * So, make sure that we use something that's a multiple
+	 * of a huge page when we can.
+	 */
+	if (size >= HPAGE_SIZE)
+		size = HPAGE_SIZE;
+	prot = pkey_last_malloc_record->prot;
+
+	/* Use pkey 0 */
+	mprotect_pkey(ptr, size, prot, 0);
+
+	/* Make sure that we can set it back to the original pkey. */
+	mprotect_pkey(ptr, size, prot, pkey);
+}
+
+void test_ptrace_of_child(int *ptr, u16 pkey)
+{
+	__attribute__((__unused__)) int peek_result;
+	pid_t child_pid;
+	void *ignored = 0;
+	long ret;
+	int status;
+	/*
+	 * This is the "control" for our little expermient.  Make sure
+	 * we can always access it when ptracing.
+	 */
+	int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
+	int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
+
+	/*
+	 * Fork a child which is an exact copy of this process, of course.
+	 * That means we can do all of our tests via ptrace() and then plain
+	 * memory access and ensure they work differently.
+	 */
+	child_pid = fork_lazy_child();
+	dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
+
+	ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
+	if (ret)
+		perror("attach");
+	dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
+	pkey_assert(ret != -1);
+	ret = waitpid(child_pid, &status, WUNTRACED);
+	if ((ret != child_pid) || !(WIFSTOPPED(status))) {
+		fprintf(stderr, "weird waitpid result %ld stat %x\n",
+				ret, status);
+		pkey_assert(0);
+	}
+	dprintf2("waitpid ret: %ld\n", ret);
+	dprintf2("waitpid status: %d\n", status);
+
+	pkey_access_deny(pkey);
+	pkey_write_deny(pkey);
+
+	/* Write access, untested for now:
+	ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
+	pkey_assert(ret != -1);
+	dprintf1("poke at %p: %ld\n", peek_at, ret);
+	*/
+
+	/*
+	 * Try to access the pkey-protected "ptr" via ptrace:
+	 */
+	ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
+	/* expect it to work, without an error: */
+	pkey_assert(ret != -1);
+	/* Now access from the current task, and expect an exception: */
+	peek_result = read_ptr(ptr);
+	expected_pkey_fault(pkey);
+
+	/*
+	 * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
+	 */
+	ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
+	/* expect it to work, without an error: */
+	pkey_assert(ret != -1);
+	/* Now access from the current task, and expect NO exception: */
+	peek_result = read_ptr(plain_ptr);
+	do_not_expect_pkey_fault("read plain pointer after ptrace");
+
+	ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
+	pkey_assert(ret != -1);
+
+	ret = kill(child_pid, SIGKILL);
+	pkey_assert(ret != -1);
+
+	wait(&status);
+
+	free(plain_ptr_unaligned);
+}
+
+void *get_pointer_to_instructions(void)
+{
+	void *p1;
+
+	p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
+	dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
+	/* lots_o_noops_around_write should be page-aligned already */
+	assert(p1 == &lots_o_noops_around_write);
+
+	/* Point 'p1' at the *second* page of the function: */
+	p1 += PAGE_SIZE;
+
+	/*
+	 * Try to ensure we fault this in on next touch to ensure
+	 * we get an instruction fault as opposed to a data one
+	 */
+	madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+
+	return p1;
+}
+
+void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
+{
+	void *p1;
+	int scratch;
+	int ptr_contents;
+	int ret;
+
+	p1 = get_pointer_to_instructions();
+	lots_o_noops_around_write(&scratch);
+	ptr_contents = read_ptr(p1);
+	dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+
+	ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
+	pkey_assert(!ret);
+	pkey_access_deny(pkey);
+
+	dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
+
+	/*
+	 * Make sure this is an *instruction* fault
+	 */
+	madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+	lots_o_noops_around_write(&scratch);
+	do_not_expect_pkey_fault("executing on PROT_EXEC memory");
+	expect_fault_on_read_execonly_key(p1, pkey);
+}
+
+void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
+{
+	void *p1;
+	int scratch;
+	int ptr_contents;
+	int ret;
+
+	dprintf1("%s() start\n", __func__);
+
+	p1 = get_pointer_to_instructions();
+	lots_o_noops_around_write(&scratch);
+	ptr_contents = read_ptr(p1);
+	dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+
+	/* Use a *normal* mprotect(), not mprotect_pkey(): */
+	ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
+	pkey_assert(!ret);
+
+	/*
+	 * Reset the shadow, assuming that the above mprotect()
+	 * correctly changed PKRU, but to an unknown value since
+	 * the actual allocated pkey is unknown.
+	 */
+	shadow_pkey_reg = __read_pkey_reg();
+
+	dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
+
+	/* Make sure this is an *instruction* fault */
+	madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+	lots_o_noops_around_write(&scratch);
+	do_not_expect_pkey_fault("executing on PROT_EXEC memory");
+	expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY);
+
+	/*
+	 * Put the memory back to non-PROT_EXEC.  Should clear the
+	 * exec-only pkey off the VMA and allow it to be readable
+	 * again.  Go to PROT_NONE first to check for a kernel bug
+	 * that did not clear the pkey when doing PROT_NONE.
+	 */
+	ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
+	pkey_assert(!ret);
+
+	ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
+	pkey_assert(!ret);
+	ptr_contents = read_ptr(p1);
+	do_not_expect_pkey_fault("plain read on recently PROT_EXEC area");
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+void test_ptrace_modifies_pkru(int *ptr, u16 pkey)
+{
+	u32 new_pkru;
+	pid_t child;
+	int status, ret;
+	int pkey_offset = pkey_reg_xstate_offset();
+	size_t xsave_size = cpu_max_xsave_size();
+	void *xsave;
+	u32 *pkey_register;
+	u64 *xstate_bv;
+	struct iovec iov;
+
+	new_pkru = ~read_pkey_reg();
+	/* Don't make PROT_EXEC mappings inaccessible */
+	new_pkru &= ~3;
+
+	child = fork();
+	pkey_assert(child >= 0);
+	dprintf3("[%d] fork() ret: %d\n", getpid(), child);
+	if (!child) {
+		ptrace(PTRACE_TRACEME, 0, 0, 0);
+		/* Stop and allow the tracer to modify PKRU directly */
+		raise(SIGSTOP);
+
+		/*
+		 * need __read_pkey_reg() version so we do not do shadow_pkey_reg
+		 * checking
+		 */
+		if (__read_pkey_reg() != new_pkru)
+			exit(1);
+
+		/* Stop and allow the tracer to clear XSTATE_BV for PKRU */
+		raise(SIGSTOP);
+
+		if (__read_pkey_reg() != 0)
+			exit(1);
+
+		/* Stop and allow the tracer to examine PKRU */
+		raise(SIGSTOP);
+
+		exit(0);
+	}
+
+	pkey_assert(child == waitpid(child, &status, 0));
+	dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+	pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+	xsave = (void *)malloc(xsave_size);
+	pkey_assert(xsave > 0);
+
+	/* Modify the PKRU register directly */
+	iov.iov_base = xsave;
+	iov.iov_len = xsave_size;
+	ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+	pkey_assert(ret == 0);
+
+	pkey_register = (u32 *)(xsave + pkey_offset);
+	pkey_assert(*pkey_register == read_pkey_reg());
+
+	*pkey_register = new_pkru;
+
+	ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+	pkey_assert(ret == 0);
+
+	/* Test that the modification is visible in ptrace before any execution */
+	memset(xsave, 0xCC, xsave_size);
+	ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+	pkey_assert(ret == 0);
+	pkey_assert(*pkey_register == new_pkru);
+
+	/* Execute the tracee */
+	ret = ptrace(PTRACE_CONT, child, 0, 0);
+	pkey_assert(ret == 0);
+
+	/* Test that the tracee saw the PKRU value change */
+	pkey_assert(child == waitpid(child, &status, 0));
+	dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+	pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+	/* Test that the modification is visible in ptrace after execution */
+	memset(xsave, 0xCC, xsave_size);
+	ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+	pkey_assert(ret == 0);
+	pkey_assert(*pkey_register == new_pkru);
+
+	/* Clear the PKRU bit from XSTATE_BV */
+	xstate_bv = (u64 *)(xsave + 512);
+	*xstate_bv &= ~(1 << 9);
+
+	ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+	pkey_assert(ret == 0);
+
+	/* Test that the modification is visible in ptrace before any execution */
+	memset(xsave, 0xCC, xsave_size);
+	ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+	pkey_assert(ret == 0);
+	pkey_assert(*pkey_register == 0);
+
+	ret = ptrace(PTRACE_CONT, child, 0, 0);
+	pkey_assert(ret == 0);
+
+	/* Test that the tracee saw the PKRU value go to 0 */
+	pkey_assert(child == waitpid(child, &status, 0));
+	dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+	pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+	/* Test that the modification is visible in ptrace after execution */
+	memset(xsave, 0xCC, xsave_size);
+	ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+	pkey_assert(ret == 0);
+	pkey_assert(*pkey_register == 0);
+
+	ret = ptrace(PTRACE_CONT, child, 0, 0);
+	pkey_assert(ret == 0);
+	pkey_assert(child == waitpid(child, &status, 0));
+	dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+	pkey_assert(WIFEXITED(status));
+	pkey_assert(WEXITSTATUS(status) == 0);
+	free(xsave);
+}
+#endif
+
+void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
+{
+	int size = PAGE_SIZE;
+	int sret;
+
+	if (cpu_has_pkeys()) {
+		dprintf1("SKIP: %s: no CPU support\n", __func__);
+		return;
+	}
+
+	sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey);
+	pkey_assert(sret < 0);
+}
+
+void (*pkey_tests[])(int *ptr, u16 pkey) = {
+	test_read_of_write_disabled_region,
+	test_read_of_access_disabled_region,
+	test_read_of_access_disabled_region_with_page_already_mapped,
+	test_write_of_write_disabled_region,
+	test_write_of_write_disabled_region_with_page_already_mapped,
+	test_write_of_access_disabled_region,
+	test_write_of_access_disabled_region_with_page_already_mapped,
+	test_kernel_write_of_access_disabled_region,
+	test_kernel_write_of_write_disabled_region,
+	test_kernel_gup_of_access_disabled_region,
+	test_kernel_gup_write_to_write_disabled_region,
+	test_executing_on_unreadable_memory,
+	test_implicit_mprotect_exec_only_memory,
+	test_mprotect_with_pkey_0,
+	test_ptrace_of_child,
+	test_pkey_init_state,
+	test_pkey_syscalls_on_non_allocated_pkey,
+	test_pkey_syscalls_bad_args,
+	test_pkey_alloc_exhaust,
+	test_pkey_alloc_free_attach_pkey0,
+#if defined(__i386__) || defined(__x86_64__)
+	test_ptrace_modifies_pkru,
+#endif
+};
+
+void run_tests_once(void)
+{
+	int *ptr;
+	int prot = PROT_READ|PROT_WRITE;
+
+	for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
+		int pkey;
+		int orig_pkey_faults = pkey_faults;
+
+		dprintf1("======================\n");
+		dprintf1("test %d preparing...\n", test_nr);
+
+		tracing_on();
+		pkey = alloc_random_pkey();
+		dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
+		ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
+		dprintf1("test %d starting...\n", test_nr);
+		pkey_tests[test_nr](ptr, pkey);
+		dprintf1("freeing test memory: %p\n", ptr);
+		free_pkey_malloc(ptr);
+		sys_pkey_free(pkey);
+
+		dprintf1("pkey_faults: %d\n", pkey_faults);
+		dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults);
+
+		tracing_off();
+		close_test_fds();
+
+		printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr);
+		dprintf1("======================\n\n");
+	}
+	iteration_nr++;
+}
+
+void pkey_setup_shadow(void)
+{
+	shadow_pkey_reg = __read_pkey_reg();
+}
+
+int main(void)
+{
+	int nr_iterations = 22;
+	int pkeys_supported = is_pkeys_supported();
+
+	srand((unsigned int)time(NULL));
+
+	setup_handlers();
+
+	printf("has pkeys: %d\n", pkeys_supported);
+
+	if (!pkeys_supported) {
+		int size = PAGE_SIZE;
+		int *ptr;
+
+		printf("running PKEY tests for unsupported CPU/OS\n");
+
+		ptr  = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+		assert(ptr != (void *)-1);
+		test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
+		exit(0);
+	}
+
+	pkey_setup_shadow();
+	printf("startup pkey_reg: %016llx\n", read_pkey_reg());
+	setup_hugetlbfs();
+
+	while (nr_iterations-- > 0)
+		run_tests_once();
+
+	printf("done (all tests OK)\n");
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
new file mode 100644
index 000000000000..8984e0bb58c7
--- /dev/null
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -0,0 +1,274 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Please run as root
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+exitcode=0
+
+usage() {
+	cat <<EOF
+usage: ${BASH_SOURCE[0]:-$0} [ -h | -t "<categories>"]
+  -t: specify specific categories to tests to run
+  -h: display this message
+
+The default behavior is to run all tests.
+
+Alternatively, specific groups tests can be run by passing a string
+to the -t argument containing one or more of the following categories
+separated by spaces:
+- mmap
+	tests for mmap(2)
+- gup_test
+	tests for gup using gup_test interface
+- userfaultfd
+	tests for  userfaultfd(2)
+- compaction
+	a test for the patch "Allow compaction of unevictable pages"
+- mlock
+	tests for mlock(2)
+- mremap
+	tests for mremap(2)
+- hugevm
+	tests for very large virtual address space
+- vmalloc
+	vmalloc smoke tests
+- hmm
+	hmm smoke tests
+- madv_populate
+	test memadvise(2) MADV_POPULATE_{READ,WRITE} options
+- memfd_secret
+	test memfd_secret(2)
+- process_mrelease
+	test process_mrelease(2)
+- ksm
+	ksm tests that do not require >=2 NUMA nodes
+- ksm_numa
+	ksm tests that require >=2 NUMA nodes
+- pkey
+	memory protection key tests
+- soft_dirty
+	test soft dirty page bit semantics
+- cow
+	test copy-on-write semantics
+example: ./run_vmtests.sh -t "hmm mmap ksm"
+EOF
+	exit 0
+}
+
+
+while getopts "ht:" OPT; do
+	case ${OPT} in
+		"h") usage ;;
+		"t") VM_SELFTEST_ITEMS=${OPTARG} ;;
+	esac
+done
+shift $((OPTIND -1))
+
+# default behavior: run all tests
+VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default}
+
+test_selected() {
+	if [ "$VM_SELFTEST_ITEMS" == "default" ]; then
+		# If no VM_SELFTEST_ITEMS are specified, run all tests
+		return 0
+	fi
+	# If test selected argument is one of the test items
+	if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then
+	        return 0
+	else
+	        return 1
+	fi
+}
+
+# get huge pagesize and freepages from /proc/meminfo
+while read -r name size unit; do
+	if [ "$name" = "HugePages_Free:" ]; then
+		freepgs="$size"
+	fi
+	if [ "$name" = "Hugepagesize:" ]; then
+		hpgsize_KB="$size"
+	fi
+done < /proc/meminfo
+
+# Simple hugetlbfs tests have a hardcoded minimum requirement of
+# huge pages totaling 256MB (262144KB) in size.  The userfaultfd
+# hugetlb test requires a minimum of 2 * nr_cpus huge pages.  Take
+# both of these requirements into account and attempt to increase
+# number of huge pages available.
+nr_cpus=$(nproc)
+hpgsize_MB=$((hpgsize_KB / 1024))
+half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128))
+needmem_KB=$((half_ufd_size_MB * 2 * 1024))
+
+# set proper nr_hugepages
+if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then
+	nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
+	needpgs=$((needmem_KB / hpgsize_KB))
+	tries=2
+	while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do
+		lackpgs=$((needpgs - freepgs))
+		echo 3 > /proc/sys/vm/drop_caches
+		if ! echo $((lackpgs + nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then
+			echo "Please run this test as root"
+			exit $ksft_skip
+		fi
+		while read -r name size unit; do
+			if [ "$name" = "HugePages_Free:" ]; then
+				freepgs=$size
+			fi
+		done < /proc/meminfo
+		tries=$((tries - 1))
+	done
+	if [ "$freepgs" -lt "$needpgs" ]; then
+		printf "Not enough huge pages available (%d < %d)\n" \
+		       "$freepgs" "$needpgs"
+		exit 1
+	fi
+else
+	echo "no hugetlbfs support in kernel?"
+	exit 1
+fi
+
+# filter 64bit architectures
+ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64"
+if [ -z "$ARCH" ]; then
+	ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/')
+fi
+VADDR64=0
+echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1
+
+# Usage: run_test [test binary] [arbitrary test arguments...]
+run_test() {
+	if test_selected ${CATEGORY}; then
+		local title="running $*"
+		local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -)
+		printf "%s\n%s\n%s\n" "$sep" "$title" "$sep"
+
+		"$@"
+		local ret=$?
+		if [ $ret -eq 0 ]; then
+			echo "[PASS]"
+		elif [ $ret -eq $ksft_skip ]; then
+			echo "[SKIP]"
+			exitcode=$ksft_skip
+		else
+			echo "[FAIL]"
+			exitcode=1
+		fi
+	fi # test_selected
+}
+
+CATEGORY="hugetlb" run_test ./hugepage-mmap
+
+shmmax=$(cat /proc/sys/kernel/shmmax)
+shmall=$(cat /proc/sys/kernel/shmall)
+echo 268435456 > /proc/sys/kernel/shmmax
+echo 4194304 > /proc/sys/kernel/shmall
+CATEGORY="hugetlb" run_test ./hugepage-shm
+echo "$shmmax" > /proc/sys/kernel/shmmax
+echo "$shmall" > /proc/sys/kernel/shmall
+
+CATEGORY="hugetlb" run_test ./map_hugetlb
+CATEGORY="hugetlb" run_test ./hugepage-mremap
+CATEGORY="hugetlb" run_test ./hugepage-vmemmap
+CATEGORY="hugetlb" run_test ./hugetlb-madvise
+
+if test_selected "hugetlb"; then
+	echo "NOTE: These hugetlb tests provide minimal coverage.  Use"
+	echo "      https://github.com/libhugetlbfs/libhugetlbfs.git for"
+	echo "      hugetlb regression testing."
+fi
+
+CATEGORY="mmap" run_test ./map_fixed_noreplace
+
+# get_user_pages_fast() benchmark
+CATEGORY="gup_test" run_test ./gup_test -u
+# pin_user_pages_fast() benchmark
+CATEGORY="gup_test" run_test ./gup_test -a
+# Dump pages 0, 19, and 4096, using pin_user_pages:
+CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000
+
+uffd_mods=("" ":dev")
+for mod in "${uffd_mods[@]}"; do
+	CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16
+	# Hugetlb tests require source and destination huge pages. Pass in half
+	# the size ($half_ufd_size_MB), which is used for *each*.
+	CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32
+	CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32
+	CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16
+done
+
+#cleanup
+echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
+
+CATEGORY="compaction" run_test ./compaction_test
+
+CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit
+
+CATEGORY="mmap" run_test ./map_populate
+
+CATEGORY="mlock" run_test ./mlock-random-test
+
+CATEGORY="mlock" run_test ./mlock2-tests
+
+CATEGORY="process_mrelease" run_test ./mrelease_test
+
+CATEGORY="mremap" run_test ./mremap_test
+
+CATEGORY="hugetlb" run_test ./thuge-gen
+
+if [ $VADDR64 -ne 0 ]; then
+	CATEGORY="hugevm" run_test ./virtual_address_range
+
+	# virtual address 128TB switch test
+	CATEGORY="hugevm" run_test ./va_128TBswitch.sh
+fi # VADDR64
+
+# vmalloc stability smoke test
+CATEGORY="vmalloc" run_test ./test_vmalloc.sh smoke
+
+CATEGORY="mremap" run_test ./mremap_dontunmap
+
+CATEGORY="hmm" run_test ./test_hmm.sh smoke
+
+# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
+CATEGORY="madv_populate" run_test ./madv_populate
+
+CATEGORY="memfd_secret" run_test ./memfd_secret
+
+# KSM MADV_MERGEABLE test with 10 identical pages
+CATEGORY="ksm" run_test ./ksm_tests -M -p 10
+# KSM unmerge test
+CATEGORY="ksm" run_test ./ksm_tests -U
+# KSM test with 10 zero pages and use_zero_pages = 0
+CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0
+# KSM test with 10 zero pages and use_zero_pages = 1
+CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1
+# KSM test with 2 NUMA nodes and merge_across_nodes = 1
+CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1
+# KSM test with 2 NUMA nodes and merge_across_nodes = 0
+CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0
+
+CATEGORY="ksm" run_test ./ksm_functional_tests
+
+run_test ./ksm_functional_tests
+
+# protection_keys tests
+if [ -x ./protection_keys_32 ]
+then
+	CATEGORY="pkey" run_test ./protection_keys_32
+fi
+
+if [ -x ./protection_keys_64 ]
+then
+	CATEGORY="pkey" run_test ./protection_keys_64
+fi
+
+CATEGORY="soft_dirty" run_test ./soft-dirty
+
+# COW tests
+CATEGORY="cow" run_test ./cow
+
+exit $exitcode
diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings
new file mode 100644
index 000000000000..9abfc60e9e6f
--- /dev/null
+++ b/tools/testing/selftests/mm/settings
@@ -0,0 +1 @@
+timeout=45
diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c
new file mode 100644
index 000000000000..21d8830c5f24
--- /dev/null
+++ b/tools/testing/selftests/mm/soft-dirty.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <malloc.h>
+#include <sys/mman.h>
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define PAGEMAP_FILE_PATH "/proc/self/pagemap"
+#define TEST_ITERATIONS 10000
+
+static void test_simple(int pagemap_fd, int pagesize)
+{
+	int i;
+	char *map;
+
+	map = aligned_alloc(pagesize, pagesize);
+	if (!map)
+		ksft_exit_fail_msg("mmap failed\n");
+
+	clear_softdirty();
+
+	for (i = 0 ; i < TEST_ITERATIONS; i++) {
+		if (pagemap_is_softdirty(pagemap_fd, map) == 1) {
+			ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i);
+			break;
+		}
+
+		clear_softdirty();
+		// Write something to the page to get the dirty bit enabled on the page
+		map[0]++;
+
+		if (pagemap_is_softdirty(pagemap_fd, map) == 0) {
+			ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i);
+			break;
+		}
+
+		clear_softdirty();
+	}
+	free(map);
+
+	ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__);
+}
+
+static void test_vma_reuse(int pagemap_fd, int pagesize)
+{
+	char *map, *map2;
+
+	map = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0);
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap failed");
+
+	// The kernel always marks new regions as soft dirty
+	ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
+			 "Test %s dirty bit of allocated page\n", __func__);
+
+	clear_softdirty();
+	munmap(map, pagesize);
+
+	map2 = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0);
+	if (map2 == MAP_FAILED)
+		ksft_exit_fail_msg("mmap failed");
+
+	// Dirty bit is set for new regions even if they are reused
+	if (map == map2)
+		ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1,
+				 "Test %s dirty bit of reused address page\n", __func__);
+	else
+		ksft_test_result_skip("Test %s dirty bit of reused address page\n", __func__);
+
+	munmap(map2, pagesize);
+}
+
+static void test_hugepage(int pagemap_fd, int pagesize)
+{
+	char *map;
+	int i, ret;
+	size_t hpage_len = read_pmd_pagesize();
+
+	map = memalign(hpage_len, hpage_len);
+	if (!map)
+		ksft_exit_fail_msg("memalign failed\n");
+
+	ret = madvise(map, hpage_len, MADV_HUGEPAGE);
+	if (ret)
+		ksft_exit_fail_msg("madvise failed %d\n", ret);
+
+	for (i = 0; i < hpage_len; i++)
+		map[i] = (char)i;
+
+	if (check_huge_anon(map, 1, hpage_len)) {
+		ksft_test_result_pass("Test %s huge page allocation\n", __func__);
+
+		clear_softdirty();
+		for (i = 0 ; i < TEST_ITERATIONS ; i++) {
+			if (pagemap_is_softdirty(pagemap_fd, map) == 1) {
+				ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i);
+				break;
+			}
+
+			clear_softdirty();
+			// Write something to the page to get the dirty bit enabled on the page
+			map[0]++;
+
+			if (pagemap_is_softdirty(pagemap_fd, map) == 0) {
+				ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i);
+				break;
+			}
+			clear_softdirty();
+		}
+
+		ksft_test_result(i == TEST_ITERATIONS, "Test %s huge page dirty bit\n", __func__);
+	} else {
+		// hugepage allocation failed. skip these tests
+		ksft_test_result_skip("Test %s huge page allocation\n", __func__);
+		ksft_test_result_skip("Test %s huge page dirty bit\n", __func__);
+	}
+	free(map);
+}
+
+static void test_mprotect(int pagemap_fd, int pagesize, bool anon)
+{
+	const char *type[] = {"file", "anon"};
+	const char *fname = "./soft-dirty-test-file";
+	int test_fd;
+	char *map;
+
+	if (anon) {
+		map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE,
+			   MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+		if (!map)
+			ksft_exit_fail_msg("anon mmap failed\n");
+	} else {
+		test_fd = open(fname, O_RDWR | O_CREAT);
+		if (test_fd < 0) {
+			ksft_test_result_skip("Test %s open() file failed\n", __func__);
+			return;
+		}
+		unlink(fname);
+		ftruncate(test_fd, pagesize);
+		map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE,
+			   MAP_SHARED, test_fd, 0);
+		if (!map)
+			ksft_exit_fail_msg("file mmap failed\n");
+	}
+
+	*map = 1;
+	ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
+			 "Test %s-%s dirty bit of new written page\n",
+			 __func__, type[anon]);
+	clear_softdirty();
+	ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
+			 "Test %s-%s soft-dirty clear after clear_refs\n",
+			 __func__, type[anon]);
+	mprotect(map, pagesize, PROT_READ);
+	ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
+			 "Test %s-%s soft-dirty clear after marking RO\n",
+			 __func__, type[anon]);
+	mprotect(map, pagesize, PROT_READ|PROT_WRITE);
+	ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
+			 "Test %s-%s soft-dirty clear after marking RW\n",
+			 __func__, type[anon]);
+	*map = 2;
+	ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
+			 "Test %s-%s soft-dirty after rewritten\n",
+			 __func__, type[anon]);
+
+	munmap(map, pagesize);
+
+	if (!anon)
+		close(test_fd);
+}
+
+static void test_mprotect_anon(int pagemap_fd, int pagesize)
+{
+	test_mprotect(pagemap_fd, pagesize, true);
+}
+
+static void test_mprotect_file(int pagemap_fd, int pagesize)
+{
+	test_mprotect(pagemap_fd, pagesize, false);
+}
+
+int main(int argc, char **argv)
+{
+	int pagemap_fd;
+	int pagesize;
+
+	ksft_print_header();
+	ksft_set_plan(15);
+
+	pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY);
+	if (pagemap_fd < 0)
+		ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH);
+
+	pagesize = getpagesize();
+
+	test_simple(pagemap_fd, pagesize);
+	test_vma_reuse(pagemap_fd, pagesize);
+	test_hugepage(pagemap_fd, pagesize);
+	test_mprotect_anon(pagemap_fd, pagesize);
+	test_mprotect_file(pagemap_fd, pagesize);
+
+	close(pagemap_fd);
+
+	return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
new file mode 100644
index 000000000000..76e1c36dd9e5
--- /dev/null
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
+ * address range in a process via <debugfs>/split_huge_pages interface.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <malloc.h>
+#include <stdbool.h>
+#include "vm_util.h"
+
+uint64_t pagesize;
+unsigned int pageshift;
+uint64_t pmd_pagesize;
+
+#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages"
+#define INPUT_MAX 80
+
+#define PID_FMT "%d,0x%lx,0x%lx"
+#define PATH_FMT "%s,0x%lx,0x%lx"
+
+#define PFN_MASK     ((1UL<<55)-1)
+#define KPF_THP      (1UL<<22)
+
+int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
+{
+	uint64_t paddr;
+	uint64_t page_flags;
+
+	if (pagemap_file) {
+		pread(pagemap_file, &paddr, sizeof(paddr),
+			((long)vaddr >> pageshift) * sizeof(paddr));
+
+		if (kpageflags_file) {
+			pread(kpageflags_file, &page_flags, sizeof(page_flags),
+				(paddr & PFN_MASK) * sizeof(page_flags));
+
+			return !!(page_flags & KPF_THP);
+		}
+	}
+	return 0;
+}
+
+static int write_file(const char *path, const char *buf, size_t buflen)
+{
+	int fd;
+	ssize_t numwritten;
+
+	fd = open(path, O_WRONLY);
+	if (fd == -1)
+		return 0;
+
+	numwritten = write(fd, buf, buflen - 1);
+	close(fd);
+	if (numwritten < 1)
+		return 0;
+
+	return (unsigned int) numwritten;
+}
+
+static void write_debugfs(const char *fmt, ...)
+{
+	char input[INPUT_MAX];
+	int ret;
+	va_list argp;
+
+	va_start(argp, fmt);
+	ret = vsnprintf(input, INPUT_MAX, fmt, argp);
+	va_end(argp);
+
+	if (ret >= INPUT_MAX) {
+		printf("%s: Debugfs input is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+
+	if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
+		perror(SPLIT_DEBUGFS);
+		exit(EXIT_FAILURE);
+	}
+}
+
+void split_pmd_thp(void)
+{
+	char *one_page;
+	size_t len = 4 * pmd_pagesize;
+	size_t i;
+
+	one_page = memalign(pmd_pagesize, len);
+
+	if (!one_page) {
+		printf("Fail to allocate memory\n");
+		exit(EXIT_FAILURE);
+	}
+
+	madvise(one_page, len, MADV_HUGEPAGE);
+
+	for (i = 0; i < len; i++)
+		one_page[i] = (char)i;
+
+	if (!check_huge_anon(one_page, 1, pmd_pagesize)) {
+		printf("No THP is allocated\n");
+		exit(EXIT_FAILURE);
+	}
+
+	/* split all THPs */
+	write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
+		(uint64_t)one_page + len);
+
+	for (i = 0; i < len; i++)
+		if (one_page[i] != (char)i) {
+			printf("%ld byte corrupted\n", i);
+			exit(EXIT_FAILURE);
+		}
+
+
+	if (check_huge_anon(one_page, 0, pmd_pagesize)) {
+		printf("Still AnonHugePages not split\n");
+		exit(EXIT_FAILURE);
+	}
+
+	printf("Split huge pages successful\n");
+	free(one_page);
+}
+
+void split_pte_mapped_thp(void)
+{
+	char *one_page, *pte_mapped, *pte_mapped2;
+	size_t len = 4 * pmd_pagesize;
+	uint64_t thp_size;
+	size_t i;
+	const char *pagemap_template = "/proc/%d/pagemap";
+	const char *kpageflags_proc = "/proc/kpageflags";
+	char pagemap_proc[255];
+	int pagemap_fd;
+	int kpageflags_fd;
+
+	if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
+		perror("get pagemap proc error");
+		exit(EXIT_FAILURE);
+	}
+	pagemap_fd = open(pagemap_proc, O_RDONLY);
+
+	if (pagemap_fd == -1) {
+		perror("read pagemap:");
+		exit(EXIT_FAILURE);
+	}
+
+	kpageflags_fd = open(kpageflags_proc, O_RDONLY);
+
+	if (kpageflags_fd == -1) {
+		perror("read kpageflags:");
+		exit(EXIT_FAILURE);
+	}
+
+	one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
+			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+	madvise(one_page, len, MADV_HUGEPAGE);
+
+	for (i = 0; i < len; i++)
+		one_page[i] = (char)i;
+
+	if (!check_huge_anon(one_page, 1, pmd_pagesize)) {
+		printf("No THP is allocated\n");
+		exit(EXIT_FAILURE);
+	}
+
+	/* remap the first pagesize of first THP */
+	pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
+
+	/* remap the Nth pagesize of Nth THP */
+	for (i = 1; i < 4; i++) {
+		pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
+				     pagesize, pagesize,
+				     MREMAP_MAYMOVE|MREMAP_FIXED,
+				     pte_mapped + pagesize * i);
+		if (pte_mapped2 == (char *)-1) {
+			perror("mremap failed");
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	/* smap does not show THPs after mremap, use kpageflags instead */
+	thp_size = 0;
+	for (i = 0; i < pagesize * 4; i++)
+		if (i % pagesize == 0 &&
+		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+			thp_size++;
+
+	if (thp_size != 4) {
+		printf("Some THPs are missing during mremap\n");
+		exit(EXIT_FAILURE);
+	}
+
+	/* split all remapped THPs */
+	write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped,
+		      (uint64_t)pte_mapped + pagesize * 4);
+
+	/* smap does not show THPs after mremap, use kpageflags instead */
+	thp_size = 0;
+	for (i = 0; i < pagesize * 4; i++) {
+		if (pte_mapped[i] != (char)i) {
+			printf("%ld byte corrupted\n", i);
+			exit(EXIT_FAILURE);
+		}
+		if (i % pagesize == 0 &&
+		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+			thp_size++;
+	}
+
+	if (thp_size) {
+		printf("Still %ld THPs not split\n", thp_size);
+		exit(EXIT_FAILURE);
+	}
+
+	printf("Split PTE-mapped huge pages successful\n");
+	munmap(one_page, len);
+	close(pagemap_fd);
+	close(kpageflags_fd);
+}
+
+void split_file_backed_thp(void)
+{
+	int status;
+	int fd;
+	ssize_t num_written;
+	char tmpfs_template[] = "/tmp/thp_split_XXXXXX";
+	const char *tmpfs_loc = mkdtemp(tmpfs_template);
+	char testfile[INPUT_MAX];
+	uint64_t pgoff_start = 0, pgoff_end = 1024;
+
+	printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n");
+
+	status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
+
+	if (status) {
+		printf("Unable to create a tmpfs for testing\n");
+		exit(EXIT_FAILURE);
+	}
+
+	status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
+	if (status >= INPUT_MAX) {
+		printf("Fail to create file-backed THP split testing file\n");
+		goto cleanup;
+	}
+
+	fd = open(testfile, O_CREAT|O_WRONLY);
+	if (fd == -1) {
+		perror("Cannot open testing file\n");
+		goto cleanup;
+	}
+
+	/* write something to the file, so a file-backed THP can be allocated */
+	num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1);
+	close(fd);
+
+	if (num_written < 1) {
+		printf("Fail to write data to testing file\n");
+		goto cleanup;
+	}
+
+	/* split the file-backed THP */
+	write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end);
+
+	status = unlink(testfile);
+	if (status)
+		perror("Cannot remove testing file\n");
+
+cleanup:
+	status = umount(tmpfs_loc);
+	if (status) {
+		printf("Unable to umount %s\n", tmpfs_loc);
+		exit(EXIT_FAILURE);
+	}
+	status = rmdir(tmpfs_loc);
+	if (status) {
+		perror("cannot remove tmp dir");
+		exit(EXIT_FAILURE);
+	}
+
+	printf("file-backed THP split test done, please check dmesg for more information\n");
+}
+
+int main(int argc, char **argv)
+{
+	if (geteuid() != 0) {
+		printf("Please run the benchmark as root\n");
+		exit(EXIT_FAILURE);
+	}
+
+	pagesize = getpagesize();
+	pageshift = ffs(pagesize) - 1;
+	pmd_pagesize = read_pmd_pagesize();
+
+	split_pmd_thp();
+	split_pte_mapped_thp();
+	split_file_backed_thp();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/test_hmm.sh b/tools/testing/selftests/mm/test_hmm.sh
new file mode 100644
index 000000000000..46e19b5d648d
--- /dev/null
+++ b/tools/testing/selftests/mm/test_hmm.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+#
+# This is a test script for the kernel test driver to analyse vmalloc
+# allocator. Therefore it is just a kernel module loader. You can specify
+# and pass different parameters in order to:
+#     a) analyse performance of vmalloc allocations;
+#     b) stressing and stability check of vmalloc subsystem.
+
+TEST_NAME="test_hmm"
+DRIVER="test_hmm"
+
+# 1 if fails
+exitcode=1
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+check_test_requirements()
+{
+	uid=$(id -u)
+	if [ $uid -ne 0 ]; then
+		echo "$0: Must be run as root"
+		exit $ksft_skip
+	fi
+
+	if ! which modprobe > /dev/null 2>&1; then
+		echo "$0: You need modprobe installed"
+		exit $ksft_skip
+	fi
+
+	if ! modinfo $DRIVER > /dev/null 2>&1; then
+		echo "$0: You must have the following enabled in your kernel:"
+		echo "CONFIG_TEST_HMM=m"
+		exit $ksft_skip
+	fi
+}
+
+load_driver()
+{
+	if [ $# -eq 0 ]; then
+		modprobe $DRIVER > /dev/null 2>&1
+	else
+		if [ $# -eq 2 ]; then
+			modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2
+				> /dev/null 2>&1
+		else
+			echo "Missing module parameters. Make sure pass"\
+			"spm_addr_dev0 and spm_addr_dev1"
+			usage
+		fi
+	fi
+}
+
+unload_driver()
+{
+	modprobe -r $DRIVER > /dev/null 2>&1
+}
+
+run_smoke()
+{
+	echo "Running smoke test. Note, this test provides basic coverage."
+
+	load_driver $1 $2
+	$(dirname "${BASH_SOURCE[0]}")/hmm-tests
+	unload_driver
+}
+
+usage()
+{
+	echo -n "Usage: $0"
+	echo
+	echo "Example usage:"
+	echo
+	echo "# Shows help message"
+	echo "./${TEST_NAME}.sh"
+	echo
+	echo "# Smoke testing"
+	echo "./${TEST_NAME}.sh smoke"
+	echo
+	echo "# Smoke testing with SPM enabled"
+	echo "./${TEST_NAME}.sh smoke <spm_addr_dev0> <spm_addr_dev1>"
+	echo
+	exit 0
+}
+
+function run_test()
+{
+	if [ $# -eq 0 ]; then
+		usage
+	else
+		if [ "$1" = "smoke" ]; then
+			run_smoke $2 $3
+		else
+			usage
+		fi
+	fi
+}
+
+check_test_requirements
+run_test $@
+
+exit 0
diff --git a/tools/testing/selftests/mm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh
new file mode 100644
index 000000000000..d73b846736f1
--- /dev/null
+++ b/tools/testing/selftests/mm/test_vmalloc.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+#
+# This is a test script for the kernel test driver to analyse vmalloc
+# allocator. Therefore it is just a kernel module loader. You can specify
+# and pass different parameters in order to:
+#     a) analyse performance of vmalloc allocations;
+#     b) stressing and stability check of vmalloc subsystem.
+
+TEST_NAME="vmalloc"
+DRIVER="test_${TEST_NAME}"
+NUM_CPUS=`grep -c ^processor /proc/cpuinfo`
+
+# 1 if fails
+exitcode=1
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+#
+# Static templates for performance, stressing and smoke tests.
+# Also it is possible to pass any supported parameters manualy.
+#
+PERF_PARAM="sequential_test_order=1 test_repeat_count=3"
+SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10"
+STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20"
+
+check_test_requirements()
+{
+	uid=$(id -u)
+	if [ $uid -ne 0 ]; then
+		echo "$0: Must be run as root"
+		exit $ksft_skip
+	fi
+
+	if ! which modprobe > /dev/null 2>&1; then
+		echo "$0: You need modprobe installed"
+		exit $ksft_skip
+	fi
+
+	if ! modinfo $DRIVER > /dev/null 2>&1; then
+		echo "$0: You must have the following enabled in your kernel:"
+		echo "CONFIG_TEST_VMALLOC=m"
+		exit $ksft_skip
+	fi
+}
+
+run_perfformance_check()
+{
+	echo "Run performance tests to evaluate how fast vmalloc allocation is."
+	echo "It runs all test cases on one single CPU with sequential order."
+
+	modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1
+	echo "Done."
+	echo "Ccheck the kernel message buffer to see the summary."
+}
+
+run_stability_check()
+{
+	echo "Run stability tests. In order to stress vmalloc subsystem all"
+	echo "available test cases are run by NUM_CPUS workers simultaneously."
+	echo "It will take time, so be patient."
+
+	modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1
+	echo "Done."
+	echo "Check the kernel ring buffer to see the summary."
+}
+
+run_smoke_check()
+{
+	echo "Run smoke test. Note, this test provides basic coverage."
+	echo "Please check $0 output how it can be used"
+	echo "for deep performance analysis as well as stress testing."
+
+	modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1
+	echo "Done."
+	echo "Check the kernel ring buffer to see the summary."
+}
+
+usage()
+{
+	echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | "
+	echo "manual parameters"
+	echo
+	echo "Valid tests and parameters:"
+	echo
+	modinfo $DRIVER
+	echo
+	echo "Example usage:"
+	echo
+	echo "# Shows help message"
+	echo "./${DRIVER}.sh"
+	echo
+	echo "# Runs 1 test(id_1), repeats it 5 times by NUM_CPUS workers"
+	echo "./${DRIVER}.sh nr_threads=$NUM_CPUS run_test_mask=1 test_repeat_count=5"
+	echo
+	echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with "
+	echo "sequential order"
+	echo -n "./${DRIVER}.sh sequential_test_order=1 "
+	echo "run_test_mask=23"
+	echo
+	echo -n "# Runs all tests by NUM_CPUS workers, shuffled order, repeats "
+	echo "20 times"
+	echo "./${DRIVER}.sh nr_threads=$NUM_CPUS test_repeat_count=20"
+	echo
+	echo "# Performance analysis"
+	echo "./${DRIVER}.sh performance"
+	echo
+	echo "# Stress testing"
+	echo "./${DRIVER}.sh stress"
+	echo
+	exit 0
+}
+
+function validate_passed_args()
+{
+	VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'`
+
+	#
+	# Something has been passed, check it.
+	#
+	for passed_arg in $@; do
+		key=${passed_arg//=*/}
+		val="${passed_arg:$((${#key}+1))}"
+		valid=0
+
+		for valid_arg in $VALID_ARGS; do
+			if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then
+				valid=1
+				break
+			fi
+		done
+
+		if [[ $valid -ne 1 ]]; then
+			echo "Error: key or value is not correct: ${key} $val"
+			exit $exitcode
+		fi
+	done
+}
+
+function run_manual_check()
+{
+	#
+	# Validate passed parameters. If there is wrong one,
+	# the script exists and does not execute further.
+	#
+	validate_passed_args $@
+
+	echo "Run the test with following parameters: $@"
+	modprobe $DRIVER $@ > /dev/null 2>&1
+	echo "Done."
+	echo "Check the kernel ring buffer to see the summary."
+}
+
+function run_test()
+{
+	if [ $# -eq 0 ]; then
+		usage
+	else
+		if [[ "$1" = "performance" ]]; then
+			run_perfformance_check
+		elif [[ "$1" = "stress" ]]; then
+			run_stability_check
+		elif [[ "$1" = "smoke" ]]; then
+			run_smoke_check
+		else
+			run_manual_check $@
+		fi
+	fi
+}
+
+check_test_requirements
+run_test $@
+
+exit 0
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
new file mode 100644
index 000000000000..361ef7192cc6
--- /dev/null
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Test selecting other page sizes for mmap/shmget.
+
+   Before running this huge pages for each huge page size must have been
+   reserved.
+   For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used.
+   Also shmmax must be increased.
+   And you need to run as root to work around some weird permissions in shm.
+   And nothing using huge pages should run in parallel.
+   When the program aborts you may need to clean up the shm segments with
+   ipcrm -m by hand, like this
+   sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m
+   (warning this will remove all if someone else uses them) */
+
+#define _GNU_SOURCE 1
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <glob.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <string.h>
+
+#define err(x) perror(x), exit(1)
+
+#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_SHIFT  26
+#define MAP_HUGE_MASK   0x3f
+#if !defined(MAP_HUGETLB)
+#define MAP_HUGETLB	0x40000
+#endif
+
+#define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
+#define SHM_HUGE_SHIFT  26
+#define SHM_HUGE_MASK   0x3f
+#define SHM_HUGE_2MB    (21 << SHM_HUGE_SHIFT)
+#define SHM_HUGE_1GB    (30 << SHM_HUGE_SHIFT)
+
+#define NUM_PAGESIZES   5
+
+#define NUM_PAGES 4
+
+#define Dprintf(fmt...) // printf(fmt)
+
+unsigned long page_sizes[NUM_PAGESIZES];
+int num_page_sizes;
+
+int ilog2(unsigned long v)
+{
+	int l = 0;
+	while ((1UL << l) < v)
+		l++;
+	return l;
+}
+
+void find_pagesizes(void)
+{
+	glob_t g;
+	int i;
+	glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
+	assert(g.gl_pathc <= NUM_PAGESIZES);
+	for (i = 0; i < g.gl_pathc; i++) {
+		sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
+				&page_sizes[i]);
+		page_sizes[i] <<= 10;
+		printf("Found %luMB\n", page_sizes[i] >> 20);
+	}
+	num_page_sizes = g.gl_pathc;
+	globfree(&g);
+}
+
+unsigned long default_huge_page_size(void)
+{
+	unsigned long hps = 0;
+	char *line = NULL;
+	size_t linelen = 0;
+	FILE *f = fopen("/proc/meminfo", "r");
+	if (!f)
+		return 0;
+	while (getline(&line, &linelen, f) > 0) {
+		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
+			hps <<= 10;
+			break;
+		}
+	}
+	free(line);
+	return hps;
+}
+
+void show(unsigned long ps)
+{
+	char buf[100];
+	if (ps == getpagesize())
+		return;
+	printf("%luMB: ", ps >> 20);
+	fflush(stdout);
+	snprintf(buf, sizeof buf,
+		"cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+		ps >> 10);
+	system(buf);
+}
+
+unsigned long read_sysfs(int warn, char *fmt, ...)
+{
+	char *line = NULL;
+	size_t linelen = 0;
+	char buf[100];
+	FILE *f;
+	va_list ap;
+	unsigned long val = 0;
+
+	va_start(ap, fmt);
+	vsnprintf(buf, sizeof buf, fmt, ap);
+	va_end(ap);
+
+	f = fopen(buf, "r");
+	if (!f) {
+		if (warn)
+			printf("missing %s\n", buf);
+		return 0;
+	}
+	if (getline(&line, &linelen, f) > 0) {
+		sscanf(line, "%lu", &val);
+	}
+	fclose(f);
+	free(line);
+	return val;
+}
+
+unsigned long read_free(unsigned long ps)
+{
+	return read_sysfs(ps != getpagesize(),
+			"/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+			ps >> 10);
+}
+
+void test_mmap(unsigned long size, unsigned flags)
+{
+	char *map;
+	unsigned long before, after;
+	int err;
+
+	before = read_free(size);
+	map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE,
+			MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0);
+
+	if (map == (char *)-1) err("mmap");
+	memset(map, 0xff, size*NUM_PAGES);
+	after = read_free(size);
+	Dprintf("before %lu after %lu diff %ld size %lu\n",
+		before, after, before - after, size);
+	assert(size == getpagesize() || (before - after) == NUM_PAGES);
+	show(size);
+	err = munmap(map, size);
+	assert(!err);
+}
+
+void test_shmget(unsigned long size, unsigned flags)
+{
+	int id;
+	unsigned long before, after;
+	int err;
+
+	before = read_free(size);
+	id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags);
+	if (id < 0) err("shmget");
+
+	struct shm_info i;
+	if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl");
+	Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss);
+
+
+	Dprintf("id %d\n", id);
+	char *map = shmat(id, NULL, 0600);
+	if (map == (char*)-1) err("shmat");
+
+	shmctl(id, IPC_RMID, NULL);
+
+	memset(map, 0xff, size*NUM_PAGES);
+	after = read_free(size);
+
+	Dprintf("before %lu after %lu diff %ld size %lu\n",
+		before, after, before - after, size);
+	assert(size == getpagesize() || (before - after) == NUM_PAGES);
+	show(size);
+	err = shmdt(map);
+	assert(!err);
+}
+
+void sanity_checks(void)
+{
+	int i;
+	unsigned long largest = getpagesize();
+
+	for (i = 0; i < num_page_sizes; i++) {
+		if (page_sizes[i] > largest)
+			largest = page_sizes[i];
+
+		if (read_free(page_sizes[i]) < NUM_PAGES) {
+			printf("Not enough huge pages for page size %lu MB, need %u\n",
+				page_sizes[i] >> 20,
+				NUM_PAGES);
+			exit(0);
+		}
+	}
+
+	if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) {
+		printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES);
+		exit(0);
+	}
+
+#if defined(__x86_64__)
+	if (largest != 1U<<30) {
+		printf("No GB pages available on x86-64\n"
+		       "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
+		exit(0);
+	}
+#endif
+}
+
+int main(void)
+{
+	int i;
+	unsigned default_hps = default_huge_page_size();
+
+	find_pagesizes();
+
+	sanity_checks();
+
+	for (i = 0; i < num_page_sizes; i++) {
+		unsigned long ps = page_sizes[i];
+		int arg = ilog2(ps) << MAP_HUGE_SHIFT;
+		printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
+		test_mmap(ps, MAP_HUGETLB | arg);
+	}
+	printf("Testing default huge mmap\n");
+	test_mmap(default_hps, SHM_HUGETLB);
+
+	puts("Testing non-huge shmget");
+	test_shmget(getpagesize(), 0);
+
+	for (i = 0; i < num_page_sizes; i++) {
+		unsigned long ps = page_sizes[i];
+		int arg = ilog2(ps) << SHM_HUGE_SHIFT;
+		printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
+		test_shmget(ps, SHM_HUGETLB | arg);
+	}
+	puts("default huge shmget");
+	test_shmget(default_hps, SHM_HUGETLB);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c
new file mode 100644
index 000000000000..e3f00adb1b82
--- /dev/null
+++ b/tools/testing/selftests/mm/transhuge-stress.c
@@ -0,0 +1,122 @@
+/*
+ * Stress test for transparent huge pages, memory compaction and migration.
+ *
+ * Authors: Konstantin Khlebnikov <koct9i@gmail.com>
+ *
+ * This is free and unencumbered software released into the public domain.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <err.h>
+#include <time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include "util.h"
+
+int backing_fd = -1;
+int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE;
+#define PROT_RW (PROT_READ | PROT_WRITE)
+
+int main(int argc, char **argv)
+{
+	size_t ram, len;
+	void *ptr, *p;
+	struct timespec a, b;
+	int i = 0;
+	char *name = NULL;
+	double s;
+	uint8_t *map;
+	size_t map_len;
+	int pagemap_fd;
+
+	ram = sysconf(_SC_PHYS_PAGES);
+	if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4)
+		ram = SIZE_MAX / 4;
+	else
+		ram *= sysconf(_SC_PAGESIZE);
+	len = ram;
+
+	while (++i < argc) {
+		if (!strcmp(argv[i], "-h"))
+			errx(1, "usage: %s [size in MiB]", argv[0]);
+		else if (!strcmp(argv[i], "-f"))
+			name = argv[++i];
+		else
+			len = atoll(argv[i]) << 20;
+	}
+
+	if (name) {
+		backing_fd = open(name, O_RDWR);
+		if (backing_fd == -1)
+			errx(2, "open %s", name);
+		mmap_flags = MAP_SHARED;
+	}
+
+	warnx("allocate %zd transhuge pages, using %zd MiB virtual memory"
+	      " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20,
+	      ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1));
+
+	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+	if (pagemap_fd < 0)
+		err(2, "open pagemap");
+
+	len -= len % HPAGE_SIZE;
+	ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0);
+	if (ptr == MAP_FAILED)
+		err(2, "initial mmap");
+	ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;
+
+	if (madvise(ptr, len, MADV_HUGEPAGE))
+		err(2, "MADV_HUGEPAGE");
+
+	map_len = ram >> (HPAGE_SHIFT - 1);
+	map = malloc(map_len);
+	if (!map)
+		errx(2, "map malloc");
+
+	while (1) {
+		int nr_succeed = 0, nr_failed = 0, nr_pages = 0;
+
+		memset(map, 0, map_len);
+
+		clock_gettime(CLOCK_MONOTONIC, &a);
+		for (p = ptr; p < ptr + len; p += HPAGE_SIZE) {
+			int64_t pfn;
+
+			pfn = allocate_transhuge(p, pagemap_fd);
+
+			if (pfn < 0) {
+				nr_failed++;
+			} else {
+				size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT);
+
+				nr_succeed++;
+				if (idx >= map_len) {
+					map = realloc(map, idx + 1);
+					if (!map)
+						errx(2, "map realloc");
+					memset(map + map_len, 0, idx + 1 - map_len);
+					map_len = idx + 1;
+				}
+				if (!map[idx])
+					nr_pages++;
+				map[idx] = 1;
+			}
+
+			/* split transhuge page, keep last page */
+			if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED))
+				err(2, "MADV_DONTNEED");
+		}
+		clock_gettime(CLOCK_MONOTONIC, &b);
+		s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.;
+
+		warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
+		      "%4d succeed, %4d failed, %4d different pages",
+		      s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
+		      nr_succeed, nr_failed, nr_pages);
+	}
+}
diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c
new file mode 100644
index 000000000000..7f22844ed704
--- /dev/null
+++ b/tools/testing/selftests/mm/userfaultfd.c
@@ -0,0 +1,1858 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stress userfaultfd syscall.
+ *
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ * This test allocates two virtual areas and bounces the physical
+ * memory across the two virtual areas (from area_src to area_dst)
+ * using userfaultfd.
+ *
+ * There are three threads running per CPU:
+ *
+ * 1) one per-CPU thread takes a per-page pthread_mutex in a random
+ *    page of the area_dst (while the physical page may still be in
+ *    area_src), and increments a per-page counter in the same page,
+ *    and checks its value against a verification region.
+ *
+ * 2) another per-CPU thread handles the userfaults generated by
+ *    thread 1 above. userfaultfd blocking reads or poll() modes are
+ *    exercised interleaved.
+ *
+ * 3) one last per-CPU thread transfers the memory in the background
+ *    at maximum bandwidth (if not already transferred by thread
+ *    2). Each cpu thread takes cares of transferring a portion of the
+ *    area.
+ *
+ * When all threads of type 3 completed the transfer, one bounce is
+ * complete. area_src and area_dst are then swapped. All threads are
+ * respawned and so the bounce is immediately restarted in the
+ * opposite direction.
+ *
+ * per-CPU threads 1 by triggering userfaults inside
+ * pthread_mutex_lock will also verify the atomicity of the memory
+ * transfer (UFFDIO_COPY).
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <poll.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <pthread.h>
+#include <linux/userfaultfd.h>
+#include <setjmp.h>
+#include <stdbool.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <sys/random.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#ifdef __NR_userfaultfd
+
+static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size;
+
+#define BOUNCE_RANDOM		(1<<0)
+#define BOUNCE_RACINGFAULTS	(1<<1)
+#define BOUNCE_VERIFY		(1<<2)
+#define BOUNCE_POLL		(1<<3)
+static int bounces;
+
+#define TEST_ANON	1
+#define TEST_HUGETLB	2
+#define TEST_SHMEM	3
+static int test_type;
+
+#define UFFD_FLAGS	(O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY)
+
+#define BASE_PMD_ADDR ((void *)(1UL << 30))
+
+/* test using /dev/userfaultfd, instead of userfaultfd(2) */
+static bool test_dev_userfaultfd;
+
+/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
+#define ALARM_INTERVAL_SECS 10
+static volatile bool test_uffdio_copy_eexist = true;
+static volatile bool test_uffdio_zeropage_eexist = true;
+/* Whether to test uffd write-protection */
+static bool test_uffdio_wp = true;
+/* Whether to test uffd minor faults */
+static bool test_uffdio_minor = false;
+static bool map_shared;
+static int mem_fd;
+static unsigned long long *count_verify;
+static int uffd = -1;
+static int uffd_flags, finished, *pipefd;
+static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
+static char *zeropage;
+pthread_attr_t attr;
+static bool test_collapse;
+
+/* Userfaultfd test statistics */
+struct uffd_stats {
+	int cpu;
+	unsigned long missing_faults;
+	unsigned long wp_faults;
+	unsigned long minor_faults;
+};
+
+/* pthread_mutex_t starts at page offset 0 */
+#define area_mutex(___area, ___nr)					\
+	((pthread_mutex_t *) ((___area) + (___nr)*page_size))
+/*
+ * count is placed in the page after pthread_mutex_t naturally aligned
+ * to avoid non alignment faults on non-x86 archs.
+ */
+#define area_count(___area, ___nr)					\
+	((volatile unsigned long long *) ((unsigned long)		\
+				 ((___area) + (___nr)*page_size +	\
+				  sizeof(pthread_mutex_t) +		\
+				  sizeof(unsigned long long) - 1) &	\
+				 ~(unsigned long)(sizeof(unsigned long long) \
+						  -  1)))
+
+#define swap(a, b) \
+	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+
+#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1)))
+
+const char *examples =
+    "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
+    "./userfaultfd anon 100 99999\n\n"
+    "# Run the same anonymous memory test, but using /dev/userfaultfd:\n"
+    "./userfaultfd anon:dev 100 99999\n\n"
+    "# Run share memory test on 1GiB region with 99 bounces:\n"
+    "./userfaultfd shmem 1000 99\n\n"
+    "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
+    "./userfaultfd hugetlb 256 50\n\n"
+    "# Run the same hugetlb test but using shared file:\n"
+    "./userfaultfd hugetlb_shared 256 50\n\n"
+    "# 10MiB-~6GiB 999 bounces anonymous test, "
+    "continue forever unless an error triggers\n"
+    "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
+
+static void usage(void)
+{
+	fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
+		"[hugetlbfs_file]\n\n");
+	fprintf(stderr, "Supported <test type>: anon, hugetlb, "
+		"hugetlb_shared, shmem\n\n");
+	fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. "
+		"Supported mods:\n");
+	fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n");
+	fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n");
+	fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n"
+		"memory\n");
+	fprintf(stderr, "\nExample test mod usage:\n");
+	fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n");
+	fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n");
+
+	fprintf(stderr, "Examples:\n\n");
+	fprintf(stderr, "%s", examples);
+	exit(1);
+}
+
+#define _err(fmt, ...)						\
+	do {							\
+		int ret = errno;				\
+		fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__);	\
+		fprintf(stderr, " (errno=%d, line=%d)\n",	\
+			ret, __LINE__);				\
+	} while (0)
+
+#define errexit(exitcode, fmt, ...)		\
+	do {					\
+		_err(fmt, ##__VA_ARGS__);	\
+		exit(exitcode);			\
+	} while (0)
+
+#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__)
+
+static void uffd_stats_reset(struct uffd_stats *uffd_stats,
+			     unsigned long n_cpus)
+{
+	int i;
+
+	for (i = 0; i < n_cpus; i++) {
+		uffd_stats[i].cpu = i;
+		uffd_stats[i].missing_faults = 0;
+		uffd_stats[i].wp_faults = 0;
+		uffd_stats[i].minor_faults = 0;
+	}
+}
+
+static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
+{
+	int i;
+	unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
+
+	for (i = 0; i < n_cpus; i++) {
+		miss_total += stats[i].missing_faults;
+		wp_total += stats[i].wp_faults;
+		minor_total += stats[i].minor_faults;
+	}
+
+	printf("userfaults: ");
+	if (miss_total) {
+		printf("%llu missing (", miss_total);
+		for (i = 0; i < n_cpus; i++)
+			printf("%lu+", stats[i].missing_faults);
+		printf("\b) ");
+	}
+	if (wp_total) {
+		printf("%llu wp (", wp_total);
+		for (i = 0; i < n_cpus; i++)
+			printf("%lu+", stats[i].wp_faults);
+		printf("\b) ");
+	}
+	if (minor_total) {
+		printf("%llu minor (", minor_total);
+		for (i = 0; i < n_cpus; i++)
+			printf("%lu+", stats[i].minor_faults);
+		printf("\b)");
+	}
+	printf("\n");
+}
+
+static void anon_release_pages(char *rel_area)
+{
+	if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
+		err("madvise(MADV_DONTNEED) failed");
+}
+
+static void anon_allocate_area(void **alloc_area, bool is_src)
+{
+	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+}
+
+static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+}
+
+static void hugetlb_release_pages(char *rel_area)
+{
+	if (!map_shared) {
+		if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
+			err("madvise(MADV_DONTNEED) failed");
+	} else {
+		if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
+			err("madvise(MADV_REMOVE) failed");
+	}
+}
+
+static void hugetlb_allocate_area(void **alloc_area, bool is_src)
+{
+	off_t size = nr_pages * page_size;
+	off_t offset = is_src ? 0 : size;
+	void *area_alias = NULL;
+	char **alloc_area_alias;
+
+	*alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
+			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
+			   (is_src ? 0 : MAP_NORESERVE),
+			   mem_fd, offset);
+	if (*alloc_area == MAP_FAILED)
+		err("mmap of hugetlbfs file failed");
+
+	if (map_shared) {
+		area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
+				  MAP_SHARED, mem_fd, offset);
+		if (area_alias == MAP_FAILED)
+			err("mmap of hugetlb file alias failed");
+	}
+
+	if (is_src) {
+		alloc_area_alias = &area_src_alias;
+	} else {
+		alloc_area_alias = &area_dst_alias;
+	}
+	if (area_alias)
+		*alloc_area_alias = area_alias;
+}
+
+static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+	if (!map_shared)
+		return;
+
+	*start = (unsigned long) area_dst_alias + offset;
+}
+
+static void shmem_release_pages(char *rel_area)
+{
+	if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
+		err("madvise(MADV_REMOVE) failed");
+}
+
+static void shmem_allocate_area(void **alloc_area, bool is_src)
+{
+	void *area_alias = NULL;
+	size_t bytes = nr_pages * page_size;
+	unsigned long offset = is_src ? 0 : bytes;
+	char *p = NULL, *p_alias = NULL;
+
+	if (test_collapse) {
+		p = BASE_PMD_ADDR;
+		if (!is_src)
+			/* src map + alias + interleaved hpages */
+			p += 2 * (bytes + hpage_size);
+		p_alias = p;
+		p_alias += bytes;
+		p_alias += hpage_size;  /* Prevent src/dst VMA merge */
+	}
+
+	*alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
+			   mem_fd, offset);
+	if (*alloc_area == MAP_FAILED)
+		err("mmap of memfd failed");
+	if (test_collapse && *alloc_area != p)
+		err("mmap of memfd failed at %p", p);
+
+	area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
+			  mem_fd, offset);
+	if (area_alias == MAP_FAILED)
+		err("mmap of memfd alias failed");
+	if (test_collapse && area_alias != p_alias)
+		err("mmap of anonymous memory failed at %p", p_alias);
+
+	if (is_src)
+		area_src_alias = area_alias;
+	else
+		area_dst_alias = area_alias;
+}
+
+static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+	*start = (unsigned long)area_dst_alias + offset;
+}
+
+static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
+{
+	if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size))
+		err("Did not find expected %d number of hugepages",
+		    expect_nr_hpages);
+}
+
+struct uffd_test_ops {
+	void (*allocate_area)(void **alloc_area, bool is_src);
+	void (*release_pages)(char *rel_area);
+	void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
+	void (*check_pmd_mapping)(void *p, int expect_nr_hpages);
+};
+
+static struct uffd_test_ops anon_uffd_test_ops = {
+	.allocate_area	= anon_allocate_area,
+	.release_pages	= anon_release_pages,
+	.alias_mapping = noop_alias_mapping,
+	.check_pmd_mapping = NULL,
+};
+
+static struct uffd_test_ops shmem_uffd_test_ops = {
+	.allocate_area	= shmem_allocate_area,
+	.release_pages	= shmem_release_pages,
+	.alias_mapping = shmem_alias_mapping,
+	.check_pmd_mapping = shmem_check_pmd_mapping,
+};
+
+static struct uffd_test_ops hugetlb_uffd_test_ops = {
+	.allocate_area	= hugetlb_allocate_area,
+	.release_pages	= hugetlb_release_pages,
+	.alias_mapping = hugetlb_alias_mapping,
+	.check_pmd_mapping = NULL,
+};
+
+static struct uffd_test_ops *uffd_test_ops;
+
+static inline uint64_t uffd_minor_feature(void)
+{
+	if (test_type == TEST_HUGETLB && map_shared)
+		return UFFD_FEATURE_MINOR_HUGETLBFS;
+	else if (test_type == TEST_SHMEM)
+		return UFFD_FEATURE_MINOR_SHMEM;
+	else
+		return 0;
+}
+
+static uint64_t get_expected_ioctls(uint64_t mode)
+{
+	uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
+
+	if (test_type == TEST_HUGETLB)
+		ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
+
+	if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
+		ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
+
+	if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
+		ioctls &= ~(1 << _UFFDIO_CONTINUE);
+
+	return ioctls;
+}
+
+static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
+{
+	uint64_t expected = get_expected_ioctls(mode);
+	uint64_t actual = ioctls & expected;
+
+	if (actual != expected) {
+		err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
+		    expected, actual);
+	}
+}
+
+static int __userfaultfd_open_dev(void)
+{
+	int fd, _uffd;
+
+	fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
+	if (fd < 0)
+		errexit(KSFT_SKIP, "opening /dev/userfaultfd failed");
+
+	_uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS);
+	if (_uffd < 0)
+		errexit(errno == ENOTTY ? KSFT_SKIP : 1,
+			"creating userfaultfd failed");
+	close(fd);
+	return _uffd;
+}
+
+static void userfaultfd_open(uint64_t *features)
+{
+	struct uffdio_api uffdio_api;
+
+	if (test_dev_userfaultfd)
+		uffd = __userfaultfd_open_dev();
+	else {
+		uffd = syscall(__NR_userfaultfd, UFFD_FLAGS);
+		if (uffd < 0)
+			errexit(errno == ENOSYS ? KSFT_SKIP : 1,
+				"creating userfaultfd failed");
+	}
+	uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+	uffdio_api.api = UFFD_API;
+	uffdio_api.features = *features;
+	if (ioctl(uffd, UFFDIO_API, &uffdio_api))
+		err("UFFDIO_API failed.\nPlease make sure to "
+		    "run with either root or ptrace capability.");
+	if (uffdio_api.api != UFFD_API)
+		err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
+
+	*features = uffdio_api.features;
+}
+
+static inline void munmap_area(void **area)
+{
+	if (*area)
+		if (munmap(*area, nr_pages * page_size))
+			err("munmap");
+
+	*area = NULL;
+}
+
+static void uffd_test_ctx_clear(void)
+{
+	size_t i;
+
+	if (pipefd) {
+		for (i = 0; i < nr_cpus * 2; ++i) {
+			if (close(pipefd[i]))
+				err("close pipefd");
+		}
+		free(pipefd);
+		pipefd = NULL;
+	}
+
+	if (count_verify) {
+		free(count_verify);
+		count_verify = NULL;
+	}
+
+	if (uffd != -1) {
+		if (close(uffd))
+			err("close uffd");
+		uffd = -1;
+	}
+
+	munmap_area((void **)&area_src);
+	munmap_area((void **)&area_src_alias);
+	munmap_area((void **)&area_dst);
+	munmap_area((void **)&area_dst_alias);
+	munmap_area((void **)&area_remap);
+}
+
+static void uffd_test_ctx_init(uint64_t features)
+{
+	unsigned long nr, cpu;
+
+	uffd_test_ctx_clear();
+
+	uffd_test_ops->allocate_area((void **)&area_src, true);
+	uffd_test_ops->allocate_area((void **)&area_dst, false);
+
+	userfaultfd_open(&features);
+
+	count_verify = malloc(nr_pages * sizeof(unsigned long long));
+	if (!count_verify)
+		err("count_verify");
+
+	for (nr = 0; nr < nr_pages; nr++) {
+		*area_mutex(area_src, nr) =
+			(pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
+		count_verify[nr] = *area_count(area_src, nr) = 1;
+		/*
+		 * In the transition between 255 to 256, powerpc will
+		 * read out of order in my_bcmp and see both bytes as
+		 * zero, so leave a placeholder below always non-zero
+		 * after the count, to avoid my_bcmp to trigger false
+		 * positives.
+		 */
+		*(area_count(area_src, nr) + 1) = 1;
+	}
+
+	/*
+	 * After initialization of area_src, we must explicitly release pages
+	 * for area_dst to make sure it's fully empty.  Otherwise we could have
+	 * some area_dst pages be errornously initialized with zero pages,
+	 * hence we could hit memory corruption later in the test.
+	 *
+	 * One example is when THP is globally enabled, above allocate_area()
+	 * calls could have the two areas merged into a single VMA (as they
+	 * will have the same VMA flags so they're mergeable).  When we
+	 * initialize the area_src above, it's possible that some part of
+	 * area_dst could have been faulted in via one huge THP that will be
+	 * shared between area_src and area_dst.  It could cause some of the
+	 * area_dst won't be trapped by missing userfaults.
+	 *
+	 * This release_pages() will guarantee even if that happened, we'll
+	 * proactively split the thp and drop any accidentally initialized
+	 * pages within area_dst.
+	 */
+	uffd_test_ops->release_pages(area_dst);
+
+	pipefd = malloc(sizeof(int) * nr_cpus * 2);
+	if (!pipefd)
+		err("pipefd");
+	for (cpu = 0; cpu < nr_cpus; cpu++)
+		if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
+			err("pipe");
+}
+
+static int my_bcmp(char *str1, char *str2, size_t n)
+{
+	unsigned long i;
+	for (i = 0; i < n; i++)
+		if (str1[i] != str2[i])
+			return 1;
+	return 0;
+}
+
+static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
+{
+	struct uffdio_writeprotect prms;
+
+	/* Write protection page faults */
+	prms.range.start = start;
+	prms.range.len = len;
+	/* Undo write-protect, do wakeup after that */
+	prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
+
+	if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
+		err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
+}
+
+static void continue_range(int ufd, __u64 start, __u64 len)
+{
+	struct uffdio_continue req;
+	int ret;
+
+	req.range.start = start;
+	req.range.len = len;
+	req.mode = 0;
+
+	if (ioctl(ufd, UFFDIO_CONTINUE, &req))
+		err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
+		    (uint64_t)start);
+
+	/*
+	 * Error handling within the kernel for continue is subtly different
+	 * from copy or zeropage, so it may be a source of bugs. Trigger an
+	 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
+	 */
+	req.mapped = 0;
+	ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
+	if (ret >= 0 || req.mapped != -EEXIST)
+		err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
+		    ret, (int64_t) req.mapped);
+}
+
+static void *locking_thread(void *arg)
+{
+	unsigned long cpu = (unsigned long) arg;
+	unsigned long page_nr;
+	unsigned long long count;
+
+	if (!(bounces & BOUNCE_RANDOM)) {
+		page_nr = -bounces;
+		if (!(bounces & BOUNCE_RACINGFAULTS))
+			page_nr += cpu * nr_pages_per_cpu;
+	}
+
+	while (!finished) {
+		if (bounces & BOUNCE_RANDOM) {
+			if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr))
+				err("getrandom failed");
+		} else
+			page_nr += 1;
+		page_nr %= nr_pages;
+		pthread_mutex_lock(area_mutex(area_dst, page_nr));
+		count = *area_count(area_dst, page_nr);
+		if (count != count_verify[page_nr])
+			err("page_nr %lu memory corruption %llu %llu",
+			    page_nr, count, count_verify[page_nr]);
+		count++;
+		*area_count(area_dst, page_nr) = count_verify[page_nr] = count;
+		pthread_mutex_unlock(area_mutex(area_dst, page_nr));
+	}
+
+	return NULL;
+}
+
+static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
+			    unsigned long offset)
+{
+	uffd_test_ops->alias_mapping(&uffdio_copy->dst,
+				     uffdio_copy->len,
+				     offset);
+	if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
+		/* real retval in ufdio_copy.copy */
+		if (uffdio_copy->copy != -EEXIST)
+			err("UFFDIO_COPY retry error: %"PRId64,
+			    (int64_t)uffdio_copy->copy);
+	} else {
+		err("UFFDIO_COPY retry unexpected: %"PRId64,
+		    (int64_t)uffdio_copy->copy);
+	}
+}
+
+static void wake_range(int ufd, unsigned long addr, unsigned long len)
+{
+	struct uffdio_range uffdio_wake;
+
+	uffdio_wake.start = addr;
+	uffdio_wake.len = len;
+
+	if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
+		fprintf(stderr, "error waking %lu\n",
+			addr), exit(1);
+}
+
+static int __copy_page(int ufd, unsigned long offset, bool retry)
+{
+	struct uffdio_copy uffdio_copy;
+
+	if (offset >= nr_pages * page_size)
+		err("unexpected offset %lu\n", offset);
+	uffdio_copy.dst = (unsigned long) area_dst + offset;
+	uffdio_copy.src = (unsigned long) area_src + offset;
+	uffdio_copy.len = page_size;
+	if (test_uffdio_wp)
+		uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
+	else
+		uffdio_copy.mode = 0;
+	uffdio_copy.copy = 0;
+	if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
+		/* real retval in ufdio_copy.copy */
+		if (uffdio_copy.copy != -EEXIST)
+			err("UFFDIO_COPY error: %"PRId64,
+			    (int64_t)uffdio_copy.copy);
+		wake_range(ufd, uffdio_copy.dst, page_size);
+	} else if (uffdio_copy.copy != page_size) {
+		err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
+	} else {
+		if (test_uffdio_copy_eexist && retry) {
+			test_uffdio_copy_eexist = false;
+			retry_copy_page(ufd, &uffdio_copy, offset);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static int copy_page_retry(int ufd, unsigned long offset)
+{
+	return __copy_page(ufd, offset, true);
+}
+
+static int copy_page(int ufd, unsigned long offset)
+{
+	return __copy_page(ufd, offset, false);
+}
+
+static int uffd_read_msg(int ufd, struct uffd_msg *msg)
+{
+	int ret = read(uffd, msg, sizeof(*msg));
+
+	if (ret != sizeof(*msg)) {
+		if (ret < 0) {
+			if (errno == EAGAIN || errno == EINTR)
+				return 1;
+			err("blocking read error");
+		} else {
+			err("short read");
+		}
+	}
+
+	return 0;
+}
+
+static void uffd_handle_page_fault(struct uffd_msg *msg,
+				   struct uffd_stats *stats)
+{
+	unsigned long offset;
+
+	if (msg->event != UFFD_EVENT_PAGEFAULT)
+		err("unexpected msg event %u", msg->event);
+
+	if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
+		/* Write protect page faults */
+		wp_range(uffd, msg->arg.pagefault.address, page_size, false);
+		stats->wp_faults++;
+	} else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
+		uint8_t *area;
+		int b;
+
+		/*
+		 * Minor page faults
+		 *
+		 * To prove we can modify the original range for testing
+		 * purposes, we're going to bit flip this range before
+		 * continuing.
+		 *
+		 * Note that this requires all minor page fault tests operate on
+		 * area_dst (non-UFFD-registered) and area_dst_alias
+		 * (UFFD-registered).
+		 */
+
+		area = (uint8_t *)(area_dst +
+				   ((char *)msg->arg.pagefault.address -
+				    area_dst_alias));
+		for (b = 0; b < page_size; ++b)
+			area[b] = ~area[b];
+		continue_range(uffd, msg->arg.pagefault.address, page_size);
+		stats->minor_faults++;
+	} else {
+		/*
+		 * Missing page faults.
+		 *
+		 * Here we force a write check for each of the missing mode
+		 * faults.  It's guaranteed because the only threads that
+		 * will trigger uffd faults are the locking threads, and
+		 * their first instruction to touch the missing page will
+		 * always be pthread_mutex_lock().
+		 *
+		 * Note that here we relied on an NPTL glibc impl detail to
+		 * always read the lock type at the entry of the lock op
+		 * (pthread_mutex_t.__data.__type, offset 0x10) before
+		 * doing any locking operations to guarantee that.  It's
+		 * actually not good to rely on this impl detail because
+		 * logically a pthread-compatible lib can implement the
+		 * locks without types and we can fail when linking with
+		 * them.  However since we used to find bugs with this
+		 * strict check we still keep it around.  Hopefully this
+		 * could be a good hint when it fails again.  If one day
+		 * it'll break on some other impl of glibc we'll revisit.
+		 */
+		if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+			err("unexpected write fault");
+
+		offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
+		offset &= ~(page_size-1);
+
+		if (copy_page(uffd, offset))
+			stats->missing_faults++;
+	}
+}
+
+static void *uffd_poll_thread(void *arg)
+{
+	struct uffd_stats *stats = (struct uffd_stats *)arg;
+	unsigned long cpu = stats->cpu;
+	struct pollfd pollfd[2];
+	struct uffd_msg msg;
+	struct uffdio_register uffd_reg;
+	int ret;
+	char tmp_chr;
+
+	pollfd[0].fd = uffd;
+	pollfd[0].events = POLLIN;
+	pollfd[1].fd = pipefd[cpu*2];
+	pollfd[1].events = POLLIN;
+
+	for (;;) {
+		ret = poll(pollfd, 2, -1);
+		if (ret <= 0) {
+			if (errno == EINTR || errno == EAGAIN)
+				continue;
+			err("poll error: %d", ret);
+		}
+		if (pollfd[1].revents & POLLIN) {
+			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
+				err("read pipefd error");
+			break;
+		}
+		if (!(pollfd[0].revents & POLLIN))
+			err("pollfd[0].revents %d", pollfd[0].revents);
+		if (uffd_read_msg(uffd, &msg))
+			continue;
+		switch (msg.event) {
+		default:
+			err("unexpected msg event %u\n", msg.event);
+			break;
+		case UFFD_EVENT_PAGEFAULT:
+			uffd_handle_page_fault(&msg, stats);
+			break;
+		case UFFD_EVENT_FORK:
+			close(uffd);
+			uffd = msg.arg.fork.ufd;
+			pollfd[0].fd = uffd;
+			break;
+		case UFFD_EVENT_REMOVE:
+			uffd_reg.range.start = msg.arg.remove.start;
+			uffd_reg.range.len = msg.arg.remove.end -
+				msg.arg.remove.start;
+			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
+				err("remove failure");
+			break;
+		case UFFD_EVENT_REMAP:
+			area_remap = area_dst;  /* save for later unmap */
+			area_dst = (char *)(unsigned long)msg.arg.remap.to;
+			break;
+		}
+	}
+
+	return NULL;
+}
+
+pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *uffd_read_thread(void *arg)
+{
+	struct uffd_stats *stats = (struct uffd_stats *)arg;
+	struct uffd_msg msg;
+
+	pthread_mutex_unlock(&uffd_read_mutex);
+	/* from here cancellation is ok */
+
+	for (;;) {
+		if (uffd_read_msg(uffd, &msg))
+			continue;
+		uffd_handle_page_fault(&msg, stats);
+	}
+
+	return NULL;
+}
+
+static void *background_thread(void *arg)
+{
+	unsigned long cpu = (unsigned long) arg;
+	unsigned long page_nr, start_nr, mid_nr, end_nr;
+
+	start_nr = cpu * nr_pages_per_cpu;
+	end_nr = (cpu+1) * nr_pages_per_cpu;
+	mid_nr = (start_nr + end_nr) / 2;
+
+	/* Copy the first half of the pages */
+	for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
+		copy_page_retry(uffd, page_nr * page_size);
+
+	/*
+	 * If we need to test uffd-wp, set it up now.  Then we'll have
+	 * at least the first half of the pages mapped already which
+	 * can be write-protected for testing
+	 */
+	if (test_uffdio_wp)
+		wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
+			nr_pages_per_cpu * page_size, true);
+
+	/*
+	 * Continue the 2nd half of the page copying, handling write
+	 * protection faults if any
+	 */
+	for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
+		copy_page_retry(uffd, page_nr * page_size);
+
+	return NULL;
+}
+
+static int stress(struct uffd_stats *uffd_stats)
+{
+	unsigned long cpu;
+	pthread_t locking_threads[nr_cpus];
+	pthread_t uffd_threads[nr_cpus];
+	pthread_t background_threads[nr_cpus];
+
+	finished = 0;
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		if (pthread_create(&locking_threads[cpu], &attr,
+				   locking_thread, (void *)cpu))
+			return 1;
+		if (bounces & BOUNCE_POLL) {
+			if (pthread_create(&uffd_threads[cpu], &attr,
+					   uffd_poll_thread,
+					   (void *)&uffd_stats[cpu]))
+				return 1;
+		} else {
+			if (pthread_create(&uffd_threads[cpu], &attr,
+					   uffd_read_thread,
+					   (void *)&uffd_stats[cpu]))
+				return 1;
+			pthread_mutex_lock(&uffd_read_mutex);
+		}
+		if (pthread_create(&background_threads[cpu], &attr,
+				   background_thread, (void *)cpu))
+			return 1;
+	}
+	for (cpu = 0; cpu < nr_cpus; cpu++)
+		if (pthread_join(background_threads[cpu], NULL))
+			return 1;
+
+	/*
+	 * Be strict and immediately zap area_src, the whole area has
+	 * been transferred already by the background treads. The
+	 * area_src could then be faulted in a racy way by still
+	 * running uffdio_threads reading zeropages after we zapped
+	 * area_src (but they're guaranteed to get -EEXIST from
+	 * UFFDIO_COPY without writing zero pages into area_dst
+	 * because the background threads already completed).
+	 */
+	uffd_test_ops->release_pages(area_src);
+
+	finished = 1;
+	for (cpu = 0; cpu < nr_cpus; cpu++)
+		if (pthread_join(locking_threads[cpu], NULL))
+			return 1;
+
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		char c;
+		if (bounces & BOUNCE_POLL) {
+			if (write(pipefd[cpu*2+1], &c, 1) != 1)
+				err("pipefd write error");
+			if (pthread_join(uffd_threads[cpu],
+					 (void *)&uffd_stats[cpu]))
+				return 1;
+		} else {
+			if (pthread_cancel(uffd_threads[cpu]))
+				return 1;
+			if (pthread_join(uffd_threads[cpu], NULL))
+				return 1;
+		}
+	}
+
+	return 0;
+}
+
+sigjmp_buf jbuf, *sigbuf;
+
+static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
+{
+	if (sig == SIGBUS) {
+		if (sigbuf)
+			siglongjmp(*sigbuf, 1);
+		abort();
+	}
+}
+
+/*
+ * For non-cooperative userfaultfd test we fork() a process that will
+ * generate pagefaults, will mremap the area monitored by the
+ * userfaultfd and at last this process will release the monitored
+ * area.
+ * For the anonymous and shared memory the area is divided into two
+ * parts, the first part is accessed before mremap, and the second
+ * part is accessed after mremap. Since hugetlbfs does not support
+ * mremap, the entire monitored area is accessed in a single pass for
+ * HUGETLB_TEST.
+ * The release of the pages currently generates event for shmem and
+ * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
+ * for hugetlb.
+ * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
+ * monitored area, generate pagefaults and test that signal is delivered.
+ * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
+ * test robustness use case - we release monitored area, fork a process
+ * that will generate pagefaults and verify signal is generated.
+ * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
+ * feature. Using monitor thread, verify no userfault events are generated.
+ */
+static int faulting_process(int signal_test)
+{
+	unsigned long nr;
+	unsigned long long count;
+	unsigned long split_nr_pages;
+	unsigned long lastnr;
+	struct sigaction act;
+	volatile unsigned long signalled = 0;
+
+	split_nr_pages = (nr_pages + 1) / 2;
+
+	if (signal_test) {
+		sigbuf = &jbuf;
+		memset(&act, 0, sizeof(act));
+		act.sa_sigaction = sighndl;
+		act.sa_flags = SA_SIGINFO;
+		if (sigaction(SIGBUS, &act, 0))
+			err("sigaction");
+		lastnr = (unsigned long)-1;
+	}
+
+	for (nr = 0; nr < split_nr_pages; nr++) {
+		volatile int steps = 1;
+		unsigned long offset = nr * page_size;
+
+		if (signal_test) {
+			if (sigsetjmp(*sigbuf, 1) != 0) {
+				if (steps == 1 && nr == lastnr)
+					err("Signal repeated");
+
+				lastnr = nr;
+				if (signal_test == 1) {
+					if (steps == 1) {
+						/* This is a MISSING request */
+						steps++;
+						if (copy_page(uffd, offset))
+							signalled++;
+					} else {
+						/* This is a WP request */
+						assert(steps == 2);
+						wp_range(uffd,
+							 (__u64)area_dst +
+							 offset,
+							 page_size, false);
+					}
+				} else {
+					signalled++;
+					continue;
+				}
+			}
+		}
+
+		count = *area_count(area_dst, nr);
+		if (count != count_verify[nr])
+			err("nr %lu memory corruption %llu %llu\n",
+			    nr, count, count_verify[nr]);
+		/*
+		 * Trigger write protection if there is by writing
+		 * the same value back.
+		 */
+		*area_count(area_dst, nr) = count;
+	}
+
+	if (signal_test)
+		return signalled != split_nr_pages;
+
+	area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
+			  MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
+	if (area_dst == MAP_FAILED)
+		err("mremap");
+	/* Reset area_src since we just clobbered it */
+	area_src = NULL;
+
+	for (; nr < nr_pages; nr++) {
+		count = *area_count(area_dst, nr);
+		if (count != count_verify[nr]) {
+			err("nr %lu memory corruption %llu %llu\n",
+			    nr, count, count_verify[nr]);
+		}
+		/*
+		 * Trigger write protection if there is by writing
+		 * the same value back.
+		 */
+		*area_count(area_dst, nr) = count;
+	}
+
+	uffd_test_ops->release_pages(area_dst);
+
+	for (nr = 0; nr < nr_pages; nr++)
+		if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
+			err("nr %lu is not zero", nr);
+
+	return 0;
+}
+
+static void retry_uffdio_zeropage(int ufd,
+				  struct uffdio_zeropage *uffdio_zeropage,
+				  unsigned long offset)
+{
+	uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
+				     uffdio_zeropage->range.len,
+				     offset);
+	if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
+		if (uffdio_zeropage->zeropage != -EEXIST)
+			err("UFFDIO_ZEROPAGE error: %"PRId64,
+			    (int64_t)uffdio_zeropage->zeropage);
+	} else {
+		err("UFFDIO_ZEROPAGE error: %"PRId64,
+		    (int64_t)uffdio_zeropage->zeropage);
+	}
+}
+
+static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
+{
+	struct uffdio_zeropage uffdio_zeropage;
+	int ret;
+	bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE);
+	__s64 res;
+
+	if (offset >= nr_pages * page_size)
+		err("unexpected offset %lu", offset);
+	uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
+	uffdio_zeropage.range.len = page_size;
+	uffdio_zeropage.mode = 0;
+	ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
+	res = uffdio_zeropage.zeropage;
+	if (ret) {
+		/* real retval in ufdio_zeropage.zeropage */
+		if (has_zeropage)
+			err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
+		else if (res != -EINVAL)
+			err("UFFDIO_ZEROPAGE not -EINVAL");
+	} else if (has_zeropage) {
+		if (res != page_size) {
+			err("UFFDIO_ZEROPAGE unexpected size");
+		} else {
+			if (test_uffdio_zeropage_eexist && retry) {
+				test_uffdio_zeropage_eexist = false;
+				retry_uffdio_zeropage(ufd, &uffdio_zeropage,
+						      offset);
+			}
+			return 1;
+		}
+	} else
+		err("UFFDIO_ZEROPAGE succeeded");
+
+	return 0;
+}
+
+static int uffdio_zeropage(int ufd, unsigned long offset)
+{
+	return __uffdio_zeropage(ufd, offset, false);
+}
+
+/* exercise UFFDIO_ZEROPAGE */
+static int userfaultfd_zeropage_test(void)
+{
+	struct uffdio_register uffdio_register;
+
+	printf("testing UFFDIO_ZEROPAGE: ");
+	fflush(stdout);
+
+	uffd_test_ctx_init(0);
+
+	uffdio_register.range.start = (unsigned long) area_dst;
+	uffdio_register.range.len = nr_pages * page_size;
+	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+	if (test_uffdio_wp)
+		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
+	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+		err("register failure");
+
+	assert_expected_ioctls_present(
+		uffdio_register.mode, uffdio_register.ioctls);
+
+	if (uffdio_zeropage(uffd, 0))
+		if (my_bcmp(area_dst, zeropage, page_size))
+			err("zeropage is not zero");
+
+	printf("done.\n");
+	return 0;
+}
+
+static int userfaultfd_events_test(void)
+{
+	struct uffdio_register uffdio_register;
+	pthread_t uffd_mon;
+	int err, features;
+	pid_t pid;
+	char c;
+	struct uffd_stats stats = { 0 };
+
+	printf("testing events (fork, remap, remove): ");
+	fflush(stdout);
+
+	features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
+		UFFD_FEATURE_EVENT_REMOVE;
+	uffd_test_ctx_init(features);
+
+	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+	uffdio_register.range.start = (unsigned long) area_dst;
+	uffdio_register.range.len = nr_pages * page_size;
+	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+	if (test_uffdio_wp)
+		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
+	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+		err("register failure");
+
+	assert_expected_ioctls_present(
+		uffdio_register.mode, uffdio_register.ioctls);
+
+	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
+		err("uffd_poll_thread create");
+
+	pid = fork();
+	if (pid < 0)
+		err("fork");
+
+	if (!pid)
+		exit(faulting_process(0));
+
+	waitpid(pid, &err, 0);
+	if (err)
+		err("faulting process failed");
+	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+		err("pipe write");
+	if (pthread_join(uffd_mon, NULL))
+		return 1;
+
+	uffd_stats_report(&stats, 1);
+
+	return stats.missing_faults != nr_pages;
+}
+
+static int userfaultfd_sig_test(void)
+{
+	struct uffdio_register uffdio_register;
+	unsigned long userfaults;
+	pthread_t uffd_mon;
+	int err, features;
+	pid_t pid;
+	char c;
+	struct uffd_stats stats = { 0 };
+
+	printf("testing signal delivery: ");
+	fflush(stdout);
+
+	features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
+	uffd_test_ctx_init(features);
+
+	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+	uffdio_register.range.start = (unsigned long) area_dst;
+	uffdio_register.range.len = nr_pages * page_size;
+	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+	if (test_uffdio_wp)
+		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
+	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+		err("register failure");
+
+	assert_expected_ioctls_present(
+		uffdio_register.mode, uffdio_register.ioctls);
+
+	if (faulting_process(1))
+		err("faulting process failed");
+
+	uffd_test_ops->release_pages(area_dst);
+
+	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
+		err("uffd_poll_thread create");
+
+	pid = fork();
+	if (pid < 0)
+		err("fork");
+
+	if (!pid)
+		exit(faulting_process(2));
+
+	waitpid(pid, &err, 0);
+	if (err)
+		err("faulting process failed");
+	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+		err("pipe write");
+	if (pthread_join(uffd_mon, (void **)&userfaults))
+		return 1;
+
+	printf("done.\n");
+	if (userfaults)
+		err("Signal test failed, userfaults: %ld", userfaults);
+
+	return userfaults != 0;
+}
+
+void check_memory_contents(char *p)
+{
+	unsigned long i;
+	uint8_t expected_byte;
+	void *expected_page;
+
+	if (posix_memalign(&expected_page, page_size, page_size))
+		err("out of memory");
+
+	for (i = 0; i < nr_pages; ++i) {
+		expected_byte = ~((uint8_t)(i % ((uint8_t)-1)));
+		memset(expected_page, expected_byte, page_size);
+		if (my_bcmp(expected_page, p + (i * page_size), page_size))
+			err("unexpected page contents after minor fault");
+	}
+
+	free(expected_page);
+}
+
+static int userfaultfd_minor_test(void)
+{
+	unsigned long p;
+	struct uffdio_register uffdio_register;
+	pthread_t uffd_mon;
+	char c;
+	struct uffd_stats stats = { 0 };
+
+	if (!test_uffdio_minor)
+		return 0;
+
+	printf("testing minor faults: ");
+	fflush(stdout);
+
+	uffd_test_ctx_init(uffd_minor_feature());
+
+	uffdio_register.range.start = (unsigned long)area_dst_alias;
+	uffdio_register.range.len = nr_pages * page_size;
+	uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
+	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+		err("register failure");
+
+	assert_expected_ioctls_present(
+		uffdio_register.mode, uffdio_register.ioctls);
+
+	/*
+	 * After registering with UFFD, populate the non-UFFD-registered side of
+	 * the shared mapping. This should *not* trigger any UFFD minor faults.
+	 */
+	for (p = 0; p < nr_pages; ++p) {
+		memset(area_dst + (p * page_size), p % ((uint8_t)-1),
+		       page_size);
+	}
+
+	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
+		err("uffd_poll_thread create");
+
+	/*
+	 * Read each of the pages back using the UFFD-registered mapping. We
+	 * expect that the first time we touch a page, it will result in a minor
+	 * fault. uffd_poll_thread will resolve the fault by bit-flipping the
+	 * page's contents, and then issuing a CONTINUE ioctl.
+	 */
+	check_memory_contents(area_dst_alias);
+
+	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+		err("pipe write");
+	if (pthread_join(uffd_mon, NULL))
+		return 1;
+
+	uffd_stats_report(&stats, 1);
+
+	if (test_collapse) {
+		printf("testing collapse of uffd memory into PMD-mapped THPs:");
+		if (madvise(area_dst_alias, nr_pages * page_size,
+			    MADV_COLLAPSE))
+			err("madvise(MADV_COLLAPSE)");
+
+		uffd_test_ops->check_pmd_mapping(area_dst,
+						 nr_pages * page_size /
+						 hpage_size);
+		/*
+		 * This won't cause uffd-fault - it purely just makes sure there
+		 * was no corruption.
+		 */
+		check_memory_contents(area_dst_alias);
+		printf(" done.\n");
+	}
+
+	return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
+}
+
+#define BIT_ULL(nr)                   (1ULL << (nr))
+#define PM_SOFT_DIRTY                 BIT_ULL(55)
+#define PM_MMAP_EXCLUSIVE             BIT_ULL(56)
+#define PM_UFFD_WP                    BIT_ULL(57)
+#define PM_FILE                       BIT_ULL(61)
+#define PM_SWAP                       BIT_ULL(62)
+#define PM_PRESENT                    BIT_ULL(63)
+
+static int pagemap_open(void)
+{
+	int fd = open("/proc/self/pagemap", O_RDONLY);
+
+	if (fd < 0)
+		err("open pagemap");
+
+	return fd;
+}
+
+static uint64_t pagemap_read_vaddr(int fd, void *vaddr)
+{
+	uint64_t value;
+	int ret;
+
+	ret = pread(fd, &value, sizeof(uint64_t),
+		    ((uint64_t)vaddr >> 12) * sizeof(uint64_t));
+	if (ret != sizeof(uint64_t))
+		err("pread() on pagemap failed");
+
+	return value;
+}
+
+/* This macro let __LINE__ works in err() */
+#define  pagemap_check_wp(value, wp) do {				\
+		if (!!(value & PM_UFFD_WP) != wp)			\
+			err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
+	} while (0)
+
+static int pagemap_test_fork(bool present)
+{
+	pid_t child = fork();
+	uint64_t value;
+	int fd, result;
+
+	if (!child) {
+		/* Open the pagemap fd of the child itself */
+		fd = pagemap_open();
+		value = pagemap_read_vaddr(fd, area_dst);
+		/*
+		 * After fork() uffd-wp bit should be gone as long as we're
+		 * without UFFD_FEATURE_EVENT_FORK
+		 */
+		pagemap_check_wp(value, false);
+		/* Succeed */
+		exit(0);
+	}
+	waitpid(child, &result, 0);
+	return result;
+}
+
+static void userfaultfd_pagemap_test(unsigned int test_pgsize)
+{
+	struct uffdio_register uffdio_register;
+	int pagemap_fd;
+	uint64_t value;
+
+	/* Pagemap tests uffd-wp only */
+	if (!test_uffdio_wp)
+		return;
+
+	/* Not enough memory to test this page size */
+	if (test_pgsize > nr_pages * page_size)
+		return;
+
+	printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize);
+	/* Flush so it doesn't flush twice in parent/child later */
+	fflush(stdout);
+
+	uffd_test_ctx_init(0);
+
+	if (test_pgsize > page_size) {
+		/* This is a thp test */
+		if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
+			err("madvise(MADV_HUGEPAGE) failed");
+	} else if (test_pgsize == page_size) {
+		/* This is normal page test; force no thp */
+		if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
+			err("madvise(MADV_NOHUGEPAGE) failed");
+	}
+
+	uffdio_register.range.start = (unsigned long) area_dst;
+	uffdio_register.range.len = nr_pages * page_size;
+	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
+	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+		err("register failed");
+
+	pagemap_fd = pagemap_open();
+
+	/* Touch the page */
+	*area_dst = 1;
+	wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
+	value = pagemap_read_vaddr(pagemap_fd, area_dst);
+	pagemap_check_wp(value, true);
+	/* Make sure uffd-wp bit dropped when fork */
+	if (pagemap_test_fork(true))
+		err("Detected stall uffd-wp bit in child");
+
+	/* Exclusive required or PAGEOUT won't work */
+	if (!(value & PM_MMAP_EXCLUSIVE))
+		err("multiple mapping detected: 0x%"PRIx64, value);
+
+	if (madvise(area_dst, test_pgsize, MADV_PAGEOUT))
+		err("madvise(MADV_PAGEOUT) failed");
+
+	/* Uffd-wp should persist even swapped out */
+	value = pagemap_read_vaddr(pagemap_fd, area_dst);
+	pagemap_check_wp(value, true);
+	/* Make sure uffd-wp bit dropped when fork */
+	if (pagemap_test_fork(false))
+		err("Detected stall uffd-wp bit in child");
+
+	/* Unprotect; this tests swap pte modifications */
+	wp_range(uffd, (uint64_t)area_dst, page_size, false);
+	value = pagemap_read_vaddr(pagemap_fd, area_dst);
+	pagemap_check_wp(value, false);
+
+	/* Fault in the page from disk */
+	*area_dst = 2;
+	value = pagemap_read_vaddr(pagemap_fd, area_dst);
+	pagemap_check_wp(value, false);
+
+	close(pagemap_fd);
+	printf("done\n");
+}
+
+static int userfaultfd_stress(void)
+{
+	void *area;
+	unsigned long nr;
+	struct uffdio_register uffdio_register;
+	struct uffd_stats uffd_stats[nr_cpus];
+
+	uffd_test_ctx_init(0);
+
+	if (posix_memalign(&area, page_size, page_size))
+		err("out of memory");
+	zeropage = area;
+	bzero(zeropage, page_size);
+
+	pthread_mutex_lock(&uffd_read_mutex);
+
+	pthread_attr_init(&attr);
+	pthread_attr_setstacksize(&attr, 16*1024*1024);
+
+	while (bounces--) {
+		printf("bounces: %d, mode:", bounces);
+		if (bounces & BOUNCE_RANDOM)
+			printf(" rnd");
+		if (bounces & BOUNCE_RACINGFAULTS)
+			printf(" racing");
+		if (bounces & BOUNCE_VERIFY)
+			printf(" ver");
+		if (bounces & BOUNCE_POLL)
+			printf(" poll");
+		else
+			printf(" read");
+		printf(", ");
+		fflush(stdout);
+
+		if (bounces & BOUNCE_POLL)
+			fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+		else
+			fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
+
+		/* register */
+		uffdio_register.range.start = (unsigned long) area_dst;
+		uffdio_register.range.len = nr_pages * page_size;
+		uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+		if (test_uffdio_wp)
+			uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
+		if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+			err("register failure");
+		assert_expected_ioctls_present(
+			uffdio_register.mode, uffdio_register.ioctls);
+
+		if (area_dst_alias) {
+			uffdio_register.range.start = (unsigned long)
+				area_dst_alias;
+			if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+				err("register failure alias");
+		}
+
+		/*
+		 * The madvise done previously isn't enough: some
+		 * uffd_thread could have read userfaults (one of
+		 * those already resolved by the background thread)
+		 * and it may be in the process of calling
+		 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
+		 * area_src and it would map a zero page in it (of
+		 * course such a UFFDIO_COPY is perfectly safe as it'd
+		 * return -EEXIST). The problem comes at the next
+		 * bounce though: that racing UFFDIO_COPY would
+		 * generate zeropages in the area_src, so invalidating
+		 * the previous MADV_DONTNEED. Without this additional
+		 * MADV_DONTNEED those zeropages leftovers in the
+		 * area_src would lead to -EEXIST failure during the
+		 * next bounce, effectively leaving a zeropage in the
+		 * area_dst.
+		 *
+		 * Try to comment this out madvise to see the memory
+		 * corruption being caught pretty quick.
+		 *
+		 * khugepaged is also inhibited to collapse THP after
+		 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
+		 * required to MADV_DONTNEED here.
+		 */
+		uffd_test_ops->release_pages(area_dst);
+
+		uffd_stats_reset(uffd_stats, nr_cpus);
+
+		/* bounce pass */
+		if (stress(uffd_stats))
+			return 1;
+
+		/* Clear all the write protections if there is any */
+		if (test_uffdio_wp)
+			wp_range(uffd, (unsigned long)area_dst,
+				 nr_pages * page_size, false);
+
+		/* unregister */
+		if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
+			err("unregister failure");
+		if (area_dst_alias) {
+			uffdio_register.range.start = (unsigned long) area_dst;
+			if (ioctl(uffd, UFFDIO_UNREGISTER,
+				  &uffdio_register.range))
+				err("unregister failure alias");
+		}
+
+		/* verification */
+		if (bounces & BOUNCE_VERIFY)
+			for (nr = 0; nr < nr_pages; nr++)
+				if (*area_count(area_dst, nr) != count_verify[nr])
+					err("error area_count %llu %llu %lu\n",
+					    *area_count(area_src, nr),
+					    count_verify[nr], nr);
+
+		/* prepare next bounce */
+		swap(area_src, area_dst);
+
+		swap(area_src_alias, area_dst_alias);
+
+		uffd_stats_report(uffd_stats, nr_cpus);
+	}
+
+	if (test_type == TEST_ANON) {
+		/*
+		 * shmem/hugetlb won't be able to run since they have different
+		 * behavior on fork() (file-backed memory normally drops ptes
+		 * directly when fork), meanwhile the pagemap test will verify
+		 * pgtable entry of fork()ed child.
+		 */
+		userfaultfd_pagemap_test(page_size);
+		/*
+		 * Hard-code for x86_64 for now for 2M THP, as x86_64 is
+		 * currently the only one that supports uffd-wp
+		 */
+		userfaultfd_pagemap_test(page_size * 512);
+	}
+
+	return userfaultfd_zeropage_test() || userfaultfd_sig_test()
+		|| userfaultfd_events_test() || userfaultfd_minor_test();
+}
+
+/*
+ * Copied from mlock2-tests.c
+ */
+unsigned long default_huge_page_size(void)
+{
+	unsigned long hps = 0;
+	char *line = NULL;
+	size_t linelen = 0;
+	FILE *f = fopen("/proc/meminfo", "r");
+
+	if (!f)
+		return 0;
+	while (getline(&line, &linelen, f) > 0) {
+		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
+			hps <<= 10;
+			break;
+		}
+	}
+
+	free(line);
+	fclose(f);
+	return hps;
+}
+
+static void set_test_type(const char *type)
+{
+	if (!strcmp(type, "anon")) {
+		test_type = TEST_ANON;
+		uffd_test_ops = &anon_uffd_test_ops;
+	} else if (!strcmp(type, "hugetlb")) {
+		test_type = TEST_HUGETLB;
+		uffd_test_ops = &hugetlb_uffd_test_ops;
+	} else if (!strcmp(type, "hugetlb_shared")) {
+		map_shared = true;
+		test_type = TEST_HUGETLB;
+		uffd_test_ops = &hugetlb_uffd_test_ops;
+		/* Minor faults require shared hugetlb; only enable here. */
+		test_uffdio_minor = true;
+	} else if (!strcmp(type, "shmem")) {
+		map_shared = true;
+		test_type = TEST_SHMEM;
+		uffd_test_ops = &shmem_uffd_test_ops;
+		test_uffdio_minor = true;
+	}
+}
+
+static void parse_test_type_arg(const char *raw_type)
+{
+	char *buf = strdup(raw_type);
+	uint64_t features = UFFD_API_FEATURES;
+
+	while (buf) {
+		const char *token = strsep(&buf, ":");
+
+		if (!test_type)
+			set_test_type(token);
+		else if (!strcmp(token, "dev"))
+			test_dev_userfaultfd = true;
+		else if (!strcmp(token, "syscall"))
+			test_dev_userfaultfd = false;
+		else if (!strcmp(token, "collapse"))
+			test_collapse = true;
+		else
+			err("unrecognized test mod '%s'", token);
+	}
+
+	if (!test_type)
+		err("failed to parse test type argument: '%s'", raw_type);
+
+	if (test_collapse && test_type != TEST_SHMEM)
+		err("Unsupported test: %s", raw_type);
+
+	if (test_type == TEST_HUGETLB)
+		page_size = hpage_size;
+	else
+		page_size = sysconf(_SC_PAGE_SIZE);
+
+	if (!page_size)
+		err("Unable to determine page size");
+	if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
+	    > page_size)
+		err("Impossible to run this test");
+
+	/*
+	 * Whether we can test certain features depends not just on test type,
+	 * but also on whether or not this particular kernel supports the
+	 * feature.
+	 */
+
+	userfaultfd_open(&features);
+
+	test_uffdio_wp = test_uffdio_wp &&
+		(features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
+	test_uffdio_minor = test_uffdio_minor &&
+		(features & uffd_minor_feature());
+
+	close(uffd);
+	uffd = -1;
+}
+
+static void sigalrm(int sig)
+{
+	if (sig != SIGALRM)
+		abort();
+	test_uffdio_copy_eexist = true;
+	test_uffdio_zeropage_eexist = true;
+	alarm(ALARM_INTERVAL_SECS);
+}
+
+int main(int argc, char **argv)
+{
+	size_t bytes;
+
+	if (argc < 4)
+		usage();
+
+	if (signal(SIGALRM, sigalrm) == SIG_ERR)
+		err("failed to arm SIGALRM");
+	alarm(ALARM_INTERVAL_SECS);
+
+	hpage_size = default_huge_page_size();
+	parse_test_type_arg(argv[1]);
+	bytes = atol(argv[2]) * 1024 * 1024;
+
+	if (test_collapse && bytes & (hpage_size - 1))
+		err("MiB must be multiple of %lu if :collapse mod set",
+		    hpage_size >> 20);
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+
+	if (test_collapse) {
+		/* nr_cpus must divide (bytes / page_size), otherwise,
+		 * area allocations of (nr_pages * paze_size) won't be a
+		 * multiple of hpage_size, even if bytes is a multiple of
+		 * hpage_size.
+		 *
+		 * This means that nr_cpus must divide (N * (2 << (H-P))
+		 * where:
+		 *	bytes = hpage_size * N
+		 *	hpage_size = 2 << H
+		 *	page_size = 2 << P
+		 *
+		 * And we want to chose nr_cpus to be the largest value
+		 * satisfying this constraint, not larger than the number
+		 * of online CPUs. Unfortunately, prime factorization of
+		 * N and nr_cpus may be arbitrary, so have to search for it.
+		 * Instead, just use the highest power of 2 dividing both
+		 * nr_cpus and (bytes / page_size).
+		 */
+		int x = factor_of_2(nr_cpus);
+		int y = factor_of_2(bytes / page_size);
+
+		nr_cpus = x < y ? x : y;
+	}
+	nr_pages_per_cpu = bytes / page_size / nr_cpus;
+	if (!nr_pages_per_cpu) {
+		_err("invalid MiB");
+		usage();
+	}
+
+	bounces = atoi(argv[3]);
+	if (bounces <= 0) {
+		_err("invalid bounces");
+		usage();
+	}
+	nr_pages = nr_pages_per_cpu * nr_cpus;
+
+	if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) {
+		unsigned int memfd_flags = 0;
+
+		if (test_type == TEST_HUGETLB)
+			memfd_flags = MFD_HUGETLB;
+		mem_fd = memfd_create(argv[0], memfd_flags);
+		if (mem_fd < 0)
+			err("memfd_create");
+		if (ftruncate(mem_fd, nr_pages * page_size * 2))
+			err("ftruncate");
+		if (fallocate(mem_fd,
+			      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
+			      nr_pages * page_size * 2))
+			err("fallocate");
+	}
+	printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
+	       nr_pages, nr_pages_per_cpu);
+	return userfaultfd_stress();
+}
+
+#else /* __NR_userfaultfd */
+
+#warning "missing __NR_userfaultfd definition"
+
+int main(void)
+{
+	printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
+	return KSFT_SKIP;
+}
+
+#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/mm/util.h b/tools/testing/selftests/mm/util.h
new file mode 100644
index 000000000000..b27d26199334
--- /dev/null
+++ b/tools/testing/selftests/mm/util.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __KSELFTEST_VM_UTIL_H
+#define __KSELFTEST_VM_UTIL_H
+
+#include <stdint.h>
+#include <sys/mman.h>
+#include <err.h>
+#include <string.h> /* ffsl() */
+#include <unistd.h> /* _SC_PAGESIZE */
+
+static unsigned int __page_size;
+static unsigned int __page_shift;
+
+static inline unsigned int page_size(void)
+{
+	if (!__page_size)
+		__page_size = sysconf(_SC_PAGESIZE);
+	return __page_size;
+}
+
+static inline unsigned int page_shift(void)
+{
+	if (!__page_shift)
+		__page_shift = (ffsl(page_size()) - 1);
+	return __page_shift;
+}
+
+#define PAGE_SHIFT	(page_shift())
+#define PAGE_SIZE	(page_size())
+/*
+ * On ppc64 this will only work with radix 2M hugepage size
+ */
+#define HPAGE_SHIFT 21
+#define HPAGE_SIZE (1 << HPAGE_SHIFT)
+
+#define PAGEMAP_PRESENT(ent)	(((ent) & (1ull << 63)) != 0)
+#define PAGEMAP_PFN(ent)	((ent) & ((1ull << 55) - 1))
+
+
+static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd)
+{
+	uint64_t ent[2];
+
+	/* drop pmd */
+	if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
+		 MAP_FIXED | MAP_ANONYMOUS |
+		 MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
+		errx(2, "mmap transhuge");
+
+	if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
+		err(2, "MADV_HUGEPAGE");
+
+	/* allocate transparent huge page */
+	*(volatile void **)ptr = ptr;
+
+	if (pread(pagemap_fd, ent, sizeof(ent),
+		  (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
+		err(2, "read pagemap");
+
+	if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
+	    PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
+	    !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1)))
+		return PAGEMAP_PFN(ent[0]);
+
+	return -1;
+}
+
+#endif
diff --git a/tools/testing/selftests/mm/va_128TBswitch.c b/tools/testing/selftests/mm/va_128TBswitch.c
new file mode 100644
index 000000000000..1d2068989883
--- /dev/null
+++ b/tools/testing/selftests/mm/va_128TBswitch.c
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Authors: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+ * Authors: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ */
+
+#include <stdio.h>
+#include <sys/mman.h>
+#include <string.h>
+
+#include "../kselftest.h"
+
+#ifdef __powerpc64__
+#define PAGE_SIZE	(64 << 10)
+/*
+ * This will work with 16M and 2M hugepage size
+ */
+#define HUGETLB_SIZE	(16 << 20)
+#else
+#define PAGE_SIZE	(4 << 10)
+#define HUGETLB_SIZE	(2 << 20)
+#endif
+
+/*
+ * >= 128TB is the hint addr value we used to select
+ * large address space.
+ */
+#define ADDR_SWITCH_HINT (1UL << 47)
+#define LOW_ADDR	((void *) (1UL << 30))
+#define HIGH_ADDR	((void *) (1UL << 48))
+
+struct testcase {
+	void *addr;
+	unsigned long size;
+	unsigned long flags;
+	const char *msg;
+	unsigned int low_addr_required:1;
+	unsigned int keep_mapped:1;
+};
+
+static struct testcase testcases[] = {
+	{
+		/*
+		 * If stack is moved, we could possibly allocate
+		 * this at the requested address.
+		 */
+		.addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+		.size = PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
+		.low_addr_required = 1,
+	},
+	{
+		/*
+		 * We should never allocate at the requested address or above it
+		 * The len cross the 128TB boundary. Without MAP_FIXED
+		 * we will always search in the lower address space.
+		 */
+		.addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))",
+		.low_addr_required = 1,
+	},
+	{
+		/*
+		 * Exact mapping at 128TB, the area is free we should get that
+		 * even without MAP_FIXED.
+		 */
+		.addr = ((void *)(ADDR_SWITCH_HINT)),
+		.size = PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
+		.keep_mapped = 1,
+	},
+	{
+		.addr = (void *)(ADDR_SWITCH_HINT),
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+		.msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
+	},
+	{
+		.addr = NULL,
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(NULL)",
+		.low_addr_required = 1,
+	},
+	{
+		.addr = LOW_ADDR,
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(LOW_ADDR)",
+		.low_addr_required = 1,
+	},
+	{
+		.addr = HIGH_ADDR,
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(HIGH_ADDR)",
+		.keep_mapped = 1,
+	},
+	{
+		.addr = HIGH_ADDR,
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(HIGH_ADDR) again",
+		.keep_mapped = 1,
+	},
+	{
+		.addr = HIGH_ADDR,
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+		.msg = "mmap(HIGH_ADDR, MAP_FIXED)",
+	},
+	{
+		.addr = (void *) -1,
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(-1)",
+		.keep_mapped = 1,
+	},
+	{
+		.addr = (void *) -1,
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(-1) again",
+	},
+	{
+		.addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+		.size = PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
+		.low_addr_required = 1,
+	},
+	{
+		.addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)",
+		.low_addr_required = 1,
+		.keep_mapped = 1,
+	},
+	{
+		.addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2),
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)",
+		.low_addr_required = 1,
+		.keep_mapped = 1,
+	},
+	{
+		.addr = ((void *)(ADDR_SWITCH_HINT)),
+		.size = PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
+	},
+	{
+		.addr = (void *)(ADDR_SWITCH_HINT),
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+		.msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
+	},
+};
+
+static struct testcase hugetlb_testcases[] = {
+	{
+		.addr = NULL,
+		.size = HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(NULL, MAP_HUGETLB)",
+		.low_addr_required = 1,
+	},
+	{
+		.addr = LOW_ADDR,
+		.size = HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(LOW_ADDR, MAP_HUGETLB)",
+		.low_addr_required = 1,
+	},
+	{
+		.addr = HIGH_ADDR,
+		.size = HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(HIGH_ADDR, MAP_HUGETLB)",
+		.keep_mapped = 1,
+	},
+	{
+		.addr = HIGH_ADDR,
+		.size = HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again",
+		.keep_mapped = 1,
+	},
+	{
+		.addr = HIGH_ADDR,
+		.size = HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+		.msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)",
+	},
+	{
+		.addr = (void *) -1,
+		.size = HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(-1, MAP_HUGETLB)",
+		.keep_mapped = 1,
+	},
+	{
+		.addr = (void *) -1,
+		.size = HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(-1, MAP_HUGETLB) again",
+	},
+	{
+		.addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
+		.size = 2 * HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)",
+		.low_addr_required = 1,
+		.keep_mapped = 1,
+	},
+	{
+		.addr = (void *)(ADDR_SWITCH_HINT),
+		.size = 2 * HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+		.msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)",
+	},
+};
+
+static int run_test(struct testcase *test, int count)
+{
+	void *p;
+	int i, ret = KSFT_PASS;
+
+	for (i = 0; i < count; i++) {
+		struct testcase *t = test + i;
+
+		p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0);
+
+		printf("%s: %p - ", t->msg, p);
+
+		if (p == MAP_FAILED) {
+			printf("FAILED\n");
+			ret = KSFT_FAIL;
+			continue;
+		}
+
+		if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) {
+			printf("FAILED\n");
+			ret = KSFT_FAIL;
+		} else {
+			/*
+			 * Do a dereference of the address returned so that we catch
+			 * bugs in page fault handling
+			 */
+			memset(p, 0, t->size);
+			printf("OK\n");
+		}
+		if (!t->keep_mapped)
+			munmap(p, t->size);
+	}
+
+	return ret;
+}
+
+static int supported_arch(void)
+{
+#if defined(__powerpc64__)
+	return 1;
+#elif defined(__x86_64__)
+	return 1;
+#else
+	return 0;
+#endif
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	if (!supported_arch())
+		return KSFT_SKIP;
+
+	ret = run_test(testcases, ARRAY_SIZE(testcases));
+	if (argc == 2 && !strcmp(argv[1], "--run-hugetlb"))
+		ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases));
+	return ret;
+}
diff --git a/tools/testing/selftests/mm/va_128TBswitch.sh b/tools/testing/selftests/mm/va_128TBswitch.sh
new file mode 100644
index 000000000000..41580751dc51
--- /dev/null
+++ b/tools/testing/selftests/mm/va_128TBswitch.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2022 Adam Sindelar (Meta) <adam@wowsignal.io>
+#
+# This is a test for mmap behavior with 5-level paging. This script wraps the
+# real test to check that the kernel is configured to support at least 5
+# pagetable levels.
+
+# 1 means the test failed
+exitcode=1
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+fail()
+{
+	echo "$1"
+	exit $exitcode
+}
+
+check_supported_x86_64()
+{
+	local config="/proc/config.gz"
+	[[ -f "${config}" ]] || config="/boot/config-$(uname -r)"
+	[[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot"
+
+	# gzip -dcfq automatically handles both compressed and plaintext input.
+	# See man 1 gzip under '-f'.
+	local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2)
+
+	if [[ "${pg_table_levels}" -lt 5 ]]; then
+		echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
+		exit $ksft_skip
+	fi
+}
+
+check_test_requirements()
+{
+	# The test supports x86_64 and powerpc64. We currently have no useful
+	# eligibility check for powerpc64, and the test itself will reject other
+	# architectures.
+	case `uname -m` in
+		"x86_64")
+			check_supported_x86_64
+		;;
+		*)
+			return 0
+		;;
+	esac
+}
+
+check_test_requirements
+./va_128TBswitch
diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
new file mode 100644
index 000000000000..c0592646ed93
--- /dev/null
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2017, Anshuman Khandual, IBM Corp.
+ *
+ * Works on architectures which support 128TB virtual
+ * address range and beyond.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+
+/*
+ * Maximum address range mapped with a single mmap()
+ * call is little bit more than 16GB. Hence 16GB is
+ * chosen as the single chunk size for address space
+ * mapping.
+ */
+#define MAP_CHUNK_SIZE   17179869184UL /* 16GB */
+
+/*
+ * Address space till 128TB is mapped without any hint
+ * and is enabled by default. Address space beyond 128TB
+ * till 512TB is obtained by passing hint address as the
+ * first argument into mmap() system call.
+ *
+ * The process heap address space is divided into two
+ * different areas one below 128TB and one above 128TB
+ * till it reaches 512TB. One with size 128TB and the
+ * other being 384TB.
+ *
+ * On Arm64 the address space is 256TB and no high mappings
+ * are supported so far.
+ */
+
+#define NR_CHUNKS_128TB   8192UL /* Number of 16GB chunks for 128TB */
+#define NR_CHUNKS_256TB   (NR_CHUNKS_128TB * 2UL)
+#define NR_CHUNKS_384TB   (NR_CHUNKS_128TB * 3UL)
+
+#define ADDR_MARK_128TB  (1UL << 47) /* First address beyond 128TB */
+#define ADDR_MARK_256TB  (1UL << 48) /* First address beyond 256TB */
+
+#ifdef __aarch64__
+#define HIGH_ADDR_MARK  ADDR_MARK_256TB
+#define HIGH_ADDR_SHIFT 49
+#define NR_CHUNKS_LOW   NR_CHUNKS_256TB
+#define NR_CHUNKS_HIGH  0
+#else
+#define HIGH_ADDR_MARK  ADDR_MARK_128TB
+#define HIGH_ADDR_SHIFT 48
+#define NR_CHUNKS_LOW   NR_CHUNKS_128TB
+#define NR_CHUNKS_HIGH  NR_CHUNKS_384TB
+#endif
+
+static char *hind_addr(void)
+{
+	int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT);
+
+	return (char *) (1UL << bits);
+}
+
+static int validate_addr(char *ptr, int high_addr)
+{
+	unsigned long addr = (unsigned long) ptr;
+
+	if (high_addr) {
+		if (addr < HIGH_ADDR_MARK) {
+			printf("Bad address %lx\n", addr);
+			return 1;
+		}
+		return 0;
+	}
+
+	if (addr > HIGH_ADDR_MARK) {
+		printf("Bad address %lx\n", addr);
+		return 1;
+	}
+	return 0;
+}
+
+static int validate_lower_address_hint(void)
+{
+	char *ptr;
+
+	ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ |
+			PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+	if (ptr == MAP_FAILED)
+		return 0;
+
+	return 1;
+}
+
+int main(int argc, char *argv[])
+{
+	char *ptr[NR_CHUNKS_LOW];
+	char *hptr[NR_CHUNKS_HIGH];
+	char *hint;
+	unsigned long i, lchunks, hchunks;
+
+	for (i = 0; i < NR_CHUNKS_LOW; i++) {
+		ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
+					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+		if (ptr[i] == MAP_FAILED) {
+			if (validate_lower_address_hint())
+				return 1;
+			break;
+		}
+
+		if (validate_addr(ptr[i], 0))
+			return 1;
+	}
+	lchunks = i;
+
+	for (i = 0; i < NR_CHUNKS_HIGH; i++) {
+		hint = hind_addr();
+		hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
+					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+		if (hptr[i] == MAP_FAILED)
+			break;
+
+		if (validate_addr(hptr[i], 1))
+			return 1;
+	}
+	hchunks = i;
+
+	for (i = 0; i < lchunks; i++)
+		munmap(ptr[i], MAP_CHUNK_SIZE);
+
+	for (i = 0; i < hchunks; i++)
+		munmap(hptr[i], MAP_CHUNK_SIZE);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
new file mode 100644
index 000000000000..40e795624ff3
--- /dev/null
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <fcntl.h>
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
+#define SMAP_FILE_PATH "/proc/self/smaps"
+#define MAX_LINE_LENGTH 500
+
+uint64_t pagemap_get_entry(int fd, char *start)
+{
+	const unsigned long pfn = (unsigned long)start / getpagesize();
+	uint64_t entry;
+	int ret;
+
+	ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry));
+	if (ret != sizeof(entry))
+		ksft_exit_fail_msg("reading pagemap failed\n");
+	return entry;
+}
+
+bool pagemap_is_softdirty(int fd, char *start)
+{
+	uint64_t entry = pagemap_get_entry(fd, start);
+
+	// Check if dirty bit (55th bit) is set
+	return entry & 0x0080000000000000ull;
+}
+
+bool pagemap_is_swapped(int fd, char *start)
+{
+	uint64_t entry = pagemap_get_entry(fd, start);
+
+	return entry & 0x4000000000000000ull;
+}
+
+bool pagemap_is_populated(int fd, char *start)
+{
+	uint64_t entry = pagemap_get_entry(fd, start);
+
+	/* Present or swapped. */
+	return entry & 0xc000000000000000ull;
+}
+
+unsigned long pagemap_get_pfn(int fd, char *start)
+{
+	uint64_t entry = pagemap_get_entry(fd, start);
+
+	/* If present (63th bit), PFN is at bit 0 -- 54. */
+	if (entry & 0x8000000000000000ull)
+		return entry & 0x007fffffffffffffull;
+	return -1ul;
+}
+
+void clear_softdirty(void)
+{
+	int ret;
+	const char *ctrl = "4";
+	int fd = open("/proc/self/clear_refs", O_WRONLY);
+
+	if (fd < 0)
+		ksft_exit_fail_msg("opening clear_refs failed\n");
+	ret = write(fd, ctrl, strlen(ctrl));
+	close(fd);
+	if (ret != strlen(ctrl))
+		ksft_exit_fail_msg("writing clear_refs failed\n");
+}
+
+bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len)
+{
+	while (fgets(buf, len, fp)) {
+		if (!strncmp(buf, pattern, strlen(pattern)))
+			return true;
+	}
+	return false;
+}
+
+uint64_t read_pmd_pagesize(void)
+{
+	int fd;
+	char buf[20];
+	ssize_t num_read;
+
+	fd = open(PMD_SIZE_FILE_PATH, O_RDONLY);
+	if (fd == -1)
+		ksft_exit_fail_msg("Open hpage_pmd_size failed\n");
+
+	num_read = read(fd, buf, 19);
+	if (num_read < 1) {
+		close(fd);
+		ksft_exit_fail_msg("Read hpage_pmd_size failed\n");
+	}
+	buf[num_read] = '\0';
+	close(fd);
+
+	return strtoul(buf, NULL, 10);
+}
+
+bool __check_huge(void *addr, char *pattern, int nr_hpages,
+		  uint64_t hpage_size)
+{
+	uint64_t thp = -1;
+	int ret;
+	FILE *fp;
+	char buffer[MAX_LINE_LENGTH];
+	char addr_pattern[MAX_LINE_LENGTH];
+
+	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+		       (unsigned long) addr);
+	if (ret >= MAX_LINE_LENGTH)
+		ksft_exit_fail_msg("%s: Pattern is too long\n", __func__);
+
+	fp = fopen(SMAP_FILE_PATH, "r");
+	if (!fp)
+		ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH);
+
+	if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
+		goto err_out;
+
+	/*
+	 * Fetch the pattern in the same block and check the number of
+	 * hugepages.
+	 */
+	if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer)))
+		goto err_out;
+
+	snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern);
+
+	if (sscanf(buffer, addr_pattern, &thp) != 1)
+		ksft_exit_fail_msg("Reading smap error\n");
+
+err_out:
+	fclose(fp);
+	return thp == (nr_hpages * (hpage_size >> 10));
+}
+
+bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size)
+{
+	return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size);
+}
+
+bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size)
+{
+	return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size);
+}
+
+bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size)
+{
+	return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size);
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
new file mode 100644
index 000000000000..1995ee911ef2
--- /dev/null
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdint.h>
+#include <stdbool.h>
+
+uint64_t pagemap_get_entry(int fd, char *start);
+bool pagemap_is_softdirty(int fd, char *start);
+bool pagemap_is_swapped(int fd, char *start);
+bool pagemap_is_populated(int fd, char *start);
+unsigned long pagemap_get_pfn(int fd, char *start);
+void clear_softdirty(void);
+bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
+uint64_t read_pmd_pagesize(void);
+bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size);
+bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size);
+bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);
diff --git a/tools/testing/selftests/mm/write_hugetlb_memory.sh b/tools/testing/selftests/mm/write_hugetlb_memory.sh
new file mode 100644
index 000000000000..70a02301f4c2
--- /dev/null
+++ b/tools/testing/selftests/mm/write_hugetlb_memory.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+size=$1
+populate=$2
+write=$3
+cgroup=$4
+path=$5
+method=$6
+private=$7
+want_sleep=$8
+reserve=$9
+
+echo "Putting task in cgroup '$cgroup'"
+echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs
+
+echo "Method is $method"
+
+set +e
+./write_to_hugetlbfs -p "$path" -s "$size" "$write" "$populate" -m "$method" \
+      "$private" "$want_sleep" "$reserve"
diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c
new file mode 100644
index 000000000000..6a2caba19ee1
--- /dev/null
+++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This program reserves and uses hugetlb memory, supporting a bunch of
+ * scenarios needed by the charged_reserved_hugetlb.sh test.
+ */
+
+#include <err.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+/* Global definitions. */
+enum method {
+	HUGETLBFS,
+	MMAP_MAP_HUGETLB,
+	SHM,
+	MAX_METHOD
+};
+
+
+/* Global variables. */
+static const char *self;
+static char *shmaddr;
+static int shmid;
+
+/*
+ * Show usage and exit.
+ */
+static void exit_usage(void)
+{
+	printf("Usage: %s -p <path to hugetlbfs file> -s <size to map> "
+	       "[-m <0=hugetlbfs | 1=mmap(MAP_HUGETLB)>] [-l] [-r] "
+	       "[-o] [-w] [-n]\n",
+	       self);
+	exit(EXIT_FAILURE);
+}
+
+void sig_handler(int signo)
+{
+	printf("Received %d.\n", signo);
+	if (signo == SIGINT) {
+		printf("Deleting the memory\n");
+		if (shmdt((const void *)shmaddr) != 0) {
+			perror("Detach failure");
+			shmctl(shmid, IPC_RMID, NULL);
+			exit(4);
+		}
+
+		shmctl(shmid, IPC_RMID, NULL);
+		printf("Done deleting the memory\n");
+	}
+	exit(2);
+}
+
+int main(int argc, char **argv)
+{
+	int fd = 0;
+	int key = 0;
+	int *ptr = NULL;
+	int c = 0;
+	int size = 0;
+	char path[256] = "";
+	enum method method = MAX_METHOD;
+	int want_sleep = 0, private = 0;
+	int populate = 0;
+	int write = 0;
+	int reserve = 1;
+
+	if (signal(SIGINT, sig_handler) == SIG_ERR)
+		err(1, "\ncan't catch SIGINT\n");
+
+	/* Parse command-line arguments. */
+	setvbuf(stdout, NULL, _IONBF, 0);
+	self = argv[0];
+
+	while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) {
+		switch (c) {
+		case 's':
+			size = atoi(optarg);
+			break;
+		case 'p':
+			strncpy(path, optarg, sizeof(path));
+			break;
+		case 'm':
+			if (atoi(optarg) >= MAX_METHOD) {
+				errno = EINVAL;
+				perror("Invalid -m.");
+				exit_usage();
+			}
+			method = atoi(optarg);
+			break;
+		case 'o':
+			populate = 1;
+			break;
+		case 'w':
+			write = 1;
+			break;
+		case 'l':
+			want_sleep = 1;
+			break;
+		case 'r':
+		    private
+			= 1;
+			break;
+		case 'n':
+			reserve = 0;
+			break;
+		default:
+			errno = EINVAL;
+			perror("Invalid arg");
+			exit_usage();
+		}
+	}
+
+	if (strncmp(path, "", sizeof(path)) != 0) {
+		printf("Writing to this path: %s\n", path);
+	} else {
+		errno = EINVAL;
+		perror("path not found");
+		exit_usage();
+	}
+
+	if (size != 0) {
+		printf("Writing this size: %d\n", size);
+	} else {
+		errno = EINVAL;
+		perror("size not found");
+		exit_usage();
+	}
+
+	if (!populate)
+		printf("Not populating.\n");
+	else
+		printf("Populating.\n");
+
+	if (!write)
+		printf("Not writing to memory.\n");
+
+	if (method == MAX_METHOD) {
+		errno = EINVAL;
+		perror("-m Invalid");
+		exit_usage();
+	} else
+		printf("Using method=%d\n", method);
+
+	if (!private)
+		printf("Shared mapping.\n");
+	else
+		printf("Private mapping.\n");
+
+	if (!reserve)
+		printf("NO_RESERVE mapping.\n");
+	else
+		printf("RESERVE mapping.\n");
+
+	switch (method) {
+	case HUGETLBFS:
+		printf("Allocating using HUGETLBFS.\n");
+		fd = open(path, O_CREAT | O_RDWR, 0777);
+		if (fd == -1)
+			err(1, "Failed to open file.");
+
+		ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+			   (private ? MAP_PRIVATE : MAP_SHARED) |
+				   (populate ? MAP_POPULATE : 0) |
+				   (reserve ? 0 : MAP_NORESERVE),
+			   fd, 0);
+
+		if (ptr == MAP_FAILED) {
+			close(fd);
+			err(1, "Error mapping the file");
+		}
+		break;
+	case MMAP_MAP_HUGETLB:
+		printf("Allocating using MAP_HUGETLB.\n");
+		ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+			   (private ? (MAP_PRIVATE | MAP_ANONYMOUS) :
+				      MAP_SHARED) |
+				   MAP_HUGETLB | (populate ? MAP_POPULATE : 0) |
+				   (reserve ? 0 : MAP_NORESERVE),
+			   -1, 0);
+
+		if (ptr == MAP_FAILED)
+			err(1, "mmap");
+
+		printf("Returned address is %p\n", ptr);
+		break;
+	case SHM:
+		printf("Allocating using SHM.\n");
+		shmid = shmget(key, size,
+			       SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+		if (shmid < 0) {
+			shmid = shmget(++key, size,
+				       SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+			if (shmid < 0)
+				err(1, "shmget");
+		}
+		printf("shmid: 0x%x, shmget key:%d\n", shmid, key);
+
+		ptr = shmat(shmid, NULL, 0);
+		if (ptr == (int *)-1) {
+			perror("Shared memory attach failure");
+			shmctl(shmid, IPC_RMID, NULL);
+			exit(2);
+		}
+		printf("shmaddr: %p\n", ptr);
+
+		break;
+	default:
+		errno = EINVAL;
+		err(1, "Invalid method.");
+	}
+
+	if (write) {
+		printf("Writing to memory.\n");
+		memset(ptr, 1, size);
+	}
+
+	if (want_sleep) {
+		/* Signal to caller that we're done. */
+		printf("DONE\n");
+
+		/* Hold memory until external kill signal is delivered. */
+		while (1)
+			sleep(100);
+	}
+
+	if (method == HUGETLBFS)
+		close(fd);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
deleted file mode 100644
index 1f8c36a9fa10..000000000000
--- a/tools/testing/selftests/vm/.gitignore
+++ /dev/null
@@ -1,38 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-cow
-hugepage-mmap
-hugepage-mremap
-hugepage-shm
-hugepage-vmemmap
-hugetlb-madvise
-khugepaged
-map_hugetlb
-map_populate
-thuge-gen
-compaction_test
-migration
-mlock2-tests
-mrelease_test
-mremap_dontunmap
-mremap_test
-on-fault-limit
-transhuge-stress
-protection_keys
-protection_keys_32
-protection_keys_64
-madv_populate
-userfaultfd
-mlock-intersect-test
-mlock-random-test
-virtual_address_range
-gup_test
-va_128TBswitch
-map_fixed_noreplace
-write_to_hugetlbfs
-hmm-tests
-memfd_secret
-soft-dirty
-split_huge_page_test
-ksm_tests
-local_config.h
-local_config.mk
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
deleted file mode 100644
index 89c14e41bd43..000000000000
--- a/tools/testing/selftests/vm/Makefile
+++ /dev/null
@@ -1,180 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# Makefile for vm selftests
-
-LOCAL_HDRS += $(selfdir)/vm/local_config.h $(top_srcdir)/mm/gup_test.h
-
-include local_config.mk
-
-uname_M := $(shell uname -m 2>/dev/null || echo not)
-MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/')
-
-# Without this, failed build products remain, with up-to-date timestamps,
-# thus tricking Make (and you!) into believing that All Is Well, in subsequent
-# make invocations:
-.DELETE_ON_ERROR:
-
-# Avoid accidental wrong builds, due to built-in rules working just a little
-# bit too well--but not quite as well as required for our situation here.
-#
-# In other words, "make userfaultfd" is supposed to fail to build at all,
-# because this Makefile only supports either "make" (all), or "make /full/path".
-# However,  the built-in rules, if not suppressed, will pick up CFLAGS and the
-# initial LDLIBS (but not the target-specific LDLIBS, because those are only
-# set for the full path target!). This causes it to get pretty far into building
-# things despite using incorrect values such as an *occasionally* incomplete
-# LDLIBS.
-MAKEFLAGS += --no-builtin-rules
-
-CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
-LDLIBS = -lrt -lpthread
-TEST_GEN_FILES = cow
-TEST_GEN_FILES += compaction_test
-TEST_GEN_FILES += gup_test
-TEST_GEN_FILES += hmm-tests
-TEST_GEN_FILES += hugetlb-madvise
-TEST_GEN_FILES += hugepage-mmap
-TEST_GEN_FILES += hugepage-mremap
-TEST_GEN_FILES += hugepage-shm
-TEST_GEN_FILES += hugepage-vmemmap
-TEST_GEN_FILES += khugepaged
-TEST_GEN_PROGS = madv_populate
-TEST_GEN_FILES += map_fixed_noreplace
-TEST_GEN_FILES += map_hugetlb
-TEST_GEN_FILES += map_populate
-TEST_GEN_FILES += memfd_secret
-TEST_GEN_FILES += migration
-TEST_GEN_FILES += mlock-random-test
-TEST_GEN_FILES += mlock2-tests
-TEST_GEN_FILES += mrelease_test
-TEST_GEN_FILES += mremap_dontunmap
-TEST_GEN_FILES += mremap_test
-TEST_GEN_FILES += on-fault-limit
-TEST_GEN_FILES += thuge-gen
-TEST_GEN_FILES += transhuge-stress
-TEST_GEN_FILES += userfaultfd
-TEST_GEN_PROGS += soft-dirty
-TEST_GEN_PROGS += split_huge_page_test
-TEST_GEN_FILES += ksm_tests
-TEST_GEN_PROGS += ksm_functional_tests
-
-ifeq ($(MACHINE),x86_64)
-CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32)
-CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c)
-CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie)
-
-VMTARGETS := protection_keys
-BINARIES_32 := $(VMTARGETS:%=%_32)
-BINARIES_64 := $(VMTARGETS:%=%_64)
-
-ifeq ($(CAN_BUILD_WITH_NOPIE),1)
-CFLAGS += -no-pie
-endif
-
-ifeq ($(CAN_BUILD_I386),1)
-TEST_GEN_FILES += $(BINARIES_32)
-endif
-
-ifeq ($(CAN_BUILD_X86_64),1)
-TEST_GEN_FILES += $(BINARIES_64)
-endif
-else
-
-ifneq (,$(findstring $(MACHINE),ppc64))
-TEST_GEN_FILES += protection_keys
-endif
-
-endif
-
-ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64))
-TEST_GEN_FILES += va_128TBswitch
-TEST_GEN_FILES += virtual_address_range
-TEST_GEN_FILES += write_to_hugetlbfs
-endif
-
-TEST_PROGS := run_vmtests.sh
-
-TEST_FILES := test_vmalloc.sh
-TEST_FILES += test_hmm.sh
-TEST_FILES += va_128TBswitch.sh
-
-include ../lib.mk
-
-$(OUTPUT)/cow: vm_util.c
-$(OUTPUT)/khugepaged: vm_util.c
-$(OUTPUT)/ksm_functional_tests: vm_util.c
-$(OUTPUT)/madv_populate: vm_util.c
-$(OUTPUT)/soft-dirty: vm_util.c
-$(OUTPUT)/split_huge_page_test: vm_util.c
-$(OUTPUT)/userfaultfd: vm_util.c
-
-ifeq ($(MACHINE),x86_64)
-BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
-BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
-
-define gen-target-rule-32
-$(1) $(1)_32: $(OUTPUT)/$(1)_32
-.PHONY: $(1) $(1)_32
-endef
-
-define gen-target-rule-64
-$(1) $(1)_64: $(OUTPUT)/$(1)_64
-.PHONY: $(1) $(1)_64
-endef
-
-ifeq ($(CAN_BUILD_I386),1)
-$(BINARIES_32): CFLAGS += -m32 -mxsave
-$(BINARIES_32): LDLIBS += -lrt -ldl -lm
-$(BINARIES_32): $(OUTPUT)/%_32: %.c
-	$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
-$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t))))
-endif
-
-ifeq ($(CAN_BUILD_X86_64),1)
-$(BINARIES_64): CFLAGS += -m64 -mxsave
-$(BINARIES_64): LDLIBS += -lrt -ldl
-$(BINARIES_64): $(OUTPUT)/%_64: %.c
-	$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
-$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t))))
-endif
-
-# x86_64 users should be encouraged to install 32-bit libraries
-ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01)
-all: warn_32bit_failure
-
-warn_32bit_failure:
-	@echo "Warning: you seem to have a broken 32-bit build" 2>&1;		\
-	echo  "environment. This will reduce test coverage of 64-bit" 2>&1;	\
-	echo  "kernels. If you are using a Debian-like distribution," 2>&1;	\
-	echo  "try:"; 2>&1;							\
-	echo  "";								\
-	echo  "  apt-get install gcc-multilib libc6-i386 libc6-dev-i386";	\
-	echo  "";								\
-	echo  "If you are using a Fedora-like distribution, try:";		\
-	echo  "";								\
-	echo  "  yum install glibc-devel.*i686";				\
-	exit 0;
-endif
-endif
-
-# cow_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
-$(OUTPUT)/cow: LDLIBS += $(COW_EXTRA_LIBS)
-
-$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap
-
-$(OUTPUT)/ksm_tests: LDLIBS += -lnuma
-
-$(OUTPUT)/migration: LDLIBS += -lnuma
-
-local_config.mk local_config.h: check_config.sh
-	/bin/sh ./check_config.sh $(CC)
-
-EXTRA_CLEAN += local_config.mk local_config.h
-
-ifeq ($(COW_EXTRA_LIBS),)
-all: warn_missing_liburing
-
-warn_missing_liburing:
-	@echo ; \
-	echo "Warning: missing liburing support. Some COW tests will be skipped." ; \
-	echo
-endif
diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh
deleted file mode 100644
index a5cb4b09a46c..000000000000
--- a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh
+++ /dev/null
@@ -1,584 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-set -e
-
-if [[ $(id -u) -ne 0 ]]; then
-  echo "This test must be run as root. Skipping..."
-  exit $ksft_skip
-fi
-
-fault_limit_file=limit_in_bytes
-reservation_limit_file=rsvd.limit_in_bytes
-fault_usage_file=usage_in_bytes
-reservation_usage_file=rsvd.usage_in_bytes
-
-if [[ "$1" == "-cgroup-v2" ]]; then
-  cgroup2=1
-  fault_limit_file=max
-  reservation_limit_file=rsvd.max
-  fault_usage_file=current
-  reservation_usage_file=rsvd.current
-fi
-
-if [[ $cgroup2 ]]; then
-  cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
-  if [[ -z "$cgroup_path" ]]; then
-    cgroup_path=/dev/cgroup/memory
-    mount -t cgroup2 none $cgroup_path
-    do_umount=1
-  fi
-  echo "+hugetlb" >$cgroup_path/cgroup.subtree_control
-else
-  cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}')
-  if [[ -z "$cgroup_path" ]]; then
-    cgroup_path=/dev/cgroup/memory
-    mount -t cgroup memory,hugetlb $cgroup_path
-    do_umount=1
-  fi
-fi
-export cgroup_path
-
-function cleanup() {
-  if [[ $cgroup2 ]]; then
-    echo $$ >$cgroup_path/cgroup.procs
-  else
-    echo $$ >$cgroup_path/tasks
-  fi
-
-  if [[ -e /mnt/huge ]]; then
-    rm -rf /mnt/huge/*
-    umount /mnt/huge || echo error
-    rmdir /mnt/huge
-  fi
-  if [[ -e $cgroup_path/hugetlb_cgroup_test ]]; then
-    rmdir $cgroup_path/hugetlb_cgroup_test
-  fi
-  if [[ -e $cgroup_path/hugetlb_cgroup_test1 ]]; then
-    rmdir $cgroup_path/hugetlb_cgroup_test1
-  fi
-  if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then
-    rmdir $cgroup_path/hugetlb_cgroup_test2
-  fi
-  echo 0 >/proc/sys/vm/nr_hugepages
-  echo CLEANUP DONE
-}
-
-function expect_equal() {
-  local expected="$1"
-  local actual="$2"
-  local error="$3"
-
-  if [[ "$expected" != "$actual" ]]; then
-    echo "expected ($expected) != actual ($actual): $3"
-    cleanup
-    exit 1
-  fi
-}
-
-function get_machine_hugepage_size() {
-  hpz=$(grep -i hugepagesize /proc/meminfo)
-  kb=${hpz:14:-3}
-  mb=$(($kb / 1024))
-  echo $mb
-}
-
-MB=$(get_machine_hugepage_size)
-
-function setup_cgroup() {
-  local name="$1"
-  local cgroup_limit="$2"
-  local reservation_limit="$3"
-
-  mkdir $cgroup_path/$name
-
-  echo writing cgroup limit: "$cgroup_limit"
-  echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file
-
-  echo writing reseravation limit: "$reservation_limit"
-  echo "$reservation_limit" > \
-    $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file
-
-  if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then
-    echo 0 >$cgroup_path/$name/cpuset.cpus
-  fi
-  if [ -e "$cgroup_path/$name/cpuset.mems" ]; then
-    echo 0 >$cgroup_path/$name/cpuset.mems
-  fi
-}
-
-function wait_for_hugetlb_memory_to_get_depleted() {
-  local cgroup="$1"
-  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
-  # Wait for hugetlbfs memory to get depleted.
-  while [ $(cat $path) != 0 ]; do
-    echo Waiting for hugetlb memory to get depleted.
-    cat $path
-    sleep 0.5
-  done
-}
-
-function wait_for_hugetlb_memory_to_get_reserved() {
-  local cgroup="$1"
-  local size="$2"
-
-  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
-  # Wait for hugetlbfs memory to get written.
-  while [ $(cat $path) != $size ]; do
-    echo Waiting for hugetlb memory reservation to reach size $size.
-    cat $path
-    sleep 0.5
-  done
-}
-
-function wait_for_hugetlb_memory_to_get_written() {
-  local cgroup="$1"
-  local size="$2"
-
-  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file"
-  # Wait for hugetlbfs memory to get written.
-  while [ $(cat $path) != $size ]; do
-    echo Waiting for hugetlb memory to reach size $size.
-    cat $path
-    sleep 0.5
-  done
-}
-
-function write_hugetlbfs_and_get_usage() {
-  local cgroup="$1"
-  local size="$2"
-  local populate="$3"
-  local write="$4"
-  local path="$5"
-  local method="$6"
-  local private="$7"
-  local expect_failure="$8"
-  local reserve="$9"
-
-  # Function return values.
-  reservation_failed=0
-  oom_killed=0
-  hugetlb_difference=0
-  reserved_difference=0
-
-  local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file
-  local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file
-
-  local hugetlb_before=$(cat $hugetlb_usage)
-  local reserved_before=$(cat $reserved_usage)
-
-  echo
-  echo Starting:
-  echo hugetlb_usage="$hugetlb_before"
-  echo reserved_usage="$reserved_before"
-  echo expect_failure is "$expect_failure"
-
-  output=$(mktemp)
-  set +e
-  if [[ "$method" == "1" ]] || [[ "$method" == 2 ]] ||
-    [[ "$private" == "-r" ]] && [[ "$expect_failure" != 1 ]]; then
-
-    bash write_hugetlb_memory.sh "$size" "$populate" "$write" \
-      "$cgroup" "$path" "$method" "$private" "-l" "$reserve" 2>&1 | tee $output &
-
-    local write_result=$?
-    local write_pid=$!
-
-    until grep -q -i "DONE" $output; do
-      echo waiting for DONE signal.
-      if ! ps $write_pid > /dev/null
-      then
-        echo "FAIL: The write died"
-        cleanup
-        exit 1
-      fi
-      sleep 0.5
-    done
-
-    echo ================= write_hugetlb_memory.sh output is:
-    cat $output
-    echo ================= end output.
-
-    if [[ "$populate" == "-o" ]] || [[ "$write" == "-w" ]]; then
-      wait_for_hugetlb_memory_to_get_written "$cgroup" "$size"
-    elif [[ "$reserve" != "-n" ]]; then
-      wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size"
-    else
-      # This case doesn't produce visible effects, but we still have
-      # to wait for the async process to start and execute...
-      sleep 0.5
-    fi
-
-    echo write_result is $write_result
-  else
-    bash write_hugetlb_memory.sh "$size" "$populate" "$write" \
-      "$cgroup" "$path" "$method" "$private" "$reserve"
-    local write_result=$?
-
-    if [[ "$reserve" != "-n" ]]; then
-      wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size"
-    fi
-  fi
-  set -e
-
-  if [[ "$write_result" == 1 ]]; then
-    reservation_failed=1
-  fi
-
-  # On linus/master, the above process gets SIGBUS'd on oomkill, with
-  # return code 135. On earlier kernels, it gets actual oomkill, with return
-  # code 137, so just check for both conditions in case we're testing
-  # against an earlier kernel.
-  if [[ "$write_result" == 135 ]] || [[ "$write_result" == 137 ]]; then
-    oom_killed=1
-  fi
-
-  local hugetlb_after=$(cat $hugetlb_usage)
-  local reserved_after=$(cat $reserved_usage)
-
-  echo After write:
-  echo hugetlb_usage="$hugetlb_after"
-  echo reserved_usage="$reserved_after"
-
-  hugetlb_difference=$(($hugetlb_after - $hugetlb_before))
-  reserved_difference=$(($reserved_after - $reserved_before))
-}
-
-function cleanup_hugetlb_memory() {
-  set +e
-  local cgroup="$1"
-  if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then
-    echo killing write_to_hugetlbfs
-    killall -2 write_to_hugetlbfs
-    wait_for_hugetlb_memory_to_get_depleted $cgroup
-  fi
-  set -e
-
-  if [[ -e /mnt/huge ]]; then
-    rm -rf /mnt/huge/*
-    umount /mnt/huge
-    rmdir /mnt/huge
-  fi
-}
-
-function run_test() {
-  local size=$(($1 * ${MB} * 1024 * 1024))
-  local populate="$2"
-  local write="$3"
-  local cgroup_limit=$(($4 * ${MB} * 1024 * 1024))
-  local reservation_limit=$(($5 * ${MB} * 1024 * 1024))
-  local nr_hugepages="$6"
-  local method="$7"
-  local private="$8"
-  local expect_failure="$9"
-  local reserve="${10}"
-
-  # Function return values.
-  hugetlb_difference=0
-  reserved_difference=0
-  reservation_failed=0
-  oom_killed=0
-
-  echo nr hugepages = "$nr_hugepages"
-  echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages
-
-  setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit"
-
-  mkdir -p /mnt/huge
-  mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge
-
-  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \
-    "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \
-    "$reserve"
-
-  cleanup_hugetlb_memory "hugetlb_cgroup_test"
-
-  local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file)
-  local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file)
-
-  echo $hugetlb_difference
-  echo $reserved_difference
-  expect_equal "0" "$final_hugetlb" "final hugetlb is not zero"
-  expect_equal "0" "$final_reservation" "final reservation is not zero"
-}
-
-function run_multiple_cgroup_test() {
-  local size1="$1"
-  local populate1="$2"
-  local write1="$3"
-  local cgroup_limit1="$4"
-  local reservation_limit1="$5"
-
-  local size2="$6"
-  local populate2="$7"
-  local write2="$8"
-  local cgroup_limit2="$9"
-  local reservation_limit2="${10}"
-
-  local nr_hugepages="${11}"
-  local method="${12}"
-  local private="${13}"
-  local expect_failure="${14}"
-  local reserve="${15}"
-
-  # Function return values.
-  hugetlb_difference1=0
-  reserved_difference1=0
-  reservation_failed1=0
-  oom_killed1=0
-
-  hugetlb_difference2=0
-  reserved_difference2=0
-  reservation_failed2=0
-  oom_killed2=0
-
-  echo nr hugepages = "$nr_hugepages"
-  echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages
-
-  setup_cgroup "hugetlb_cgroup_test1" "$cgroup_limit1" "$reservation_limit1"
-  setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2"
-
-  mkdir -p /mnt/huge
-  mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge
-
-  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \
-    "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \
-    "$expect_failure" "$reserve"
-
-  hugetlb_difference1=$hugetlb_difference
-  reserved_difference1=$reserved_difference
-  reservation_failed1=$reservation_failed
-  oom_killed1=$oom_killed
-
-  local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file
-  local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file
-  local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file
-  local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file
-
-  local usage_before_second_write=$(cat $cgroup1_hugetlb_usage)
-  local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage)
-
-  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test2" "$size2" \
-    "$populate2" "$write2" "/mnt/huge/test2" "$method" "$private" \
-    "$expect_failure" "$reserve"
-
-  hugetlb_difference2=$hugetlb_difference
-  reserved_difference2=$reserved_difference
-  reservation_failed2=$reservation_failed
-  oom_killed2=$oom_killed
-
-  expect_equal "$usage_before_second_write" \
-    "$(cat $cgroup1_hugetlb_usage)" "Usage changed."
-  expect_equal "$reservation_usage_before_second_write" \
-    "$(cat $cgroup1_reservation_usage)" "Reservation usage changed."
-
-  cleanup_hugetlb_memory
-
-  local final_hugetlb=$(cat $cgroup1_hugetlb_usage)
-  local final_reservation=$(cat $cgroup1_reservation_usage)
-
-  expect_equal "0" "$final_hugetlb" \
-    "hugetlbt_cgroup_test1 final hugetlb is not zero"
-  expect_equal "0" "$final_reservation" \
-    "hugetlbt_cgroup_test1 final reservation is not zero"
-
-  local final_hugetlb=$(cat $cgroup2_hugetlb_usage)
-  local final_reservation=$(cat $cgroup2_reservation_usage)
-
-  expect_equal "0" "$final_hugetlb" \
-    "hugetlb_cgroup_test2 final hugetlb is not zero"
-  expect_equal "0" "$final_reservation" \
-    "hugetlb_cgroup_test2 final reservation is not zero"
-}
-
-cleanup
-
-for populate in "" "-o"; do
-  for method in 0 1 2; do
-    for private in "" "-r"; do
-      for reserve in "" "-n"; do
-
-        # Skip mmap(MAP_HUGETLB | MAP_SHARED). Doesn't seem to be supported.
-        if [[ "$method" == 1 ]] && [[ "$private" == "" ]]; then
-          continue
-        fi
-
-        # Skip populated shmem tests. Doesn't seem to be supported.
-        if [[ "$method" == 2"" ]] && [[ "$populate" == "-o" ]]; then
-          continue
-        fi
-
-        if [[ "$method" == 2"" ]] && [[ "$reserve" == "-n" ]]; then
-          continue
-        fi
-
-        cleanup
-        echo
-        echo
-        echo
-        echo Test normal case.
-        echo private=$private, populate=$populate, method=$method, reserve=$reserve
-        run_test 5 "$populate" "" 10 10 10 "$method" "$private" "0" "$reserve"
-
-        echo Memory charged to hugtlb=$hugetlb_difference
-        echo Memory charged to reservation=$reserved_difference
-
-        if [[ "$populate" == "-o" ]]; then
-          expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \
-            "Reserved memory charged to hugetlb cgroup."
-        else
-          expect_equal "0" "$hugetlb_difference" \
-            "Reserved memory charged to hugetlb cgroup."
-        fi
-
-        if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then
-          expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \
-            "Reserved memory not charged to reservation usage."
-        else
-          expect_equal "0" "$reserved_difference" \
-            "Reserved memory not charged to reservation usage."
-        fi
-
-        echo 'PASS'
-
-        cleanup
-        echo
-        echo
-        echo
-        echo Test normal case with write.
-        echo private=$private, populate=$populate, method=$method, reserve=$reserve
-        run_test 5 "$populate" '-w' 5 5 10 "$method" "$private" "0" "$reserve"
-
-        echo Memory charged to hugtlb=$hugetlb_difference
-        echo Memory charged to reservation=$reserved_difference
-
-        expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \
-          "Reserved memory charged to hugetlb cgroup."
-
-        expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \
-          "Reserved memory not charged to reservation usage."
-
-        echo 'PASS'
-
-        cleanup
-        continue
-        echo
-        echo
-        echo
-        echo Test more than reservation case.
-        echo private=$private, populate=$populate, method=$method, reserve=$reserve
-
-        if [ "$reserve" != "-n" ]; then
-          run_test "5" "$populate" '' "10" "2" "10" "$method" "$private" "1" \
-            "$reserve"
-
-          expect_equal "1" "$reservation_failed" "Reservation succeeded."
-        fi
-
-        echo 'PASS'
-
-        cleanup
-
-        echo
-        echo
-        echo
-        echo Test more than cgroup limit case.
-        echo private=$private, populate=$populate, method=$method, reserve=$reserve
-
-        # Not sure if shm memory can be cleaned up when the process gets sigbus'd.
-        if [[ "$method" != 2 ]]; then
-          run_test 5 "$populate" "-w" 2 10 10 "$method" "$private" "1" "$reserve"
-
-          expect_equal "1" "$oom_killed" "Not oom killed."
-        fi
-        echo 'PASS'
-
-        cleanup
-
-        echo
-        echo
-        echo
-        echo Test normal case, multiple cgroups.
-        echo private=$private, populate=$populate, method=$method, reserve=$reserve
-        run_multiple_cgroup_test "3" "$populate" "" "10" "10" "5" \
-          "$populate" "" "10" "10" "10" \
-          "$method" "$private" "0" "$reserve"
-
-        echo Memory charged to hugtlb1=$hugetlb_difference1
-        echo Memory charged to reservation1=$reserved_difference1
-        echo Memory charged to hugtlb2=$hugetlb_difference2
-        echo Memory charged to reservation2=$reserved_difference2
-
-        if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then
-          expect_equal "3" "$reserved_difference1" \
-            "Incorrect reservations charged to cgroup 1."
-
-          expect_equal "5" "$reserved_difference2" \
-            "Incorrect reservation charged to cgroup 2."
-
-        else
-          expect_equal "0" "$reserved_difference1" \
-            "Incorrect reservations charged to cgroup 1."
-
-          expect_equal "0" "$reserved_difference2" \
-            "Incorrect reservation charged to cgroup 2."
-        fi
-
-        if [[ "$populate" == "-o" ]]; then
-          expect_equal "3" "$hugetlb_difference1" \
-            "Incorrect hugetlb charged to cgroup 1."
-
-          expect_equal "5" "$hugetlb_difference2" \
-            "Incorrect hugetlb charged to cgroup 2."
-
-        else
-          expect_equal "0" "$hugetlb_difference1" \
-            "Incorrect hugetlb charged to cgroup 1."
-
-          expect_equal "0" "$hugetlb_difference2" \
-            "Incorrect hugetlb charged to cgroup 2."
-        fi
-        echo 'PASS'
-
-        cleanup
-        echo
-        echo
-        echo
-        echo Test normal case with write, multiple cgroups.
-        echo private=$private, populate=$populate, method=$method, reserve=$reserve
-        run_multiple_cgroup_test "3" "$populate" "-w" "10" "10" "5" \
-          "$populate" "-w" "10" "10" "10" \
-          "$method" "$private" "0" "$reserve"
-
-        echo Memory charged to hugtlb1=$hugetlb_difference1
-        echo Memory charged to reservation1=$reserved_difference1
-        echo Memory charged to hugtlb2=$hugetlb_difference2
-        echo Memory charged to reservation2=$reserved_difference2
-
-        expect_equal "3" "$hugetlb_difference1" \
-          "Incorrect hugetlb charged to cgroup 1."
-
-        expect_equal "3" "$reserved_difference1" \
-          "Incorrect reservation charged to cgroup 1."
-
-        expect_equal "5" "$hugetlb_difference2" \
-          "Incorrect hugetlb charged to cgroup 2."
-
-        expect_equal "5" "$reserved_difference2" \
-          "Incorrected reservation charged to cgroup 2."
-        echo 'PASS'
-
-        cleanup
-
-      done # reserve
-    done   # private
-  done     # populate
-done       # method
-
-if [[ $do_umount ]]; then
-  umount $cgroup_path
-  rmdir $cgroup_path
-fi
diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/vm/check_config.sh
deleted file mode 100644
index bcba3af0acea..000000000000
--- a/tools/testing/selftests/vm/check_config.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-#
-# Probe for libraries and create header files to record the results. Both C
-# header files and Makefile include fragments are created.
-
-OUTPUT_H_FILE=local_config.h
-OUTPUT_MKFILE=local_config.mk
-
-tmpname=$(mktemp)
-tmpfile_c=${tmpname}.c
-tmpfile_o=${tmpname}.o
-
-# liburing
-echo "#include <sys/types.h>"        > $tmpfile_c
-echo "#include <liburing.h>"        >> $tmpfile_c
-echo "int func(void) { return 0; }" >> $tmpfile_c
-
-CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"}
-$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
-
-if [ -f $tmpfile_o ]; then
-    echo "#define LOCAL_CONFIG_HAVE_LIBURING 1"  > $OUTPUT_H_FILE
-    echo "COW_EXTRA_LIBS = -luring"              > $OUTPUT_MKFILE
-else
-    echo "// No liburing support found"          > $OUTPUT_H_FILE
-    echo "# No liburing support found, so:"      > $OUTPUT_MKFILE
-    echo "COW_EXTRA_LIBS = "                    >> $OUTPUT_MKFILE
-fi
-
-rm ${tmpname}.*
diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/vm/compaction_test.c
deleted file mode 100644
index 9b420140ba2b..000000000000
--- a/tools/testing/selftests/vm/compaction_test.c
+++ /dev/null
@@ -1,231 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *
- * A test for the patch "Allow compaction of unevictable pages".
- * With this patch we should be able to allocate at least 1/4
- * of RAM in huge pages. Without the patch much less is
- * allocated.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/resource.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <unistd.h>
-#include <string.h>
-
-#include "../kselftest.h"
-
-#define MAP_SIZE_MB	100
-#define MAP_SIZE	(MAP_SIZE_MB * 1024 * 1024)
-
-struct map_list {
-	void *map;
-	struct map_list *next;
-};
-
-int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize)
-{
-	char  buffer[256] = {0};
-	char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'";
-	FILE *cmdfile = popen(cmd, "r");
-
-	if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
-		perror("Failed to read meminfo\n");
-		return -1;
-	}
-
-	pclose(cmdfile);
-
-	*memfree = atoll(buffer);
-	cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'";
-	cmdfile = popen(cmd, "r");
-
-	if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
-		perror("Failed to read meminfo\n");
-		return -1;
-	}
-
-	pclose(cmdfile);
-	*hugepagesize = atoll(buffer);
-
-	return 0;
-}
-
-int prereq(void)
-{
-	char allowed;
-	int fd;
-
-	fd = open("/proc/sys/vm/compact_unevictable_allowed",
-		  O_RDONLY | O_NONBLOCK);
-	if (fd < 0) {
-		perror("Failed to open\n"
-		       "/proc/sys/vm/compact_unevictable_allowed\n");
-		return -1;
-	}
-
-	if (read(fd, &allowed, sizeof(char)) != sizeof(char)) {
-		perror("Failed to read from\n"
-		       "/proc/sys/vm/compact_unevictable_allowed\n");
-		close(fd);
-		return -1;
-	}
-
-	close(fd);
-	if (allowed == '1')
-		return 0;
-
-	return -1;
-}
-
-int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
-{
-	int fd;
-	int compaction_index = 0;
-	char initial_nr_hugepages[10] = {0};
-	char nr_hugepages[10] = {0};
-
-	/* We want to test with 80% of available memory. Else, OOM killer comes
-	   in to play */
-	mem_free = mem_free * 0.8;
-
-	fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK);
-	if (fd < 0) {
-		perror("Failed to open /proc/sys/vm/nr_hugepages");
-		return -1;
-	}
-
-	if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) {
-		perror("Failed to read from /proc/sys/vm/nr_hugepages");
-		goto close_fd;
-	}
-
-	/* Start with the initial condition of 0 huge pages*/
-	if (write(fd, "0", sizeof(char)) != sizeof(char)) {
-		perror("Failed to write 0 to /proc/sys/vm/nr_hugepages\n");
-		goto close_fd;
-	}
-
-	lseek(fd, 0, SEEK_SET);
-
-	/* Request a large number of huge pages. The Kernel will allocate
-	   as much as it can */
-	if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) {
-		perror("Failed to write 100000 to /proc/sys/vm/nr_hugepages\n");
-		goto close_fd;
-	}
-
-	lseek(fd, 0, SEEK_SET);
-
-	if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) {
-		perror("Failed to re-read from /proc/sys/vm/nr_hugepages\n");
-		goto close_fd;
-	}
-
-	/* We should have been able to request at least 1/3 rd of the memory in
-	   huge pages */
-	compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size);
-
-	if (compaction_index > 3) {
-		printf("No of huge pages allocated = %d\n",
-		       (atoi(nr_hugepages)));
-		fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n"
-			"as huge pages\n", compaction_index);
-		goto close_fd;
-	}
-
-	printf("No of huge pages allocated = %d\n",
-	       (atoi(nr_hugepages)));
-
-	lseek(fd, 0, SEEK_SET);
-
-	if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages))
-	    != strlen(initial_nr_hugepages)) {
-		perror("Failed to write value to /proc/sys/vm/nr_hugepages\n");
-		goto close_fd;
-	}
-
-	close(fd);
-	return 0;
-
- close_fd:
-	close(fd);
-	printf("Not OK. Compaction test failed.");
-	return -1;
-}
-
-
-int main(int argc, char **argv)
-{
-	struct rlimit lim;
-	struct map_list *list, *entry;
-	size_t page_size, i;
-	void *map = NULL;
-	unsigned long mem_free = 0;
-	unsigned long hugepage_size = 0;
-	long mem_fragmentable_MB = 0;
-
-	if (prereq() != 0) {
-		printf("Either the sysctl compact_unevictable_allowed is not\n"
-		       "set to 1 or couldn't read the proc file.\n"
-		       "Skipping the test\n");
-		return KSFT_SKIP;
-	}
-
-	lim.rlim_cur = RLIM_INFINITY;
-	lim.rlim_max = RLIM_INFINITY;
-	if (setrlimit(RLIMIT_MEMLOCK, &lim)) {
-		perror("Failed to set rlimit:\n");
-		return -1;
-	}
-
-	page_size = getpagesize();
-
-	list = NULL;
-
-	if (read_memory_info(&mem_free, &hugepage_size) != 0) {
-		printf("ERROR: Cannot read meminfo\n");
-		return -1;
-	}
-
-	mem_fragmentable_MB = mem_free * 0.8 / 1024;
-
-	while (mem_fragmentable_MB > 0) {
-		map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
-			   MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0);
-		if (map == MAP_FAILED)
-			break;
-
-		entry = malloc(sizeof(struct map_list));
-		if (!entry) {
-			munmap(map, MAP_SIZE);
-			break;
-		}
-		entry->map = map;
-		entry->next = list;
-		list = entry;
-
-		/* Write something (in this case the address of the map) to
-		 * ensure that KSM can't merge the mapped pages
-		 */
-		for (i = 0; i < MAP_SIZE; i += page_size)
-			*(unsigned long *)(map + i) = (unsigned long)map + i;
-
-		mem_fragmentable_MB -= MAP_SIZE_MB;
-	}
-
-	for (entry = list; entry != NULL; entry = entry->next) {
-		munmap(entry->map, MAP_SIZE);
-		if (!entry->next)
-			break;
-		entry = entry->next;
-	}
-
-	if (check_compaction(mem_free, hugepage_size) == 0)
-		return 0;
-
-	return -1;
-}
diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config
deleted file mode 100644
index be087c4bc396..000000000000
--- a/tools/testing/selftests/vm/config
+++ /dev/null
@@ -1,8 +0,0 @@
-CONFIG_SYSVIPC=y
-CONFIG_USERFAULTFD=y
-CONFIG_TEST_VMALLOC=m
-CONFIG_DEVICE_PRIVATE=y
-CONFIG_TEST_HMM=m
-CONFIG_GUP_TEST=y
-CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_MEM_SOFT_DIRTY=y
diff --git a/tools/testing/selftests/vm/cow.c b/tools/testing/selftests/vm/cow.c
deleted file mode 100644
index 16216d893d96..000000000000
--- a/tools/testing/selftests/vm/cow.c
+++ /dev/null
@@ -1,1764 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * COW (Copy On Write) tests.
- *
- * Copyright 2022, Red Hat, Inc.
- *
- * Author(s): David Hildenbrand <david@redhat.com>
- */
-#define _GNU_SOURCE
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <dirent.h>
-#include <assert.h>
-#include <sys/mman.h>
-#include <sys/ioctl.h>
-#include <sys/wait.h>
-#include <linux/memfd.h>
-
-#include "local_config.h"
-#ifdef LOCAL_CONFIG_HAVE_LIBURING
-#include <liburing.h>
-#endif /* LOCAL_CONFIG_HAVE_LIBURING */
-
-#include "../../../../mm/gup_test.h"
-#include "../kselftest.h"
-#include "vm_util.h"
-
-#ifndef MADV_COLLAPSE
-#define MADV_COLLAPSE 25
-#endif
-
-static size_t pagesize;
-static int pagemap_fd;
-static size_t thpsize;
-static int nr_hugetlbsizes;
-static size_t hugetlbsizes[10];
-static int gup_fd;
-static bool has_huge_zeropage;
-
-static void detect_thpsize(void)
-{
-	int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size",
-		      O_RDONLY);
-	size_t size = 0;
-	char buf[15];
-	int ret;
-
-	if (fd < 0)
-		return;
-
-	ret = pread(fd, buf, sizeof(buf), 0);
-	if (ret > 0 && ret < sizeof(buf)) {
-		buf[ret] = 0;
-
-		size = strtoul(buf, NULL, 10);
-		if (size < pagesize)
-			size = 0;
-		if (size > 0) {
-			thpsize = size;
-			ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
-				       thpsize / 1024);
-		}
-	}
-
-	close(fd);
-}
-
-static void detect_huge_zeropage(void)
-{
-	int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
-		      O_RDONLY);
-	size_t enabled = 0;
-	char buf[15];
-	int ret;
-
-	if (fd < 0)
-		return;
-
-	ret = pread(fd, buf, sizeof(buf), 0);
-	if (ret > 0 && ret < sizeof(buf)) {
-		buf[ret] = 0;
-
-		enabled = strtoul(buf, NULL, 10);
-		if (enabled == 1) {
-			has_huge_zeropage = true;
-			ksft_print_msg("[INFO] huge zeropage is enabled\n");
-		}
-	}
-
-	close(fd);
-}
-
-static void detect_hugetlbsizes(void)
-{
-	DIR *dir = opendir("/sys/kernel/mm/hugepages/");
-
-	if (!dir)
-		return;
-
-	while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) {
-		struct dirent *entry = readdir(dir);
-		size_t kb;
-
-		if (!entry)
-			break;
-		if (entry->d_type != DT_DIR)
-			continue;
-		if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1)
-			continue;
-		hugetlbsizes[nr_hugetlbsizes] = kb * 1024;
-		nr_hugetlbsizes++;
-		ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n",
-			       kb);
-	}
-	closedir(dir);
-}
-
-static bool range_is_swapped(void *addr, size_t size)
-{
-	for (; size; addr += pagesize, size -= pagesize)
-		if (!pagemap_is_swapped(pagemap_fd, addr))
-			return false;
-	return true;
-}
-
-struct comm_pipes {
-	int child_ready[2];
-	int parent_ready[2];
-};
-
-static int setup_comm_pipes(struct comm_pipes *comm_pipes)
-{
-	if (pipe(comm_pipes->child_ready) < 0)
-		return -errno;
-	if (pipe(comm_pipes->parent_ready) < 0) {
-		close(comm_pipes->child_ready[0]);
-		close(comm_pipes->child_ready[1]);
-		return -errno;
-	}
-
-	return 0;
-}
-
-static void close_comm_pipes(struct comm_pipes *comm_pipes)
-{
-	close(comm_pipes->child_ready[0]);
-	close(comm_pipes->child_ready[1]);
-	close(comm_pipes->parent_ready[0]);
-	close(comm_pipes->parent_ready[1]);
-}
-
-static int child_memcmp_fn(char *mem, size_t size,
-			   struct comm_pipes *comm_pipes)
-{
-	char *old = malloc(size);
-	char buf;
-
-	/* Backup the original content. */
-	memcpy(old, mem, size);
-
-	/* Wait until the parent modified the page. */
-	write(comm_pipes->child_ready[1], "0", 1);
-	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
-		;
-
-	/* See if we still read the old values. */
-	return memcmp(old, mem, size);
-}
-
-static int child_vmsplice_memcmp_fn(char *mem, size_t size,
-				    struct comm_pipes *comm_pipes)
-{
-	struct iovec iov = {
-		.iov_base = mem,
-		.iov_len = size,
-	};
-	ssize_t cur, total, transferred;
-	char *old, *new;
-	int fds[2];
-	char buf;
-
-	old = malloc(size);
-	new = malloc(size);
-
-	/* Backup the original content. */
-	memcpy(old, mem, size);
-
-	if (pipe(fds) < 0)
-		return -errno;
-
-	/* Trigger a read-only pin. */
-	transferred = vmsplice(fds[1], &iov, 1, 0);
-	if (transferred < 0)
-		return -errno;
-	if (transferred == 0)
-		return -EINVAL;
-
-	/* Unmap it from our page tables. */
-	if (munmap(mem, size) < 0)
-		return -errno;
-
-	/* Wait until the parent modified it. */
-	write(comm_pipes->child_ready[1], "0", 1);
-	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
-		;
-
-	/* See if we still read the old values via the pipe. */
-	for (total = 0; total < transferred; total += cur) {
-		cur = read(fds[0], new + total, transferred - total);
-		if (cur < 0)
-			return -errno;
-	}
-
-	return memcmp(old, new, transferred);
-}
-
-typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
-
-static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
-				  child_fn fn)
-{
-	struct comm_pipes comm_pipes;
-	char buf;
-	int ret;
-
-	ret = setup_comm_pipes(&comm_pipes);
-	if (ret) {
-		ksft_test_result_fail("pipe() failed\n");
-		return;
-	}
-
-	ret = fork();
-	if (ret < 0) {
-		ksft_test_result_fail("fork() failed\n");
-		goto close_comm_pipes;
-	} else if (!ret) {
-		exit(fn(mem, size, &comm_pipes));
-	}
-
-	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
-		;
-
-	if (do_mprotect) {
-		/*
-		 * mprotect() optimizations might try avoiding
-		 * write-faults by directly mapping pages writable.
-		 */
-		ret = mprotect(mem, size, PROT_READ);
-		ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
-		if (ret) {
-			ksft_test_result_fail("mprotect() failed\n");
-			write(comm_pipes.parent_ready[1], "0", 1);
-			wait(&ret);
-			goto close_comm_pipes;
-		}
-	}
-
-	/* Modify the page. */
-	memset(mem, 0xff, size);
-	write(comm_pipes.parent_ready[1], "0", 1);
-
-	wait(&ret);
-	if (WIFEXITED(ret))
-		ret = WEXITSTATUS(ret);
-	else
-		ret = -EINVAL;
-
-	ksft_test_result(!ret, "No leak from parent into child\n");
-close_comm_pipes:
-	close_comm_pipes(&comm_pipes);
-}
-
-static void test_cow_in_parent(char *mem, size_t size)
-{
-	do_test_cow_in_parent(mem, size, false, child_memcmp_fn);
-}
-
-static void test_cow_in_parent_mprotect(char *mem, size_t size)
-{
-	do_test_cow_in_parent(mem, size, true, child_memcmp_fn);
-}
-
-static void test_vmsplice_in_child(char *mem, size_t size)
-{
-	do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn);
-}
-
-static void test_vmsplice_in_child_mprotect(char *mem, size_t size)
-{
-	do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn);
-}
-
-static void do_test_vmsplice_in_parent(char *mem, size_t size,
-				       bool before_fork)
-{
-	struct iovec iov = {
-		.iov_base = mem,
-		.iov_len = size,
-	};
-	ssize_t cur, total, transferred;
-	struct comm_pipes comm_pipes;
-	char *old, *new;
-	int ret, fds[2];
-	char buf;
-
-	old = malloc(size);
-	new = malloc(size);
-
-	memcpy(old, mem, size);
-
-	ret = setup_comm_pipes(&comm_pipes);
-	if (ret) {
-		ksft_test_result_fail("pipe() failed\n");
-		goto free;
-	}
-
-	if (pipe(fds) < 0) {
-		ksft_test_result_fail("pipe() failed\n");
-		goto close_comm_pipes;
-	}
-
-	if (before_fork) {
-		transferred = vmsplice(fds[1], &iov, 1, 0);
-		if (transferred <= 0) {
-			ksft_test_result_fail("vmsplice() failed\n");
-			goto close_pipe;
-		}
-	}
-
-	ret = fork();
-	if (ret < 0) {
-		ksft_test_result_fail("fork() failed\n");
-		goto close_pipe;
-	} else if (!ret) {
-		write(comm_pipes.child_ready[1], "0", 1);
-		while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
-			;
-		/* Modify page content in the child. */
-		memset(mem, 0xff, size);
-		exit(0);
-	}
-
-	if (!before_fork) {
-		transferred = vmsplice(fds[1], &iov, 1, 0);
-		if (transferred <= 0) {
-			ksft_test_result_fail("vmsplice() failed\n");
-			wait(&ret);
-			goto close_pipe;
-		}
-	}
-
-	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
-		;
-	if (munmap(mem, size) < 0) {
-		ksft_test_result_fail("munmap() failed\n");
-		goto close_pipe;
-	}
-	write(comm_pipes.parent_ready[1], "0", 1);
-
-	/* Wait until the child is done writing. */
-	wait(&ret);
-	if (!WIFEXITED(ret)) {
-		ksft_test_result_fail("wait() failed\n");
-		goto close_pipe;
-	}
-
-	/* See if we still read the old values. */
-	for (total = 0; total < transferred; total += cur) {
-		cur = read(fds[0], new + total, transferred - total);
-		if (cur < 0) {
-			ksft_test_result_fail("read() failed\n");
-			goto close_pipe;
-		}
-	}
-
-	ksft_test_result(!memcmp(old, new, transferred),
-			 "No leak from child into parent\n");
-close_pipe:
-	close(fds[0]);
-	close(fds[1]);
-close_comm_pipes:
-	close_comm_pipes(&comm_pipes);
-free:
-	free(old);
-	free(new);
-}
-
-static void test_vmsplice_before_fork(char *mem, size_t size)
-{
-	do_test_vmsplice_in_parent(mem, size, true);
-}
-
-static void test_vmsplice_after_fork(char *mem, size_t size)
-{
-	do_test_vmsplice_in_parent(mem, size, false);
-}
-
-#ifdef LOCAL_CONFIG_HAVE_LIBURING
-static void do_test_iouring(char *mem, size_t size, bool use_fork)
-{
-	struct comm_pipes comm_pipes;
-	struct io_uring_cqe *cqe;
-	struct io_uring_sqe *sqe;
-	struct io_uring ring;
-	ssize_t cur, total;
-	struct iovec iov;
-	char *buf, *tmp;
-	int ret, fd;
-	FILE *file;
-
-	ret = setup_comm_pipes(&comm_pipes);
-	if (ret) {
-		ksft_test_result_fail("pipe() failed\n");
-		return;
-	}
-
-	file = tmpfile();
-	if (!file) {
-		ksft_test_result_fail("tmpfile() failed\n");
-		goto close_comm_pipes;
-	}
-	fd = fileno(file);
-	assert(fd);
-
-	tmp = malloc(size);
-	if (!tmp) {
-		ksft_test_result_fail("malloc() failed\n");
-		goto close_file;
-	}
-
-	/* Skip on errors, as we might just lack kernel support. */
-	ret = io_uring_queue_init(1, &ring, 0);
-	if (ret < 0) {
-		ksft_test_result_skip("io_uring_queue_init() failed\n");
-		goto free_tmp;
-	}
-
-	/*
-	 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
-	 * | FOLL_LONGTERM the range.
-	 *
-	 * Skip on errors, as we might just lack kernel support or might not
-	 * have sufficient MEMLOCK permissions.
-	 */
-	iov.iov_base = mem;
-	iov.iov_len = size;
-	ret = io_uring_register_buffers(&ring, &iov, 1);
-	if (ret) {
-		ksft_test_result_skip("io_uring_register_buffers() failed\n");
-		goto queue_exit;
-	}
-
-	if (use_fork) {
-		/*
-		 * fork() and keep the child alive until we're done. Note that
-		 * we expect the pinned page to not get shared with the child.
-		 */
-		ret = fork();
-		if (ret < 0) {
-			ksft_test_result_fail("fork() failed\n");
-			goto unregister_buffers;
-		} else if (!ret) {
-			write(comm_pipes.child_ready[1], "0", 1);
-			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
-				;
-			exit(0);
-		}
-
-		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
-			;
-	} else {
-		/*
-		 * Map the page R/O into the page table. Enable softdirty
-		 * tracking to stop the page from getting mapped R/W immediately
-		 * again by mprotect() optimizations. Note that we don't have an
-		 * easy way to test if that worked (the pagemap does not export
-		 * if the page is mapped R/O vs. R/W).
-		 */
-		ret = mprotect(mem, size, PROT_READ);
-		clear_softdirty();
-		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
-		if (ret) {
-			ksft_test_result_fail("mprotect() failed\n");
-			goto unregister_buffers;
-		}
-	}
-
-	/*
-	 * Modify the page and write page content as observed by the fixed
-	 * buffer pin to the file so we can verify it.
-	 */
-	memset(mem, 0xff, size);
-	sqe = io_uring_get_sqe(&ring);
-	if (!sqe) {
-		ksft_test_result_fail("io_uring_get_sqe() failed\n");
-		goto quit_child;
-	}
-	io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
-
-	ret = io_uring_submit(&ring);
-	if (ret < 0) {
-		ksft_test_result_fail("io_uring_submit() failed\n");
-		goto quit_child;
-	}
-
-	ret = io_uring_wait_cqe(&ring, &cqe);
-	if (ret < 0) {
-		ksft_test_result_fail("io_uring_wait_cqe() failed\n");
-		goto quit_child;
-	}
-
-	if (cqe->res != size) {
-		ksft_test_result_fail("write_fixed failed\n");
-		goto quit_child;
-	}
-	io_uring_cqe_seen(&ring, cqe);
-
-	/* Read back the file content to the temporary buffer. */
-	total = 0;
-	while (total < size) {
-		cur = pread(fd, tmp + total, size - total, total);
-		if (cur < 0) {
-			ksft_test_result_fail("pread() failed\n");
-			goto quit_child;
-		}
-		total += cur;
-	}
-
-	/* Finally, check if we read what we expected. */
-	ksft_test_result(!memcmp(mem, tmp, size),
-			 "Longterm R/W pin is reliable\n");
-
-quit_child:
-	if (use_fork) {
-		write(comm_pipes.parent_ready[1], "0", 1);
-		wait(&ret);
-	}
-unregister_buffers:
-	io_uring_unregister_buffers(&ring);
-queue_exit:
-	io_uring_queue_exit(&ring);
-free_tmp:
-	free(tmp);
-close_file:
-	fclose(file);
-close_comm_pipes:
-	close_comm_pipes(&comm_pipes);
-}
-
-static void test_iouring_ro(char *mem, size_t size)
-{
-	do_test_iouring(mem, size, false);
-}
-
-static void test_iouring_fork(char *mem, size_t size)
-{
-	do_test_iouring(mem, size, true);
-}
-
-#endif /* LOCAL_CONFIG_HAVE_LIBURING */
-
-enum ro_pin_test {
-	RO_PIN_TEST,
-	RO_PIN_TEST_SHARED,
-	RO_PIN_TEST_PREVIOUSLY_SHARED,
-	RO_PIN_TEST_RO_EXCLUSIVE,
-};
-
-static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
-			   bool fast)
-{
-	struct pin_longterm_test args;
-	struct comm_pipes comm_pipes;
-	char *tmp, buf;
-	__u64 tmp_val;
-	int ret;
-
-	if (gup_fd < 0) {
-		ksft_test_result_skip("gup_test not available\n");
-		return;
-	}
-
-	tmp = malloc(size);
-	if (!tmp) {
-		ksft_test_result_fail("malloc() failed\n");
-		return;
-	}
-
-	ret = setup_comm_pipes(&comm_pipes);
-	if (ret) {
-		ksft_test_result_fail("pipe() failed\n");
-		goto free_tmp;
-	}
-
-	switch (test) {
-	case RO_PIN_TEST:
-		break;
-	case RO_PIN_TEST_SHARED:
-	case RO_PIN_TEST_PREVIOUSLY_SHARED:
-		/*
-		 * Share the pages with our child. As the pages are not pinned,
-		 * this should just work.
-		 */
-		ret = fork();
-		if (ret < 0) {
-			ksft_test_result_fail("fork() failed\n");
-			goto close_comm_pipes;
-		} else if (!ret) {
-			write(comm_pipes.child_ready[1], "0", 1);
-			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
-				;
-			exit(0);
-		}
-
-		/* Wait until our child is ready. */
-		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
-			;
-
-		if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
-			/*
-			 * Tell the child to quit now and wait until it quit.
-			 * The pages should now be mapped R/O into our page
-			 * tables, but they are no longer shared.
-			 */
-			write(comm_pipes.parent_ready[1], "0", 1);
-			wait(&ret);
-			if (!WIFEXITED(ret))
-				ksft_print_msg("[INFO] wait() failed\n");
-		}
-		break;
-	case RO_PIN_TEST_RO_EXCLUSIVE:
-		/*
-		 * Map the page R/O into the page table. Enable softdirty
-		 * tracking to stop the page from getting mapped R/W immediately
-		 * again by mprotect() optimizations. Note that we don't have an
-		 * easy way to test if that worked (the pagemap does not export
-		 * if the page is mapped R/O vs. R/W).
-		 */
-		ret = mprotect(mem, size, PROT_READ);
-		clear_softdirty();
-		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
-		if (ret) {
-			ksft_test_result_fail("mprotect() failed\n");
-			goto close_comm_pipes;
-		}
-		break;
-	default:
-		assert(false);
-	}
-
-	/* Take a R/O pin. This should trigger unsharing. */
-	args.addr = (__u64)(uintptr_t)mem;
-	args.size = size;
-	args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
-	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
-	if (ret) {
-		if (errno == EINVAL)
-			ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
-		else
-			ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
-		goto wait;
-	}
-
-	/* Modify the page. */
-	memset(mem, 0xff, size);
-
-	/*
-	 * Read back the content via the pin to the temporary buffer and
-	 * test if we observed the modification.
-	 */
-	tmp_val = (__u64)(uintptr_t)tmp;
-	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
-	if (ret)
-		ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
-	else
-		ksft_test_result(!memcmp(mem, tmp, size),
-				 "Longterm R/O pin is reliable\n");
-
-	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
-	if (ret)
-		ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
-wait:
-	switch (test) {
-	case RO_PIN_TEST_SHARED:
-		write(comm_pipes.parent_ready[1], "0", 1);
-		wait(&ret);
-		if (!WIFEXITED(ret))
-			ksft_print_msg("[INFO] wait() failed\n");
-		break;
-	default:
-		break;
-	}
-close_comm_pipes:
-	close_comm_pipes(&comm_pipes);
-free_tmp:
-	free(tmp);
-}
-
-static void test_ro_pin_on_shared(char *mem, size_t size)
-{
-	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
-}
-
-static void test_ro_fast_pin_on_shared(char *mem, size_t size)
-{
-	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
-}
-
-static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
-{
-	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
-}
-
-static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
-{
-	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
-}
-
-static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
-{
-	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
-}
-
-static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
-{
-	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
-}
-
-typedef void (*test_fn)(char *mem, size_t size);
-
-static void do_run_with_base_page(test_fn fn, bool swapout)
-{
-	char *mem;
-	int ret;
-
-	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
-		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	if (mem == MAP_FAILED) {
-		ksft_test_result_fail("mmap() failed\n");
-		return;
-	}
-
-	ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
-	/* Ignore if not around on a kernel. */
-	if (ret && errno != EINVAL) {
-		ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
-		goto munmap;
-	}
-
-	/* Populate a base page. */
-	memset(mem, 0, pagesize);
-
-	if (swapout) {
-		madvise(mem, pagesize, MADV_PAGEOUT);
-		if (!pagemap_is_swapped(pagemap_fd, mem)) {
-			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
-			goto munmap;
-		}
-	}
-
-	fn(mem, pagesize);
-munmap:
-	munmap(mem, pagesize);
-}
-
-static void run_with_base_page(test_fn fn, const char *desc)
-{
-	ksft_print_msg("[RUN] %s ... with base page\n", desc);
-	do_run_with_base_page(fn, false);
-}
-
-static void run_with_base_page_swap(test_fn fn, const char *desc)
-{
-	ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
-	do_run_with_base_page(fn, true);
-}
-
-enum thp_run {
-	THP_RUN_PMD,
-	THP_RUN_PMD_SWAPOUT,
-	THP_RUN_PTE,
-	THP_RUN_PTE_SWAPOUT,
-	THP_RUN_SINGLE_PTE,
-	THP_RUN_SINGLE_PTE_SWAPOUT,
-	THP_RUN_PARTIAL_MREMAP,
-	THP_RUN_PARTIAL_SHARED,
-};
-
-static void do_run_with_thp(test_fn fn, enum thp_run thp_run)
-{
-	char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
-	size_t size, mmap_size, mremap_size;
-	int ret;
-
-	/* For alignment purposes, we need twice the thp size. */
-	mmap_size = 2 * thpsize;
-	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
-			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	if (mmap_mem == MAP_FAILED) {
-		ksft_test_result_fail("mmap() failed\n");
-		return;
-	}
-
-	/* We need a THP-aligned memory area. */
-	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
-
-	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
-	if (ret) {
-		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
-		goto munmap;
-	}
-
-	/*
-	 * Try to populate a THP. Touch the first sub-page and test if we get
-	 * another sub-page populated automatically.
-	 */
-	mem[0] = 0;
-	if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) {
-		ksft_test_result_skip("Did not get a THP populated\n");
-		goto munmap;
-	}
-	memset(mem, 0, thpsize);
-
-	size = thpsize;
-	switch (thp_run) {
-	case THP_RUN_PMD:
-	case THP_RUN_PMD_SWAPOUT:
-		break;
-	case THP_RUN_PTE:
-	case THP_RUN_PTE_SWAPOUT:
-		/*
-		 * Trigger PTE-mapping the THP by temporarily mapping a single
-		 * subpage R/O.
-		 */
-		ret = mprotect(mem + pagesize, pagesize, PROT_READ);
-		if (ret) {
-			ksft_test_result_fail("mprotect() failed\n");
-			goto munmap;
-		}
-		ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
-		if (ret) {
-			ksft_test_result_fail("mprotect() failed\n");
-			goto munmap;
-		}
-		break;
-	case THP_RUN_SINGLE_PTE:
-	case THP_RUN_SINGLE_PTE_SWAPOUT:
-		/*
-		 * Discard all but a single subpage of that PTE-mapped THP. What
-		 * remains is a single PTE mapping a single subpage.
-		 */
-		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
-		if (ret) {
-			ksft_test_result_fail("MADV_DONTNEED failed\n");
-			goto munmap;
-		}
-		size = pagesize;
-		break;
-	case THP_RUN_PARTIAL_MREMAP:
-		/*
-		 * Remap half of the THP. We need some new memory location
-		 * for that.
-		 */
-		mremap_size = thpsize / 2;
-		mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
-				  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-		if (mem == MAP_FAILED) {
-			ksft_test_result_fail("mmap() failed\n");
-			goto munmap;
-		}
-		tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
-			     MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
-		if (tmp != mremap_mem) {
-			ksft_test_result_fail("mremap() failed\n");
-			goto munmap;
-		}
-		size = mremap_size;
-		break;
-	case THP_RUN_PARTIAL_SHARED:
-		/*
-		 * Share the first page of the THP with a child and quit the
-		 * child. This will result in some parts of the THP never
-		 * have been shared.
-		 */
-		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
-		if (ret) {
-			ksft_test_result_fail("MADV_DONTFORK failed\n");
-			goto munmap;
-		}
-		ret = fork();
-		if (ret < 0) {
-			ksft_test_result_fail("fork() failed\n");
-			goto munmap;
-		} else if (!ret) {
-			exit(0);
-		}
-		wait(&ret);
-		/* Allow for sharing all pages again. */
-		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
-		if (ret) {
-			ksft_test_result_fail("MADV_DOFORK failed\n");
-			goto munmap;
-		}
-		break;
-	default:
-		assert(false);
-	}
-
-	switch (thp_run) {
-	case THP_RUN_PMD_SWAPOUT:
-	case THP_RUN_PTE_SWAPOUT:
-	case THP_RUN_SINGLE_PTE_SWAPOUT:
-		madvise(mem, size, MADV_PAGEOUT);
-		if (!range_is_swapped(mem, size)) {
-			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
-			goto munmap;
-		}
-		break;
-	default:
-		break;
-	}
-
-	fn(mem, size);
-munmap:
-	munmap(mmap_mem, mmap_size);
-	if (mremap_mem != MAP_FAILED)
-		munmap(mremap_mem, mremap_size);
-}
-
-static void run_with_thp(test_fn fn, const char *desc)
-{
-	ksft_print_msg("[RUN] %s ... with THP\n", desc);
-	do_run_with_thp(fn, THP_RUN_PMD);
-}
-
-static void run_with_thp_swap(test_fn fn, const char *desc)
-{
-	ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc);
-	do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT);
-}
-
-static void run_with_pte_mapped_thp(test_fn fn, const char *desc)
-{
-	ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc);
-	do_run_with_thp(fn, THP_RUN_PTE);
-}
-
-static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc)
-{
-	ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc);
-	do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT);
-}
-
-static void run_with_single_pte_of_thp(test_fn fn, const char *desc)
-{
-	ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc);
-	do_run_with_thp(fn, THP_RUN_SINGLE_PTE);
-}
-
-static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc)
-{
-	ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc);
-	do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT);
-}
-
-static void run_with_partial_mremap_thp(test_fn fn, const char *desc)
-{
-	ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc);
-	do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP);
-}
-
-static void run_with_partial_shared_thp(test_fn fn, const char *desc)
-{
-	ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc);
-	do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED);
-}
-
-static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
-{
-	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
-	char *mem, *dummy;
-
-	ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
-		       hugetlbsize / 1024);
-
-	flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
-
-	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
-	if (mem == MAP_FAILED) {
-		ksft_test_result_skip("need more free huge pages\n");
-		return;
-	}
-
-	/* Populate an huge page. */
-	memset(mem, 0, hugetlbsize);
-
-	/*
-	 * We need a total of two hugetlb pages to handle COW/unsharing
-	 * properly, otherwise we might get zapped by a SIGBUS.
-	 */
-	dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
-	if (dummy == MAP_FAILED) {
-		ksft_test_result_skip("need more free huge pages\n");
-		goto munmap;
-	}
-	munmap(dummy, hugetlbsize);
-
-	fn(mem, hugetlbsize);
-munmap:
-	munmap(mem, hugetlbsize);
-}
-
-struct test_case {
-	const char *desc;
-	test_fn fn;
-};
-
-/*
- * Test cases that are specific to anonymous pages: pages in private mappings
- * that may get shared via COW during fork().
- */
-static const struct test_case anon_test_cases[] = {
-	/*
-	 * Basic COW tests for fork() without any GUP. If we miss to break COW,
-	 * either the child can observe modifications by the parent or the
-	 * other way around.
-	 */
-	{
-		"Basic COW after fork()",
-		test_cow_in_parent,
-	},
-	/*
-	 * Basic test, but do an additional mprotect(PROT_READ)+
-	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
-	 */
-	{
-		"Basic COW after fork() with mprotect() optimization",
-		test_cow_in_parent_mprotect,
-	},
-	/*
-	 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
-	 * we miss to break COW, the child observes modifications by the parent.
-	 * This is CVE-2020-29374 reported by Jann Horn.
-	 */
-	{
-		"vmsplice() + unmap in child",
-		test_vmsplice_in_child
-	},
-	/*
-	 * vmsplice() test, but do an additional mprotect(PROT_READ)+
-	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
-	 */
-	{
-		"vmsplice() + unmap in child with mprotect() optimization",
-		test_vmsplice_in_child_mprotect
-	},
-	/*
-	 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
-	 * fork(); modify in the child. If we miss to break COW, the parent
-	 * observes modifications by the child.
-	 */
-	{
-		"vmsplice() before fork(), unmap in parent after fork()",
-		test_vmsplice_before_fork,
-	},
-	/*
-	 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
-	 * child. If we miss to break COW, the parent observes modifications by
-	 * the child.
-	 */
-	{
-		"vmsplice() + unmap in parent after fork()",
-		test_vmsplice_after_fork,
-	},
-#ifdef LOCAL_CONFIG_HAVE_LIBURING
-	/*
-	 * Take a R/W longterm pin and then map the page R/O into the page
-	 * table to trigger a write fault on next access. When modifying the
-	 * page, the page content must be visible via the pin.
-	 */
-	{
-		"R/O-mapping a page registered as iouring fixed buffer",
-		test_iouring_ro,
-	},
-	/*
-	 * Take a R/W longterm pin and then fork() a child. When modifying the
-	 * page, the page content must be visible via the pin. We expect the
-	 * pinned page to not get shared with the child.
-	 */
-	{
-		"fork() with an iouring fixed buffer",
-		test_iouring_fork,
-	},
-
-#endif /* LOCAL_CONFIG_HAVE_LIBURING */
-	/*
-	 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
-	 * When modifying the page via the page table, the page content change
-	 * must be visible via the pin.
-	 */
-	{
-		"R/O GUP pin on R/O-mapped shared page",
-		test_ro_pin_on_shared,
-	},
-	/* Same as above, but using GUP-fast. */
-	{
-		"R/O GUP-fast pin on R/O-mapped shared page",
-		test_ro_fast_pin_on_shared,
-	},
-	/*
-	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
-	 * was previously shared. When modifying the page via the page table,
-	 * the page content change must be visible via the pin.
-	 */
-	{
-		"R/O GUP pin on R/O-mapped previously-shared page",
-		test_ro_pin_on_ro_previously_shared,
-	},
-	/* Same as above, but using GUP-fast. */
-	{
-		"R/O GUP-fast pin on R/O-mapped previously-shared page",
-		test_ro_fast_pin_on_ro_previously_shared,
-	},
-	/*
-	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
-	 * When modifying the page via the page table, the page content change
-	 * must be visible via the pin.
-	 */
-	{
-		"R/O GUP pin on R/O-mapped exclusive page",
-		test_ro_pin_on_ro_exclusive,
-	},
-	/* Same as above, but using GUP-fast. */
-	{
-		"R/O GUP-fast pin on R/O-mapped exclusive page",
-		test_ro_fast_pin_on_ro_exclusive,
-	},
-};
-
-static void run_anon_test_case(struct test_case const *test_case)
-{
-	int i;
-
-	run_with_base_page(test_case->fn, test_case->desc);
-	run_with_base_page_swap(test_case->fn, test_case->desc);
-	if (thpsize) {
-		run_with_thp(test_case->fn, test_case->desc);
-		run_with_thp_swap(test_case->fn, test_case->desc);
-		run_with_pte_mapped_thp(test_case->fn, test_case->desc);
-		run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc);
-		run_with_single_pte_of_thp(test_case->fn, test_case->desc);
-		run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc);
-		run_with_partial_mremap_thp(test_case->fn, test_case->desc);
-		run_with_partial_shared_thp(test_case->fn, test_case->desc);
-	}
-	for (i = 0; i < nr_hugetlbsizes; i++)
-		run_with_hugetlb(test_case->fn, test_case->desc,
-				 hugetlbsizes[i]);
-}
-
-static void run_anon_test_cases(void)
-{
-	int i;
-
-	ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
-
-	for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
-		run_anon_test_case(&anon_test_cases[i]);
-}
-
-static int tests_per_anon_test_case(void)
-{
-	int tests = 2 + nr_hugetlbsizes;
-
-	if (thpsize)
-		tests += 8;
-	return tests;
-}
-
-enum anon_thp_collapse_test {
-	ANON_THP_COLLAPSE_UNSHARED,
-	ANON_THP_COLLAPSE_FULLY_SHARED,
-	ANON_THP_COLLAPSE_LOWER_SHARED,
-	ANON_THP_COLLAPSE_UPPER_SHARED,
-};
-
-static void do_test_anon_thp_collapse(char *mem, size_t size,
-				      enum anon_thp_collapse_test test)
-{
-	struct comm_pipes comm_pipes;
-	char buf;
-	int ret;
-
-	ret = setup_comm_pipes(&comm_pipes);
-	if (ret) {
-		ksft_test_result_fail("pipe() failed\n");
-		return;
-	}
-
-	/*
-	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
-	 * R/O, such that we can try collapsing it later.
-	 */
-	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
-	if (ret) {
-		ksft_test_result_fail("mprotect() failed\n");
-		goto close_comm_pipes;
-	}
-	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
-	if (ret) {
-		ksft_test_result_fail("mprotect() failed\n");
-		goto close_comm_pipes;
-	}
-
-	switch (test) {
-	case ANON_THP_COLLAPSE_UNSHARED:
-		/* Collapse before actually COW-sharing the page. */
-		ret = madvise(mem, size, MADV_COLLAPSE);
-		if (ret) {
-			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
-					      strerror(errno));
-			goto close_comm_pipes;
-		}
-		break;
-	case ANON_THP_COLLAPSE_FULLY_SHARED:
-		/* COW-share the full PTE-mapped THP. */
-		break;
-	case ANON_THP_COLLAPSE_LOWER_SHARED:
-		/* Don't COW-share the upper part of the THP. */
-		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
-		if (ret) {
-			ksft_test_result_fail("MADV_DONTFORK failed\n");
-			goto close_comm_pipes;
-		}
-		break;
-	case ANON_THP_COLLAPSE_UPPER_SHARED:
-		/* Don't COW-share the lower part of the THP. */
-		ret = madvise(mem, size / 2, MADV_DONTFORK);
-		if (ret) {
-			ksft_test_result_fail("MADV_DONTFORK failed\n");
-			goto close_comm_pipes;
-		}
-		break;
-	default:
-		assert(false);
-	}
-
-	ret = fork();
-	if (ret < 0) {
-		ksft_test_result_fail("fork() failed\n");
-		goto close_comm_pipes;
-	} else if (!ret) {
-		switch (test) {
-		case ANON_THP_COLLAPSE_UNSHARED:
-		case ANON_THP_COLLAPSE_FULLY_SHARED:
-			exit(child_memcmp_fn(mem, size, &comm_pipes));
-			break;
-		case ANON_THP_COLLAPSE_LOWER_SHARED:
-			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
-			break;
-		case ANON_THP_COLLAPSE_UPPER_SHARED:
-			exit(child_memcmp_fn(mem + size / 2, size / 2,
-					     &comm_pipes));
-			break;
-		default:
-			assert(false);
-		}
-	}
-
-	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
-		;
-
-	switch (test) {
-	case ANON_THP_COLLAPSE_UNSHARED:
-		break;
-	case ANON_THP_COLLAPSE_UPPER_SHARED:
-	case ANON_THP_COLLAPSE_LOWER_SHARED:
-		/*
-		 * Revert MADV_DONTFORK such that we merge the VMAs and are
-		 * able to actually collapse.
-		 */
-		ret = madvise(mem, size, MADV_DOFORK);
-		if (ret) {
-			ksft_test_result_fail("MADV_DOFORK failed\n");
-			write(comm_pipes.parent_ready[1], "0", 1);
-			wait(&ret);
-			goto close_comm_pipes;
-		}
-		/* FALLTHROUGH */
-	case ANON_THP_COLLAPSE_FULLY_SHARED:
-		/* Collapse before anyone modified the COW-shared page. */
-		ret = madvise(mem, size, MADV_COLLAPSE);
-		if (ret) {
-			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
-					      strerror(errno));
-			write(comm_pipes.parent_ready[1], "0", 1);
-			wait(&ret);
-			goto close_comm_pipes;
-		}
-		break;
-	default:
-		assert(false);
-	}
-
-	/* Modify the page. */
-	memset(mem, 0xff, size);
-	write(comm_pipes.parent_ready[1], "0", 1);
-
-	wait(&ret);
-	if (WIFEXITED(ret))
-		ret = WEXITSTATUS(ret);
-	else
-		ret = -EINVAL;
-
-	ksft_test_result(!ret, "No leak from parent into child\n");
-close_comm_pipes:
-	close_comm_pipes(&comm_pipes);
-}
-
-static void test_anon_thp_collapse_unshared(char *mem, size_t size)
-{
-	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
-}
-
-static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
-{
-	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
-}
-
-static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
-{
-	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
-}
-
-static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
-{
-	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
-}
-
-/*
- * Test cases that are specific to anonymous THP: pages in private mappings
- * that may get shared via COW during fork().
- */
-static const struct test_case anon_thp_test_cases[] = {
-	/*
-	 * Basic COW test for fork() without any GUP when collapsing a THP
-	 * before fork().
-	 *
-	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
-	 * collapse") might easily get COW handling wrong when not collapsing
-	 * exclusivity information properly.
-	 */
-	{
-		"Basic COW after fork() when collapsing before fork()",
-		test_anon_thp_collapse_unshared,
-	},
-	/* Basic COW test, but collapse after COW-sharing a full THP. */
-	{
-		"Basic COW after fork() when collapsing after fork() (fully shared)",
-		test_anon_thp_collapse_fully_shared,
-	},
-	/*
-	 * Basic COW test, but collapse after COW-sharing the lower half of a
-	 * THP.
-	 */
-	{
-		"Basic COW after fork() when collapsing after fork() (lower shared)",
-		test_anon_thp_collapse_lower_shared,
-	},
-	/*
-	 * Basic COW test, but collapse after COW-sharing the upper half of a
-	 * THP.
-	 */
-	{
-		"Basic COW after fork() when collapsing after fork() (upper shared)",
-		test_anon_thp_collapse_upper_shared,
-	},
-};
-
-static void run_anon_thp_test_cases(void)
-{
-	int i;
-
-	if (!thpsize)
-		return;
-
-	ksft_print_msg("[INFO] Anonymous THP tests\n");
-
-	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
-		struct test_case const *test_case = &anon_thp_test_cases[i];
-
-		ksft_print_msg("[RUN] %s\n", test_case->desc);
-		do_run_with_thp(test_case->fn, THP_RUN_PMD);
-	}
-}
-
-static int tests_per_anon_thp_test_case(void)
-{
-	return thpsize ? 1 : 0;
-}
-
-typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
-
-static void test_cow(char *mem, const char *smem, size_t size)
-{
-	char *old = malloc(size);
-
-	/* Backup the original content. */
-	memcpy(old, smem, size);
-
-	/* Modify the page. */
-	memset(mem, 0xff, size);
-
-	/* See if we still read the old values via the other mapping. */
-	ksft_test_result(!memcmp(smem, old, size),
-			 "Other mapping not modified\n");
-	free(old);
-}
-
-static void test_ro_pin(char *mem, const char *smem, size_t size)
-{
-	do_test_ro_pin(mem, size, RO_PIN_TEST, false);
-}
-
-static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
-{
-	do_test_ro_pin(mem, size, RO_PIN_TEST, true);
-}
-
-static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
-{
-	char *mem, *smem, tmp;
-
-	ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
-
-	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
-		   MAP_PRIVATE | MAP_ANON, -1, 0);
-	if (mem == MAP_FAILED) {
-		ksft_test_result_fail("mmap() failed\n");
-		return;
-	}
-
-	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
-	if (mem == MAP_FAILED) {
-		ksft_test_result_fail("mmap() failed\n");
-		goto munmap;
-	}
-
-	/* Read from the page to populate the shared zeropage. */
-	tmp = *mem + *smem;
-	asm volatile("" : "+r" (tmp));
-
-	fn(mem, smem, pagesize);
-munmap:
-	munmap(mem, pagesize);
-	if (smem != MAP_FAILED)
-		munmap(smem, pagesize);
-}
-
-static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
-{
-	char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
-	size_t mmap_size;
-	int ret;
-
-	ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
-
-	if (!has_huge_zeropage) {
-		ksft_test_result_skip("Huge zeropage not enabled\n");
-		return;
-	}
-
-	/* For alignment purposes, we need twice the thp size. */
-	mmap_size = 2 * thpsize;
-	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
-			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	if (mmap_mem == MAP_FAILED) {
-		ksft_test_result_fail("mmap() failed\n");
-		return;
-	}
-	mmap_smem = mmap(NULL, mmap_size, PROT_READ,
-			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	if (mmap_smem == MAP_FAILED) {
-		ksft_test_result_fail("mmap() failed\n");
-		goto munmap;
-	}
-
-	/* We need a THP-aligned memory area. */
-	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
-	smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1));
-
-	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
-	ret |= madvise(smem, thpsize, MADV_HUGEPAGE);
-	if (ret) {
-		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
-		goto munmap;
-	}
-
-	/*
-	 * Read from the memory to populate the huge shared zeropage. Read from
-	 * the first sub-page and test if we get another sub-page populated
-	 * automatically.
-	 */
-	tmp = *mem + *smem;
-	asm volatile("" : "+r" (tmp));
-	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
-	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
-		ksft_test_result_skip("Did not get THPs populated\n");
-		goto munmap;
-	}
-
-	fn(mem, smem, thpsize);
-munmap:
-	munmap(mmap_mem, mmap_size);
-	if (mmap_smem != MAP_FAILED)
-		munmap(mmap_smem, mmap_size);
-}
-
-static void run_with_memfd(non_anon_test_fn fn, const char *desc)
-{
-	char *mem, *smem, tmp;
-	int fd;
-
-	ksft_print_msg("[RUN] %s ... with memfd\n", desc);
-
-	fd = memfd_create("test", 0);
-	if (fd < 0) {
-		ksft_test_result_fail("memfd_create() failed\n");
-		return;
-	}
-
-	/* File consists of a single page filled with zeroes. */
-	if (fallocate(fd, 0, 0, pagesize)) {
-		ksft_test_result_fail("fallocate() failed\n");
-		goto close;
-	}
-
-	/* Create a private mapping of the memfd. */
-	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
-	if (mem == MAP_FAILED) {
-		ksft_test_result_fail("mmap() failed\n");
-		goto close;
-	}
-	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
-	if (mem == MAP_FAILED) {
-		ksft_test_result_fail("mmap() failed\n");
-		goto munmap;
-	}
-
-	/* Fault the page in. */
-	tmp = *mem + *smem;
-	asm volatile("" : "+r" (tmp));
-
-	fn(mem, smem, pagesize);
-munmap:
-	munmap(mem, pagesize);
-	if (smem != MAP_FAILED)
-		munmap(smem, pagesize);
-close:
-	close(fd);
-}
-
-static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
-{
-	char *mem, *smem, tmp;
-	FILE *file;
-	int fd;
-
-	ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
-
-	file = tmpfile();
-	if (!file) {
-		ksft_test_result_fail("tmpfile() failed\n");
-		return;
-	}
-
-	fd = fileno(file);
-	if (fd < 0) {
-		ksft_test_result_skip("fileno() failed\n");
-		return;
-	}
-
-	/* File consists of a single page filled with zeroes. */
-	if (fallocate(fd, 0, 0, pagesize)) {
-		ksft_test_result_fail("fallocate() failed\n");
-		goto close;
-	}
-
-	/* Create a private mapping of the memfd. */
-	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
-	if (mem == MAP_FAILED) {
-		ksft_test_result_fail("mmap() failed\n");
-		goto close;
-	}
-	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
-	if (mem == MAP_FAILED) {
-		ksft_test_result_fail("mmap() failed\n");
-		goto munmap;
-	}
-
-	/* Fault the page in. */
-	tmp = *mem + *smem;
-	asm volatile("" : "+r" (tmp));
-
-	fn(mem, smem, pagesize);
-munmap:
-	munmap(mem, pagesize);
-	if (smem != MAP_FAILED)
-		munmap(smem, pagesize);
-close:
-	fclose(file);
-}
-
-static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
-				   size_t hugetlbsize)
-{
-	int flags = MFD_HUGETLB;
-	char *mem, *smem, tmp;
-	int fd;
-
-	ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
-		       hugetlbsize / 1024);
-
-	flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
-
-	fd = memfd_create("test", flags);
-	if (fd < 0) {
-		ksft_test_result_skip("memfd_create() failed\n");
-		return;
-	}
-
-	/* File consists of a single page filled with zeroes. */
-	if (fallocate(fd, 0, 0, hugetlbsize)) {
-		ksft_test_result_skip("need more free huge pages\n");
-		goto close;
-	}
-
-	/* Create a private mapping of the memfd. */
-	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
-		   0);
-	if (mem == MAP_FAILED) {
-		ksft_test_result_skip("need more free huge pages\n");
-		goto close;
-	}
-	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
-	if (mem == MAP_FAILED) {
-		ksft_test_result_fail("mmap() failed\n");
-		goto munmap;
-	}
-
-	/* Fault the page in. */
-	tmp = *mem + *smem;
-	asm volatile("" : "+r" (tmp));
-
-	fn(mem, smem, hugetlbsize);
-munmap:
-	munmap(mem, hugetlbsize);
-	if (mem != MAP_FAILED)
-		munmap(smem, hugetlbsize);
-close:
-	close(fd);
-}
-
-struct non_anon_test_case {
-	const char *desc;
-	non_anon_test_fn fn;
-};
-
-/*
- * Test cases that target any pages in private mappings that are not anonymous:
- * pages that may get shared via COW ndependent of fork(). This includes
- * the shared zeropage(s), pagecache pages, ...
- */
-static const struct non_anon_test_case non_anon_test_cases[] = {
-	/*
-	 * Basic COW test without any GUP. If we miss to break COW, changes are
-	 * visible via other private/shared mappings.
-	 */
-	{
-		"Basic COW",
-		test_cow,
-	},
-	/*
-	 * Take a R/O longterm pin. When modifying the page via the page table,
-	 * the page content change must be visible via the pin.
-	 */
-	{
-		"R/O longterm GUP pin",
-		test_ro_pin,
-	},
-	/* Same as above, but using GUP-fast. */
-	{
-		"R/O longterm GUP-fast pin",
-		test_ro_fast_pin,
-	},
-};
-
-static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
-{
-	int i;
-
-	run_with_zeropage(test_case->fn, test_case->desc);
-	run_with_memfd(test_case->fn, test_case->desc);
-	run_with_tmpfile(test_case->fn, test_case->desc);
-	if (thpsize)
-		run_with_huge_zeropage(test_case->fn, test_case->desc);
-	for (i = 0; i < nr_hugetlbsizes; i++)
-		run_with_memfd_hugetlb(test_case->fn, test_case->desc,
-				       hugetlbsizes[i]);
-}
-
-static void run_non_anon_test_cases(void)
-{
-	int i;
-
-	ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
-
-	for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
-		run_non_anon_test_case(&non_anon_test_cases[i]);
-}
-
-static int tests_per_non_anon_test_case(void)
-{
-	int tests = 3 + nr_hugetlbsizes;
-
-	if (thpsize)
-		tests += 1;
-	return tests;
-}
-
-int main(int argc, char **argv)
-{
-	int err;
-
-	pagesize = getpagesize();
-	detect_thpsize();
-	detect_hugetlbsizes();
-	detect_huge_zeropage();
-
-	ksft_print_header();
-	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
-		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
-		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
-
-	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
-	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
-	if (pagemap_fd < 0)
-		ksft_exit_fail_msg("opening pagemap failed\n");
-
-	run_anon_test_cases();
-	run_anon_thp_test_cases();
-	run_non_anon_test_cases();
-
-	err = ksft_get_fail_cnt();
-	if (err)
-		ksft_exit_fail_msg("%d out of %d tests failed\n",
-				   err, ksft_test_num());
-	return ksft_exit_pass();
-}
diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c
deleted file mode 100644
index e43879291dac..000000000000
--- a/tools/testing/selftests/vm/gup_test.c
+++ /dev/null
@@ -1,271 +0,0 @@
-#include <fcntl.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <dirent.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <pthread.h>
-#include <assert.h>
-#include <mm/gup_test.h>
-#include "../kselftest.h"
-
-#include "util.h"
-
-#define MB (1UL << 20)
-
-/* Just the flags we need, copied from mm.h: */
-#define FOLL_WRITE	0x01	/* check pte is writable */
-#define FOLL_TOUCH	0x02	/* mark page accessed */
-
-#define GUP_TEST_FILE "/sys/kernel/debug/gup_test"
-
-static unsigned long cmd = GUP_FAST_BENCHMARK;
-static int gup_fd, repeats = 1;
-static unsigned long size = 128 * MB;
-/* Serialize prints */
-static pthread_mutex_t print_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-static char *cmd_to_str(unsigned long cmd)
-{
-	switch (cmd) {
-	case GUP_FAST_BENCHMARK:
-		return "GUP_FAST_BENCHMARK";
-	case PIN_FAST_BENCHMARK:
-		return "PIN_FAST_BENCHMARK";
-	case PIN_LONGTERM_BENCHMARK:
-		return "PIN_LONGTERM_BENCHMARK";
-	case GUP_BASIC_TEST:
-		return "GUP_BASIC_TEST";
-	case PIN_BASIC_TEST:
-		return "PIN_BASIC_TEST";
-	case DUMP_USER_PAGES_TEST:
-		return "DUMP_USER_PAGES_TEST";
-	}
-	return "Unknown command";
-}
-
-void *gup_thread(void *data)
-{
-	struct gup_test gup = *(struct gup_test *)data;
-	int i;
-
-	/* Only report timing information on the *_BENCHMARK commands: */
-	if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) ||
-	     (cmd == PIN_LONGTERM_BENCHMARK)) {
-		for (i = 0; i < repeats; i++) {
-			gup.size = size;
-			if (ioctl(gup_fd, cmd, &gup))
-				perror("ioctl"), exit(1);
-
-			pthread_mutex_lock(&print_mutex);
-			printf("%s: Time: get:%lld put:%lld us",
-			       cmd_to_str(cmd), gup.get_delta_usec,
-			       gup.put_delta_usec);
-			if (gup.size != size)
-				printf(", truncated (size: %lld)", gup.size);
-			printf("\n");
-			pthread_mutex_unlock(&print_mutex);
-		}
-	} else {
-		gup.size = size;
-		if (ioctl(gup_fd, cmd, &gup)) {
-			perror("ioctl");
-			exit(1);
-		}
-
-		pthread_mutex_lock(&print_mutex);
-		printf("%s: done\n", cmd_to_str(cmd));
-		if (gup.size != size)
-			printf("Truncated (size: %lld)\n", gup.size);
-		pthread_mutex_unlock(&print_mutex);
-	}
-
-	return NULL;
-}
-
-int main(int argc, char **argv)
-{
-	struct gup_test gup = { 0 };
-	int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret;
-	int flags = MAP_PRIVATE, touch = 0;
-	char *file = "/dev/zero";
-	pthread_t *tid;
-	char *p;
-
-	while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) {
-		switch (opt) {
-		case 'a':
-			cmd = PIN_FAST_BENCHMARK;
-			break;
-		case 'b':
-			cmd = PIN_BASIC_TEST;
-			break;
-		case 'L':
-			cmd = PIN_LONGTERM_BENCHMARK;
-			break;
-		case 'c':
-			cmd = DUMP_USER_PAGES_TEST;
-			/*
-			 * Dump page 0 (index 1). May be overridden later, by
-			 * user's non-option arguments.
-			 *
-			 * .which_pages is zero-based, so that zero can mean "do
-			 * nothing".
-			 */
-			gup.which_pages[0] = 1;
-			break;
-		case 'p':
-			/* works only with DUMP_USER_PAGES_TEST */
-			gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN;
-			break;
-		case 'F':
-			/* strtol, so you can pass flags in hex form */
-			gup.gup_flags = strtol(optarg, 0, 0);
-			break;
-		case 'j':
-			nthreads = atoi(optarg);
-			break;
-		case 'm':
-			size = atoi(optarg) * MB;
-			break;
-		case 'r':
-			repeats = atoi(optarg);
-			break;
-		case 'n':
-			nr_pages = atoi(optarg);
-			break;
-		case 't':
-			thp = 1;
-			break;
-		case 'T':
-			thp = 0;
-			break;
-		case 'U':
-			cmd = GUP_BASIC_TEST;
-			break;
-		case 'u':
-			cmd = GUP_FAST_BENCHMARK;
-			break;
-		case 'w':
-			write = 1;
-			break;
-		case 'W':
-			write = 0;
-			break;
-		case 'f':
-			file = optarg;
-			break;
-		case 'S':
-			flags &= ~MAP_PRIVATE;
-			flags |= MAP_SHARED;
-			break;
-		case 'H':
-			flags |= (MAP_HUGETLB | MAP_ANONYMOUS);
-			break;
-		case 'z':
-			/* fault pages in gup, do not fault in userland */
-			touch = 1;
-			break;
-		default:
-			return -1;
-		}
-	}
-
-	if (optind < argc) {
-		int extra_arg_count = 0;
-		/*
-		 * For example:
-		 *
-		 *   ./gup_test -c 0 1 0x1001
-		 *
-		 * ...to dump pages 0, 1, and 4097
-		 */
-
-		while ((optind < argc) &&
-		       (extra_arg_count < GUP_TEST_MAX_PAGES_TO_DUMP)) {
-			/*
-			 * Do the 1-based indexing here, so that the user can
-			 * use normal 0-based indexing on the command line.
-			 */
-			long page_index = strtol(argv[optind], 0, 0) + 1;
-
-			gup.which_pages[extra_arg_count] = page_index;
-			extra_arg_count++;
-			optind++;
-		}
-	}
-
-	filed = open(file, O_RDWR|O_CREAT);
-	if (filed < 0) {
-		perror("open");
-		exit(filed);
-	}
-
-	gup.nr_pages_per_call = nr_pages;
-	if (write)
-		gup.gup_flags |= FOLL_WRITE;
-
-	gup_fd = open(GUP_TEST_FILE, O_RDWR);
-	if (gup_fd == -1) {
-		switch (errno) {
-		case EACCES:
-			if (getuid())
-				printf("Please run this test as root\n");
-			break;
-		case ENOENT:
-			if (opendir("/sys/kernel/debug") == NULL) {
-				printf("mount debugfs at /sys/kernel/debug\n");
-				break;
-			}
-			printf("check if CONFIG_GUP_TEST is enabled in kernel config\n");
-			break;
-		default:
-			perror("failed to open " GUP_TEST_FILE);
-			break;
-		}
-		exit(KSFT_SKIP);
-	}
-
-	p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0);
-	if (p == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
-	gup.addr = (unsigned long)p;
-
-	if (thp == 1)
-		madvise(p, size, MADV_HUGEPAGE);
-	else if (thp == 0)
-		madvise(p, size, MADV_NOHUGEPAGE);
-
-	/*
-	 * FOLL_TOUCH, in gup_test, is used as an either/or case: either
-	 * fault pages in from the kernel via FOLL_TOUCH, or fault them
-	 * in here, from user space. This allows comparison of performance
-	 * between those two cases.
-	 */
-	if (touch) {
-		gup.gup_flags |= FOLL_TOUCH;
-	} else {
-		for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
-			p[0] = 0;
-	}
-
-	tid = malloc(sizeof(pthread_t) * nthreads);
-	assert(tid);
-	for (i = 0; i < nthreads; i++) {
-		ret = pthread_create(&tid[i], NULL, gup_thread, &gup);
-		assert(ret == 0);
-	}
-	for (i = 0; i < nthreads; i++) {
-		ret = pthread_join(tid[i], NULL);
-		assert(ret == 0);
-	}
-	free(tid);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
deleted file mode 100644
index 4adaad1b822f..000000000000
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ /dev/null
@@ -1,2054 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * HMM stands for Heterogeneous Memory Management, it is a helper layer inside
- * the linux kernel to help device drivers mirror a process address space in
- * the device. This allows the device to use the same address space which
- * makes communication and data exchange a lot easier.
- *
- * This framework's sole purpose is to exercise various code paths inside
- * the kernel to make sure that HMM performs as expected and to flush out any
- * bugs.
- */
-
-#include "../kselftest_harness.h"
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <strings.h>
-#include <time.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <sys/ioctl.h>
-
-
-/*
- * This is a private UAPI to the kernel test module so it isn't exported
- * in the usual include/uapi/... directory.
- */
-#include <lib/test_hmm_uapi.h>
-#include <mm/gup_test.h>
-
-struct hmm_buffer {
-	void		*ptr;
-	void		*mirror;
-	unsigned long	size;
-	int		fd;
-	uint64_t	cpages;
-	uint64_t	faults;
-};
-
-enum {
-	HMM_PRIVATE_DEVICE_ONE,
-	HMM_PRIVATE_DEVICE_TWO,
-	HMM_COHERENCE_DEVICE_ONE,
-	HMM_COHERENCE_DEVICE_TWO,
-};
-
-#define TWOMEG		(1 << 21)
-#define HMM_BUFFER_SIZE (1024 << 12)
-#define HMM_PATH_MAX    64
-#define NTIMES		10
-
-#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
-/* Just the flags we need, copied from mm.h: */
-#define FOLL_WRITE	0x01	/* check pte is writable */
-#define FOLL_LONGTERM   0x10000 /* mapping lifetime is indefinite */
-
-FIXTURE(hmm)
-{
-	int		fd;
-	unsigned int	page_size;
-	unsigned int	page_shift;
-};
-
-FIXTURE_VARIANT(hmm)
-{
-	int     device_number;
-};
-
-FIXTURE_VARIANT_ADD(hmm, hmm_device_private)
-{
-	.device_number = HMM_PRIVATE_DEVICE_ONE,
-};
-
-FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent)
-{
-	.device_number = HMM_COHERENCE_DEVICE_ONE,
-};
-
-FIXTURE(hmm2)
-{
-	int		fd0;
-	int		fd1;
-	unsigned int	page_size;
-	unsigned int	page_shift;
-};
-
-FIXTURE_VARIANT(hmm2)
-{
-	int     device_number0;
-	int     device_number1;
-};
-
-FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private)
-{
-	.device_number0 = HMM_PRIVATE_DEVICE_ONE,
-	.device_number1 = HMM_PRIVATE_DEVICE_TWO,
-};
-
-FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent)
-{
-	.device_number0 = HMM_COHERENCE_DEVICE_ONE,
-	.device_number1 = HMM_COHERENCE_DEVICE_TWO,
-};
-
-static int hmm_open(int unit)
-{
-	char pathname[HMM_PATH_MAX];
-	int fd;
-
-	snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit);
-	fd = open(pathname, O_RDWR, 0);
-	if (fd < 0)
-		fprintf(stderr, "could not open hmm dmirror driver (%s)\n",
-			pathname);
-	return fd;
-}
-
-static bool hmm_is_coherent_type(int dev_num)
-{
-	return (dev_num >= HMM_COHERENCE_DEVICE_ONE);
-}
-
-FIXTURE_SETUP(hmm)
-{
-	self->page_size = sysconf(_SC_PAGE_SIZE);
-	self->page_shift = ffs(self->page_size) - 1;
-
-	self->fd = hmm_open(variant->device_number);
-	if (self->fd < 0 && hmm_is_coherent_type(variant->device_number))
-		SKIP(exit(0), "DEVICE_COHERENT not available");
-	ASSERT_GE(self->fd, 0);
-}
-
-FIXTURE_SETUP(hmm2)
-{
-	self->page_size = sysconf(_SC_PAGE_SIZE);
-	self->page_shift = ffs(self->page_size) - 1;
-
-	self->fd0 = hmm_open(variant->device_number0);
-	if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0))
-		SKIP(exit(0), "DEVICE_COHERENT not available");
-	ASSERT_GE(self->fd0, 0);
-	self->fd1 = hmm_open(variant->device_number1);
-	ASSERT_GE(self->fd1, 0);
-}
-
-FIXTURE_TEARDOWN(hmm)
-{
-	int ret = close(self->fd);
-
-	ASSERT_EQ(ret, 0);
-	self->fd = -1;
-}
-
-FIXTURE_TEARDOWN(hmm2)
-{
-	int ret = close(self->fd0);
-
-	ASSERT_EQ(ret, 0);
-	self->fd0 = -1;
-
-	ret = close(self->fd1);
-	ASSERT_EQ(ret, 0);
-	self->fd1 = -1;
-}
-
-static int hmm_dmirror_cmd(int fd,
-			   unsigned long request,
-			   struct hmm_buffer *buffer,
-			   unsigned long npages)
-{
-	struct hmm_dmirror_cmd cmd;
-	int ret;
-
-	/* Simulate a device reading system memory. */
-	cmd.addr = (__u64)buffer->ptr;
-	cmd.ptr = (__u64)buffer->mirror;
-	cmd.npages = npages;
-
-	for (;;) {
-		ret = ioctl(fd, request, &cmd);
-		if (ret == 0)
-			break;
-		if (errno == EINTR)
-			continue;
-		return -errno;
-	}
-	buffer->cpages = cmd.cpages;
-	buffer->faults = cmd.faults;
-
-	return 0;
-}
-
-static void hmm_buffer_free(struct hmm_buffer *buffer)
-{
-	if (buffer == NULL)
-		return;
-
-	if (buffer->ptr)
-		munmap(buffer->ptr, buffer->size);
-	free(buffer->mirror);
-	free(buffer);
-}
-
-/*
- * Create a temporary file that will be deleted on close.
- */
-static int hmm_create_file(unsigned long size)
-{
-	char path[HMM_PATH_MAX];
-	int fd;
-
-	strcpy(path, "/tmp");
-	fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600);
-	if (fd >= 0) {
-		int r;
-
-		do {
-			r = ftruncate(fd, size);
-		} while (r == -1 && errno == EINTR);
-		if (!r)
-			return fd;
-		close(fd);
-	}
-	return -1;
-}
-
-/*
- * Return a random unsigned number.
- */
-static unsigned int hmm_random(void)
-{
-	static int fd = -1;
-	unsigned int r;
-
-	if (fd < 0) {
-		fd = open("/dev/urandom", O_RDONLY);
-		if (fd < 0) {
-			fprintf(stderr, "%s:%d failed to open /dev/urandom\n",
-					__FILE__, __LINE__);
-			return ~0U;
-		}
-	}
-	read(fd, &r, sizeof(r));
-	return r;
-}
-
-static void hmm_nanosleep(unsigned int n)
-{
-	struct timespec t;
-
-	t.tv_sec = 0;
-	t.tv_nsec = n;
-	nanosleep(&t, NULL);
-}
-
-static int hmm_migrate_sys_to_dev(int fd,
-				   struct hmm_buffer *buffer,
-				   unsigned long npages)
-{
-	return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages);
-}
-
-static int hmm_migrate_dev_to_sys(int fd,
-				   struct hmm_buffer *buffer,
-				   unsigned long npages)
-{
-	return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages);
-}
-
-/*
- * Simple NULL test of device open/close.
- */
-TEST_F(hmm, open_close)
-{
-}
-
-/*
- * Read private anonymous memory.
- */
-TEST_F(hmm, anon_read)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-	int val;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/*
-	 * Initialize buffer in system memory but leave the first two pages
-	 * zero (pte_none and pfn_zero).
-	 */
-	i = 2 * self->page_size / sizeof(*ptr);
-	for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Set buffer permission to read-only. */
-	ret = mprotect(buffer->ptr, size, PROT_READ);
-	ASSERT_EQ(ret, 0);
-
-	/* Populate the CPU page table with a special zero page. */
-	val = *(int *)(buffer->ptr + self->page_size);
-	ASSERT_EQ(val, 0);
-
-	/* Simulate a device reading system memory. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	ASSERT_EQ(buffer->faults, 1);
-
-	/* Check what the device read. */
-	ptr = buffer->mirror;
-	for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], 0);
-	for (; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Read private anonymous memory which has been protected with
- * mprotect() PROT_NONE.
- */
-TEST_F(hmm, anon_read_prot)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Initialize buffer in system memory. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Initialize mirror buffer so we can verify it isn't written. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ptr[i] = -i;
-
-	/* Protect buffer from reading. */
-	ret = mprotect(buffer->ptr, size, PROT_NONE);
-	ASSERT_EQ(ret, 0);
-
-	/* Simulate a device reading system memory. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
-	ASSERT_EQ(ret, -EFAULT);
-
-	/* Allow CPU to read the buffer so we can check it. */
-	ret = mprotect(buffer->ptr, size, PROT_READ);
-	ASSERT_EQ(ret, 0);
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	/* Check what the device read. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], -i);
-
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Write private anonymous memory.
- */
-TEST_F(hmm, anon_write)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Initialize data that the device will write to buffer->ptr. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Simulate a device writing system memory. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	ASSERT_EQ(buffer->faults, 1);
-
-	/* Check what the device wrote. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Write private anonymous memory which has been protected with
- * mprotect() PROT_READ.
- */
-TEST_F(hmm, anon_write_prot)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Simulate a device reading a zero page of memory. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, 1);
-	ASSERT_EQ(buffer->faults, 1);
-
-	/* Initialize data that the device will write to buffer->ptr. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Simulate a device writing system memory. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
-	ASSERT_EQ(ret, -EPERM);
-
-	/* Check what the device wrote. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], 0);
-
-	/* Now allow writing and see that the zero page is replaced. */
-	ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ);
-	ASSERT_EQ(ret, 0);
-
-	/* Simulate a device writing system memory. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	ASSERT_EQ(buffer->faults, 1);
-
-	/* Check what the device wrote. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Check that a device writing an anonymous private mapping
- * will copy-on-write if a child process inherits the mapping.
- */
-TEST_F(hmm, anon_write_child)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	pid_t pid;
-	int child_fd;
-	int ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Initialize buffer->ptr so we can tell if it is written. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Initialize data that the device will write to buffer->ptr. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ptr[i] = -i;
-
-	pid = fork();
-	if (pid == -1)
-		ASSERT_EQ(pid, 0);
-	if (pid != 0) {
-		waitpid(pid, &ret, 0);
-		ASSERT_EQ(WIFEXITED(ret), 1);
-
-		/* Check that the parent's buffer did not change. */
-		for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-			ASSERT_EQ(ptr[i], i);
-		return;
-	}
-
-	/* Check that we see the parent's values. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], -i);
-
-	/* The child process needs its own mirror to its own mm. */
-	child_fd = hmm_open(0);
-	ASSERT_GE(child_fd, 0);
-
-	/* Simulate a device writing system memory. */
-	ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	ASSERT_EQ(buffer->faults, 1);
-
-	/* Check what the device wrote. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], -i);
-
-	close(child_fd);
-	exit(0);
-}
-
-/*
- * Check that a device writing an anonymous shared mapping
- * will not copy-on-write if a child process inherits the mapping.
- */
-TEST_F(hmm, anon_write_child_shared)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	pid_t pid;
-	int child_fd;
-	int ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_SHARED | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Initialize buffer->ptr so we can tell if it is written. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Initialize data that the device will write to buffer->ptr. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ptr[i] = -i;
-
-	pid = fork();
-	if (pid == -1)
-		ASSERT_EQ(pid, 0);
-	if (pid != 0) {
-		waitpid(pid, &ret, 0);
-		ASSERT_EQ(WIFEXITED(ret), 1);
-
-		/* Check that the parent's buffer did change. */
-		for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-			ASSERT_EQ(ptr[i], -i);
-		return;
-	}
-
-	/* Check that we see the parent's values. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], -i);
-
-	/* The child process needs its own mirror to its own mm. */
-	child_fd = hmm_open(0);
-	ASSERT_GE(child_fd, 0);
-
-	/* Simulate a device writing system memory. */
-	ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	ASSERT_EQ(buffer->faults, 1);
-
-	/* Check what the device wrote. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], -i);
-
-	close(child_fd);
-	exit(0);
-}
-
-/*
- * Write private anonymous huge page.
- */
-TEST_F(hmm, anon_write_huge)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	void *old_ptr;
-	void *map;
-	int *ptr;
-	int ret;
-
-	size = 2 * TWOMEG;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	size = TWOMEG;
-	npages = size >> self->page_shift;
-	map = (void *)ALIGN((uintptr_t)buffer->ptr, size);
-	ret = madvise(map, size, MADV_HUGEPAGE);
-	ASSERT_EQ(ret, 0);
-	old_ptr = buffer->ptr;
-	buffer->ptr = map;
-
-	/* Initialize data that the device will write to buffer->ptr. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Simulate a device writing system memory. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	ASSERT_EQ(buffer->faults, 1);
-
-	/* Check what the device wrote. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	buffer->ptr = old_ptr;
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Read numeric data from raw and tagged kernel status files.  Used to read
- * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag).
- */
-static long file_read_ulong(char *file, const char *tag)
-{
-	int fd;
-	char buf[2048];
-	int len;
-	char *p, *q;
-	long val;
-
-	fd = open(file, O_RDONLY);
-	if (fd < 0) {
-		/* Error opening the file */
-		return -1;
-	}
-
-	len = read(fd, buf, sizeof(buf));
-	close(fd);
-	if (len < 0) {
-		/* Error in reading the file */
-		return -1;
-	}
-	if (len == sizeof(buf)) {
-		/* Error file is too large */
-		return -1;
-	}
-	buf[len] = '\0';
-
-	/* Search for a tag if provided */
-	if (tag) {
-		p = strstr(buf, tag);
-		if (!p)
-			return -1; /* looks like the line we want isn't there */
-		p += strlen(tag);
-	} else
-		p = buf;
-
-	val = strtol(p, &q, 0);
-	if (*q != ' ') {
-		/* Error parsing the file */
-		return -1;
-	}
-
-	return val;
-}
-
-/*
- * Write huge TLBFS page.
- */
-TEST_F(hmm, anon_write_hugetlbfs)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long default_hsize;
-	unsigned long i;
-	int *ptr;
-	int ret;
-
-	default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
-	if (default_hsize < 0 || default_hsize*1024 < default_hsize)
-		SKIP(return, "Huge page size could not be determined");
-	default_hsize = default_hsize*1024; /* KB to B */
-
-	size = ALIGN(TWOMEG, default_hsize);
-	npages = size >> self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-				   PROT_READ | PROT_WRITE,
-				   MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-				   -1, 0);
-	if (buffer->ptr == MAP_FAILED) {
-		free(buffer);
-		SKIP(return, "Huge page could not be allocated");
-	}
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	/* Initialize data that the device will write to buffer->ptr. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Simulate a device writing system memory. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	ASSERT_EQ(buffer->faults, 1);
-
-	/* Check what the device wrote. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	munmap(buffer->ptr, buffer->size);
-	buffer->ptr = NULL;
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Read mmap'ed file memory.
- */
-TEST_F(hmm, file_read)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-	int fd;
-	ssize_t len;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	fd = hmm_create_file(size);
-	ASSERT_GE(fd, 0);
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = fd;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	/* Write initial contents of the file. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-	len = pwrite(fd, buffer->mirror, size, 0);
-	ASSERT_EQ(len, size);
-	memset(buffer->mirror, 0, size);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ,
-			   MAP_SHARED,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Simulate a device reading system memory. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	ASSERT_EQ(buffer->faults, 1);
-
-	/* Check what the device read. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Write mmap'ed file memory.
- */
-TEST_F(hmm, file_write)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-	int fd;
-	ssize_t len;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	fd = hmm_create_file(size);
-	ASSERT_GE(fd, 0);
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = fd;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_SHARED,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Initialize data that the device will write to buffer->ptr. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Simulate a device writing system memory. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	ASSERT_EQ(buffer->faults, 1);
-
-	/* Check what the device wrote. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	/* Check that the device also wrote the file. */
-	len = pread(fd, buffer->mirror, size, 0);
-	ASSERT_EQ(len, size);
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Migrate anonymous memory to device private memory.
- */
-TEST_F(hmm, migrate)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Initialize buffer in system memory. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Migrate memory to device. */
-	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-
-	/* Check what the device read. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Migrate anonymous memory to device private memory and fault some of it back
- * to system memory, then try migrating the resulting mix of system and device
- * private memory to the device.
- */
-TEST_F(hmm, migrate_fault)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Initialize buffer in system memory. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Migrate memory to device. */
-	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-
-	/* Check what the device read. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	/* Fault half the pages back to system memory and check them. */
-	for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	/* Migrate memory to the device again. */
-	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-
-	/* Check what the device read. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	hmm_buffer_free(buffer);
-}
-
-TEST_F(hmm, migrate_release)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Initialize buffer in system memory. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Migrate memory to device. */
-	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-
-	/* Check what the device read. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	/* Release device memory. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages);
-	ASSERT_EQ(ret, 0);
-
-	/* Fault pages back to system memory and check them. */
-	for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Migrate anonymous shared memory to device private memory.
- */
-TEST_F(hmm, migrate_shared)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	int ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_SHARED | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Migrate memory to device. */
-	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-	ASSERT_EQ(ret, -ENOENT);
-
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Try to migrate various memory types to device private memory.
- */
-TEST_F(hmm2, migrate_mixed)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	int *ptr;
-	unsigned char *p;
-	int ret;
-	int val;
-
-	npages = 6;
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	/* Reserve a range of addresses. */
-	buffer->ptr = mmap(NULL, size,
-			   PROT_NONE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-	p = buffer->ptr;
-
-	/* Migrating a protected area should be an error. */
-	ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages);
-	ASSERT_EQ(ret, -EINVAL);
-
-	/* Punch a hole after the first page address. */
-	ret = munmap(buffer->ptr + self->page_size, self->page_size);
-	ASSERT_EQ(ret, 0);
-
-	/* We expect an error if the vma doesn't cover the range. */
-	ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3);
-	ASSERT_EQ(ret, -EINVAL);
-
-	/* Page 2 will be a read-only zero page. */
-	ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size,
-				PROT_READ);
-	ASSERT_EQ(ret, 0);
-	ptr = (int *)(buffer->ptr + 2 * self->page_size);
-	val = *ptr + 3;
-	ASSERT_EQ(val, 3);
-
-	/* Page 3 will be read-only. */
-	ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
-				PROT_READ | PROT_WRITE);
-	ASSERT_EQ(ret, 0);
-	ptr = (int *)(buffer->ptr + 3 * self->page_size);
-	*ptr = val;
-	ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
-				PROT_READ);
-	ASSERT_EQ(ret, 0);
-
-	/* Page 4-5 will be read-write. */
-	ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size,
-				PROT_READ | PROT_WRITE);
-	ASSERT_EQ(ret, 0);
-	ptr = (int *)(buffer->ptr + 4 * self->page_size);
-	*ptr = val;
-	ptr = (int *)(buffer->ptr + 5 * self->page_size);
-	*ptr = val;
-
-	/* Now try to migrate pages 2-5 to device 1. */
-	buffer->ptr = p + 2 * self->page_size;
-	ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, 4);
-
-	/* Page 5 won't be migrated to device 0 because it's on device 1. */
-	buffer->ptr = p + 5 * self->page_size;
-	ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1);
-	ASSERT_EQ(ret, -ENOENT);
-	buffer->ptr = p;
-
-	buffer->ptr = p;
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Migrate anonymous memory to device memory and back to system memory
- * multiple times. In case of private zone configuration, this is done
- * through fault pages accessed by CPU. In case of coherent zone configuration,
- * the pages from the device should be explicitly migrated back to system memory.
- * The reason is Coherent device zone has coherent access by CPU, therefore
- * it will not generate any page fault.
- */
-TEST_F(hmm, migrate_multiple)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	unsigned long c;
-	int *ptr;
-	int ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	for (c = 0; c < NTIMES; c++) {
-		buffer = malloc(sizeof(*buffer));
-		ASSERT_NE(buffer, NULL);
-
-		buffer->fd = -1;
-		buffer->size = size;
-		buffer->mirror = malloc(size);
-		ASSERT_NE(buffer->mirror, NULL);
-
-		buffer->ptr = mmap(NULL, size,
-				   PROT_READ | PROT_WRITE,
-				   MAP_PRIVATE | MAP_ANONYMOUS,
-				   buffer->fd, 0);
-		ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-		/* Initialize buffer in system memory. */
-		for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-			ptr[i] = i;
-
-		/* Migrate memory to device. */
-		ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-		ASSERT_EQ(ret, 0);
-		ASSERT_EQ(buffer->cpages, npages);
-
-		/* Check what the device read. */
-		for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-			ASSERT_EQ(ptr[i], i);
-
-		/* Migrate back to system memory and check them. */
-		if (hmm_is_coherent_type(variant->device_number)) {
-			ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages);
-			ASSERT_EQ(ret, 0);
-			ASSERT_EQ(buffer->cpages, npages);
-		}
-
-		for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-			ASSERT_EQ(ptr[i], i);
-
-		hmm_buffer_free(buffer);
-	}
-}
-
-/*
- * Read anonymous memory multiple times.
- */
-TEST_F(hmm, anon_read_multiple)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	unsigned long c;
-	int *ptr;
-	int ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	for (c = 0; c < NTIMES; c++) {
-		buffer = malloc(sizeof(*buffer));
-		ASSERT_NE(buffer, NULL);
-
-		buffer->fd = -1;
-		buffer->size = size;
-		buffer->mirror = malloc(size);
-		ASSERT_NE(buffer->mirror, NULL);
-
-		buffer->ptr = mmap(NULL, size,
-				   PROT_READ | PROT_WRITE,
-				   MAP_PRIVATE | MAP_ANONYMOUS,
-				   buffer->fd, 0);
-		ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-		/* Initialize buffer in system memory. */
-		for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-			ptr[i] = i + c;
-
-		/* Simulate a device reading system memory. */
-		ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer,
-				      npages);
-		ASSERT_EQ(ret, 0);
-		ASSERT_EQ(buffer->cpages, npages);
-		ASSERT_EQ(buffer->faults, 1);
-
-		/* Check what the device read. */
-		for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-			ASSERT_EQ(ptr[i], i + c);
-
-		hmm_buffer_free(buffer);
-	}
-}
-
-void *unmap_buffer(void *p)
-{
-	struct hmm_buffer *buffer = p;
-
-	/* Delay for a bit and then unmap buffer while it is being read. */
-	hmm_nanosleep(hmm_random() % 32000);
-	munmap(buffer->ptr + buffer->size / 2, buffer->size / 2);
-	buffer->ptr = NULL;
-
-	return NULL;
-}
-
-/*
- * Try reading anonymous memory while it is being unmapped.
- */
-TEST_F(hmm, anon_teardown)
-{
-	unsigned long npages;
-	unsigned long size;
-	unsigned long c;
-	void *ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	for (c = 0; c < NTIMES; ++c) {
-		pthread_t thread;
-		struct hmm_buffer *buffer;
-		unsigned long i;
-		int *ptr;
-		int rc;
-
-		buffer = malloc(sizeof(*buffer));
-		ASSERT_NE(buffer, NULL);
-
-		buffer->fd = -1;
-		buffer->size = size;
-		buffer->mirror = malloc(size);
-		ASSERT_NE(buffer->mirror, NULL);
-
-		buffer->ptr = mmap(NULL, size,
-				   PROT_READ | PROT_WRITE,
-				   MAP_PRIVATE | MAP_ANONYMOUS,
-				   buffer->fd, 0);
-		ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-		/* Initialize buffer in system memory. */
-		for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-			ptr[i] = i + c;
-
-		rc = pthread_create(&thread, NULL, unmap_buffer, buffer);
-		ASSERT_EQ(rc, 0);
-
-		/* Simulate a device reading system memory. */
-		rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer,
-				     npages);
-		if (rc == 0) {
-			ASSERT_EQ(buffer->cpages, npages);
-			ASSERT_EQ(buffer->faults, 1);
-
-			/* Check what the device read. */
-			for (i = 0, ptr = buffer->mirror;
-			     i < size / sizeof(*ptr);
-			     ++i)
-				ASSERT_EQ(ptr[i], i + c);
-		}
-
-		pthread_join(thread, &ret);
-		hmm_buffer_free(buffer);
-	}
-}
-
-/*
- * Test memory snapshot without faulting in pages accessed by the device.
- */
-TEST_F(hmm, mixedmap)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned char *m;
-	int ret;
-
-	npages = 1;
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(npages);
-	ASSERT_NE(buffer->mirror, NULL);
-
-
-	/* Reserve a range of addresses. */
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE,
-			   self->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Simulate a device snapshotting CPU pagetables. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-
-	/* Check what the device saw. */
-	m = buffer->mirror;
-	ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ);
-
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Test memory snapshot without faulting in pages accessed by the device.
- */
-TEST_F(hmm2, snapshot)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	int *ptr;
-	unsigned char *p;
-	unsigned char *m;
-	int ret;
-	int val;
-
-	npages = 7;
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(npages);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	/* Reserve a range of addresses. */
-	buffer->ptr = mmap(NULL, size,
-			   PROT_NONE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-	p = buffer->ptr;
-
-	/* Punch a hole after the first page address. */
-	ret = munmap(buffer->ptr + self->page_size, self->page_size);
-	ASSERT_EQ(ret, 0);
-
-	/* Page 2 will be read-only zero page. */
-	ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size,
-				PROT_READ);
-	ASSERT_EQ(ret, 0);
-	ptr = (int *)(buffer->ptr + 2 * self->page_size);
-	val = *ptr + 3;
-	ASSERT_EQ(val, 3);
-
-	/* Page 3 will be read-only. */
-	ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
-				PROT_READ | PROT_WRITE);
-	ASSERT_EQ(ret, 0);
-	ptr = (int *)(buffer->ptr + 3 * self->page_size);
-	*ptr = val;
-	ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
-				PROT_READ);
-	ASSERT_EQ(ret, 0);
-
-	/* Page 4-6 will be read-write. */
-	ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size,
-				PROT_READ | PROT_WRITE);
-	ASSERT_EQ(ret, 0);
-	ptr = (int *)(buffer->ptr + 4 * self->page_size);
-	*ptr = val;
-
-	/* Page 5 will be migrated to device 0. */
-	buffer->ptr = p + 5 * self->page_size;
-	ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, 1);
-
-	/* Page 6 will be migrated to device 1. */
-	buffer->ptr = p + 6 * self->page_size;
-	ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, 1);
-
-	/* Simulate a device snapshotting CPU pagetables. */
-	buffer->ptr = p;
-	ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-
-	/* Check what the device saw. */
-	m = buffer->mirror;
-	ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR);
-	ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR);
-	ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ);
-	ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ);
-	ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE);
-	if (!hmm_is_coherent_type(variant->device_number0)) {
-		ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL |
-				HMM_DMIRROR_PROT_WRITE);
-		ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE);
-	} else {
-		ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL |
-				HMM_DMIRROR_PROT_WRITE);
-		ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE |
-				HMM_DMIRROR_PROT_WRITE);
-	}
-
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that
- * should be mapped by a large page table entry.
- */
-TEST_F(hmm, compound)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long default_hsize;
-	int *ptr;
-	unsigned char *m;
-	int ret;
-	unsigned long i;
-
-	/* Skip test if we can't allocate a hugetlbfs page. */
-
-	default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
-	if (default_hsize < 0 || default_hsize*1024 < default_hsize)
-		SKIP(return, "Huge page size could not be determined");
-	default_hsize = default_hsize*1024; /* KB to B */
-
-	size = ALIGN(TWOMEG, default_hsize);
-	npages = size >> self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-				   PROT_READ | PROT_WRITE,
-				   MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-				   -1, 0);
-	if (buffer->ptr == MAP_FAILED) {
-		free(buffer);
-		return;
-	}
-
-	buffer->size = size;
-	buffer->mirror = malloc(npages);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	/* Initialize the pages the device will snapshot in buffer->ptr. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Simulate a device snapshotting CPU pagetables. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-
-	/* Check what the device saw. */
-	m = buffer->mirror;
-	for (i = 0; i < npages; ++i)
-		ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE |
-				HMM_DMIRROR_PROT_PMD);
-
-	/* Make the region read-only. */
-	ret = mprotect(buffer->ptr, size, PROT_READ);
-	ASSERT_EQ(ret, 0);
-
-	/* Simulate a device snapshotting CPU pagetables. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-
-	/* Check what the device saw. */
-	m = buffer->mirror;
-	for (i = 0; i < npages; ++i)
-		ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ |
-				HMM_DMIRROR_PROT_PMD);
-
-	munmap(buffer->ptr, buffer->size);
-	buffer->ptr = NULL;
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Test two devices reading the same memory (double mapped).
- */
-TEST_F(hmm2, double_map)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-
-	npages = 6;
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(npages);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	/* Reserve a range of addresses. */
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Initialize buffer in system memory. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Make region read-only. */
-	ret = mprotect(buffer->ptr, size, PROT_READ);
-	ASSERT_EQ(ret, 0);
-
-	/* Simulate device 0 reading system memory. */
-	ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	ASSERT_EQ(buffer->faults, 1);
-
-	/* Check what the device read. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	/* Simulate device 1 reading system memory. */
-	ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	ASSERT_EQ(buffer->faults, 1);
-
-	/* Check what the device read. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	/* Migrate pages to device 1 and try to read from device 0. */
-	ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-
-	ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	ASSERT_EQ(buffer->faults, 1);
-
-	/* Check what device 0 read. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Basic check of exclusive faulting.
- */
-TEST_F(hmm, exclusive)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Initialize buffer in system memory. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Map memory exclusively for device access. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-
-	/* Check what the device read. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	/* Fault pages back to system memory and check them. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i]++, i);
-
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i+1);
-
-	/* Check atomic access revoked */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_CHECK_EXCLUSIVE, buffer, npages);
-	ASSERT_EQ(ret, 0);
-
-	hmm_buffer_free(buffer);
-}
-
-TEST_F(hmm, exclusive_mprotect)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Initialize buffer in system memory. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Map memory exclusively for device access. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-
-	/* Check what the device read. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	ret = mprotect(buffer->ptr, size, PROT_READ);
-	ASSERT_EQ(ret, 0);
-
-	/* Simulate a device writing system memory. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
-	ASSERT_EQ(ret, -EPERM);
-
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Check copy-on-write works.
- */
-TEST_F(hmm, exclusive_cow)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-
-	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-	ASSERT_NE(npages, 0);
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Initialize buffer in system memory. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Map memory exclusively for device access. */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-
-	fork();
-
-	/* Fault pages back to system memory and check them. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i]++, i);
-
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i+1);
-
-	hmm_buffer_free(buffer);
-}
-
-static int gup_test_exec(int gup_fd, unsigned long addr, int cmd,
-			 int npages, int size, int flags)
-{
-	struct gup_test gup = {
-		.nr_pages_per_call	= npages,
-		.addr			= addr,
-		.gup_flags		= FOLL_WRITE | flags,
-		.size			= size,
-	};
-
-	if (ioctl(gup_fd, cmd, &gup)) {
-		perror("ioctl on error\n");
-		return errno;
-	}
-
-	return 0;
-}
-
-/*
- * Test get user device pages through gup_test. Setting PIN_LONGTERM flag.
- * This should trigger a migration back to system memory for both, private
- * and coherent type pages.
- * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added
- * to your configuration before you run it.
- */
-TEST_F(hmm, hmm_gup_test)
-{
-	struct hmm_buffer *buffer;
-	int gup_fd;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-	unsigned char *m;
-
-	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
-	if (gup_fd == -1)
-		SKIP(return, "Skipping test, could not find gup_test driver");
-
-	npages = 4;
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Initialize buffer in system memory. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Migrate memory to device. */
-	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	/* Check what the device read. */
-	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	ASSERT_EQ(gup_test_exec(gup_fd,
-				(unsigned long)buffer->ptr,
-				GUP_BASIC_TEST, 1, self->page_size, 0), 0);
-	ASSERT_EQ(gup_test_exec(gup_fd,
-				(unsigned long)buffer->ptr + 1 * self->page_size,
-				GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0);
-	ASSERT_EQ(gup_test_exec(gup_fd,
-				(unsigned long)buffer->ptr + 2 * self->page_size,
-				PIN_FAST_BENCHMARK, 1, self->page_size, FOLL_LONGTERM), 0);
-	ASSERT_EQ(gup_test_exec(gup_fd,
-				(unsigned long)buffer->ptr + 3 * self->page_size,
-				PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 0);
-
-	/* Take snapshot to CPU pagetables */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	m = buffer->mirror;
-	if (hmm_is_coherent_type(variant->device_number)) {
-		ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[0]);
-		ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[1]);
-	} else {
-		ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]);
-		ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]);
-	}
-	ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]);
-	ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[3]);
-	/*
-	 * Check again the content on the pages. Make sure there's no
-	 * corrupted data.
-	 */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ASSERT_EQ(ptr[i], i);
-
-	close(gup_fd);
-	hmm_buffer_free(buffer);
-}
-
-/*
- * Test copy-on-write in device pages.
- * In case of writing to COW private page(s), a page fault will migrate pages
- * back to system memory first. Then, these pages will be duplicated. In case
- * of COW device coherent type, pages are duplicated directly from device
- * memory.
- */
-TEST_F(hmm, hmm_cow_in_device)
-{
-	struct hmm_buffer *buffer;
-	unsigned long npages;
-	unsigned long size;
-	unsigned long i;
-	int *ptr;
-	int ret;
-	unsigned char *m;
-	pid_t pid;
-	int status;
-
-	npages = 4;
-	size = npages << self->page_shift;
-
-	buffer = malloc(sizeof(*buffer));
-	ASSERT_NE(buffer, NULL);
-
-	buffer->fd = -1;
-	buffer->size = size;
-	buffer->mirror = malloc(size);
-	ASSERT_NE(buffer->mirror, NULL);
-
-	buffer->ptr = mmap(NULL, size,
-			   PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS,
-			   buffer->fd, 0);
-	ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-	/* Initialize buffer in system memory. */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Migrate memory to device. */
-
-	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-
-	pid = fork();
-	if (pid == -1)
-		ASSERT_EQ(pid, 0);
-	if (!pid) {
-		/* Child process waitd for SIGTERM from the parent. */
-		while (1) {
-		}
-		perror("Should not reach this\n");
-		exit(0);
-	}
-	/* Parent process writes to COW pages(s) and gets a
-	 * new copy in system. In case of device private pages,
-	 * this write causes a migration to system mem first.
-	 */
-	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-		ptr[i] = i;
-
-	/* Terminate child and wait */
-	EXPECT_EQ(0, kill(pid, SIGTERM));
-	EXPECT_EQ(pid, waitpid(pid, &status, 0));
-	EXPECT_NE(0, WIFSIGNALED(status));
-	EXPECT_EQ(SIGTERM, WTERMSIG(status));
-
-	/* Take snapshot to CPU pagetables */
-	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
-	ASSERT_EQ(ret, 0);
-	ASSERT_EQ(buffer->cpages, npages);
-	m = buffer->mirror;
-	for (i = 0; i < npages; i++)
-		ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]);
-
-	hmm_buffer_free(buffer);
-}
-TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/vm/hugepage-mmap.c b/tools/testing/selftests/vm/hugepage-mmap.c
deleted file mode 100644
index 955ef87f382c..000000000000
--- a/tools/testing/selftests/vm/hugepage-mmap.c
+++ /dev/null
@@ -1,91 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * hugepage-mmap:
- *
- * Example of using huge page memory in a user application using the mmap
- * system call.  Before running this application, make sure that the
- * administrator has mounted the hugetlbfs filesystem (on some directory
- * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
- * example, the app is requesting memory of size 256MB that is backed by
- * huge pages.
- *
- * For the ia64 architecture, the Linux kernel reserves Region number 4 for
- * huge pages.  That means that if one requires a fixed address, a huge page
- * aligned address starting with 0x800000... will be required.  If a fixed
- * address is not required, the kernel will select an address in the proper
- * range.
- * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
- */
-#define _GNU_SOURCE
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-
-#define LENGTH (256UL*1024*1024)
-#define PROTECTION (PROT_READ | PROT_WRITE)
-
-/* Only ia64 requires this */
-#ifdef __ia64__
-#define ADDR (void *)(0x8000000000000000UL)
-#define FLAGS (MAP_SHARED | MAP_FIXED)
-#else
-#define ADDR (void *)(0x0UL)
-#define FLAGS (MAP_SHARED)
-#endif
-
-static void check_bytes(char *addr)
-{
-	printf("First hex is %x\n", *((unsigned int *)addr));
-}
-
-static void write_bytes(char *addr)
-{
-	unsigned long i;
-
-	for (i = 0; i < LENGTH; i++)
-		*(addr + i) = (char)i;
-}
-
-static int read_bytes(char *addr)
-{
-	unsigned long i;
-
-	check_bytes(addr);
-	for (i = 0; i < LENGTH; i++)
-		if (*(addr + i) != (char)i) {
-			printf("Mismatch at %lu\n", i);
-			return 1;
-		}
-	return 0;
-}
-
-int main(void)
-{
-	void *addr;
-	int fd, ret;
-
-	fd = memfd_create("hugepage-mmap", MFD_HUGETLB);
-	if (fd < 0) {
-		perror("memfd_create() failed");
-		exit(1);
-	}
-
-	addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		close(fd);
-		exit(1);
-	}
-
-	printf("Returned address is %p\n", addr);
-	check_bytes(addr);
-	write_bytes(addr);
-	ret = read_bytes(addr);
-
-	munmap(addr, LENGTH);
-	close(fd);
-
-	return ret;
-}
diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c
deleted file mode 100644
index e53b5eaa8fce..000000000000
--- a/tools/testing/selftests/vm/hugepage-mremap.c
+++ /dev/null
@@ -1,188 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * hugepage-mremap:
- *
- * Example of remapping huge page memory in a user application using the
- * mremap system call.  The path to a file in a hugetlbfs filesystem must
- * be passed as the last argument to this test.  The amount of memory used
- * by this test in MBs can optionally be passed as an argument.  If no memory
- * amount is passed, the default amount is 10MB.
- *
- * To make sure the test triggers pmd sharing and goes through the 'unshare'
- * path in the mremap code use 1GB (1024) or more.
- */
-
-#define _GNU_SOURCE
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <errno.h>
-#include <fcntl.h> /* Definition of O_* constants */
-#include <sys/syscall.h> /* Definition of SYS_* constants */
-#include <linux/userfaultfd.h>
-#include <sys/ioctl.h>
-#include <string.h>
-
-#define DEFAULT_LENGTH_MB 10UL
-#define MB_TO_BYTES(x) (x * 1024 * 1024)
-
-#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC)
-#define FLAGS (MAP_SHARED | MAP_ANONYMOUS)
-
-static void check_bytes(char *addr)
-{
-	printf("First hex is %x\n", *((unsigned int *)addr));
-}
-
-static void write_bytes(char *addr, size_t len)
-{
-	unsigned long i;
-
-	for (i = 0; i < len; i++)
-		*(addr + i) = (char)i;
-}
-
-static int read_bytes(char *addr, size_t len)
-{
-	unsigned long i;
-
-	check_bytes(addr);
-	for (i = 0; i < len; i++)
-		if (*(addr + i) != (char)i) {
-			printf("Mismatch at %lu\n", i);
-			return 1;
-		}
-	return 0;
-}
-
-static void register_region_with_uffd(char *addr, size_t len)
-{
-	long uffd; /* userfaultfd file descriptor */
-	struct uffdio_api uffdio_api;
-	struct uffdio_register uffdio_register;
-
-	/* Create and enable userfaultfd object. */
-
-	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
-	if (uffd == -1) {
-		perror("userfaultfd");
-		exit(1);
-	}
-
-	uffdio_api.api = UFFD_API;
-	uffdio_api.features = 0;
-	if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
-		perror("ioctl-UFFDIO_API");
-		exit(1);
-	}
-
-	/* Create a private anonymous mapping. The memory will be
-	 * demand-zero paged--that is, not yet allocated. When we
-	 * actually touch the memory, it will be allocated via
-	 * the userfaultfd.
-	 */
-
-	addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
-		    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
-
-	printf("Address returned by mmap() = %p\n", addr);
-
-	/* Register the memory range of the mapping we just created for
-	 * handling by the userfaultfd object. In mode, we request to track
-	 * missing pages (i.e., pages that have not yet been faulted in).
-	 */
-
-	uffdio_register.range.start = (unsigned long)addr;
-	uffdio_register.range.len = len;
-	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
-	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
-		perror("ioctl-UFFDIO_REGISTER");
-		exit(1);
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	size_t length = 0;
-	int ret = 0, fd;
-
-	if (argc >= 2 && !strcmp(argv[1], "-h")) {
-		printf("Usage: %s [length_in_MB]\n", argv[0]);
-		exit(1);
-	}
-
-	/* Read memory length as the first arg if valid, otherwise fallback to
-	 * the default length.
-	 */
-	if (argc >= 2)
-		length = (size_t)atoi(argv[1]);
-	else
-		length = DEFAULT_LENGTH_MB;
-
-	length = MB_TO_BYTES(length);
-	fd = memfd_create(argv[0], MFD_HUGETLB);
-	if (fd < 0) {
-		perror("Open failed");
-		exit(1);
-	}
-
-	/* mmap to a PUD aligned address to hopefully trigger pmd sharing. */
-	unsigned long suggested_addr = 0x7eaa40000000;
-	void *haddr = mmap((void *)suggested_addr, length, PROTECTION,
-			   MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
-	printf("Map haddr: Returned address is %p\n", haddr);
-	if (haddr == MAP_FAILED) {
-		perror("mmap1");
-		exit(1);
-	}
-
-	/* mmap again to a dummy address to hopefully trigger pmd sharing. */
-	suggested_addr = 0x7daa40000000;
-	void *daddr = mmap((void *)suggested_addr, length, PROTECTION,
-			   MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
-	printf("Map daddr: Returned address is %p\n", daddr);
-	if (daddr == MAP_FAILED) {
-		perror("mmap3");
-		exit(1);
-	}
-
-	suggested_addr = 0x7faa40000000;
-	void *vaddr =
-		mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0);
-	printf("Map vaddr: Returned address is %p\n", vaddr);
-	if (vaddr == MAP_FAILED) {
-		perror("mmap2");
-		exit(1);
-	}
-
-	register_region_with_uffd(haddr, length);
-
-	void *addr = mremap(haddr, length, length,
-			    MREMAP_MAYMOVE | MREMAP_FIXED, vaddr);
-	if (addr == MAP_FAILED) {
-		perror("mremap");
-		exit(1);
-	}
-
-	printf("Mremap: Returned address is %p\n", addr);
-	check_bytes(addr);
-	write_bytes(addr, length);
-	ret = read_bytes(addr, length);
-
-	munmap(addr, length);
-
-	addr = mremap(addr, length, length, 0);
-	if (addr != MAP_FAILED) {
-		printf("mremap: Expected failure, but call succeeded\n");
-		exit(1);
-	}
-
-	close(fd);
-
-	return ret;
-}
diff --git a/tools/testing/selftests/vm/hugepage-shm.c b/tools/testing/selftests/vm/hugepage-shm.c
deleted file mode 100644
index e2527f32005b..000000000000
--- a/tools/testing/selftests/vm/hugepage-shm.c
+++ /dev/null
@@ -1,101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * hugepage-shm:
- *
- * Example of using huge page memory in a user application using Sys V shared
- * memory system calls.  In this example the app is requesting 256MB of
- * memory that is backed by huge pages.  The application uses the flag
- * SHM_HUGETLB in the shmget system call to inform the kernel that it is
- * requesting huge pages.
- *
- * For the ia64 architecture, the Linux kernel reserves Region number 4 for
- * huge pages.  That means that if one requires a fixed address, a huge page
- * aligned address starting with 0x800000... will be required.  If a fixed
- * address is not required, the kernel will select an address in the proper
- * range.
- * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
- *
- * Note: The default shared memory limit is quite low on many kernels,
- * you may need to increase it via:
- *
- * echo 268435456 > /proc/sys/kernel/shmmax
- *
- * This will increase the maximum size per shared memory segment to 256MB.
- * The other limit that you will hit eventually is shmall which is the
- * total amount of shared memory in pages. To set it to 16GB on a system
- * with a 4kB pagesize do:
- *
- * echo 4194304 > /proc/sys/kernel/shmall
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#include <sys/mman.h>
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-#define LENGTH (256UL*1024*1024)
-
-#define dprintf(x)  printf(x)
-
-/* Only ia64 requires this */
-#ifdef __ia64__
-#define ADDR (void *)(0x8000000000000000UL)
-#define SHMAT_FLAGS (SHM_RND)
-#else
-#define ADDR (void *)(0x0UL)
-#define SHMAT_FLAGS (0)
-#endif
-
-int main(void)
-{
-	int shmid;
-	unsigned long i;
-	char *shmaddr;
-
-	shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
-	if (shmid < 0) {
-		perror("shmget");
-		exit(1);
-	}
-	printf("shmid: 0x%x\n", shmid);
-
-	shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
-	if (shmaddr == (char *)-1) {
-		perror("Shared memory attach failure");
-		shmctl(shmid, IPC_RMID, NULL);
-		exit(2);
-	}
-	printf("shmaddr: %p\n", shmaddr);
-
-	dprintf("Starting the writes:\n");
-	for (i = 0; i < LENGTH; i++) {
-		shmaddr[i] = (char)(i);
-		if (!(i % (1024 * 1024)))
-			dprintf(".");
-	}
-	dprintf("\n");
-
-	dprintf("Starting the Check...");
-	for (i = 0; i < LENGTH; i++)
-		if (shmaddr[i] != (char)i) {
-			printf("\nIndex %lu mismatched\n", i);
-			exit(3);
-		}
-	dprintf("Done.\n");
-
-	if (shmdt((const void *)shmaddr) != 0) {
-		perror("Detach failure");
-		shmctl(shmid, IPC_RMID, NULL);
-		exit(4);
-	}
-
-	shmctl(shmid, IPC_RMID, NULL);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/vm/hugepage-vmemmap.c
deleted file mode 100644
index 557bdbd4f87e..000000000000
--- a/tools/testing/selftests/vm/hugepage-vmemmap.c
+++ /dev/null
@@ -1,144 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * A test case of using hugepage memory in a user application using the
- * mmap system call with MAP_HUGETLB flag.  Before running this program
- * make sure the administrator has allocated enough default sized huge
- * pages to cover the 2 MB allocation.
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-
-#define MAP_LENGTH		(2UL * 1024 * 1024)
-
-#ifndef MAP_HUGETLB
-#define MAP_HUGETLB		0x40000	/* arch specific */
-#endif
-
-#define PAGE_SIZE		4096
-
-#define PAGE_COMPOUND_HEAD	(1UL << 15)
-#define PAGE_COMPOUND_TAIL	(1UL << 16)
-#define PAGE_HUGE		(1UL << 17)
-
-#define HEAD_PAGE_FLAGS		(PAGE_COMPOUND_HEAD | PAGE_HUGE)
-#define TAIL_PAGE_FLAGS		(PAGE_COMPOUND_TAIL | PAGE_HUGE)
-
-#define PM_PFRAME_BITS		55
-#define PM_PFRAME_MASK		~((1UL << PM_PFRAME_BITS) - 1)
-
-/*
- * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
- * That means the addresses starting with 0x800000... will need to be
- * specified.  Specifying a fixed address is not required on ppc64, i386
- * or x86_64.
- */
-#ifdef __ia64__
-#define MAP_ADDR		(void *)(0x8000000000000000UL)
-#define MAP_FLAGS		(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
-#else
-#define MAP_ADDR		NULL
-#define MAP_FLAGS		(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
-#endif
-
-static void write_bytes(char *addr, size_t length)
-{
-	unsigned long i;
-
-	for (i = 0; i < length; i++)
-		*(addr + i) = (char)i;
-}
-
-static unsigned long virt_to_pfn(void *addr)
-{
-	int fd;
-	unsigned long pagemap;
-
-	fd = open("/proc/self/pagemap", O_RDONLY);
-	if (fd < 0)
-		return -1UL;
-
-	lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET);
-	read(fd, &pagemap, sizeof(pagemap));
-	close(fd);
-
-	return pagemap & ~PM_PFRAME_MASK;
-}
-
-static int check_page_flags(unsigned long pfn)
-{
-	int fd, i;
-	unsigned long pageflags;
-
-	fd = open("/proc/kpageflags", O_RDONLY);
-	if (fd < 0)
-		return -1;
-
-	lseek(fd, pfn * sizeof(pageflags), SEEK_SET);
-
-	read(fd, &pageflags, sizeof(pageflags));
-	if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) {
-		close(fd);
-		printf("Head page flags (%lx) is invalid\n", pageflags);
-		return -1;
-	}
-
-	/*
-	 * pages other than the first page must be tail and shouldn't be head;
-	 * this also verifies kernel has correctly set the fake page_head to tail
-	 * while hugetlb_free_vmemmap is enabled.
-	 */
-	for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) {
-		read(fd, &pageflags, sizeof(pageflags));
-		if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS ||
-		    (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) {
-			close(fd);
-			printf("Tail page flags (%lx) is invalid\n", pageflags);
-			return -1;
-		}
-	}
-
-	close(fd);
-
-	return 0;
-}
-
-int main(int argc, char **argv)
-{
-	void *addr;
-	unsigned long pfn;
-
-	addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
-
-	/* Trigger allocation of HugeTLB page. */
-	write_bytes(addr, MAP_LENGTH);
-
-	pfn = virt_to_pfn(addr);
-	if (pfn == -1UL) {
-		munmap(addr, MAP_LENGTH);
-		perror("virt_to_pfn");
-		exit(1);
-	}
-
-	printf("Returned address is %p whose pfn is %lx\n", addr, pfn);
-
-	if (check_page_flags(pfn) < 0) {
-		munmap(addr, MAP_LENGTH);
-		perror("check_page_flags");
-		exit(1);
-	}
-
-	/* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
-	if (munmap(addr, MAP_LENGTH)) {
-		perror("munmap");
-		exit(1);
-	}
-
-	return 0;
-}
diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c
deleted file mode 100644
index a634f47d1e56..000000000000
--- a/tools/testing/selftests/vm/hugetlb-madvise.c
+++ /dev/null
@@ -1,406 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * hugepage-madvise:
- *
- * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE
- * on hugetlb mappings.
- *
- * Before running this test, make sure the administrator has pre-allocated
- * at least MIN_FREE_PAGES hugetlb pages and they are free.  In addition,
- * the test takes an argument that is the path to a file in a hugetlbfs
- * filesystem.  Therefore, a hugetlbfs filesystem must be mounted on some
- * directory.
- */
-
-#define _GNU_SOURCE
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#define __USE_GNU
-#include <fcntl.h>
-
-#define MIN_FREE_PAGES	20
-#define NR_HUGE_PAGES	10	/* common number of pages to map/allocate */
-
-#define validate_free_pages(exp_free)					\
-	do {								\
-		int fhp = get_free_hugepages();				\
-		if (fhp != (exp_free)) {				\
-			printf("Unexpected number of free huge "	\
-				"pages line %d\n", __LINE__);		\
-			exit(1);					\
-		}							\
-	} while (0)
-
-unsigned long huge_page_size;
-unsigned long base_page_size;
-
-/*
- * default_huge_page_size copied from mlock2-tests.c
- */
-unsigned long default_huge_page_size(void)
-{
-	unsigned long hps = 0;
-	char *line = NULL;
-	size_t linelen = 0;
-	FILE *f = fopen("/proc/meminfo", "r");
-
-	if (!f)
-		return 0;
-	while (getline(&line, &linelen, f) > 0) {
-		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
-			hps <<= 10;
-			break;
-		}
-	}
-
-	free(line);
-	fclose(f);
-	return hps;
-}
-
-unsigned long get_free_hugepages(void)
-{
-	unsigned long fhp = 0;
-	char *line = NULL;
-	size_t linelen = 0;
-	FILE *f = fopen("/proc/meminfo", "r");
-
-	if (!f)
-		return fhp;
-	while (getline(&line, &linelen, f) > 0) {
-		if (sscanf(line, "HugePages_Free:      %lu", &fhp) == 1)
-			break;
-	}
-
-	free(line);
-	fclose(f);
-	return fhp;
-}
-
-void write_fault_pages(void *addr, unsigned long nr_pages)
-{
-	unsigned long i;
-
-	for (i = 0; i < nr_pages; i++)
-		*((unsigned long *)(addr + (i * huge_page_size))) = i;
-}
-
-void read_fault_pages(void *addr, unsigned long nr_pages)
-{
-	unsigned long dummy = 0;
-	unsigned long i;
-
-	for (i = 0; i < nr_pages; i++)
-		dummy += *((unsigned long *)(addr + (i * huge_page_size)));
-}
-
-int main(int argc, char **argv)
-{
-	unsigned long free_hugepages;
-	void *addr, *addr2;
-	int fd;
-	int ret;
-
-	huge_page_size = default_huge_page_size();
-	if (!huge_page_size) {
-		printf("Unable to determine huge page size, exiting!\n");
-		exit(1);
-	}
-	base_page_size = sysconf(_SC_PAGE_SIZE);
-	if (!huge_page_size) {
-		printf("Unable to determine base page size, exiting!\n");
-		exit(1);
-	}
-
-	free_hugepages = get_free_hugepages();
-	if (free_hugepages < MIN_FREE_PAGES) {
-		printf("Not enough free huge pages to test, exiting!\n");
-		exit(1);
-	}
-
-	fd = memfd_create(argv[0], MFD_HUGETLB);
-	if (fd < 0) {
-		perror("memfd_create() failed");
-		exit(1);
-	}
-
-	/*
-	 * Test validity of MADV_DONTNEED addr and length arguments.  mmap
-	 * size is NR_HUGE_PAGES + 2.  One page at the beginning and end of
-	 * the mapping will be unmapped so we KNOW there is nothing mapped
-	 * there.
-	 */
-	addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size,
-			PROT_READ | PROT_WRITE,
-			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-			-1, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
-	if (munmap(addr, huge_page_size) ||
-			munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size,
-				huge_page_size)) {
-		perror("munmap");
-		exit(1);
-	}
-	addr = addr + huge_page_size;
-
-	write_fault_pages(addr, NR_HUGE_PAGES);
-	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-	/* addr before mapping should fail */
-	ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size,
-		MADV_DONTNEED);
-	if (!ret) {
-		printf("Unexpected success of madvise call with invalid addr line %d\n",
-				__LINE__);
-			exit(1);
-	}
-
-	/* addr + length after mapping should fail */
-	ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size,
-		MADV_DONTNEED);
-	if (!ret) {
-		printf("Unexpected success of madvise call with invalid length line %d\n",
-				__LINE__);
-			exit(1);
-	}
-
-	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
-
-	/*
-	 * Test alignment of MADV_DONTNEED addr and length arguments
-	 */
-	addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
-			PROT_READ | PROT_WRITE,
-			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-			-1, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
-	write_fault_pages(addr, NR_HUGE_PAGES);
-	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-	/* addr is not huge page size aligned and should fail */
-	ret = madvise(addr + base_page_size,
-			NR_HUGE_PAGES * huge_page_size - base_page_size,
-			MADV_DONTNEED);
-	if (!ret) {
-		printf("Unexpected success of madvise call with unaligned start address %d\n",
-				__LINE__);
-			exit(1);
-	}
-
-	/* addr + length should be aligned down to huge page size */
-	if (madvise(addr,
-			((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size,
-			MADV_DONTNEED)) {
-		perror("madvise");
-		exit(1);
-	}
-
-	/* should free all but last page in mapping */
-	validate_free_pages(free_hugepages - 1);
-
-	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
-	validate_free_pages(free_hugepages);
-
-	/*
-	 * Test MADV_DONTNEED on anonymous private mapping
-	 */
-	addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
-			PROT_READ | PROT_WRITE,
-			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-			-1, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
-	write_fault_pages(addr, NR_HUGE_PAGES);
-	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-		perror("madvise");
-		exit(1);
-	}
-
-	/* should free all pages in mapping */
-	validate_free_pages(free_hugepages);
-
-	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
-
-	/*
-	 * Test MADV_DONTNEED on private mapping of hugetlb file
-	 */
-	if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
-		perror("fallocate");
-		exit(1);
-	}
-	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-	addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
-			PROT_READ | PROT_WRITE,
-			MAP_PRIVATE, fd, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
-
-	/* read should not consume any pages */
-	read_fault_pages(addr, NR_HUGE_PAGES);
-	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-	/* madvise should not free any pages */
-	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-		perror("madvise");
-		exit(1);
-	}
-	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-	/* writes should allocate private pages */
-	write_fault_pages(addr, NR_HUGE_PAGES);
-	validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
-
-	/* madvise should free private pages */
-	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-		perror("madvise");
-		exit(1);
-	}
-	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-	/* writes should allocate private pages */
-	write_fault_pages(addr, NR_HUGE_PAGES);
-	validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
-
-	/*
-	 * The fallocate below certainly should free the pages associated
-	 * with the file.  However, pages in the private mapping are also
-	 * freed.  This is not the 'correct' behavior, but is expected
-	 * because this is how it has worked since the initial hugetlb
-	 * implementation.
-	 */
-	if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
-					0, NR_HUGE_PAGES * huge_page_size)) {
-		perror("fallocate");
-		exit(1);
-	}
-	validate_free_pages(free_hugepages);
-
-	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
-
-	/*
-	 * Test MADV_DONTNEED on shared mapping of hugetlb file
-	 */
-	if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
-		perror("fallocate");
-		exit(1);
-	}
-	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-	addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
-			PROT_READ | PROT_WRITE,
-			MAP_SHARED, fd, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
-
-	/* write should not consume any pages */
-	write_fault_pages(addr, NR_HUGE_PAGES);
-	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-	/* madvise should not free any pages */
-	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-		perror("madvise");
-		exit(1);
-	}
-	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-	/*
-	 * Test MADV_REMOVE on shared mapping of hugetlb file
-	 *
-	 * madvise is same as hole punch and should free all pages.
-	 */
-	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) {
-		perror("madvise");
-		exit(1);
-	}
-	validate_free_pages(free_hugepages);
-	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
-
-	/*
-	 * Test MADV_REMOVE on shared and private mapping of hugetlb file
-	 */
-	if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
-		perror("fallocate");
-		exit(1);
-	}
-	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-	addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
-			PROT_READ | PROT_WRITE,
-			MAP_SHARED, fd, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
-
-	/* shared write should not consume any additional pages */
-	write_fault_pages(addr, NR_HUGE_PAGES);
-	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-	addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
-			PROT_READ | PROT_WRITE,
-			MAP_PRIVATE, fd, 0);
-	if (addr2 == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
-
-	/* private read should not consume any pages */
-	read_fault_pages(addr2, NR_HUGE_PAGES);
-	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-	/* private write should consume additional pages */
-	write_fault_pages(addr2, NR_HUGE_PAGES);
-	validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
-
-	/* madvise of shared mapping should not free any pages */
-	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-		perror("madvise");
-		exit(1);
-	}
-	validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
-
-	/* madvise of private mapping should free private pages */
-	if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-		perror("madvise");
-		exit(1);
-	}
-	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-	/* private write should consume additional pages again */
-	write_fault_pages(addr2, NR_HUGE_PAGES);
-	validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
-
-	/*
-	 * madvise should free both file and private pages although this is
-	 * not correct.  private pages should not be freed, but this is
-	 * expected.  See comment associated with FALLOC_FL_PUNCH_HOLE call.
-	 */
-	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) {
-		perror("madvise");
-		exit(1);
-	}
-	validate_free_pages(free_hugepages);
-
-	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
-	(void)munmap(addr2, NR_HUGE_PAGES * huge_page_size);
-
-	close(fd);
-	return 0;
-}
diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh
deleted file mode 100644
index bf2d2a684edf..000000000000
--- a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh
+++ /dev/null
@@ -1,252 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-set -e
-
-if [[ $(id -u) -ne 0 ]]; then
-  echo "This test must be run as root. Skipping..."
-  exit $ksft_skip
-fi
-
-usage_file=usage_in_bytes
-
-if [[ "$1" == "-cgroup-v2" ]]; then
-  cgroup2=1
-  usage_file=current
-fi
-
-
-if [[ $cgroup2 ]]; then
-  CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
-  if [[ -z "$CGROUP_ROOT" ]]; then
-    CGROUP_ROOT=/dev/cgroup/memory
-    mount -t cgroup2 none $CGROUP_ROOT
-    do_umount=1
-  fi
-  echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control
-else
-  CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}')
-  if [[ -z "$CGROUP_ROOT" ]]; then
-    CGROUP_ROOT=/dev/cgroup/memory
-    mount -t cgroup memory,hugetlb $CGROUP_ROOT
-    do_umount=1
-  fi
-fi
-MNT='/mnt/huge/'
-
-function get_machine_hugepage_size() {
-  hpz=$(grep -i hugepagesize /proc/meminfo)
-  kb=${hpz:14:-3}
-  mb=$(($kb / 1024))
-  echo $mb
-}
-
-MB=$(get_machine_hugepage_size)
-
-function cleanup() {
-  echo cleanup
-  set +e
-  rm -rf "$MNT"/* 2>/dev/null
-  umount "$MNT" 2>/dev/null
-  rmdir "$MNT" 2>/dev/null
-  rmdir "$CGROUP_ROOT"/a/b 2>/dev/null
-  rmdir "$CGROUP_ROOT"/a 2>/dev/null
-  rmdir "$CGROUP_ROOT"/test1 2>/dev/null
-  echo 0 >/proc/sys/vm/nr_hugepages
-  set -e
-}
-
-function assert_state() {
-  local expected_a="$1"
-  local expected_a_hugetlb="$2"
-  local expected_b=""
-  local expected_b_hugetlb=""
-
-  if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then
-    expected_b="$3"
-    expected_b_hugetlb="$4"
-  fi
-  local tolerance=$((5 * 1024 * 1024))
-
-  local actual_a
-  actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)"
-  if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] ||
-    [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then
-    echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB
-    echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB
-    echo fail
-
-    cleanup
-    exit 1
-  fi
-
-  local actual_a_hugetlb
-  actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)"
-  if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] ||
-    [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then
-    echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB
-    echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB
-    echo fail
-
-    cleanup
-    exit 1
-  fi
-
-  if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then
-    return
-  fi
-
-  local actual_b
-  actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)"
-  if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] ||
-    [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then
-    echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB
-    echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB
-    echo fail
-
-    cleanup
-    exit 1
-  fi
-
-  local actual_b_hugetlb
-  actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)"
-  if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] ||
-    [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then
-    echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB
-    echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB
-    echo fail
-
-    cleanup
-    exit 1
-  fi
-}
-
-function setup() {
-  echo 100 >/proc/sys/vm/nr_hugepages
-  mkdir "$CGROUP_ROOT"/a
-  sleep 1
-  if [[ $cgroup2 ]]; then
-    echo "+hugetlb +memory" >$CGROUP_ROOT/a/cgroup.subtree_control
-  else
-    echo 0 >$CGROUP_ROOT/a/cpuset.mems
-    echo 0 >$CGROUP_ROOT/a/cpuset.cpus
-  fi
-
-  mkdir "$CGROUP_ROOT"/a/b
-
-  if [[ ! $cgroup2 ]]; then
-    echo 0 >$CGROUP_ROOT/a/b/cpuset.mems
-    echo 0 >$CGROUP_ROOT/a/b/cpuset.cpus
-  fi
-
-  mkdir -p "$MNT"
-  mount -t hugetlbfs none "$MNT"
-}
-
-write_hugetlbfs() {
-  local cgroup="$1"
-  local path="$2"
-  local size="$3"
-
-  if [[ $cgroup2 ]]; then
-    echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs
-  else
-    echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems
-    echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus
-    echo $$ >"$CGROUP_ROOT/$cgroup/tasks"
-  fi
-  ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o
-  if [[ $cgroup2 ]]; then
-    echo $$ >$CGROUP_ROOT/cgroup.procs
-  else
-    echo $$ >"$CGROUP_ROOT/tasks"
-  fi
-  echo
-}
-
-set -e
-
-size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages.
-
-cleanup
-
-echo
-echo
-echo Test charge, rmdir, uncharge
-setup
-echo mkdir
-mkdir $CGROUP_ROOT/test1
-
-echo write
-write_hugetlbfs test1 "$MNT"/test $size
-
-echo rmdir
-rmdir $CGROUP_ROOT/test1
-mkdir $CGROUP_ROOT/test1
-
-echo uncharge
-rm -rf /mnt/huge/*
-
-cleanup
-
-echo done
-echo
-echo
-if [[ ! $cgroup2 ]]; then
-  echo "Test parent and child hugetlb usage"
-  setup
-
-  echo write
-  write_hugetlbfs a "$MNT"/test $size
-
-  echo Assert memory charged correctly for parent use.
-  assert_state 0 $size 0 0
-
-  write_hugetlbfs a/b "$MNT"/test2 $size
-
-  echo Assert memory charged correctly for child use.
-  assert_state 0 $(($size * 2)) 0 $size
-
-  rmdir "$CGROUP_ROOT"/a/b
-  sleep 5
-  echo Assert memory reparent correctly.
-  assert_state 0 $(($size * 2))
-
-  rm -rf "$MNT"/*
-  umount "$MNT"
-  echo Assert memory uncharged correctly.
-  assert_state 0 0
-
-  cleanup
-fi
-
-echo
-echo
-echo "Test child only hugetlb usage"
-echo setup
-setup
-
-echo write
-write_hugetlbfs a/b "$MNT"/test2 $size
-
-echo Assert memory charged correctly for child only use.
-assert_state 0 $(($size)) 0 $size
-
-rmdir "$CGROUP_ROOT"/a/b
-echo Assert memory reparent correctly.
-assert_state 0 $size
-
-rm -rf "$MNT"/*
-umount "$MNT"
-echo Assert memory uncharged correctly.
-assert_state 0 0
-
-cleanup
-
-echo ALL PASS
-
-umount $CGROUP_ROOT
-rm -rf $CGROUP_ROOT
diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
deleted file mode 100644
index 64126c8cd561..000000000000
--- a/tools/testing/selftests/vm/khugepaged.c
+++ /dev/null
@@ -1,1558 +0,0 @@
-#define _GNU_SOURCE
-#include <ctype.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <limits.h>
-#include <dirent.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <string.h>
-#include <unistd.h>
-
-#include <sys/mman.h>
-#include <sys/wait.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/sysmacros.h>
-#include <sys/vfs.h>
-
-#include "linux/magic.h"
-
-#include "vm_util.h"
-
-#ifndef MADV_PAGEOUT
-#define MADV_PAGEOUT 21
-#endif
-#ifndef MADV_POPULATE_READ
-#define MADV_POPULATE_READ 22
-#endif
-#ifndef MADV_COLLAPSE
-#define MADV_COLLAPSE 25
-#endif
-
-#define BASE_ADDR ((void *)(1UL << 30))
-static unsigned long hpage_pmd_size;
-static unsigned long page_size;
-static int hpage_pmd_nr;
-
-#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
-#define PID_SMAPS "/proc/self/smaps"
-#define TEST_FILE "collapse_test_file"
-
-#define MAX_LINE_LENGTH 500
-
-enum vma_type {
-	VMA_ANON,
-	VMA_FILE,
-	VMA_SHMEM,
-};
-
-struct mem_ops {
-	void *(*setup_area)(int nr_hpages);
-	void (*cleanup_area)(void *p, unsigned long size);
-	void (*fault)(void *p, unsigned long start, unsigned long end);
-	bool (*check_huge)(void *addr, int nr_hpages);
-	const char *name;
-};
-
-static struct mem_ops *file_ops;
-static struct mem_ops *anon_ops;
-static struct mem_ops *shmem_ops;
-
-struct collapse_context {
-	void (*collapse)(const char *msg, char *p, int nr_hpages,
-			 struct mem_ops *ops, bool expect);
-	bool enforce_pte_scan_limits;
-	const char *name;
-};
-
-static struct collapse_context *khugepaged_context;
-static struct collapse_context *madvise_context;
-
-struct file_info {
-	const char *dir;
-	char path[PATH_MAX];
-	enum vma_type type;
-	int fd;
-	char dev_queue_read_ahead_path[PATH_MAX];
-};
-
-static struct file_info finfo;
-
-enum thp_enabled {
-	THP_ALWAYS,
-	THP_MADVISE,
-	THP_NEVER,
-};
-
-static const char *thp_enabled_strings[] = {
-	"always",
-	"madvise",
-	"never",
-	NULL
-};
-
-enum thp_defrag {
-	THP_DEFRAG_ALWAYS,
-	THP_DEFRAG_DEFER,
-	THP_DEFRAG_DEFER_MADVISE,
-	THP_DEFRAG_MADVISE,
-	THP_DEFRAG_NEVER,
-};
-
-static const char *thp_defrag_strings[] = {
-	"always",
-	"defer",
-	"defer+madvise",
-	"madvise",
-	"never",
-	NULL
-};
-
-enum shmem_enabled {
-	SHMEM_ALWAYS,
-	SHMEM_WITHIN_SIZE,
-	SHMEM_ADVISE,
-	SHMEM_NEVER,
-	SHMEM_DENY,
-	SHMEM_FORCE,
-};
-
-static const char *shmem_enabled_strings[] = {
-	"always",
-	"within_size",
-	"advise",
-	"never",
-	"deny",
-	"force",
-	NULL
-};
-
-struct khugepaged_settings {
-	bool defrag;
-	unsigned int alloc_sleep_millisecs;
-	unsigned int scan_sleep_millisecs;
-	unsigned int max_ptes_none;
-	unsigned int max_ptes_swap;
-	unsigned int max_ptes_shared;
-	unsigned long pages_to_scan;
-};
-
-struct settings {
-	enum thp_enabled thp_enabled;
-	enum thp_defrag thp_defrag;
-	enum shmem_enabled shmem_enabled;
-	bool use_zero_page;
-	struct khugepaged_settings khugepaged;
-	unsigned long read_ahead_kb;
-};
-
-static struct settings saved_settings;
-static bool skip_settings_restore;
-
-static int exit_status;
-
-static void success(const char *msg)
-{
-	printf(" \e[32m%s\e[0m\n", msg);
-}
-
-static void fail(const char *msg)
-{
-	printf(" \e[31m%s\e[0m\n", msg);
-	exit_status++;
-}
-
-static void skip(const char *msg)
-{
-	printf(" \e[33m%s\e[0m\n", msg);
-}
-
-static int read_file(const char *path, char *buf, size_t buflen)
-{
-	int fd;
-	ssize_t numread;
-
-	fd = open(path, O_RDONLY);
-	if (fd == -1)
-		return 0;
-
-	numread = read(fd, buf, buflen - 1);
-	if (numread < 1) {
-		close(fd);
-		return 0;
-	}
-
-	buf[numread] = '\0';
-	close(fd);
-
-	return (unsigned int) numread;
-}
-
-static int write_file(const char *path, const char *buf, size_t buflen)
-{
-	int fd;
-	ssize_t numwritten;
-
-	fd = open(path, O_WRONLY);
-	if (fd == -1) {
-		printf("open(%s)\n", path);
-		exit(EXIT_FAILURE);
-		return 0;
-	}
-
-	numwritten = write(fd, buf, buflen - 1);
-	close(fd);
-	if (numwritten < 1) {
-		printf("write(%s)\n", buf);
-		exit(EXIT_FAILURE);
-		return 0;
-	}
-
-	return (unsigned int) numwritten;
-}
-
-static int read_string(const char *name, const char *strings[])
-{
-	char path[PATH_MAX];
-	char buf[256];
-	char *c;
-	int ret;
-
-	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
-	if (ret >= PATH_MAX) {
-		printf("%s: Pathname is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
-
-	if (!read_file(path, buf, sizeof(buf))) {
-		perror(path);
-		exit(EXIT_FAILURE);
-	}
-
-	c = strchr(buf, '[');
-	if (!c) {
-		printf("%s: Parse failure\n", __func__);
-		exit(EXIT_FAILURE);
-	}
-
-	c++;
-	memmove(buf, c, sizeof(buf) - (c - buf));
-
-	c = strchr(buf, ']');
-	if (!c) {
-		printf("%s: Parse failure\n", __func__);
-		exit(EXIT_FAILURE);
-	}
-	*c = '\0';
-
-	ret = 0;
-	while (strings[ret]) {
-		if (!strcmp(strings[ret], buf))
-			return ret;
-		ret++;
-	}
-
-	printf("Failed to parse %s\n", name);
-	exit(EXIT_FAILURE);
-}
-
-static void write_string(const char *name, const char *val)
-{
-	char path[PATH_MAX];
-	int ret;
-
-	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
-	if (ret >= PATH_MAX) {
-		printf("%s: Pathname is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
-
-	if (!write_file(path, val, strlen(val) + 1)) {
-		perror(path);
-		exit(EXIT_FAILURE);
-	}
-}
-
-static const unsigned long _read_num(const char *path)
-{
-	char buf[21];
-
-	if (read_file(path, buf, sizeof(buf)) < 0) {
-		perror("read_file(read_num)");
-		exit(EXIT_FAILURE);
-	}
-
-	return strtoul(buf, NULL, 10);
-}
-
-static const unsigned long read_num(const char *name)
-{
-	char path[PATH_MAX];
-	int ret;
-
-	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
-	if (ret >= PATH_MAX) {
-		printf("%s: Pathname is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
-	return _read_num(path);
-}
-
-static void _write_num(const char *path, unsigned long num)
-{
-	char buf[21];
-
-	sprintf(buf, "%ld", num);
-	if (!write_file(path, buf, strlen(buf) + 1)) {
-		perror(path);
-		exit(EXIT_FAILURE);
-	}
-}
-
-static void write_num(const char *name, unsigned long num)
-{
-	char path[PATH_MAX];
-	int ret;
-
-	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
-	if (ret >= PATH_MAX) {
-		printf("%s: Pathname is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
-	_write_num(path, num);
-}
-
-static void write_settings(struct settings *settings)
-{
-	struct khugepaged_settings *khugepaged = &settings->khugepaged;
-
-	write_string("enabled", thp_enabled_strings[settings->thp_enabled]);
-	write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
-	write_string("shmem_enabled",
-			shmem_enabled_strings[settings->shmem_enabled]);
-	write_num("use_zero_page", settings->use_zero_page);
-
-	write_num("khugepaged/defrag", khugepaged->defrag);
-	write_num("khugepaged/alloc_sleep_millisecs",
-			khugepaged->alloc_sleep_millisecs);
-	write_num("khugepaged/scan_sleep_millisecs",
-			khugepaged->scan_sleep_millisecs);
-	write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
-	write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
-	write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
-	write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
-
-	if (file_ops && finfo.type == VMA_FILE)
-		_write_num(finfo.dev_queue_read_ahead_path,
-			   settings->read_ahead_kb);
-}
-
-#define MAX_SETTINGS_DEPTH 4
-static struct settings settings_stack[MAX_SETTINGS_DEPTH];
-static int settings_index;
-
-static struct settings *current_settings(void)
-{
-	if (!settings_index) {
-		printf("Fail: No settings set");
-		exit(EXIT_FAILURE);
-	}
-	return settings_stack + settings_index - 1;
-}
-
-static void push_settings(struct settings *settings)
-{
-	if (settings_index >= MAX_SETTINGS_DEPTH) {
-		printf("Fail: Settings stack exceeded");
-		exit(EXIT_FAILURE);
-	}
-	settings_stack[settings_index++] = *settings;
-	write_settings(current_settings());
-}
-
-static void pop_settings(void)
-{
-	if (settings_index <= 0) {
-		printf("Fail: Settings stack empty");
-		exit(EXIT_FAILURE);
-	}
-	--settings_index;
-	write_settings(current_settings());
-}
-
-static void restore_settings(int sig)
-{
-	if (skip_settings_restore)
-		goto out;
-
-	printf("Restore THP and khugepaged settings...");
-	write_settings(&saved_settings);
-	success("OK");
-	if (sig)
-		exit(EXIT_FAILURE);
-out:
-	exit(exit_status);
-}
-
-static void save_settings(void)
-{
-	printf("Save THP and khugepaged settings...");
-	saved_settings = (struct settings) {
-		.thp_enabled = read_string("enabled", thp_enabled_strings),
-		.thp_defrag = read_string("defrag", thp_defrag_strings),
-		.shmem_enabled =
-			read_string("shmem_enabled", shmem_enabled_strings),
-		.use_zero_page = read_num("use_zero_page"),
-	};
-	saved_settings.khugepaged = (struct khugepaged_settings) {
-		.defrag = read_num("khugepaged/defrag"),
-		.alloc_sleep_millisecs =
-			read_num("khugepaged/alloc_sleep_millisecs"),
-		.scan_sleep_millisecs =
-			read_num("khugepaged/scan_sleep_millisecs"),
-		.max_ptes_none = read_num("khugepaged/max_ptes_none"),
-		.max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
-		.max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
-		.pages_to_scan = read_num("khugepaged/pages_to_scan"),
-	};
-	if (file_ops && finfo.type == VMA_FILE)
-		saved_settings.read_ahead_kb =
-				_read_num(finfo.dev_queue_read_ahead_path);
-
-	success("OK");
-
-	signal(SIGTERM, restore_settings);
-	signal(SIGINT, restore_settings);
-	signal(SIGHUP, restore_settings);
-	signal(SIGQUIT, restore_settings);
-}
-
-static void get_finfo(const char *dir)
-{
-	struct stat path_stat;
-	struct statfs fs;
-	char buf[1 << 10];
-	char path[PATH_MAX];
-	char *str, *end;
-
-	finfo.dir = dir;
-	stat(finfo.dir, &path_stat);
-	if (!S_ISDIR(path_stat.st_mode)) {
-		printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
-		exit(EXIT_FAILURE);
-	}
-	if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
-		     finfo.dir) >= sizeof(finfo.path)) {
-		printf("%s: Pathname is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
-	if (statfs(finfo.dir, &fs)) {
-		perror("statfs()");
-		exit(EXIT_FAILURE);
-	}
-	finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
-	if (finfo.type == VMA_SHMEM)
-		return;
-
-	/* Find owning device's queue/read_ahead_kb control */
-	if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
-		     major(path_stat.st_dev), minor(path_stat.st_dev))
-	    >= sizeof(path)) {
-		printf("%s: Pathname is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
-	if (read_file(path, buf, sizeof(buf)) < 0) {
-		perror("read_file(read_num)");
-		exit(EXIT_FAILURE);
-	}
-	if (strstr(buf, "DEVTYPE=disk")) {
-		/* Found it */
-		if (snprintf(finfo.dev_queue_read_ahead_path,
-			     sizeof(finfo.dev_queue_read_ahead_path),
-			     "/sys/dev/block/%d:%d/queue/read_ahead_kb",
-			     major(path_stat.st_dev), minor(path_stat.st_dev))
-		    >= sizeof(finfo.dev_queue_read_ahead_path)) {
-			printf("%s: Pathname is too long\n", __func__);
-			exit(EXIT_FAILURE);
-		}
-		return;
-	}
-	if (!strstr(buf, "DEVTYPE=partition")) {
-		printf("%s: Unknown device type: %s\n", __func__, path);
-		exit(EXIT_FAILURE);
-	}
-	/*
-	 * Partition of block device - need to find actual device.
-	 * Using naming convention that devnameN is partition of
-	 * device devname.
-	 */
-	str = strstr(buf, "DEVNAME=");
-	if (!str) {
-		printf("%s: Could not read: %s", __func__, path);
-		exit(EXIT_FAILURE);
-	}
-	str += 8;
-	end = str;
-	while (*end) {
-		if (isdigit(*end)) {
-			*end = '\0';
-			if (snprintf(finfo.dev_queue_read_ahead_path,
-				     sizeof(finfo.dev_queue_read_ahead_path),
-				     "/sys/block/%s/queue/read_ahead_kb",
-				     str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
-				printf("%s: Pathname is too long\n", __func__);
-				exit(EXIT_FAILURE);
-			}
-			return;
-		}
-		++end;
-	}
-	printf("%s: Could not read: %s\n", __func__, path);
-	exit(EXIT_FAILURE);
-}
-
-static bool check_swap(void *addr, unsigned long size)
-{
-	bool swap = false;
-	int ret;
-	FILE *fp;
-	char buffer[MAX_LINE_LENGTH];
-	char addr_pattern[MAX_LINE_LENGTH];
-
-	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
-		       (unsigned long) addr);
-	if (ret >= MAX_LINE_LENGTH) {
-		printf("%s: Pattern is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
-
-
-	fp = fopen(PID_SMAPS, "r");
-	if (!fp) {
-		printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
-		exit(EXIT_FAILURE);
-	}
-	if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
-		goto err_out;
-
-	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
-		       size >> 10);
-	if (ret >= MAX_LINE_LENGTH) {
-		printf("%s: Pattern is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
-	/*
-	 * Fetch the Swap: in the same block and check whether it got
-	 * the expected number of hugeepages next.
-	 */
-	if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
-		goto err_out;
-
-	if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
-		goto err_out;
-
-	swap = true;
-err_out:
-	fclose(fp);
-	return swap;
-}
-
-static void *alloc_mapping(int nr)
-{
-	void *p;
-
-	p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
-		 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (p != BASE_ADDR) {
-		printf("Failed to allocate VMA at %p\n", BASE_ADDR);
-		exit(EXIT_FAILURE);
-	}
-
-	return p;
-}
-
-static void fill_memory(int *p, unsigned long start, unsigned long end)
-{
-	int i;
-
-	for (i = start / page_size; i < end / page_size; i++)
-		p[i * page_size / sizeof(*p)] = i + 0xdead0000;
-}
-
-/*
- * MADV_COLLAPSE is a best-effort request and may fail if an internal
- * resource is temporarily unavailable, in which case it will set errno to
- * EAGAIN.  In such a case, immediately reattempt the operation one more
- * time.
- */
-static int madvise_collapse_retry(void *p, unsigned long size)
-{
-	bool retry = true;
-	int ret;
-
-retry:
-	ret = madvise(p, size, MADV_COLLAPSE);
-	if (ret && errno == EAGAIN && retry) {
-		retry = false;
-		goto retry;
-	}
-	return ret;
-}
-
-/*
- * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
- * validate_memory()'able contents.
- */
-static void *alloc_hpage(struct mem_ops *ops)
-{
-	void *p = ops->setup_area(1);
-
-	ops->fault(p, 0, hpage_pmd_size);
-
-	/*
-	 * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
-	 * The latter is ineligible for collapse by MADV_COLLAPSE
-	 * while the former might cause MADV_COLLAPSE to race with
-	 * khugepaged on low-load system (like a test machine), which
-	 * would cause MADV_COLLAPSE to fail with EAGAIN.
-	 */
-	printf("Allocate huge page...");
-	if (madvise_collapse_retry(p, hpage_pmd_size)) {
-		perror("madvise(MADV_COLLAPSE)");
-		exit(EXIT_FAILURE);
-	}
-	if (!ops->check_huge(p, 1)) {
-		perror("madvise(MADV_COLLAPSE)");
-		exit(EXIT_FAILURE);
-	}
-	if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
-		perror("madvise(MADV_HUGEPAGE)");
-		exit(EXIT_FAILURE);
-	}
-	success("OK");
-	return p;
-}
-
-static void validate_memory(int *p, unsigned long start, unsigned long end)
-{
-	int i;
-
-	for (i = start / page_size; i < end / page_size; i++) {
-		if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
-			printf("Page %d is corrupted: %#x\n",
-					i, p[i * page_size / sizeof(*p)]);
-			exit(EXIT_FAILURE);
-		}
-	}
-}
-
-static void *anon_setup_area(int nr_hpages)
-{
-	return alloc_mapping(nr_hpages);
-}
-
-static void anon_cleanup_area(void *p, unsigned long size)
-{
-	munmap(p, size);
-}
-
-static void anon_fault(void *p, unsigned long start, unsigned long end)
-{
-	fill_memory(p, start, end);
-}
-
-static bool anon_check_huge(void *addr, int nr_hpages)
-{
-	return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
-}
-
-static void *file_setup_area(int nr_hpages)
-{
-	int fd;
-	void *p;
-	unsigned long size;
-
-	unlink(finfo.path);  /* Cleanup from previous failed tests */
-	printf("Creating %s for collapse%s...", finfo.path,
-	       finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
-	fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
-		  777);
-	if (fd < 0) {
-		perror("open()");
-		exit(EXIT_FAILURE);
-	}
-
-	size = nr_hpages * hpage_pmd_size;
-	p = alloc_mapping(nr_hpages);
-	fill_memory(p, 0, size);
-	write(fd, p, size);
-	close(fd);
-	munmap(p, size);
-	success("OK");
-
-	printf("Opening %s read only for collapse...", finfo.path);
-	finfo.fd = open(finfo.path, O_RDONLY, 777);
-	if (finfo.fd < 0) {
-		perror("open()");
-		exit(EXIT_FAILURE);
-	}
-	p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
-		 MAP_PRIVATE, finfo.fd, 0);
-	if (p == MAP_FAILED || p != BASE_ADDR) {
-		perror("mmap()");
-		exit(EXIT_FAILURE);
-	}
-
-	/* Drop page cache */
-	write_file("/proc/sys/vm/drop_caches", "3", 2);
-	success("OK");
-	return p;
-}
-
-static void file_cleanup_area(void *p, unsigned long size)
-{
-	munmap(p, size);
-	close(finfo.fd);
-	unlink(finfo.path);
-}
-
-static void file_fault(void *p, unsigned long start, unsigned long end)
-{
-	if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
-		perror("madvise(MADV_POPULATE_READ");
-		exit(EXIT_FAILURE);
-	}
-}
-
-static bool file_check_huge(void *addr, int nr_hpages)
-{
-	switch (finfo.type) {
-	case VMA_FILE:
-		return check_huge_file(addr, nr_hpages, hpage_pmd_size);
-	case VMA_SHMEM:
-		return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
-	default:
-		exit(EXIT_FAILURE);
-		return false;
-	}
-}
-
-static void *shmem_setup_area(int nr_hpages)
-{
-	void *p;
-	unsigned long size = nr_hpages * hpage_pmd_size;
-
-	finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
-	if (finfo.fd < 0)  {
-		perror("memfd_create()");
-		exit(EXIT_FAILURE);
-	}
-	if (ftruncate(finfo.fd, size)) {
-		perror("ftruncate()");
-		exit(EXIT_FAILURE);
-	}
-	p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
-		 0);
-	if (p != BASE_ADDR) {
-		perror("mmap()");
-		exit(EXIT_FAILURE);
-	}
-	return p;
-}
-
-static void shmem_cleanup_area(void *p, unsigned long size)
-{
-	munmap(p, size);
-	close(finfo.fd);
-}
-
-static bool shmem_check_huge(void *addr, int nr_hpages)
-{
-	return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
-}
-
-static struct mem_ops __anon_ops = {
-	.setup_area = &anon_setup_area,
-	.cleanup_area = &anon_cleanup_area,
-	.fault = &anon_fault,
-	.check_huge = &anon_check_huge,
-	.name = "anon",
-};
-
-static struct mem_ops __file_ops = {
-	.setup_area = &file_setup_area,
-	.cleanup_area = &file_cleanup_area,
-	.fault = &file_fault,
-	.check_huge = &file_check_huge,
-	.name = "file",
-};
-
-static struct mem_ops __shmem_ops = {
-	.setup_area = &shmem_setup_area,
-	.cleanup_area = &shmem_cleanup_area,
-	.fault = &anon_fault,
-	.check_huge = &shmem_check_huge,
-	.name = "shmem",
-};
-
-static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
-			       struct mem_ops *ops, bool expect)
-{
-	int ret;
-	struct settings settings = *current_settings();
-
-	printf("%s...", msg);
-
-	/*
-	 * Prevent khugepaged interference and tests that MADV_COLLAPSE
-	 * ignores /sys/kernel/mm/transparent_hugepage/enabled
-	 */
-	settings.thp_enabled = THP_NEVER;
-	settings.shmem_enabled = SHMEM_NEVER;
-	push_settings(&settings);
-
-	/* Clear VM_NOHUGEPAGE */
-	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
-	ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
-	if (((bool)ret) == expect)
-		fail("Fail: Bad return value");
-	else if (!ops->check_huge(p, expect ? nr_hpages : 0))
-		fail("Fail: check_huge()");
-	else
-		success("OK");
-
-	pop_settings();
-}
-
-static void madvise_collapse(const char *msg, char *p, int nr_hpages,
-			     struct mem_ops *ops, bool expect)
-{
-	/* Sanity check */
-	if (!ops->check_huge(p, 0)) {
-		printf("Unexpected huge page\n");
-		exit(EXIT_FAILURE);
-	}
-	__madvise_collapse(msg, p, nr_hpages, ops, expect);
-}
-
-#define TICK 500000
-static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
-			  struct mem_ops *ops)
-{
-	int full_scans;
-	int timeout = 6; /* 3 seconds */
-
-	/* Sanity check */
-	if (!ops->check_huge(p, 0)) {
-		printf("Unexpected huge page\n");
-		exit(EXIT_FAILURE);
-	}
-
-	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
-
-	/* Wait until the second full_scan completed */
-	full_scans = read_num("khugepaged/full_scans") + 2;
-
-	printf("%s...", msg);
-	while (timeout--) {
-		if (ops->check_huge(p, nr_hpages))
-			break;
-		if (read_num("khugepaged/full_scans") >= full_scans)
-			break;
-		printf(".");
-		usleep(TICK);
-	}
-
-	madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
-
-	return timeout == -1;
-}
-
-static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
-				struct mem_ops *ops, bool expect)
-{
-	if (wait_for_scan(msg, p, nr_hpages, ops)) {
-		if (expect)
-			fail("Timeout");
-		else
-			success("OK");
-		return;
-	}
-
-	/*
-	 * For file and shmem memory, khugepaged only retracts pte entries after
-	 * putting the new hugepage in the page cache. The hugepage must be
-	 * subsequently refaulted to install the pmd mapping for the mm.
-	 */
-	if (ops != &__anon_ops)
-		ops->fault(p, 0, nr_hpages * hpage_pmd_size);
-
-	if (ops->check_huge(p, expect ? nr_hpages : 0))
-		success("OK");
-	else
-		fail("Fail");
-}
-
-static struct collapse_context __khugepaged_context = {
-	.collapse = &khugepaged_collapse,
-	.enforce_pte_scan_limits = true,
-	.name = "khugepaged",
-};
-
-static struct collapse_context __madvise_context = {
-	.collapse = &madvise_collapse,
-	.enforce_pte_scan_limits = false,
-	.name = "madvise",
-};
-
-static bool is_tmpfs(struct mem_ops *ops)
-{
-	return ops == &__file_ops && finfo.type == VMA_SHMEM;
-}
-
-static void alloc_at_fault(void)
-{
-	struct settings settings = *current_settings();
-	char *p;
-
-	settings.thp_enabled = THP_ALWAYS;
-	push_settings(&settings);
-
-	p = alloc_mapping(1);
-	*p = 1;
-	printf("Allocate huge page on fault...");
-	if (check_huge_anon(p, 1, hpage_pmd_size))
-		success("OK");
-	else
-		fail("Fail");
-
-	pop_settings();
-
-	madvise(p, page_size, MADV_DONTNEED);
-	printf("Split huge PMD on MADV_DONTNEED...");
-	if (check_huge_anon(p, 0, hpage_pmd_size))
-		success("OK");
-	else
-		fail("Fail");
-	munmap(p, hpage_pmd_size);
-}
-
-static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
-{
-	void *p;
-	int nr_hpages = 4;
-	unsigned long size = nr_hpages * hpage_pmd_size;
-
-	p = ops->setup_area(nr_hpages);
-	ops->fault(p, 0, size);
-	c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
-		    ops, true);
-	validate_memory(p, 0, size);
-	ops->cleanup_area(p, size);
-}
-
-static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
-{
-	void *p;
-
-	p = ops->setup_area(1);
-	c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
-	ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
-{
-	void *p;
-
-	p = ops->setup_area(1);
-	ops->fault(p, 0, page_size);
-	c->collapse("Collapse PTE table with single PTE entry present", p,
-		    1, ops, true);
-	ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
-{
-	int max_ptes_none = hpage_pmd_nr / 2;
-	struct settings settings = *current_settings();
-	void *p;
-
-	settings.khugepaged.max_ptes_none = max_ptes_none;
-	push_settings(&settings);
-
-	p = ops->setup_area(1);
-
-	if (is_tmpfs(ops)) {
-		/* shmem pages always in the page cache */
-		printf("tmpfs...");
-		skip("Skip");
-		goto skip;
-	}
-
-	ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
-	c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
-		    ops, !c->enforce_pte_scan_limits);
-	validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
-
-	if (c->enforce_pte_scan_limits) {
-		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
-		c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
-			    true);
-		validate_memory(p, 0,
-				(hpage_pmd_nr - max_ptes_none) * page_size);
-	}
-skip:
-	ops->cleanup_area(p, hpage_pmd_size);
-	pop_settings();
-}
-
-static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
-{
-	void *p;
-
-	p = ops->setup_area(1);
-	ops->fault(p, 0, hpage_pmd_size);
-
-	printf("Swapout one page...");
-	if (madvise(p, page_size, MADV_PAGEOUT)) {
-		perror("madvise(MADV_PAGEOUT)");
-		exit(EXIT_FAILURE);
-	}
-	if (check_swap(p, page_size)) {
-		success("OK");
-	} else {
-		fail("Fail");
-		goto out;
-	}
-
-	c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
-		    true);
-	validate_memory(p, 0, hpage_pmd_size);
-out:
-	ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
-{
-	int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
-	void *p;
-
-	p = ops->setup_area(1);
-	ops->fault(p, 0, hpage_pmd_size);
-
-	printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
-	if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
-		perror("madvise(MADV_PAGEOUT)");
-		exit(EXIT_FAILURE);
-	}
-	if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
-		success("OK");
-	} else {
-		fail("Fail");
-		goto out;
-	}
-
-	c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
-		    !c->enforce_pte_scan_limits);
-	validate_memory(p, 0, hpage_pmd_size);
-
-	if (c->enforce_pte_scan_limits) {
-		ops->fault(p, 0, hpage_pmd_size);
-		printf("Swapout %d of %d pages...", max_ptes_swap,
-		       hpage_pmd_nr);
-		if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
-			perror("madvise(MADV_PAGEOUT)");
-			exit(EXIT_FAILURE);
-		}
-		if (check_swap(p, max_ptes_swap * page_size)) {
-			success("OK");
-		} else {
-			fail("Fail");
-			goto out;
-		}
-
-		c->collapse("Collapse with max_ptes_swap pages swapped out", p,
-			    1, ops, true);
-		validate_memory(p, 0, hpage_pmd_size);
-	}
-out:
-	ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
-{
-	void *p;
-
-	p = alloc_hpage(ops);
-
-	if (is_tmpfs(ops)) {
-		/* MADV_DONTNEED won't evict tmpfs pages */
-		printf("tmpfs...");
-		skip("Skip");
-		goto skip;
-	}
-
-	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
-	printf("Split huge page leaving single PTE mapping compound page...");
-	madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
-	if (ops->check_huge(p, 0))
-		success("OK");
-	else
-		fail("Fail");
-
-	c->collapse("Collapse PTE table with single PTE mapping compound page",
-		    p, 1, ops, true);
-	validate_memory(p, 0, page_size);
-skip:
-	ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
-{
-	void *p;
-
-	p = alloc_hpage(ops);
-	printf("Split huge page leaving single PTE page table full of compound pages...");
-	madvise(p, page_size, MADV_NOHUGEPAGE);
-	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
-	if (ops->check_huge(p, 0))
-		success("OK");
-	else
-		fail("Fail");
-
-	c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
-		    true);
-	validate_memory(p, 0, hpage_pmd_size);
-	ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
-{
-	void *p;
-	int i;
-
-	p = ops->setup_area(1);
-	for (i = 0; i < hpage_pmd_nr; i++) {
-		printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
-				i + 1, hpage_pmd_nr);
-
-		madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
-		ops->fault(BASE_ADDR, 0, hpage_pmd_size);
-		if (!ops->check_huge(BASE_ADDR, 1)) {
-			printf("Failed to allocate huge page\n");
-			exit(EXIT_FAILURE);
-		}
-		madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
-
-		p = mremap(BASE_ADDR - i * page_size,
-				i * page_size + hpage_pmd_size,
-				(i + 1) * page_size,
-				MREMAP_MAYMOVE | MREMAP_FIXED,
-				BASE_ADDR + 2 * hpage_pmd_size);
-		if (p == MAP_FAILED) {
-			perror("mremap+unmap");
-			exit(EXIT_FAILURE);
-		}
-
-		p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
-				(i + 1) * page_size,
-				(i + 1) * page_size + hpage_pmd_size,
-				MREMAP_MAYMOVE | MREMAP_FIXED,
-				BASE_ADDR - (i + 1) * page_size);
-		if (p == MAP_FAILED) {
-			perror("mremap+alloc");
-			exit(EXIT_FAILURE);
-		}
-	}
-
-	ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
-	ops->fault(p, 0, hpage_pmd_size);
-	if (!ops->check_huge(p, 1))
-		success("OK");
-	else
-		fail("Fail");
-
-	c->collapse("Collapse PTE table full of different compound pages", p, 1,
-		    ops, true);
-
-	validate_memory(p, 0, hpage_pmd_size);
-	ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
-{
-	int wstatus;
-	void *p;
-
-	p = ops->setup_area(1);
-
-	printf("Allocate small page...");
-	ops->fault(p, 0, page_size);
-	if (ops->check_huge(p, 0))
-		success("OK");
-	else
-		fail("Fail");
-
-	printf("Share small page over fork()...");
-	if (!fork()) {
-		/* Do not touch settings on child exit */
-		skip_settings_restore = true;
-		exit_status = 0;
-
-		if (ops->check_huge(p, 0))
-			success("OK");
-		else
-			fail("Fail");
-
-		ops->fault(p, page_size, 2 * page_size);
-		c->collapse("Collapse PTE table with single page shared with parent process",
-			    p, 1, ops, true);
-
-		validate_memory(p, 0, page_size);
-		ops->cleanup_area(p, hpage_pmd_size);
-		exit(exit_status);
-	}
-
-	wait(&wstatus);
-	exit_status += WEXITSTATUS(wstatus);
-
-	printf("Check if parent still has small page...");
-	if (ops->check_huge(p, 0))
-		success("OK");
-	else
-		fail("Fail");
-	validate_memory(p, 0, page_size);
-	ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
-{
-	int wstatus;
-	void *p;
-
-	p = alloc_hpage(ops);
-	printf("Share huge page over fork()...");
-	if (!fork()) {
-		/* Do not touch settings on child exit */
-		skip_settings_restore = true;
-		exit_status = 0;
-
-		if (ops->check_huge(p, 1))
-			success("OK");
-		else
-			fail("Fail");
-
-		printf("Split huge page PMD in child process...");
-		madvise(p, page_size, MADV_NOHUGEPAGE);
-		madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
-		if (ops->check_huge(p, 0))
-			success("OK");
-		else
-			fail("Fail");
-		ops->fault(p, 0, page_size);
-
-		write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
-		c->collapse("Collapse PTE table full of compound pages in child",
-			    p, 1, ops, true);
-		write_num("khugepaged/max_ptes_shared",
-			  current_settings()->khugepaged.max_ptes_shared);
-
-		validate_memory(p, 0, hpage_pmd_size);
-		ops->cleanup_area(p, hpage_pmd_size);
-		exit(exit_status);
-	}
-
-	wait(&wstatus);
-	exit_status += WEXITSTATUS(wstatus);
-
-	printf("Check if parent still has huge page...");
-	if (ops->check_huge(p, 1))
-		success("OK");
-	else
-		fail("Fail");
-	validate_memory(p, 0, hpage_pmd_size);
-	ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
-{
-	int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
-	int wstatus;
-	void *p;
-
-	p = alloc_hpage(ops);
-	printf("Share huge page over fork()...");
-	if (!fork()) {
-		/* Do not touch settings on child exit */
-		skip_settings_restore = true;
-		exit_status = 0;
-
-		if (ops->check_huge(p, 1))
-			success("OK");
-		else
-			fail("Fail");
-
-		printf("Trigger CoW on page %d of %d...",
-				hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
-		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
-		if (ops->check_huge(p, 0))
-			success("OK");
-		else
-			fail("Fail");
-
-		c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
-			    1, ops, !c->enforce_pte_scan_limits);
-
-		if (c->enforce_pte_scan_limits) {
-			printf("Trigger CoW on page %d of %d...",
-			       hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
-			ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
-				    page_size);
-			if (ops->check_huge(p, 0))
-				success("OK");
-			else
-				fail("Fail");
-
-			c->collapse("Collapse with max_ptes_shared PTEs shared",
-				    p, 1, ops, true);
-		}
-
-		validate_memory(p, 0, hpage_pmd_size);
-		ops->cleanup_area(p, hpage_pmd_size);
-		exit(exit_status);
-	}
-
-	wait(&wstatus);
-	exit_status += WEXITSTATUS(wstatus);
-
-	printf("Check if parent still has huge page...");
-	if (ops->check_huge(p, 1))
-		success("OK");
-	else
-		fail("Fail");
-	validate_memory(p, 0, hpage_pmd_size);
-	ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void madvise_collapse_existing_thps(struct collapse_context *c,
-					   struct mem_ops *ops)
-{
-	void *p;
-
-	p = ops->setup_area(1);
-	ops->fault(p, 0, hpage_pmd_size);
-	c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
-	validate_memory(p, 0, hpage_pmd_size);
-
-	/* c->collapse() will find a hugepage and complain - call directly. */
-	__madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
-	validate_memory(p, 0, hpage_pmd_size);
-	ops->cleanup_area(p, hpage_pmd_size);
-}
-
-/*
- * Test race with khugepaged where page tables have been retracted and
- * pmd cleared.
- */
-static void madvise_retracted_page_tables(struct collapse_context *c,
-					  struct mem_ops *ops)
-{
-	void *p;
-	int nr_hpages = 1;
-	unsigned long size = nr_hpages * hpage_pmd_size;
-
-	p = ops->setup_area(nr_hpages);
-	ops->fault(p, 0, size);
-
-	/* Let khugepaged collapse and leave pmd cleared */
-	if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
-			  ops)) {
-		fail("Timeout");
-		return;
-	}
-	success("OK");
-	c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
-		    true);
-	validate_memory(p, 0, size);
-	ops->cleanup_area(p, size);
-}
-
-static void usage(void)
-{
-	fprintf(stderr, "\nUsage: ./khugepaged <test type> [dir]\n\n");
-	fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
-	fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
-	fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
-	fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
-	fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
-	fprintf(stderr,	"\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
-	fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
-	fprintf(stderr,	"\tmounted with huge=madvise option for khugepaged tests to work\n");
-	exit(1);
-}
-
-static void parse_test_type(int argc, const char **argv)
-{
-	char *buf;
-	const char *token;
-
-	if (argc == 1) {
-		/* Backwards compatibility */
-		khugepaged_context =  &__khugepaged_context;
-		madvise_context =  &__madvise_context;
-		anon_ops = &__anon_ops;
-		return;
-	}
-
-	buf = strdup(argv[1]);
-	token = strsep(&buf, ":");
-
-	if (!strcmp(token, "all")) {
-		khugepaged_context =  &__khugepaged_context;
-		madvise_context =  &__madvise_context;
-	} else if (!strcmp(token, "khugepaged")) {
-		khugepaged_context =  &__khugepaged_context;
-	} else if (!strcmp(token, "madvise")) {
-		madvise_context =  &__madvise_context;
-	} else {
-		usage();
-	}
-
-	if (!buf)
-		usage();
-
-	if (!strcmp(buf, "all")) {
-		file_ops =  &__file_ops;
-		anon_ops = &__anon_ops;
-		shmem_ops = &__shmem_ops;
-	} else if (!strcmp(buf, "anon")) {
-		anon_ops = &__anon_ops;
-	} else if (!strcmp(buf, "file")) {
-		file_ops =  &__file_ops;
-	} else if (!strcmp(buf, "shmem")) {
-		shmem_ops = &__shmem_ops;
-	} else {
-		usage();
-	}
-
-	if (!file_ops)
-		return;
-
-	if (argc != 3)
-		usage();
-}
-
-int main(int argc, const char **argv)
-{
-	struct settings default_settings = {
-		.thp_enabled = THP_MADVISE,
-		.thp_defrag = THP_DEFRAG_ALWAYS,
-		.shmem_enabled = SHMEM_ADVISE,
-		.use_zero_page = 0,
-		.khugepaged = {
-			.defrag = 1,
-			.alloc_sleep_millisecs = 10,
-			.scan_sleep_millisecs = 10,
-		},
-		/*
-		 * When testing file-backed memory, the collapse path
-		 * looks at how many pages are found in the page cache, not
-		 * what pages are mapped. Disable read ahead optimization so
-		 * pages don't find their way into the page cache unless
-		 * we mem_ops->fault() them in.
-		 */
-		.read_ahead_kb = 0,
-	};
-
-	parse_test_type(argc, argv);
-
-	if (file_ops)
-		get_finfo(argv[2]);
-
-	setbuf(stdout, NULL);
-
-	page_size = getpagesize();
-	hpage_pmd_size = read_pmd_pagesize();
-	hpage_pmd_nr = hpage_pmd_size / page_size;
-
-	default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
-	default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
-	default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
-	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
-
-	save_settings();
-	push_settings(&default_settings);
-
-	alloc_at_fault();
-
-#define TEST(t, c, o) do { \
-	if (c && o) { \
-		printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
-		t(c, o); \
-	} \
-	} while (0)
-
-	TEST(collapse_full, khugepaged_context, anon_ops);
-	TEST(collapse_full, khugepaged_context, file_ops);
-	TEST(collapse_full, khugepaged_context, shmem_ops);
-	TEST(collapse_full, madvise_context, anon_ops);
-	TEST(collapse_full, madvise_context, file_ops);
-	TEST(collapse_full, madvise_context, shmem_ops);
-
-	TEST(collapse_empty, khugepaged_context, anon_ops);
-	TEST(collapse_empty, madvise_context, anon_ops);
-
-	TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
-	TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
-	TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
-	TEST(collapse_single_pte_entry, madvise_context, anon_ops);
-	TEST(collapse_single_pte_entry, madvise_context, file_ops);
-	TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
-
-	TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
-	TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
-	TEST(collapse_max_ptes_none, madvise_context, anon_ops);
-	TEST(collapse_max_ptes_none, madvise_context, file_ops);
-
-	TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
-	TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
-	TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
-	TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
-
-	TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
-	TEST(collapse_full_of_compound, khugepaged_context, file_ops);
-	TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
-	TEST(collapse_full_of_compound, madvise_context, anon_ops);
-	TEST(collapse_full_of_compound, madvise_context, file_ops);
-	TEST(collapse_full_of_compound, madvise_context, shmem_ops);
-
-	TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
-	TEST(collapse_compound_extreme, madvise_context, anon_ops);
-
-	TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
-	TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
-
-	TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
-	TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
-
-	TEST(collapse_fork, khugepaged_context, anon_ops);
-	TEST(collapse_fork, madvise_context, anon_ops);
-
-	TEST(collapse_fork_compound, khugepaged_context, anon_ops);
-	TEST(collapse_fork_compound, madvise_context, anon_ops);
-
-	TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
-	TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
-
-	TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
-	TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
-	TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
-
-	TEST(madvise_retracted_page_tables, madvise_context, file_ops);
-	TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
-
-	restore_settings(0);
-}
diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c
deleted file mode 100644
index d8b5b4930412..000000000000
--- a/tools/testing/selftests/vm/ksm_functional_tests.c
+++ /dev/null
@@ -1,279 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * KSM functional tests
- *
- * Copyright 2022, Red Hat, Inc.
- *
- * Author(s): David Hildenbrand <david@redhat.com>
- */
-#define _GNU_SOURCE
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <linux/userfaultfd.h>
-
-#include "../kselftest.h"
-#include "vm_util.h"
-
-#define KiB 1024u
-#define MiB (1024 * KiB)
-
-static int ksm_fd;
-static int ksm_full_scans_fd;
-static int pagemap_fd;
-static size_t pagesize;
-
-static bool range_maps_duplicates(char *addr, unsigned long size)
-{
-	unsigned long offs_a, offs_b, pfn_a, pfn_b;
-
-	/*
-	 * There is no easy way to check if there are KSM pages mapped into
-	 * this range. We only check that the range does not map the same PFN
-	 * twice by comparing each pair of mapped pages.
-	 */
-	for (offs_a = 0; offs_a < size; offs_a += pagesize) {
-		pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a);
-		/* Page not present or PFN not exposed by the kernel. */
-		if (pfn_a == -1ul || !pfn_a)
-			continue;
-
-		for (offs_b = offs_a + pagesize; offs_b < size;
-		     offs_b += pagesize) {
-			pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b);
-			if (pfn_b == -1ul || !pfn_b)
-				continue;
-			if (pfn_a == pfn_b)
-				return true;
-		}
-	}
-	return false;
-}
-
-static long ksm_get_full_scans(void)
-{
-	char buf[10];
-	ssize_t ret;
-
-	ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0);
-	if (ret <= 0)
-		return -errno;
-	buf[ret] = 0;
-
-	return strtol(buf, NULL, 10);
-}
-
-static int ksm_merge(void)
-{
-	long start_scans, end_scans;
-
-	/* Wait for two full scans such that any possible merging happened. */
-	start_scans = ksm_get_full_scans();
-	if (start_scans < 0)
-		return start_scans;
-	if (write(ksm_fd, "1", 1) != 1)
-		return -errno;
-	do {
-		end_scans = ksm_get_full_scans();
-		if (end_scans < 0)
-			return end_scans;
-	} while (end_scans < start_scans + 2);
-
-	return 0;
-}
-
-static char *mmap_and_merge_range(char val, unsigned long size)
-{
-	char *map;
-
-	map = mmap(NULL, size, PROT_READ|PROT_WRITE,
-		   MAP_PRIVATE|MAP_ANON, -1, 0);
-	if (map == MAP_FAILED) {
-		ksft_test_result_fail("mmap() failed\n");
-		return MAP_FAILED;
-	}
-
-	/* Don't use THP. Ignore if THP are not around on a kernel. */
-	if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) {
-		ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
-		goto unmap;
-	}
-
-	/* Make sure each page contains the same values to merge them. */
-	memset(map, val, size);
-	if (madvise(map, size, MADV_MERGEABLE)) {
-		ksft_test_result_fail("MADV_MERGEABLE failed\n");
-		goto unmap;
-	}
-
-	/* Run KSM to trigger merging and wait. */
-	if (ksm_merge()) {
-		ksft_test_result_fail("Running KSM failed\n");
-		goto unmap;
-	}
-	return map;
-unmap:
-	munmap(map, size);
-	return MAP_FAILED;
-}
-
-static void test_unmerge(void)
-{
-	const unsigned int size = 2 * MiB;
-	char *map;
-
-	ksft_print_msg("[RUN] %s\n", __func__);
-
-	map = mmap_and_merge_range(0xcf, size);
-	if (map == MAP_FAILED)
-		return;
-
-	if (madvise(map, size, MADV_UNMERGEABLE)) {
-		ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
-		goto unmap;
-	}
-
-	ksft_test_result(!range_maps_duplicates(map, size),
-			 "Pages were unmerged\n");
-unmap:
-	munmap(map, size);
-}
-
-static void test_unmerge_discarded(void)
-{
-	const unsigned int size = 2 * MiB;
-	char *map;
-
-	ksft_print_msg("[RUN] %s\n", __func__);
-
-	map = mmap_and_merge_range(0xcf, size);
-	if (map == MAP_FAILED)
-		return;
-
-	/* Discard half of all mapped pages so we have pte_none() entries. */
-	if (madvise(map, size / 2, MADV_DONTNEED)) {
-		ksft_test_result_fail("MADV_DONTNEED failed\n");
-		goto unmap;
-	}
-
-	if (madvise(map, size, MADV_UNMERGEABLE)) {
-		ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
-		goto unmap;
-	}
-
-	ksft_test_result(!range_maps_duplicates(map, size),
-			 "Pages were unmerged\n");
-unmap:
-	munmap(map, size);
-}
-
-#ifdef __NR_userfaultfd
-static void test_unmerge_uffd_wp(void)
-{
-	struct uffdio_writeprotect uffd_writeprotect;
-	struct uffdio_register uffdio_register;
-	const unsigned int size = 2 * MiB;
-	struct uffdio_api uffdio_api;
-	char *map;
-	int uffd;
-
-	ksft_print_msg("[RUN] %s\n", __func__);
-
-	map = mmap_and_merge_range(0xcf, size);
-	if (map == MAP_FAILED)
-		return;
-
-	/* See if UFFD is around. */
-	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
-	if (uffd < 0) {
-		ksft_test_result_skip("__NR_userfaultfd failed\n");
-		goto unmap;
-	}
-
-	/* See if UFFD-WP is around. */
-	uffdio_api.api = UFFD_API;
-	uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
-	if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
-		ksft_test_result_fail("UFFDIO_API failed\n");
-		goto close_uffd;
-	}
-	if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
-		ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n");
-		goto close_uffd;
-	}
-
-	/* Register UFFD-WP, no need for an actual handler. */
-	uffdio_register.range.start = (unsigned long) map;
-	uffdio_register.range.len = size;
-	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
-	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
-		ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n");
-		goto close_uffd;
-	}
-
-	/* Write-protect the range using UFFD-WP. */
-	uffd_writeprotect.range.start = (unsigned long) map;
-	uffd_writeprotect.range.len = size;
-	uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
-	if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
-		ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n");
-		goto close_uffd;
-	}
-
-	if (madvise(map, size, MADV_UNMERGEABLE)) {
-		ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
-		goto close_uffd;
-	}
-
-	ksft_test_result(!range_maps_duplicates(map, size),
-			 "Pages were unmerged\n");
-close_uffd:
-	close(uffd);
-unmap:
-	munmap(map, size);
-}
-#endif
-
-int main(int argc, char **argv)
-{
-	unsigned int tests = 2;
-	int err;
-
-#ifdef __NR_userfaultfd
-	tests++;
-#endif
-
-	ksft_print_header();
-	ksft_set_plan(tests);
-
-	pagesize = getpagesize();
-
-	ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR);
-	if (ksm_fd < 0)
-		ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n");
-	ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY);
-	if (ksm_full_scans_fd < 0)
-		ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n");
-	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
-	if (pagemap_fd < 0)
-		ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n");
-
-	test_unmerge();
-	test_unmerge_discarded();
-#ifdef __NR_userfaultfd
-	test_unmerge_uffd_wp();
-#endif
-
-	err = ksft_get_fail_cnt();
-	if (err)
-		ksft_exit_fail_msg("%d out of %d tests failed\n",
-				   err, ksft_test_num());
-	return ksft_exit_pass();
-}
diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c
deleted file mode 100644
index f9eb4d67e0dd..000000000000
--- a/tools/testing/selftests/vm/ksm_tests.c
+++ /dev/null
@@ -1,849 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <sys/mman.h>
-#include <stdbool.h>
-#include <time.h>
-#include <string.h>
-#include <numa.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <stdint.h>
-#include <err.h>
-
-#include "../kselftest.h"
-#include <include/vdso/time64.h>
-#include "util.h"
-
-#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/"
-#define KSM_FP(s) (KSM_SYSFS_PATH s)
-#define KSM_SCAN_LIMIT_SEC_DEFAULT 120
-#define KSM_PAGE_COUNT_DEFAULT 10l
-#define KSM_PROT_STR_DEFAULT "rw"
-#define KSM_USE_ZERO_PAGES_DEFAULT false
-#define KSM_MERGE_ACROSS_NODES_DEFAULT true
-#define MB (1ul << 20)
-
-struct ksm_sysfs {
-	unsigned long max_page_sharing;
-	unsigned long merge_across_nodes;
-	unsigned long pages_to_scan;
-	unsigned long run;
-	unsigned long sleep_millisecs;
-	unsigned long stable_node_chains_prune_millisecs;
-	unsigned long use_zero_pages;
-};
-
-enum ksm_test_name {
-	CHECK_KSM_MERGE,
-	CHECK_KSM_UNMERGE,
-	CHECK_KSM_ZERO_PAGE_MERGE,
-	CHECK_KSM_NUMA_MERGE,
-	KSM_MERGE_TIME,
-	KSM_MERGE_TIME_HUGE_PAGES,
-	KSM_UNMERGE_TIME,
-	KSM_COW_TIME
-};
-
-static int ksm_write_sysfs(const char *file_path, unsigned long val)
-{
-	FILE *f = fopen(file_path, "w");
-
-	if (!f) {
-		fprintf(stderr, "f %s\n", file_path);
-		perror("fopen");
-		return 1;
-	}
-	if (fprintf(f, "%lu", val) < 0) {
-		perror("fprintf");
-		fclose(f);
-		return 1;
-	}
-	fclose(f);
-
-	return 0;
-}
-
-static int ksm_read_sysfs(const char *file_path, unsigned long *val)
-{
-	FILE *f = fopen(file_path, "r");
-
-	if (!f) {
-		fprintf(stderr, "f %s\n", file_path);
-		perror("fopen");
-		return 1;
-	}
-	if (fscanf(f, "%lu", val) != 1) {
-		perror("fscanf");
-		fclose(f);
-		return 1;
-	}
-	fclose(f);
-
-	return 0;
-}
-
-static int str_to_prot(char *prot_str)
-{
-	int prot = 0;
-
-	if ((strchr(prot_str, 'r')) != NULL)
-		prot |= PROT_READ;
-	if ((strchr(prot_str, 'w')) != NULL)
-		prot |= PROT_WRITE;
-	if ((strchr(prot_str, 'x')) != NULL)
-		prot |= PROT_EXEC;
-
-	return prot;
-}
-
-static void print_help(void)
-{
-	printf("usage: ksm_tests [-h] <test type> [-a prot] [-p page_count] [-l timeout]\n"
-	       "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n");
-
-	printf("Supported <test type>:\n"
-	       " -M (page merging)\n"
-	       " -Z (zero pages merging)\n"
-	       " -N (merging of pages in different NUMA nodes)\n"
-	       " -U (page unmerging)\n"
-	       " -P evaluate merging time and speed.\n"
-	       "    For this test, the size of duplicated memory area (in MiB)\n"
-	       "    must be provided using -s option\n"
-	       " -H evaluate merging time and speed of area allocated mostly with huge pages\n"
-	       "    For this test, the size of duplicated memory area (in MiB)\n"
-	       "    must be provided using -s option\n"
-	       " -D evaluate unmerging time and speed when disabling KSM.\n"
-	       "    For this test, the size of duplicated memory area (in MiB)\n"
-	       "    must be provided using -s option\n"
-	       " -C evaluate the time required to break COW of merged pages.\n\n");
-
-	printf(" -a: specify the access protections of pages.\n"
-	       "     <prot> must be of the form [rwx].\n"
-	       "     Default: %s\n", KSM_PROT_STR_DEFAULT);
-	printf(" -p: specify the number of pages to test.\n"
-	       "     Default: %ld\n", KSM_PAGE_COUNT_DEFAULT);
-	printf(" -l: limit the maximum running time (in seconds) for a test.\n"
-	       "     Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT);
-	printf(" -z: change use_zero_pages tunable\n"
-	       "     Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT);
-	printf(" -m: change merge_across_nodes tunable\n"
-	       "     Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT);
-	printf(" -s: the size of duplicated memory area (in MiB)\n");
-
-	exit(0);
-}
-
-static void  *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size)
-{
-	void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0);
-
-	if (!map_ptr) {
-		perror("mmap");
-		return NULL;
-	}
-	memset(map_ptr, data, map_size);
-	if (mprotect(map_ptr, map_size, prot)) {
-		perror("mprotect");
-		munmap(map_ptr, map_size);
-		return NULL;
-	}
-
-	return map_ptr;
-}
-
-static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout)
-{
-	struct timespec cur_time;
-	unsigned long cur_scan, init_scan;
-
-	if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan))
-		return 1;
-	cur_scan = init_scan;
-
-	while (cur_scan < init_scan + scan_count) {
-		if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan))
-			return 1;
-		if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) {
-			perror("clock_gettime");
-			return 1;
-		}
-		if ((cur_time.tv_sec - start_time.tv_sec) > timeout) {
-			printf("Scan time limit exceeded\n");
-			return 1;
-		}
-	}
-
-	return 0;
-}
-
-static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout)
-{
-	if (madvise(addr, size, MADV_MERGEABLE)) {
-		perror("madvise");
-		return 1;
-	}
-	if (ksm_write_sysfs(KSM_FP("run"), 1))
-		return 1;
-
-	/* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */
-	if (ksm_do_scan(2, start_time, timeout))
-		return 1;
-
-	return 0;
-}
-
-static int ksm_unmerge_pages(void *addr, size_t size,
-			     struct timespec start_time, int timeout)
-{
-	if (madvise(addr, size, MADV_UNMERGEABLE)) {
-		perror("madvise");
-		return 1;
-	}
-	return 0;
-}
-
-static bool assert_ksm_pages_count(long dupl_page_count)
-{
-	unsigned long max_page_sharing, pages_sharing, pages_shared;
-
-	if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) ||
-	    ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) ||
-	    ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing))
-		return false;
-
-	/*
-	 * Since there must be at least 2 pages for merging and 1 page can be
-	 * shared with the limited number of pages (max_page_sharing), sometimes
-	 * there are 'leftover' pages that cannot be merged. For example, if there
-	 * are 11 pages and max_page_sharing = 10, then only 10 pages will be
-	 * merged and the 11th page won't be affected. As a result, when the number
-	 * of duplicate pages is divided by max_page_sharing and the remainder is 1,
-	 * pages_shared and pages_sharing values will be equal between dupl_page_count
-	 * and dupl_page_count - 1.
-	 */
-	if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) {
-		if (pages_shared == dupl_page_count / max_page_sharing &&
-		    pages_sharing == pages_shared * (max_page_sharing - 1))
-			return true;
-	} else {
-		if (pages_shared == (dupl_page_count / max_page_sharing + 1) &&
-		    pages_sharing == dupl_page_count - pages_shared)
-			return true;
-	}
-
-	return false;
-}
-
-static int ksm_save_def(struct ksm_sysfs *ksm_sysfs)
-{
-	if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) ||
-	    numa_available() ? 0 :
-		ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) ||
-	    ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) ||
-	    ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) ||
-	    ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) ||
-	    ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
-			   &ksm_sysfs->stable_node_chains_prune_millisecs) ||
-	    ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages))
-		return 1;
-
-	return 0;
-}
-
-static int ksm_restore(struct ksm_sysfs *ksm_sysfs)
-{
-	if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) ||
-	    numa_available() ? 0 :
-		ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) ||
-	    ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) ||
-	    ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) ||
-	    ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) ||
-	    ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
-			    ksm_sysfs->stable_node_chains_prune_millisecs) ||
-	    ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages))
-		return 1;
-
-	return 0;
-}
-
-static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size)
-{
-	void *map_ptr;
-	struct timespec start_time;
-
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-		perror("clock_gettime");
-		return KSFT_FAIL;
-	}
-
-	/* fill pages with the same data and merge them */
-	map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
-	if (!map_ptr)
-		return KSFT_FAIL;
-
-	if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
-		goto err_out;
-
-	/* verify that the right number of pages are merged */
-	if (assert_ksm_pages_count(page_count)) {
-		printf("OK\n");
-		munmap(map_ptr, page_size * page_count);
-		return KSFT_PASS;
-	}
-
-err_out:
-	printf("Not OK\n");
-	munmap(map_ptr, page_size * page_count);
-	return KSFT_FAIL;
-}
-
-static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size)
-{
-	void *map_ptr;
-	struct timespec start_time;
-	int page_count = 2;
-
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-		perror("clock_gettime");
-		return KSFT_FAIL;
-	}
-
-	/* fill pages with the same data and merge them */
-	map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
-	if (!map_ptr)
-		return KSFT_FAIL;
-
-	if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
-		goto err_out;
-
-	/* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */
-	memset(map_ptr, '-', 1);
-	memset(map_ptr + page_size, '+', 1);
-
-	/* get at least 1 scan, so KSM can detect that the pages were modified */
-	if (ksm_do_scan(1, start_time, timeout))
-		goto err_out;
-
-	/* check that unmerging was successful and 0 pages are currently merged */
-	if (assert_ksm_pages_count(0)) {
-		printf("OK\n");
-		munmap(map_ptr, page_size * page_count);
-		return KSFT_PASS;
-	}
-
-err_out:
-	printf("Not OK\n");
-	munmap(map_ptr, page_size * page_count);
-	return KSFT_FAIL;
-}
-
-static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout,
-				     bool use_zero_pages, size_t page_size)
-{
-	void *map_ptr;
-	struct timespec start_time;
-
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-		perror("clock_gettime");
-		return KSFT_FAIL;
-	}
-
-	if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages))
-		return KSFT_FAIL;
-
-	/* fill pages with zero and try to merge them */
-	map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count);
-	if (!map_ptr)
-		return KSFT_FAIL;
-
-	if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
-		goto err_out;
-
-       /*
-	* verify that the right number of pages are merged:
-	* 1) if use_zero_pages is set to 1, empty pages are merged
-	*    with the kernel zero page instead of with each other;
-	* 2) if use_zero_pages is set to 0, empty pages are not treated specially
-	*    and merged as usual.
-	*/
-	if (use_zero_pages && !assert_ksm_pages_count(0))
-		goto err_out;
-	else if (!use_zero_pages && !assert_ksm_pages_count(page_count))
-		goto err_out;
-
-	printf("OK\n");
-	munmap(map_ptr, page_size * page_count);
-	return KSFT_PASS;
-
-err_out:
-	printf("Not OK\n");
-	munmap(map_ptr, page_size * page_count);
-	return KSFT_FAIL;
-}
-
-static int get_next_mem_node(int node)
-{
-
-	long node_size;
-	int mem_node = 0;
-	int i, max_node = numa_max_node();
-
-	for (i = node + 1; i <= max_node + node; i++) {
-		mem_node = i % (max_node + 1);
-		node_size = numa_node_size(mem_node, NULL);
-		if (node_size > 0)
-			break;
-	}
-	return mem_node;
-}
-
-static int get_first_mem_node(void)
-{
-	return get_next_mem_node(numa_max_node());
-}
-
-static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes,
-				size_t page_size)
-{
-	void *numa1_map_ptr, *numa2_map_ptr;
-	struct timespec start_time;
-	int page_count = 2;
-	int first_node;
-
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-		perror("clock_gettime");
-		return KSFT_FAIL;
-	}
-
-	if (numa_available() < 0) {
-		perror("NUMA support not enabled");
-		return KSFT_SKIP;
-	}
-	if (numa_num_configured_nodes() <= 1) {
-		printf("At least 2 NUMA nodes must be available\n");
-		return KSFT_SKIP;
-	}
-	if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes))
-		return KSFT_FAIL;
-
-	/* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */
-	first_node = get_first_mem_node();
-	numa1_map_ptr = numa_alloc_onnode(page_size, first_node);
-	numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node));
-	if (!numa1_map_ptr || !numa2_map_ptr) {
-		perror("numa_alloc_onnode");
-		return KSFT_FAIL;
-	}
-
-	memset(numa1_map_ptr, '*', page_size);
-	memset(numa2_map_ptr, '*', page_size);
-
-	/* try to merge the pages */
-	if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) ||
-	    ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout))
-		goto err_out;
-
-       /*
-	* verify that the right number of pages are merged:
-	* 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged;
-	* 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is
-	*    only 1 unique page in each node and they can't be shared.
-	*/
-	if (merge_across_nodes && !assert_ksm_pages_count(page_count))
-		goto err_out;
-	else if (!merge_across_nodes && !assert_ksm_pages_count(0))
-		goto err_out;
-
-	numa_free(numa1_map_ptr, page_size);
-	numa_free(numa2_map_ptr, page_size);
-	printf("OK\n");
-	return KSFT_PASS;
-
-err_out:
-	numa_free(numa1_map_ptr, page_size);
-	numa_free(numa2_map_ptr, page_size);
-	printf("Not OK\n");
-	return KSFT_FAIL;
-}
-
-static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size)
-{
-	void *map_ptr, *map_ptr_orig;
-	struct timespec start_time, end_time;
-	unsigned long scan_time_ns;
-	int pagemap_fd, n_normal_pages, n_huge_pages;
-
-	map_size *= MB;
-	size_t len = map_size;
-
-	len -= len % HPAGE_SIZE;
-	map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE,
-			MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
-	map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE;
-
-	if (map_ptr_orig == MAP_FAILED)
-		err(2, "initial mmap");
-
-	if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE))
-		err(2, "MADV_HUGEPAGE");
-
-	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
-	if (pagemap_fd < 0)
-		err(2, "open pagemap");
-
-	n_normal_pages = 0;
-	n_huge_pages = 0;
-	for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) {
-		if (allocate_transhuge(p, pagemap_fd) < 0)
-			n_normal_pages++;
-		else
-			n_huge_pages++;
-	}
-	printf("Number of normal pages:    %d\n", n_normal_pages);
-	printf("Number of huge pages:    %d\n", n_huge_pages);
-
-	memset(map_ptr, '*', len);
-
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-		perror("clock_gettime");
-		goto err_out;
-	}
-	if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
-		goto err_out;
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
-		perror("clock_gettime");
-		goto err_out;
-	}
-
-	scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
-		       (end_time.tv_nsec - start_time.tv_nsec);
-
-	printf("Total size:    %lu MiB\n", map_size / MB);
-	printf("Total time:    %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
-	       scan_time_ns % NSEC_PER_SEC);
-	printf("Average speed:  %.3f MiB/s\n", (map_size / MB) /
-					       ((double)scan_time_ns / NSEC_PER_SEC));
-
-	munmap(map_ptr_orig, len + HPAGE_SIZE);
-	return KSFT_PASS;
-
-err_out:
-	printf("Not OK\n");
-	munmap(map_ptr_orig, len + HPAGE_SIZE);
-	return KSFT_FAIL;
-}
-
-static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size)
-{
-	void *map_ptr;
-	struct timespec start_time, end_time;
-	unsigned long scan_time_ns;
-
-	map_size *= MB;
-
-	map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size);
-	if (!map_ptr)
-		return KSFT_FAIL;
-
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-		perror("clock_gettime");
-		goto err_out;
-	}
-	if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
-		goto err_out;
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
-		perror("clock_gettime");
-		goto err_out;
-	}
-
-	scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
-		       (end_time.tv_nsec - start_time.tv_nsec);
-
-	printf("Total size:    %lu MiB\n", map_size / MB);
-	printf("Total time:    %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
-	       scan_time_ns % NSEC_PER_SEC);
-	printf("Average speed:  %.3f MiB/s\n", (map_size / MB) /
-					       ((double)scan_time_ns / NSEC_PER_SEC));
-
-	munmap(map_ptr, map_size);
-	return KSFT_PASS;
-
-err_out:
-	printf("Not OK\n");
-	munmap(map_ptr, map_size);
-	return KSFT_FAIL;
-}
-
-static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size)
-{
-	void *map_ptr;
-	struct timespec start_time, end_time;
-	unsigned long scan_time_ns;
-
-	map_size *= MB;
-
-	map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size);
-	if (!map_ptr)
-		return KSFT_FAIL;
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-		perror("clock_gettime");
-		goto err_out;
-	}
-	if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
-		goto err_out;
-
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-		perror("clock_gettime");
-		goto err_out;
-	}
-	if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout))
-		goto err_out;
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
-		perror("clock_gettime");
-		goto err_out;
-	}
-
-	scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
-		       (end_time.tv_nsec - start_time.tv_nsec);
-
-	printf("Total size:    %lu MiB\n", map_size / MB);
-	printf("Total time:    %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
-	       scan_time_ns % NSEC_PER_SEC);
-	printf("Average speed:  %.3f MiB/s\n", (map_size / MB) /
-					       ((double)scan_time_ns / NSEC_PER_SEC));
-
-	munmap(map_ptr, map_size);
-	return KSFT_PASS;
-
-err_out:
-	printf("Not OK\n");
-	munmap(map_ptr, map_size);
-	return KSFT_FAIL;
-}
-
-static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size)
-{
-	void *map_ptr;
-	struct timespec start_time, end_time;
-	unsigned long cow_time_ns;
-
-	/* page_count must be less than 2*page_size */
-	size_t page_count = 4000;
-
-	map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
-	if (!map_ptr)
-		return KSFT_FAIL;
-
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-		perror("clock_gettime");
-		return KSFT_FAIL;
-	}
-	for (size_t i = 0; i < page_count - 1; i = i + 2)
-		memset(map_ptr + page_size * i, '-', 1);
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
-		perror("clock_gettime");
-		return KSFT_FAIL;
-	}
-
-	cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
-		       (end_time.tv_nsec - start_time.tv_nsec);
-
-	printf("Total size:    %lu MiB\n\n", (page_size * page_count) / MB);
-	printf("Not merged pages:\n");
-	printf("Total time:     %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC,
-	       cow_time_ns % NSEC_PER_SEC);
-	printf("Average speed:  %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) /
-					       ((double)cow_time_ns / NSEC_PER_SEC));
-
-	/* Create 2000 pairs of duplicate pages */
-	for (size_t i = 0; i < page_count - 1; i = i + 2) {
-		memset(map_ptr + page_size * i, '+', i / 2 + 1);
-		memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1);
-	}
-	if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
-		goto err_out;
-
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-		perror("clock_gettime");
-		goto err_out;
-	}
-	for (size_t i = 0; i < page_count - 1; i = i + 2)
-		memset(map_ptr + page_size * i, '-', 1);
-	if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
-		perror("clock_gettime");
-		goto err_out;
-	}
-
-	cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
-		       (end_time.tv_nsec - start_time.tv_nsec);
-
-	printf("Merged pages:\n");
-	printf("Total time:     %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC,
-	       cow_time_ns % NSEC_PER_SEC);
-	printf("Average speed:  %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) /
-					       ((double)cow_time_ns / NSEC_PER_SEC));
-
-	munmap(map_ptr, page_size * page_count);
-	return KSFT_PASS;
-
-err_out:
-	printf("Not OK\n");
-	munmap(map_ptr, page_size * page_count);
-	return KSFT_FAIL;
-}
-
-int main(int argc, char *argv[])
-{
-	int ret, opt;
-	int prot = 0;
-	int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT;
-	long page_count = KSM_PAGE_COUNT_DEFAULT;
-	size_t page_size = sysconf(_SC_PAGESIZE);
-	struct ksm_sysfs ksm_sysfs_old;
-	int test_name = CHECK_KSM_MERGE;
-	bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT;
-	bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT;
-	long size_MB = 0;
-
-	while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) {
-		switch (opt) {
-		case 'a':
-			prot = str_to_prot(optarg);
-			break;
-		case 'p':
-			page_count = atol(optarg);
-			if (page_count <= 0) {
-				printf("The number of pages must be greater than 0\n");
-				return KSFT_FAIL;
-			}
-			break;
-		case 'l':
-			ksm_scan_limit_sec = atoi(optarg);
-			if (ksm_scan_limit_sec <= 0) {
-				printf("Timeout value must be greater than 0\n");
-				return KSFT_FAIL;
-			}
-			break;
-		case 'h':
-			print_help();
-			break;
-		case 'z':
-			if (strcmp(optarg, "0") == 0)
-				use_zero_pages = 0;
-			else
-				use_zero_pages = 1;
-			break;
-		case 'm':
-			if (strcmp(optarg, "0") == 0)
-				merge_across_nodes = 0;
-			else
-				merge_across_nodes = 1;
-			break;
-		case 's':
-			size_MB = atoi(optarg);
-			if (size_MB <= 0) {
-				printf("Size must be greater than 0\n");
-				return KSFT_FAIL;
-			}
-		case 'M':
-			break;
-		case 'U':
-			test_name = CHECK_KSM_UNMERGE;
-			break;
-		case 'Z':
-			test_name = CHECK_KSM_ZERO_PAGE_MERGE;
-			break;
-		case 'N':
-			test_name = CHECK_KSM_NUMA_MERGE;
-			break;
-		case 'P':
-			test_name = KSM_MERGE_TIME;
-			break;
-		case 'H':
-			test_name = KSM_MERGE_TIME_HUGE_PAGES;
-			break;
-		case 'D':
-			test_name = KSM_UNMERGE_TIME;
-			break;
-		case 'C':
-			test_name = KSM_COW_TIME;
-			break;
-		default:
-			return KSFT_FAIL;
-		}
-	}
-
-	if (prot == 0)
-		prot = str_to_prot(KSM_PROT_STR_DEFAULT);
-
-	if (access(KSM_SYSFS_PATH, F_OK)) {
-		printf("Config KSM not enabled\n");
-		return KSFT_SKIP;
-	}
-
-	if (ksm_save_def(&ksm_sysfs_old)) {
-		printf("Cannot save default tunables\n");
-		return KSFT_FAIL;
-	}
-
-	if (ksm_write_sysfs(KSM_FP("run"), 2) ||
-	    ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) ||
-	    numa_available() ? 0 :
-		ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) ||
-	    ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count))
-		return KSFT_FAIL;
-
-	switch (test_name) {
-	case CHECK_KSM_MERGE:
-		ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
-				      ksm_scan_limit_sec, page_size);
-		break;
-	case CHECK_KSM_UNMERGE:
-		ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
-					page_size);
-		break;
-	case CHECK_KSM_ZERO_PAGE_MERGE:
-		ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
-						ksm_scan_limit_sec, use_zero_pages, page_size);
-		break;
-	case CHECK_KSM_NUMA_MERGE:
-		ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
-					   merge_across_nodes, page_size);
-		break;
-	case KSM_MERGE_TIME:
-		if (size_MB == 0) {
-			printf("Option '-s' is required.\n");
-			return KSFT_FAIL;
-		}
-		ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
-				     size_MB);
-		break;
-	case KSM_MERGE_TIME_HUGE_PAGES:
-		if (size_MB == 0) {
-			printf("Option '-s' is required.\n");
-			return KSFT_FAIL;
-		}
-		ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
-				ksm_scan_limit_sec, size_MB);
-		break;
-	case KSM_UNMERGE_TIME:
-		if (size_MB == 0) {
-			printf("Option '-s' is required.\n");
-			return KSFT_FAIL;
-		}
-		ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
-				       ksm_scan_limit_sec, size_MB);
-		break;
-	case KSM_COW_TIME:
-		ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
-				   page_size);
-		break;
-	}
-
-	if (ksm_restore(&ksm_sysfs_old)) {
-		printf("Cannot restore default tunables\n");
-		return KSFT_FAIL;
-	}
-
-	return ret;
-}
diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/vm/madv_populate.c
deleted file mode 100644
index 262eae6b58f2..000000000000
--- a/tools/testing/selftests/vm/madv_populate.c
+++ /dev/null
@@ -1,296 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
- *
- * Copyright 2021, Red Hat, Inc.
- *
- * Author(s): David Hildenbrand <david@redhat.com>
- */
-#define _GNU_SOURCE
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <linux/mman.h>
-#include <sys/mman.h>
-
-#include "../kselftest.h"
-#include "vm_util.h"
-
-#ifndef MADV_POPULATE_READ
-#define MADV_POPULATE_READ	22
-#endif /* MADV_POPULATE_READ */
-#ifndef MADV_POPULATE_WRITE
-#define MADV_POPULATE_WRITE	23
-#endif /* MADV_POPULATE_WRITE */
-
-/*
- * For now, we're using 2 MiB of private anonymous memory for all tests.
- */
-#define SIZE (2 * 1024 * 1024)
-
-static size_t pagesize;
-
-static void sense_support(void)
-{
-	char *addr;
-	int ret;
-
-	addr = mmap(0, pagesize, PROT_READ | PROT_WRITE,
-		    MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-	if (!addr)
-		ksft_exit_fail_msg("mmap failed\n");
-
-	ret = madvise(addr, pagesize, MADV_POPULATE_READ);
-	if (ret)
-		ksft_exit_skip("MADV_POPULATE_READ is not available\n");
-
-	ret = madvise(addr, pagesize, MADV_POPULATE_WRITE);
-	if (ret)
-		ksft_exit_skip("MADV_POPULATE_WRITE is not available\n");
-
-	munmap(addr, pagesize);
-}
-
-static void test_prot_read(void)
-{
-	char *addr;
-	int ret;
-
-	ksft_print_msg("[RUN] %s\n", __func__);
-
-	addr = mmap(0, SIZE, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-	if (addr == MAP_FAILED)
-		ksft_exit_fail_msg("mmap failed\n");
-
-	ret = madvise(addr, SIZE, MADV_POPULATE_READ);
-	ksft_test_result(!ret, "MADV_POPULATE_READ with PROT_READ\n");
-
-	ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
-	ksft_test_result(ret == -1 && errno == EINVAL,
-			 "MADV_POPULATE_WRITE with PROT_READ\n");
-
-	munmap(addr, SIZE);
-}
-
-static void test_prot_write(void)
-{
-	char *addr;
-	int ret;
-
-	ksft_print_msg("[RUN] %s\n", __func__);
-
-	addr = mmap(0, SIZE, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-	if (addr == MAP_FAILED)
-		ksft_exit_fail_msg("mmap failed\n");
-
-	ret = madvise(addr, SIZE, MADV_POPULATE_READ);
-	ksft_test_result(ret == -1 && errno == EINVAL,
-			 "MADV_POPULATE_READ with PROT_WRITE\n");
-
-	ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
-	ksft_test_result(!ret, "MADV_POPULATE_WRITE with PROT_WRITE\n");
-
-	munmap(addr, SIZE);
-}
-
-static void test_holes(void)
-{
-	char *addr;
-	int ret;
-
-	ksft_print_msg("[RUN] %s\n", __func__);
-
-	addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
-		    MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-	if (addr == MAP_FAILED)
-		ksft_exit_fail_msg("mmap failed\n");
-	ret = munmap(addr + pagesize, pagesize);
-	if (ret)
-		ksft_exit_fail_msg("munmap failed\n");
-
-	/* Hole in the middle */
-	ret = madvise(addr, SIZE, MADV_POPULATE_READ);
-	ksft_test_result(ret == -1 && errno == ENOMEM,
-			 "MADV_POPULATE_READ with holes in the middle\n");
-	ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
-	ksft_test_result(ret == -1 && errno == ENOMEM,
-			 "MADV_POPULATE_WRITE with holes in the middle\n");
-
-	/* Hole at end */
-	ret = madvise(addr, 2 * pagesize, MADV_POPULATE_READ);
-	ksft_test_result(ret == -1 && errno == ENOMEM,
-			 "MADV_POPULATE_READ with holes at the end\n");
-	ret = madvise(addr, 2 * pagesize, MADV_POPULATE_WRITE);
-	ksft_test_result(ret == -1 && errno == ENOMEM,
-			 "MADV_POPULATE_WRITE with holes at the end\n");
-
-	/* Hole at beginning */
-	ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_READ);
-	ksft_test_result(ret == -1 && errno == ENOMEM,
-			 "MADV_POPULATE_READ with holes at the beginning\n");
-	ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_WRITE);
-	ksft_test_result(ret == -1 && errno == ENOMEM,
-			 "MADV_POPULATE_WRITE with holes at the beginning\n");
-
-	munmap(addr, SIZE);
-}
-
-static bool range_is_populated(char *start, ssize_t size)
-{
-	int fd = open("/proc/self/pagemap", O_RDONLY);
-	bool ret = true;
-
-	if (fd < 0)
-		ksft_exit_fail_msg("opening pagemap failed\n");
-	for (; size > 0 && ret; size -= pagesize, start += pagesize)
-		if (!pagemap_is_populated(fd, start))
-			ret = false;
-	close(fd);
-	return ret;
-}
-
-static bool range_is_not_populated(char *start, ssize_t size)
-{
-	int fd = open("/proc/self/pagemap", O_RDONLY);
-	bool ret = true;
-
-	if (fd < 0)
-		ksft_exit_fail_msg("opening pagemap failed\n");
-	for (; size > 0 && ret; size -= pagesize, start += pagesize)
-		if (pagemap_is_populated(fd, start))
-			ret = false;
-	close(fd);
-	return ret;
-}
-
-static void test_populate_read(void)
-{
-	char *addr;
-	int ret;
-
-	ksft_print_msg("[RUN] %s\n", __func__);
-
-	addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
-		    MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-	if (addr == MAP_FAILED)
-		ksft_exit_fail_msg("mmap failed\n");
-	ksft_test_result(range_is_not_populated(addr, SIZE),
-			 "range initially not populated\n");
-
-	ret = madvise(addr, SIZE, MADV_POPULATE_READ);
-	ksft_test_result(!ret, "MADV_POPULATE_READ\n");
-	ksft_test_result(range_is_populated(addr, SIZE),
-			 "range is populated\n");
-
-	munmap(addr, SIZE);
-}
-
-static void test_populate_write(void)
-{
-	char *addr;
-	int ret;
-
-	ksft_print_msg("[RUN] %s\n", __func__);
-
-	addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
-		    MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-	if (addr == MAP_FAILED)
-		ksft_exit_fail_msg("mmap failed\n");
-	ksft_test_result(range_is_not_populated(addr, SIZE),
-			 "range initially not populated\n");
-
-	ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
-	ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
-	ksft_test_result(range_is_populated(addr, SIZE),
-			 "range is populated\n");
-
-	munmap(addr, SIZE);
-}
-
-static bool range_is_softdirty(char *start, ssize_t size)
-{
-	int fd = open("/proc/self/pagemap", O_RDONLY);
-	bool ret = true;
-
-	if (fd < 0)
-		ksft_exit_fail_msg("opening pagemap failed\n");
-	for (; size > 0 && ret; size -= pagesize, start += pagesize)
-		if (!pagemap_is_softdirty(fd, start))
-			ret = false;
-	close(fd);
-	return ret;
-}
-
-static bool range_is_not_softdirty(char *start, ssize_t size)
-{
-	int fd = open("/proc/self/pagemap", O_RDONLY);
-	bool ret = true;
-
-	if (fd < 0)
-		ksft_exit_fail_msg("opening pagemap failed\n");
-	for (; size > 0 && ret; size -= pagesize, start += pagesize)
-		if (pagemap_is_softdirty(fd, start))
-			ret = false;
-	close(fd);
-	return ret;
-}
-
-static void test_softdirty(void)
-{
-	char *addr;
-	int ret;
-
-	ksft_print_msg("[RUN] %s\n", __func__);
-
-	addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
-		    MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-	if (addr == MAP_FAILED)
-		ksft_exit_fail_msg("mmap failed\n");
-
-	/* Clear any softdirty bits. */
-	clear_softdirty();
-	ksft_test_result(range_is_not_softdirty(addr, SIZE),
-			 "range is not softdirty\n");
-
-	/* Populating READ should set softdirty. */
-	ret = madvise(addr, SIZE, MADV_POPULATE_READ);
-	ksft_test_result(!ret, "MADV_POPULATE_READ\n");
-	ksft_test_result(range_is_not_softdirty(addr, SIZE),
-			 "range is not softdirty\n");
-
-	/* Populating WRITE should set softdirty. */
-	ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
-	ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
-	ksft_test_result(range_is_softdirty(addr, SIZE),
-			 "range is softdirty\n");
-
-	munmap(addr, SIZE);
-}
-
-int main(int argc, char **argv)
-{
-	int err;
-
-	pagesize = getpagesize();
-
-	ksft_print_header();
-	ksft_set_plan(21);
-
-	sense_support();
-	test_prot_read();
-	test_prot_write();
-	test_holes();
-	test_populate_read();
-	test_populate_write();
-	test_softdirty();
-
-	err = ksft_get_fail_cnt();
-	if (err)
-		ksft_exit_fail_msg("%d out of %d tests failed\n",
-				   err, ksft_test_num());
-	return ksft_exit_pass();
-}
diff --git a/tools/testing/selftests/vm/map_fixed_noreplace.c b/tools/testing/selftests/vm/map_fixed_noreplace.c
deleted file mode 100644
index eed44322d1a6..000000000000
--- a/tools/testing/selftests/vm/map_fixed_noreplace.c
+++ /dev/null
@@ -1,231 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*
- * Test that MAP_FIXED_NOREPLACE works.
- *
- * Copyright 2018, Jann Horn <jannh@google.com>
- * Copyright 2018, Michael Ellerman, IBM Corporation.
- */
-
-#include <sys/mman.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-#ifndef MAP_FIXED_NOREPLACE
-#define MAP_FIXED_NOREPLACE 0x100000
-#endif
-
-static void dump_maps(void)
-{
-	char cmd[32];
-
-	snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
-	system(cmd);
-}
-
-static unsigned long find_base_addr(unsigned long size)
-{
-	void *addr;
-	unsigned long flags;
-
-	flags = MAP_PRIVATE | MAP_ANONYMOUS;
-	addr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
-	if (addr == MAP_FAILED) {
-		printf("Error: couldn't map the space we need for the test\n");
-		return 0;
-	}
-
-	if (munmap(addr, size) != 0) {
-		printf("Error: couldn't map the space we need for the test\n");
-		return 0;
-	}
-	return (unsigned long)addr;
-}
-
-int main(void)
-{
-	unsigned long base_addr;
-	unsigned long flags, addr, size, page_size;
-	char *p;
-
-	page_size = sysconf(_SC_PAGE_SIZE);
-
-	//let's find a base addr that is free before we start the tests
-	size = 5 * page_size;
-	base_addr = find_base_addr(size);
-	if (!base_addr) {
-		printf("Error: couldn't map the space we need for the test\n");
-		return 1;
-	}
-
-	flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
-
-	// Check we can map all the areas we need below
-	errno = 0;
-	addr = base_addr;
-	size = 5 * page_size;
-	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-	if (p == MAP_FAILED) {
-		dump_maps();
-		printf("Error: couldn't map the space we need for the test\n");
-		return 1;
-	}
-
-	errno = 0;
-	if (munmap((void *)addr, 5 * page_size) != 0) {
-		dump_maps();
-		printf("Error: munmap failed!?\n");
-		return 1;
-	}
-	printf("unmap() successful\n");
-
-	errno = 0;
-	addr = base_addr + page_size;
-	size = 3 * page_size;
-	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-	if (p == MAP_FAILED) {
-		dump_maps();
-		printf("Error: first mmap() failed unexpectedly\n");
-		return 1;
-	}
-
-	/*
-	 * Exact same mapping again:
-	 *   base |  free  | new
-	 *     +1 | mapped | new
-	 *     +2 | mapped | new
-	 *     +3 | mapped | new
-	 *     +4 |  free  | new
-	 */
-	errno = 0;
-	addr = base_addr;
-	size = 5 * page_size;
-	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-	if (p != MAP_FAILED) {
-		dump_maps();
-		printf("Error:1: mmap() succeeded when it shouldn't have\n");
-		return 1;
-	}
-
-	/*
-	 * Second mapping contained within first:
-	 *
-	 *   base |  free  |
-	 *     +1 | mapped |
-	 *     +2 | mapped | new
-	 *     +3 | mapped |
-	 *     +4 |  free  |
-	 */
-	errno = 0;
-	addr = base_addr + (2 * page_size);
-	size = page_size;
-	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-	if (p != MAP_FAILED) {
-		dump_maps();
-		printf("Error:2: mmap() succeeded when it shouldn't have\n");
-		return 1;
-	}
-
-	/*
-	 * Overlap end of existing mapping:
-	 *   base |  free  |
-	 *     +1 | mapped |
-	 *     +2 | mapped |
-	 *     +3 | mapped | new
-	 *     +4 |  free  | new
-	 */
-	errno = 0;
-	addr = base_addr + (3 * page_size);
-	size = 2 * page_size;
-	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-	if (p != MAP_FAILED) {
-		dump_maps();
-		printf("Error:3: mmap() succeeded when it shouldn't have\n");
-		return 1;
-	}
-
-	/*
-	 * Overlap start of existing mapping:
-	 *   base |  free  | new
-	 *     +1 | mapped | new
-	 *     +2 | mapped |
-	 *     +3 | mapped |
-	 *     +4 |  free  |
-	 */
-	errno = 0;
-	addr = base_addr;
-	size = 2 * page_size;
-	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-	if (p != MAP_FAILED) {
-		dump_maps();
-		printf("Error:4: mmap() succeeded when it shouldn't have\n");
-		return 1;
-	}
-
-	/*
-	 * Adjacent to start of existing mapping:
-	 *   base |  free  | new
-	 *     +1 | mapped |
-	 *     +2 | mapped |
-	 *     +3 | mapped |
-	 *     +4 |  free  |
-	 */
-	errno = 0;
-	addr = base_addr;
-	size = page_size;
-	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-	if (p == MAP_FAILED) {
-		dump_maps();
-		printf("Error:5: mmap() failed when it shouldn't have\n");
-		return 1;
-	}
-
-	/*
-	 * Adjacent to end of existing mapping:
-	 *   base |  free  |
-	 *     +1 | mapped |
-	 *     +2 | mapped |
-	 *     +3 | mapped |
-	 *     +4 |  free  |  new
-	 */
-	errno = 0;
-	addr = base_addr + (4 * page_size);
-	size = page_size;
-	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-	if (p == MAP_FAILED) {
-		dump_maps();
-		printf("Error:6: mmap() failed when it shouldn't have\n");
-		return 1;
-	}
-
-	addr = base_addr;
-	size = 5 * page_size;
-	if (munmap((void *)addr, size) != 0) {
-		dump_maps();
-		printf("Error: munmap failed!?\n");
-		return 1;
-	}
-	printf("unmap() successful\n");
-
-	printf("OK\n");
-	return 0;
-}
diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c
deleted file mode 100644
index 312889edb84a..000000000000
--- a/tools/testing/selftests/vm/map_hugetlb.c
+++ /dev/null
@@ -1,109 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Example of using hugepage memory in a user application using the mmap
- * system call with MAP_HUGETLB flag.  Before running this program make
- * sure the administrator has allocated enough default sized huge pages
- * to cover the 256 MB allocation.
- *
- * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
- * That means the addresses starting with 0x800000... will need to be
- * specified.  Specifying a fixed address is not required on ppc64, i386
- * or x86_64.
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-
-#define LENGTH (256UL*1024*1024)
-#define PROTECTION (PROT_READ | PROT_WRITE)
-
-#ifndef MAP_HUGETLB
-#define MAP_HUGETLB 0x40000 /* arch specific */
-#endif
-
-#ifndef MAP_HUGE_SHIFT
-#define MAP_HUGE_SHIFT 26
-#endif
-
-#ifndef MAP_HUGE_MASK
-#define MAP_HUGE_MASK 0x3f
-#endif
-
-/* Only ia64 requires this */
-#ifdef __ia64__
-#define ADDR (void *)(0x8000000000000000UL)
-#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
-#else
-#define ADDR (void *)(0x0UL)
-#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
-#endif
-
-static void check_bytes(char *addr)
-{
-	printf("First hex is %x\n", *((unsigned int *)addr));
-}
-
-static void write_bytes(char *addr, size_t length)
-{
-	unsigned long i;
-
-	for (i = 0; i < length; i++)
-		*(addr + i) = (char)i;
-}
-
-static int read_bytes(char *addr, size_t length)
-{
-	unsigned long i;
-
-	check_bytes(addr);
-	for (i = 0; i < length; i++)
-		if (*(addr + i) != (char)i) {
-			printf("Mismatch at %lu\n", i);
-			return 1;
-		}
-	return 0;
-}
-
-int main(int argc, char **argv)
-{
-	void *addr;
-	int ret;
-	size_t length = LENGTH;
-	int flags = FLAGS;
-	int shift = 0;
-
-	if (argc > 1)
-		length = atol(argv[1]) << 20;
-	if (argc > 2) {
-		shift = atoi(argv[2]);
-		if (shift)
-			flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
-	}
-
-	if (shift)
-		printf("%u kB hugepages\n", 1 << (shift - 10));
-	else
-		printf("Default size hugepages\n");
-	printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
-
-	addr = mmap(ADDR, length, PROTECTION, flags, -1, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
-
-	printf("Returned address is %p\n", addr);
-	check_bytes(addr);
-	write_bytes(addr, length);
-	ret = read_bytes(addr, length);
-
-	/* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
-	if (munmap(addr, length)) {
-		perror("munmap");
-		exit(1);
-	}
-
-	return ret;
-}
diff --git a/tools/testing/selftests/vm/map_populate.c b/tools/testing/selftests/vm/map_populate.c
deleted file mode 100644
index 6b8aeaa0bf7a..000000000000
--- a/tools/testing/selftests/vm/map_populate.c
+++ /dev/null
@@ -1,113 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2018 Dmitry Safonov, Arista Networks
- *
- * MAP_POPULATE | MAP_PRIVATE should COW VMA pages.
- */
-
-#define _GNU_SOURCE
-#include <errno.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#ifndef MMAP_SZ
-#define MMAP_SZ		4096
-#endif
-
-#define BUG_ON(condition, description)					\
-	do {								\
-		if (condition) {					\
-			fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \
-				__LINE__, (description), strerror(errno)); \
-			exit(1);					\
-		}							\
-	} while (0)
-
-static int parent_f(int sock, unsigned long *smap, int child)
-{
-	int status, ret;
-
-	ret = read(sock, &status, sizeof(int));
-	BUG_ON(ret <= 0, "read(sock)");
-
-	*smap = 0x22222BAD;
-	ret = msync(smap, MMAP_SZ, MS_SYNC);
-	BUG_ON(ret, "msync()");
-
-	ret = write(sock, &status, sizeof(int));
-	BUG_ON(ret <= 0, "write(sock)");
-
-	waitpid(child, &status, 0);
-	BUG_ON(!WIFEXITED(status), "child in unexpected state");
-
-	return WEXITSTATUS(status);
-}
-
-static int child_f(int sock, unsigned long *smap, int fd)
-{
-	int ret, buf = 0;
-
-	smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
-			MAP_PRIVATE | MAP_POPULATE, fd, 0);
-	BUG_ON(smap == MAP_FAILED, "mmap()");
-
-	BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file");
-
-	ret = write(sock, &buf, sizeof(int));
-	BUG_ON(ret <= 0, "write(sock)");
-
-	ret = read(sock, &buf, sizeof(int));
-	BUG_ON(ret <= 0, "read(sock)");
-
-	BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page");
-	BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted");
-
-	return 0;
-}
-
-int main(int argc, char **argv)
-{
-	int sock[2], child, ret;
-	FILE *ftmp;
-	unsigned long *smap;
-
-	ftmp = tmpfile();
-	BUG_ON(ftmp == 0, "tmpfile()");
-
-	ret = ftruncate(fileno(ftmp), MMAP_SZ);
-	BUG_ON(ret, "ftruncate()");
-
-	smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
-			MAP_SHARED, fileno(ftmp), 0);
-	BUG_ON(smap == MAP_FAILED, "mmap()");
-
-	*smap = 0xdeadbabe;
-	/* Probably unnecessary, but let it be. */
-	ret = msync(smap, MMAP_SZ, MS_SYNC);
-	BUG_ON(ret, "msync()");
-
-	ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock);
-	BUG_ON(ret, "socketpair()");
-
-	child = fork();
-	BUG_ON(child == -1, "fork()");
-
-	if (child) {
-		ret = close(sock[0]);
-		BUG_ON(ret, "close()");
-
-		return parent_f(sock[1], smap, child);
-	}
-
-	ret = close(sock[1]);
-	BUG_ON(ret, "close()");
-
-	return child_f(sock[0], smap, fileno(ftmp));
-}
diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c
deleted file mode 100644
index 957b9e18c729..000000000000
--- a/tools/testing/selftests/vm/memfd_secret.c
+++ /dev/null
@@ -1,296 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright IBM Corporation, 2021
- *
- * Author: Mike Rapoport <rppt@linux.ibm.com>
- */
-
-#define _GNU_SOURCE
-#include <sys/uio.h>
-#include <sys/mman.h>
-#include <sys/wait.h>
-#include <sys/types.h>
-#include <sys/ptrace.h>
-#include <sys/syscall.h>
-#include <sys/resource.h>
-#include <sys/capability.h>
-
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <errno.h>
-#include <stdio.h>
-
-#include "../kselftest.h"
-
-#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__)
-#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__)
-#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__)
-
-#ifdef __NR_memfd_secret
-
-#define PATTERN	0x55
-
-static const int prot = PROT_READ | PROT_WRITE;
-static const int mode = MAP_SHARED;
-
-static unsigned long page_size;
-static unsigned long mlock_limit_cur;
-static unsigned long mlock_limit_max;
-
-static int memfd_secret(unsigned int flags)
-{
-	return syscall(__NR_memfd_secret, flags);
-}
-
-static void test_file_apis(int fd)
-{
-	char buf[64];
-
-	if ((read(fd, buf, sizeof(buf)) >= 0) ||
-	    (write(fd, buf, sizeof(buf)) >= 0) ||
-	    (pread(fd, buf, sizeof(buf), 0) >= 0) ||
-	    (pwrite(fd, buf, sizeof(buf), 0) >= 0))
-		fail("unexpected file IO\n");
-	else
-		pass("file IO is blocked as expected\n");
-}
-
-static void test_mlock_limit(int fd)
-{
-	size_t len;
-	char *mem;
-
-	len = mlock_limit_cur;
-	mem = mmap(NULL, len, prot, mode, fd, 0);
-	if (mem == MAP_FAILED) {
-		fail("unable to mmap secret memory\n");
-		return;
-	}
-	munmap(mem, len);
-
-	len = mlock_limit_max * 2;
-	mem = mmap(NULL, len, prot, mode, fd, 0);
-	if (mem != MAP_FAILED) {
-		fail("unexpected mlock limit violation\n");
-		munmap(mem, len);
-		return;
-	}
-
-	pass("mlock limit is respected\n");
-}
-
-static void try_process_vm_read(int fd, int pipefd[2])
-{
-	struct iovec liov, riov;
-	char buf[64];
-	char *mem;
-
-	if (read(pipefd[0], &mem, sizeof(mem)) < 0) {
-		fail("pipe write: %s\n", strerror(errno));
-		exit(KSFT_FAIL);
-	}
-
-	liov.iov_len = riov.iov_len = sizeof(buf);
-	liov.iov_base = buf;
-	riov.iov_base = mem;
-
-	if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) {
-		if (errno == ENOSYS)
-			exit(KSFT_SKIP);
-		exit(KSFT_PASS);
-	}
-
-	exit(KSFT_FAIL);
-}
-
-static void try_ptrace(int fd, int pipefd[2])
-{
-	pid_t ppid = getppid();
-	int status;
-	char *mem;
-	long ret;
-
-	if (read(pipefd[0], &mem, sizeof(mem)) < 0) {
-		perror("pipe write");
-		exit(KSFT_FAIL);
-	}
-
-	ret = ptrace(PTRACE_ATTACH, ppid, 0, 0);
-	if (ret) {
-		perror("ptrace_attach");
-		exit(KSFT_FAIL);
-	}
-
-	ret = waitpid(ppid, &status, WUNTRACED);
-	if ((ret != ppid) || !(WIFSTOPPED(status))) {
-		fprintf(stderr, "weird waitppid result %ld stat %x\n",
-			ret, status);
-		exit(KSFT_FAIL);
-	}
-
-	if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0))
-		exit(KSFT_PASS);
-
-	exit(KSFT_FAIL);
-}
-
-static void check_child_status(pid_t pid, const char *name)
-{
-	int status;
-
-	waitpid(pid, &status, 0);
-
-	if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) {
-		skip("%s is not supported\n", name);
-		return;
-	}
-
-	if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) ||
-	    WIFSIGNALED(status)) {
-		pass("%s is blocked as expected\n", name);
-		return;
-	}
-
-	fail("%s: unexpected memory access\n", name);
-}
-
-static void test_remote_access(int fd, const char *name,
-			       void (*func)(int fd, int pipefd[2]))
-{
-	int pipefd[2];
-	pid_t pid;
-	char *mem;
-
-	if (pipe(pipefd)) {
-		fail("pipe failed: %s\n", strerror(errno));
-		return;
-	}
-
-	pid = fork();
-	if (pid < 0) {
-		fail("fork failed: %s\n", strerror(errno));
-		return;
-	}
-
-	if (pid == 0) {
-		func(fd, pipefd);
-		return;
-	}
-
-	mem = mmap(NULL, page_size, prot, mode, fd, 0);
-	if (mem == MAP_FAILED) {
-		fail("Unable to mmap secret memory\n");
-		return;
-	}
-
-	ftruncate(fd, page_size);
-	memset(mem, PATTERN, page_size);
-
-	if (write(pipefd[1], &mem, sizeof(mem)) < 0) {
-		fail("pipe write: %s\n", strerror(errno));
-		return;
-	}
-
-	check_child_status(pid, name);
-}
-
-static void test_process_vm_read(int fd)
-{
-	test_remote_access(fd, "process_vm_read", try_process_vm_read);
-}
-
-static void test_ptrace(int fd)
-{
-	test_remote_access(fd, "ptrace", try_ptrace);
-}
-
-static int set_cap_limits(rlim_t max)
-{
-	struct rlimit new;
-	cap_t cap = cap_init();
-
-	new.rlim_cur = max;
-	new.rlim_max = max;
-	if (setrlimit(RLIMIT_MEMLOCK, &new)) {
-		perror("setrlimit() returns error");
-		return -1;
-	}
-
-	/* drop capabilities including CAP_IPC_LOCK */
-	if (cap_set_proc(cap)) {
-		perror("cap_set_proc() returns error");
-		return -2;
-	}
-
-	return 0;
-}
-
-static void prepare(void)
-{
-	struct rlimit rlim;
-
-	page_size = sysconf(_SC_PAGE_SIZE);
-	if (!page_size)
-		ksft_exit_fail_msg("Failed to get page size %s\n",
-				   strerror(errno));
-
-	if (getrlimit(RLIMIT_MEMLOCK, &rlim))
-		ksft_exit_fail_msg("Unable to detect mlock limit: %s\n",
-				   strerror(errno));
-
-	mlock_limit_cur = rlim.rlim_cur;
-	mlock_limit_max = rlim.rlim_max;
-
-	printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n",
-	       page_size, mlock_limit_cur, mlock_limit_max);
-
-	if (page_size > mlock_limit_cur)
-		mlock_limit_cur = page_size;
-	if (page_size > mlock_limit_max)
-		mlock_limit_max = page_size;
-
-	if (set_cap_limits(mlock_limit_max))
-		ksft_exit_fail_msg("Unable to set mlock limit: %s\n",
-				   strerror(errno));
-}
-
-#define NUM_TESTS 4
-
-int main(int argc, char *argv[])
-{
-	int fd;
-
-	prepare();
-
-	ksft_print_header();
-	ksft_set_plan(NUM_TESTS);
-
-	fd = memfd_secret(0);
-	if (fd < 0) {
-		if (errno == ENOSYS)
-			ksft_exit_skip("memfd_secret is not supported\n");
-		else
-			ksft_exit_fail_msg("memfd_secret failed: %s\n",
-					   strerror(errno));
-	}
-
-	test_mlock_limit(fd);
-	test_file_apis(fd);
-	test_process_vm_read(fd);
-	test_ptrace(fd);
-
-	close(fd);
-
-	ksft_finished();
-}
-
-#else /* __NR_memfd_secret */
-
-int main(int argc, char *argv[])
-{
-	printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n");
-	return KSFT_SKIP;
-}
-
-#endif /* __NR_memfd_secret */
diff --git a/tools/testing/selftests/vm/migration.c b/tools/testing/selftests/vm/migration.c
deleted file mode 100644
index 1cec8425e3ca..000000000000
--- a/tools/testing/selftests/vm/migration.c
+++ /dev/null
@@ -1,193 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * The main purpose of the tests here is to exercise the migration entry code
- * paths in the kernel.
- */
-
-#include "../kselftest_harness.h"
-#include <strings.h>
-#include <pthread.h>
-#include <numa.h>
-#include <numaif.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <signal.h>
-#include <time.h>
-
-#define TWOMEG (2<<20)
-#define RUNTIME (60)
-
-#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
-
-FIXTURE(migration)
-{
-	pthread_t *threads;
-	pid_t *pids;
-	int nthreads;
-	int n1;
-	int n2;
-};
-
-FIXTURE_SETUP(migration)
-{
-	int n;
-
-	ASSERT_EQ(numa_available(), 0);
-	self->nthreads = numa_num_task_cpus() - 1;
-	self->n1 = -1;
-	self->n2 = -1;
-
-	for (n = 0; n < numa_max_possible_node(); n++)
-		if (numa_bitmask_isbitset(numa_all_nodes_ptr, n)) {
-			if (self->n1 == -1) {
-				self->n1 = n;
-			} else {
-				self->n2 = n;
-				break;
-			}
-		}
-
-	self->threads = malloc(self->nthreads * sizeof(*self->threads));
-	ASSERT_NE(self->threads, NULL);
-	self->pids = malloc(self->nthreads * sizeof(*self->pids));
-	ASSERT_NE(self->pids, NULL);
-};
-
-FIXTURE_TEARDOWN(migration)
-{
-	free(self->threads);
-	free(self->pids);
-}
-
-int migrate(uint64_t *ptr, int n1, int n2)
-{
-	int ret, tmp;
-	int status = 0;
-	struct timespec ts1, ts2;
-
-	if (clock_gettime(CLOCK_MONOTONIC, &ts1))
-		return -1;
-
-	while (1) {
-		if (clock_gettime(CLOCK_MONOTONIC, &ts2))
-			return -1;
-
-		if (ts2.tv_sec - ts1.tv_sec >= RUNTIME)
-			return 0;
-
-		ret = move_pages(0, 1, (void **) &ptr, &n2, &status,
-				MPOL_MF_MOVE_ALL);
-		if (ret) {
-			if (ret > 0)
-				printf("Didn't migrate %d pages\n", ret);
-			else
-				perror("Couldn't migrate pages");
-			return -2;
-		}
-
-		tmp = n2;
-		n2 = n1;
-		n1 = tmp;
-	}
-
-	return 0;
-}
-
-void *access_mem(void *ptr)
-{
-	uint64_t y = 0;
-	volatile uint64_t *x = ptr;
-
-	while (1) {
-		pthread_testcancel();
-		y += *x;
-	}
-
-	return NULL;
-}
-
-/*
- * Basic migration entry testing. One thread will move pages back and forth
- * between nodes whilst other threads try and access them triggering the
- * migration entry wait paths in the kernel.
- */
-TEST_F_TIMEOUT(migration, private_anon, 2*RUNTIME)
-{
-	uint64_t *ptr;
-	int i;
-
-	if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
-		SKIP(return, "Not enough threads or NUMA nodes available");
-
-	ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE,
-		MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	ASSERT_NE(ptr, MAP_FAILED);
-
-	memset(ptr, 0xde, TWOMEG);
-	for (i = 0; i < self->nthreads - 1; i++)
-		if (pthread_create(&self->threads[i], NULL, access_mem, ptr))
-			perror("Couldn't create thread");
-
-	ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
-	for (i = 0; i < self->nthreads - 1; i++)
-		ASSERT_EQ(pthread_cancel(self->threads[i]), 0);
-}
-
-/*
- * Same as the previous test but with shared memory.
- */
-TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME)
-{
-	pid_t pid;
-	uint64_t *ptr;
-	int i;
-
-	if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
-		SKIP(return, "Not enough threads or NUMA nodes available");
-
-	ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE,
-		MAP_SHARED | MAP_ANONYMOUS, -1, 0);
-	ASSERT_NE(ptr, MAP_FAILED);
-
-	memset(ptr, 0xde, TWOMEG);
-	for (i = 0; i < self->nthreads - 1; i++) {
-		pid = fork();
-		if (!pid)
-			access_mem(ptr);
-		else
-			self->pids[i] = pid;
-	}
-
-	ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
-	for (i = 0; i < self->nthreads - 1; i++)
-		ASSERT_EQ(kill(self->pids[i], SIGTERM), 0);
-}
-
-/*
- * Tests the pmd migration entry paths.
- */
-TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME)
-{
-	uint64_t *ptr;
-	int i;
-
-	if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
-		SKIP(return, "Not enough threads or NUMA nodes available");
-
-	ptr = mmap(NULL, 2*TWOMEG, PROT_READ | PROT_WRITE,
-		MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	ASSERT_NE(ptr, MAP_FAILED);
-
-	ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG);
-	ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0);
-	memset(ptr, 0xde, TWOMEG);
-	for (i = 0; i < self->nthreads - 1; i++)
-		if (pthread_create(&self->threads[i], NULL, access_mem, ptr))
-			perror("Couldn't create thread");
-
-	ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
-	for (i = 0; i < self->nthreads - 1; i++)
-		ASSERT_EQ(pthread_cancel(self->threads[i]), 0);
-}
-
-TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/vm/mlock-random-test.c b/tools/testing/selftests/vm/mlock-random-test.c
deleted file mode 100644
index 782ea94dee2f..000000000000
--- a/tools/testing/selftests/vm/mlock-random-test.c
+++ /dev/null
@@ -1,294 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * It tests the mlock/mlock2() when they are invoked
- * on randomly memory region.
- */
-#include <unistd.h>
-#include <sys/resource.h>
-#include <sys/capability.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <string.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#include <time.h>
-#include "mlock2.h"
-
-#define CHUNK_UNIT (128 * 1024)
-#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2)
-#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT
-#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3)
-
-#define TEST_LOOP 100
-#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1))
-
-int set_cap_limits(rlim_t max)
-{
-	struct rlimit new;
-	cap_t cap = cap_init();
-
-	new.rlim_cur = max;
-	new.rlim_max = max;
-	if (setrlimit(RLIMIT_MEMLOCK, &new)) {
-		perror("setrlimit() returns error\n");
-		return -1;
-	}
-
-	/* drop capabilities including CAP_IPC_LOCK */
-	if (cap_set_proc(cap)) {
-		perror("cap_set_proc() returns error\n");
-		return -2;
-	}
-
-	return 0;
-}
-
-int get_proc_locked_vm_size(void)
-{
-	FILE *f;
-	int ret = -1;
-	char line[1024] = {0};
-	unsigned long lock_size = 0;
-
-	f = fopen("/proc/self/status", "r");
-	if (!f) {
-		perror("fopen");
-		return -1;
-	}
-
-	while (fgets(line, 1024, f)) {
-		if (strstr(line, "VmLck")) {
-			ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size);
-			if (ret <= 0) {
-				printf("sscanf() on VmLck error: %s: %d\n",
-						line, ret);
-				fclose(f);
-				return -1;
-			}
-			fclose(f);
-			return (int)(lock_size << 10);
-		}
-	}
-
-	perror("cannot parse VmLck in /proc/self/status\n");
-	fclose(f);
-	return -1;
-}
-
-/*
- * Get the MMUPageSize of the memory region including input
- * address from proc file.
- *
- * return value: on error case, 0 will be returned.
- * Otherwise the page size(in bytes) is returned.
- */
-int get_proc_page_size(unsigned long addr)
-{
-	FILE *smaps;
-	char *line;
-	unsigned long mmupage_size = 0;
-	size_t size;
-
-	smaps = seek_to_smaps_entry(addr);
-	if (!smaps) {
-		printf("Unable to parse /proc/self/smaps\n");
-		return 0;
-	}
-
-	while (getline(&line, &size, smaps) > 0) {
-		if (!strstr(line, "MMUPageSize")) {
-			free(line);
-			line = NULL;
-			size = 0;
-			continue;
-		}
-
-		/* found the MMUPageSize of this section */
-		if (sscanf(line, "MMUPageSize:    %8lu kB",
-					&mmupage_size) < 1) {
-			printf("Unable to parse smaps entry for Size:%s\n",
-					line);
-			break;
-		}
-
-	}
-	free(line);
-	if (smaps)
-		fclose(smaps);
-	return mmupage_size << 10;
-}
-
-/*
- * Test mlock/mlock2() on provided memory chunk.
- * It expects the mlock/mlock2() to be successful (within rlimit)
- *
- * With allocated memory chunk [p, p + alloc_size), this
- * test will choose start/len randomly to perform mlock/mlock2
- * [start, start +  len] memory range. The range is within range
- * of the allocated chunk.
- *
- * The memory region size alloc_size is within the rlimit.
- * So we always expect a success of mlock/mlock2.
- *
- * VmLck is assumed to be 0 before this test.
- *
- *    return value: 0 - success
- *    else: failure
- */
-int test_mlock_within_limit(char *p, int alloc_size)
-{
-	int i;
-	int ret = 0;
-	int locked_vm_size = 0;
-	struct rlimit cur;
-	int page_size = 0;
-
-	getrlimit(RLIMIT_MEMLOCK, &cur);
-	if (cur.rlim_cur < alloc_size) {
-		printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
-				alloc_size, (unsigned int)cur.rlim_cur);
-		return -1;
-	}
-
-	srand(time(NULL));
-	for (i = 0; i < TEST_LOOP; i++) {
-		/*
-		 * - choose mlock/mlock2 randomly
-		 * - choose lock_size randomly but lock_size < alloc_size
-		 * - choose start_offset randomly but p+start_offset+lock_size
-		 *   < p+alloc_size
-		 */
-		int is_mlock = !!(rand() % 2);
-		int lock_size = rand() % alloc_size;
-		int start_offset = rand() % (alloc_size - lock_size);
-
-		if (is_mlock)
-			ret = mlock(p + start_offset, lock_size);
-		else
-			ret = mlock2_(p + start_offset, lock_size,
-				       MLOCK_ONFAULT);
-
-		if (ret) {
-			printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
-					is_mlock ? "mlock" : "mlock2",
-					p, alloc_size,
-					p + start_offset, lock_size);
-			return ret;
-		}
-	}
-
-	/*
-	 * Check VmLck left by the tests.
-	 */
-	locked_vm_size = get_proc_locked_vm_size();
-	page_size = get_proc_page_size((unsigned long)p);
-	if (page_size == 0) {
-		printf("cannot get proc MMUPageSize\n");
-		return -1;
-	}
-
-	if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) {
-		printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n",
-				locked_vm_size, alloc_size);
-		return -1;
-	}
-
-	return 0;
-}
-
-
-/*
- * We expect the mlock/mlock2() to be fail (outof limitation)
- *
- * With allocated memory chunk [p, p + alloc_size), this
- * test will randomly choose start/len and perform mlock/mlock2
- * on [start, start+len] range.
- *
- * The memory region size alloc_size is above the rlimit.
- * And the len to be locked is higher than rlimit.
- * So we always expect a failure of mlock/mlock2.
- * No locked page number should be increased as a side effect.
- *
- *    return value: 0 - success
- *    else: failure
- */
-int test_mlock_outof_limit(char *p, int alloc_size)
-{
-	int i;
-	int ret = 0;
-	int locked_vm_size = 0, old_locked_vm_size = 0;
-	struct rlimit cur;
-
-	getrlimit(RLIMIT_MEMLOCK, &cur);
-	if (cur.rlim_cur >= alloc_size) {
-		printf("alloc_size[%d] >%u rlimit, violates test condition\n",
-				alloc_size, (unsigned int)cur.rlim_cur);
-		return -1;
-	}
-
-	old_locked_vm_size = get_proc_locked_vm_size();
-	srand(time(NULL));
-	for (i = 0; i < TEST_LOOP; i++) {
-		int is_mlock = !!(rand() % 2);
-		int lock_size = (rand() % (alloc_size - cur.rlim_cur))
-			+ cur.rlim_cur;
-		int start_offset = rand() % (alloc_size - lock_size);
-
-		if (is_mlock)
-			ret = mlock(p + start_offset, lock_size);
-		else
-			ret = mlock2_(p + start_offset, lock_size,
-					MLOCK_ONFAULT);
-		if (ret == 0) {
-			printf("%s() succeeds? on %p(%d) mlock%p(%d)\n",
-					is_mlock ? "mlock" : "mlock2",
-					p, alloc_size,
-					p + start_offset, lock_size);
-			return -1;
-		}
-	}
-
-	locked_vm_size = get_proc_locked_vm_size();
-	if (locked_vm_size != old_locked_vm_size) {
-		printf("tests leads to new mlocked page: old[%d], new[%d]\n",
-				old_locked_vm_size,
-				locked_vm_size);
-		return -1;
-	}
-
-	return 0;
-}
-
-int main(int argc, char **argv)
-{
-	char *p = NULL;
-	int ret = 0;
-
-	if (set_cap_limits(MLOCK_RLIMIT_SIZE))
-		return -1;
-
-	p = malloc(MLOCK_WITHIN_LIMIT_SIZE);
-	if (p == NULL) {
-		perror("malloc() failure\n");
-		return -1;
-	}
-	ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
-	if (ret)
-		return ret;
-	munlock(p, MLOCK_WITHIN_LIMIT_SIZE);
-	free(p);
-
-
-	p = malloc(MLOCK_OUTOF_LIMIT_SIZE);
-	if (p == NULL) {
-		perror("malloc() failure\n");
-		return -1;
-	}
-	ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
-	if (ret)
-		return ret;
-	munlock(p, MLOCK_OUTOF_LIMIT_SIZE);
-	free(p);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c
deleted file mode 100644
index 11b2301f3aa3..000000000000
--- a/tools/testing/selftests/vm/mlock2-tests.c
+++ /dev/null
@@ -1,520 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define _GNU_SOURCE
-#include <sys/mman.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <string.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-#include <stdbool.h>
-#include "mlock2.h"
-
-#include "../kselftest.h"
-
-struct vm_boundaries {
-	unsigned long start;
-	unsigned long end;
-};
-
-static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
-{
-	FILE *file;
-	int ret = 1;
-	char line[1024] = {0};
-	char *end_addr;
-	char *stop;
-	unsigned long start;
-	unsigned long end;
-
-	if (!area)
-		return ret;
-
-	file = fopen("/proc/self/maps", "r");
-	if (!file) {
-		perror("fopen");
-		return ret;
-	}
-
-	memset(area, 0, sizeof(struct vm_boundaries));
-
-	while(fgets(line, 1024, file)) {
-		end_addr = strchr(line, '-');
-		if (!end_addr) {
-			printf("cannot parse /proc/self/maps\n");
-			goto out;
-		}
-		*end_addr = '\0';
-		end_addr++;
-		stop = strchr(end_addr, ' ');
-		if (!stop) {
-			printf("cannot parse /proc/self/maps\n");
-			goto out;
-		}
-		stop = '\0';
-
-		sscanf(line, "%lx", &start);
-		sscanf(end_addr, "%lx", &end);
-
-		if (start <= addr && end > addr) {
-			area->start = start;
-			area->end = end;
-			ret = 0;
-			goto out;
-		}
-	}
-out:
-	fclose(file);
-	return ret;
-}
-
-#define VMFLAGS "VmFlags:"
-
-static bool is_vmflag_set(unsigned long addr, const char *vmflag)
-{
-	char *line = NULL;
-	char *flags;
-	size_t size = 0;
-	bool ret = false;
-	FILE *smaps;
-
-	smaps = seek_to_smaps_entry(addr);
-	if (!smaps) {
-		printf("Unable to parse /proc/self/smaps\n");
-		goto out;
-	}
-
-	while (getline(&line, &size, smaps) > 0) {
-		if (!strstr(line, VMFLAGS)) {
-			free(line);
-			line = NULL;
-			size = 0;
-			continue;
-		}
-
-		flags = line + strlen(VMFLAGS);
-		ret = (strstr(flags, vmflag) != NULL);
-		goto out;
-	}
-
-out:
-	free(line);
-	fclose(smaps);
-	return ret;
-}
-
-#define SIZE "Size:"
-#define RSS  "Rss:"
-#define LOCKED "lo"
-
-static unsigned long get_value_for_name(unsigned long addr, const char *name)
-{
-	char *line = NULL;
-	size_t size = 0;
-	char *value_ptr;
-	FILE *smaps = NULL;
-	unsigned long value = -1UL;
-
-	smaps = seek_to_smaps_entry(addr);
-	if (!smaps) {
-		printf("Unable to parse /proc/self/smaps\n");
-		goto out;
-	}
-
-	while (getline(&line, &size, smaps) > 0) {
-		if (!strstr(line, name)) {
-			free(line);
-			line = NULL;
-			size = 0;
-			continue;
-		}
-
-		value_ptr = line + strlen(name);
-		if (sscanf(value_ptr, "%lu kB", &value) < 1) {
-			printf("Unable to parse smaps entry for Size\n");
-			goto out;
-		}
-		break;
-	}
-
-out:
-	if (smaps)
-		fclose(smaps);
-	free(line);
-	return value;
-}
-
-static bool is_vma_lock_on_fault(unsigned long addr)
-{
-	bool locked;
-	unsigned long vma_size, vma_rss;
-
-	locked = is_vmflag_set(addr, LOCKED);
-	if (!locked)
-		return false;
-
-	vma_size = get_value_for_name(addr, SIZE);
-	vma_rss = get_value_for_name(addr, RSS);
-
-	/* only one page is faulted in */
-	return (vma_rss < vma_size);
-}
-
-#define PRESENT_BIT     0x8000000000000000ULL
-#define PFN_MASK        0x007FFFFFFFFFFFFFULL
-#define UNEVICTABLE_BIT (1UL << 18)
-
-static int lock_check(unsigned long addr)
-{
-	bool locked;
-	unsigned long vma_size, vma_rss;
-
-	locked = is_vmflag_set(addr, LOCKED);
-	if (!locked)
-		return false;
-
-	vma_size = get_value_for_name(addr, SIZE);
-	vma_rss = get_value_for_name(addr, RSS);
-
-	return (vma_rss == vma_size);
-}
-
-static int unlock_lock_check(char *map)
-{
-	if (is_vmflag_set((unsigned long)map, LOCKED)) {
-		printf("VMA flag %s is present on page 1 after unlock\n", LOCKED);
-		return 1;
-	}
-
-	return 0;
-}
-
-static int test_mlock_lock()
-{
-	char *map;
-	int ret = 1;
-	unsigned long page_size = getpagesize();
-
-	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
-		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (map == MAP_FAILED) {
-		perror("test_mlock_locked mmap");
-		goto out;
-	}
-
-	if (mlock2_(map, 2 * page_size, 0)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("mlock2(0)");
-		goto unmap;
-	}
-
-	if (!lock_check((unsigned long)map))
-		goto unmap;
-
-	/* Now unlock and recheck attributes */
-	if (munlock(map, 2 * page_size)) {
-		perror("munlock()");
-		goto unmap;
-	}
-
-	ret = unlock_lock_check(map);
-
-unmap:
-	munmap(map, 2 * page_size);
-out:
-	return ret;
-}
-
-static int onfault_check(char *map)
-{
-	*map = 'a';
-	if (!is_vma_lock_on_fault((unsigned long)map)) {
-		printf("VMA is not marked for lock on fault\n");
-		return 1;
-	}
-
-	return 0;
-}
-
-static int unlock_onfault_check(char *map)
-{
-	unsigned long page_size = getpagesize();
-
-	if (is_vma_lock_on_fault((unsigned long)map) ||
-	    is_vma_lock_on_fault((unsigned long)map + page_size)) {
-		printf("VMA is still lock on fault after unlock\n");
-		return 1;
-	}
-
-	return 0;
-}
-
-static int test_mlock_onfault()
-{
-	char *map;
-	int ret = 1;
-	unsigned long page_size = getpagesize();
-
-	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
-		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (map == MAP_FAILED) {
-		perror("test_mlock_locked mmap");
-		goto out;
-	}
-
-	if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("mlock2(MLOCK_ONFAULT)");
-		goto unmap;
-	}
-
-	if (onfault_check(map))
-		goto unmap;
-
-	/* Now unlock and recheck attributes */
-	if (munlock(map, 2 * page_size)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("munlock()");
-		goto unmap;
-	}
-
-	ret = unlock_onfault_check(map);
-unmap:
-	munmap(map, 2 * page_size);
-out:
-	return ret;
-}
-
-static int test_lock_onfault_of_present()
-{
-	char *map;
-	int ret = 1;
-	unsigned long page_size = getpagesize();
-
-	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
-		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (map == MAP_FAILED) {
-		perror("test_mlock_locked mmap");
-		goto out;
-	}
-
-	*map = 'a';
-
-	if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("mlock2(MLOCK_ONFAULT)");
-		goto unmap;
-	}
-
-	if (!is_vma_lock_on_fault((unsigned long)map) ||
-	    !is_vma_lock_on_fault((unsigned long)map + page_size)) {
-		printf("VMA with present pages is not marked lock on fault\n");
-		goto unmap;
-	}
-	ret = 0;
-unmap:
-	munmap(map, 2 * page_size);
-out:
-	return ret;
-}
-
-static int test_munlockall()
-{
-	char *map;
-	int ret = 1;
-	unsigned long page_size = getpagesize();
-
-	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
-		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-
-	if (map == MAP_FAILED) {
-		perror("test_munlockall mmap");
-		goto out;
-	}
-
-	if (mlockall(MCL_CURRENT)) {
-		perror("mlockall(MCL_CURRENT)");
-		goto out;
-	}
-
-	if (!lock_check((unsigned long)map))
-		goto unmap;
-
-	if (munlockall()) {
-		perror("munlockall()");
-		goto unmap;
-	}
-
-	if (unlock_lock_check(map))
-		goto unmap;
-
-	munmap(map, 2 * page_size);
-
-	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
-		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-
-	if (map == MAP_FAILED) {
-		perror("test_munlockall second mmap");
-		goto out;
-	}
-
-	if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
-		perror("mlockall(MCL_CURRENT | MCL_ONFAULT)");
-		goto unmap;
-	}
-
-	if (onfault_check(map))
-		goto unmap;
-
-	if (munlockall()) {
-		perror("munlockall()");
-		goto unmap;
-	}
-
-	if (unlock_onfault_check(map))
-		goto unmap;
-
-	if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
-		perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
-		goto out;
-	}
-
-	if (!lock_check((unsigned long)map))
-		goto unmap;
-
-	if (munlockall()) {
-		perror("munlockall()");
-		goto unmap;
-	}
-
-	ret = unlock_lock_check(map);
-
-unmap:
-	munmap(map, 2 * page_size);
-out:
-	munlockall();
-	return ret;
-}
-
-static int test_vma_management(bool call_mlock)
-{
-	int ret = 1;
-	void *map;
-	unsigned long page_size = getpagesize();
-	struct vm_boundaries page1;
-	struct vm_boundaries page2;
-	struct vm_boundaries page3;
-
-	map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
-		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (map == MAP_FAILED) {
-		perror("mmap()");
-		return ret;
-	}
-
-	if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("mlock(ONFAULT)\n");
-		goto out;
-	}
-
-	if (get_vm_area((unsigned long)map, &page1) ||
-	    get_vm_area((unsigned long)map + page_size, &page2) ||
-	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
-		printf("couldn't find mapping in /proc/self/maps\n");
-		goto out;
-	}
-
-	/*
-	 * Before we unlock a portion, we need to that all three pages are in
-	 * the same VMA.  If they are not we abort this test (Note that this is
-	 * not a failure)
-	 */
-	if (page1.start != page2.start || page2.start != page3.start) {
-		printf("VMAs are not merged to start, aborting test\n");
-		ret = 0;
-		goto out;
-	}
-
-	if (munlock(map + page_size, page_size)) {
-		perror("munlock()");
-		goto out;
-	}
-
-	if (get_vm_area((unsigned long)map, &page1) ||
-	    get_vm_area((unsigned long)map + page_size, &page2) ||
-	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
-		printf("couldn't find mapping in /proc/self/maps\n");
-		goto out;
-	}
-
-	/* All three VMAs should be different */
-	if (page1.start == page2.start || page2.start == page3.start) {
-		printf("failed to split VMA for munlock\n");
-		goto out;
-	}
-
-	/* Now unlock the first and third page and check the VMAs again */
-	if (munlock(map, page_size * 3)) {
-		perror("munlock()");
-		goto out;
-	}
-
-	if (get_vm_area((unsigned long)map, &page1) ||
-	    get_vm_area((unsigned long)map + page_size, &page2) ||
-	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
-		printf("couldn't find mapping in /proc/self/maps\n");
-		goto out;
-	}
-
-	/* Now all three VMAs should be the same */
-	if (page1.start != page2.start || page2.start != page3.start) {
-		printf("failed to merge VMAs after munlock\n");
-		goto out;
-	}
-
-	ret = 0;
-out:
-	munmap(map, 3 * page_size);
-	return ret;
-}
-
-static int test_mlockall(int (test_function)(bool call_mlock))
-{
-	int ret = 1;
-
-	if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
-		perror("mlockall");
-		return ret;
-	}
-
-	ret = test_function(false);
-	munlockall();
-	return ret;
-}
-
-int main(int argc, char **argv)
-{
-	int ret = 0;
-	ret += test_mlock_lock();
-	ret += test_mlock_onfault();
-	ret += test_munlockall();
-	ret += test_lock_onfault_of_present();
-	ret += test_vma_management(true);
-	ret += test_mlockall(test_vma_management);
-	return ret;
-}
diff --git a/tools/testing/selftests/vm/mlock2.h b/tools/testing/selftests/vm/mlock2.h
deleted file mode 100644
index 2a6e76c226bc..000000000000
--- a/tools/testing/selftests/vm/mlock2.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <syscall.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#ifndef MLOCK_ONFAULT
-#define MLOCK_ONFAULT 1
-#endif
-
-#ifndef MCL_ONFAULT
-#define MCL_ONFAULT (MCL_FUTURE << 1)
-#endif
-
-static int mlock2_(void *start, size_t len, int flags)
-{
-#ifdef __NR_mlock2
-	return syscall(__NR_mlock2, start, len, flags);
-#else
-	errno = ENOSYS;
-	return -1;
-#endif
-}
-
-static FILE *seek_to_smaps_entry(unsigned long addr)
-{
-	FILE *file;
-	char *line = NULL;
-	size_t size = 0;
-	unsigned long start, end;
-	char perms[5];
-	unsigned long offset;
-	char dev[32];
-	unsigned long inode;
-	char path[BUFSIZ];
-
-	file = fopen("/proc/self/smaps", "r");
-	if (!file) {
-		perror("fopen smaps");
-		_exit(1);
-	}
-
-	while (getline(&line, &size, file) > 0) {
-		if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
-			   &start, &end, perms, &offset, dev, &inode, path) < 6)
-			goto next;
-
-		if (start <= addr && addr < end)
-			goto out;
-
-next:
-		free(line);
-		line = NULL;
-		size = 0;
-	}
-
-	fclose(file);
-	file = NULL;
-
-out:
-	free(line);
-	return file;
-}
diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c
deleted file mode 100644
index 6c62966ab5db..000000000000
--- a/tools/testing/selftests/vm/mrelease_test.c
+++ /dev/null
@@ -1,206 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2022 Google LLC
- */
-#define _GNU_SOURCE
-#include <errno.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-#include "util.h"
-
-#include "../kselftest.h"
-
-#ifndef __NR_pidfd_open
-#define __NR_pidfd_open -1
-#endif
-
-#ifndef __NR_process_mrelease
-#define __NR_process_mrelease -1
-#endif
-
-#define MB(x) (x << 20)
-#define MAX_SIZE_MB 1024
-
-static int alloc_noexit(unsigned long nr_pages, int pipefd)
-{
-	int ppid = getppid();
-	int timeout = 10; /* 10sec timeout to get killed */
-	unsigned long i;
-	char *buf;
-
-	buf = (char *)mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANON, 0, 0);
-	if (buf == MAP_FAILED) {
-		perror("mmap failed, halting the test");
-		return KSFT_FAIL;
-	}
-
-	for (i = 0; i < nr_pages; i++)
-		*((unsigned long *)(buf + (i * PAGE_SIZE))) = i;
-
-	/* Signal the parent that the child is ready */
-	if (write(pipefd, "", 1) < 0) {
-		perror("write");
-		return KSFT_FAIL;
-	}
-
-	/* Wait to be killed (when reparenting happens) */
-	while (getppid() == ppid && timeout > 0) {
-		sleep(1);
-		timeout--;
-	}
-
-	munmap(buf, nr_pages * PAGE_SIZE);
-
-	return (timeout > 0) ? KSFT_PASS : KSFT_FAIL;
-}
-
-/* The process_mrelease calls in this test are expected to fail */
-static void run_negative_tests(int pidfd)
-{
-	int res;
-	/* Test invalid flags. Expect to fail with EINVAL error code. */
-	if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) ||
-			errno != EINVAL) {
-		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-		perror("process_mrelease with wrong flags");
-		exit(res);
-	}
-	/*
-	 * Test reaping while process is alive with no pending SIGKILL.
-	 * Expect to fail with EINVAL error code.
-	 */
-	if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) {
-		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-		perror("process_mrelease on a live process");
-		exit(res);
-	}
-}
-
-static int child_main(int pipefd[], size_t size)
-{
-	int res;
-
-	/* Allocate and fault-in memory and wait to be killed */
-	close(pipefd[0]);
-	res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]);
-	close(pipefd[1]);
-	return res;
-}
-
-int main(void)
-{
-	int pipefd[2], pidfd;
-	bool success, retry;
-	size_t size;
-	pid_t pid;
-	char byte;
-	int res;
-
-	/* Test a wrong pidfd */
-	if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) {
-		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-		perror("process_mrelease with wrong pidfd");
-		exit(res);
-	}
-
-	/* Start the test with 1MB child memory allocation */
-	size = 1;
-retry:
-	/*
-	 * Pipe for the child to signal when it's done allocating
-	 * memory
-	 */
-	if (pipe(pipefd)) {
-		perror("pipe");
-		exit(KSFT_FAIL);
-	}
-	pid = fork();
-	if (pid < 0) {
-		perror("fork");
-		close(pipefd[0]);
-		close(pipefd[1]);
-		exit(KSFT_FAIL);
-	}
-
-	if (pid == 0) {
-		/* Child main routine */
-		res = child_main(pipefd, size);
-		exit(res);
-	}
-
-	/*
-	 * Parent main routine:
-	 * Wait for the child to finish allocations, then kill and reap
-	 */
-	close(pipefd[1]);
-	/* Block until the child is ready */
-	res = read(pipefd[0], &byte, 1);
-	close(pipefd[0]);
-	if (res < 0) {
-		perror("read");
-		if (!kill(pid, SIGKILL))
-			waitpid(pid, NULL, 0);
-		exit(KSFT_FAIL);
-	}
-
-	pidfd = syscall(__NR_pidfd_open, pid, 0);
-	if (pidfd < 0) {
-		perror("pidfd_open");
-		if (!kill(pid, SIGKILL))
-			waitpid(pid, NULL, 0);
-		exit(KSFT_FAIL);
-	}
-
-	/* Run negative tests which require a live child */
-	run_negative_tests(pidfd);
-
-	if (kill(pid, SIGKILL)) {
-		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-		perror("kill");
-		exit(res);
-	}
-
-	success = (syscall(__NR_process_mrelease, pidfd, 0) == 0);
-	if (!success) {
-		/*
-		 * If we failed to reap because the child exited too soon,
-		 * before we could call process_mrelease. Double child's memory
-		 * which causes it to spend more time on cleanup and increases
-		 * our chances of reaping its memory before it exits.
-		 * Retry until we succeed or reach MAX_SIZE_MB.
-		 */
-		if (errno == ESRCH) {
-			retry = (size <= MAX_SIZE_MB);
-		} else {
-			res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-			perror("process_mrelease");
-			waitpid(pid, NULL, 0);
-			exit(res);
-		}
-	}
-
-	/* Cleanup to prevent zombies */
-	if (waitpid(pid, NULL, 0) < 0) {
-		perror("waitpid");
-		exit(KSFT_FAIL);
-	}
-	close(pidfd);
-
-	if (!success) {
-		if (retry) {
-			size *= 2;
-			goto retry;
-		}
-		printf("All process_mrelease attempts failed!\n");
-		exit(KSFT_FAIL);
-	}
-
-	printf("Success reaping a child with %zuMB of memory allocations\n",
-	       size);
-	return KSFT_PASS;
-}
diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c
deleted file mode 100644
index f01dc4a85b0b..000000000000
--- a/tools/testing/selftests/vm/mremap_dontunmap.c
+++ /dev/null
@@ -1,364 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*
- * Tests for mremap w/ MREMAP_DONTUNMAP.
- *
- * Copyright 2020, Brian Geffon <bgeffon@google.com>
- */
-#define _GNU_SOURCE
-#include <sys/mman.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#include "../kselftest.h"
-
-#ifndef MREMAP_DONTUNMAP
-#define MREMAP_DONTUNMAP 4
-#endif
-
-unsigned long page_size;
-char *page_buffer;
-
-static void dump_maps(void)
-{
-	char cmd[32];
-
-	snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
-	system(cmd);
-}
-
-#define BUG_ON(condition, description)					      \
-	do {								      \
-		if (condition) {					      \
-			fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \
-				__LINE__, (description), strerror(errno));    \
-			dump_maps();					  \
-			exit(1);					      \
-		} 							      \
-	} while (0)
-
-// Try a simple operation for to "test" for kernel support this prevents
-// reporting tests as failed when it's run on an older kernel.
-static int kernel_support_for_mremap_dontunmap()
-{
-	int ret = 0;
-	unsigned long num_pages = 1;
-	void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE,
-				    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	BUG_ON(source_mapping == MAP_FAILED, "mmap");
-
-	// This simple remap should only fail if MREMAP_DONTUNMAP isn't
-	// supported.
-	void *dest_mapping =
-	    mremap(source_mapping, num_pages * page_size, num_pages * page_size,
-		   MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0);
-	if (dest_mapping == MAP_FAILED) {
-		ret = errno;
-	} else {
-		BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
-		       "unable to unmap destination mapping");
-	}
-
-	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
-	       "unable to unmap source mapping");
-	return ret;
-}
-
-// This helper will just validate that an entire mapping contains the expected
-// byte.
-static int check_region_contains_byte(void *addr, unsigned long size, char byte)
-{
-	BUG_ON(size & (page_size - 1),
-	       "check_region_contains_byte expects page multiples");
-	BUG_ON((unsigned long)addr & (page_size - 1),
-	       "check_region_contains_byte expects page alignment");
-
-	memset(page_buffer, byte, page_size);
-
-	unsigned long num_pages = size / page_size;
-	unsigned long i;
-
-	// Compare each page checking that it contains our expected byte.
-	for (i = 0; i < num_pages; ++i) {
-		int ret =
-		    memcmp(addr + (i * page_size), page_buffer, page_size);
-		if (ret) {
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving
-// the source mapping mapped.
-static void mremap_dontunmap_simple()
-{
-	unsigned long num_pages = 5;
-
-	void *source_mapping =
-	    mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
-		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	BUG_ON(source_mapping == MAP_FAILED, "mmap");
-
-	memset(source_mapping, 'a', num_pages * page_size);
-
-	// Try to just move the whole mapping anywhere (not fixed).
-	void *dest_mapping =
-	    mremap(source_mapping, num_pages * page_size, num_pages * page_size,
-		   MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
-	BUG_ON(dest_mapping == MAP_FAILED, "mremap");
-
-	// Validate that the pages have been moved, we know they were moved if
-	// the dest_mapping contains a's.
-	BUG_ON(check_region_contains_byte
-	       (dest_mapping, num_pages * page_size, 'a') != 0,
-	       "pages did not migrate");
-	BUG_ON(check_region_contains_byte
-	       (source_mapping, num_pages * page_size, 0) != 0,
-	       "source should have no ptes");
-
-	BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
-	       "unable to unmap destination mapping");
-	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
-	       "unable to unmap source mapping");
-}
-
-// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected.
-static void mremap_dontunmap_simple_shmem()
-{
-	unsigned long num_pages = 5;
-
-	int mem_fd = memfd_create("memfd", MFD_CLOEXEC);
-	BUG_ON(mem_fd < 0, "memfd_create");
-
-	BUG_ON(ftruncate(mem_fd, num_pages * page_size) < 0,
-			"ftruncate");
-
-	void *source_mapping =
-	    mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
-		 MAP_FILE | MAP_SHARED, mem_fd, 0);
-	BUG_ON(source_mapping == MAP_FAILED, "mmap");
-
-	BUG_ON(close(mem_fd) < 0, "close");
-
-	memset(source_mapping, 'a', num_pages * page_size);
-
-	// Try to just move the whole mapping anywhere (not fixed).
-	void *dest_mapping =
-	    mremap(source_mapping, num_pages * page_size, num_pages * page_size,
-		   MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
-	if (dest_mapping == MAP_FAILED && errno == EINVAL) {
-		// Old kernel which doesn't support MREMAP_DONTUNMAP on shmem.
-		BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
-			"unable to unmap source mapping");
-		return;
-	}
-
-	BUG_ON(dest_mapping == MAP_FAILED, "mremap");
-
-	// Validate that the pages have been moved, we know they were moved if
-	// the dest_mapping contains a's.
-	BUG_ON(check_region_contains_byte
-	       (dest_mapping, num_pages * page_size, 'a') != 0,
-	       "pages did not migrate");
-
-	// Because the region is backed by shmem, we will actually see the same
-	// memory at the source location still.
-	BUG_ON(check_region_contains_byte
-	       (source_mapping, num_pages * page_size, 'a') != 0,
-	       "source should have no ptes");
-
-	BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
-	       "unable to unmap destination mapping");
-	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
-	       "unable to unmap source mapping");
-}
-
-// This test validates MREMAP_DONTUNMAP will move page tables to a specific
-// destination using MREMAP_FIXED, also while validating that the source
-// remains intact.
-static void mremap_dontunmap_simple_fixed()
-{
-	unsigned long num_pages = 5;
-
-	// Since we want to guarantee that we can remap to a point, we will
-	// create a mapping up front.
-	void *dest_mapping =
-	    mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
-		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	BUG_ON(dest_mapping == MAP_FAILED, "mmap");
-	memset(dest_mapping, 'X', num_pages * page_size);
-
-	void *source_mapping =
-	    mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
-		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	BUG_ON(source_mapping == MAP_FAILED, "mmap");
-	memset(source_mapping, 'a', num_pages * page_size);
-
-	void *remapped_mapping =
-	    mremap(source_mapping, num_pages * page_size, num_pages * page_size,
-		   MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE,
-		   dest_mapping);
-	BUG_ON(remapped_mapping == MAP_FAILED, "mremap");
-	BUG_ON(remapped_mapping != dest_mapping,
-	       "mremap should have placed the remapped mapping at dest_mapping");
-
-	// The dest mapping will have been unmap by mremap so we expect the Xs
-	// to be gone and replaced with a's.
-	BUG_ON(check_region_contains_byte
-	       (dest_mapping, num_pages * page_size, 'a') != 0,
-	       "pages did not migrate");
-
-	// And the source mapping will have had its ptes dropped.
-	BUG_ON(check_region_contains_byte
-	       (source_mapping, num_pages * page_size, 0) != 0,
-	       "source should have no ptes");
-
-	BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
-	       "unable to unmap destination mapping");
-	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
-	       "unable to unmap source mapping");
-}
-
-// This test validates that we can MREMAP_DONTUNMAP for a portion of an
-// existing mapping.
-static void mremap_dontunmap_partial_mapping()
-{
-	/*
-	 *  source mapping:
-	 *  --------------
-	 *  | aaaaaaaaaa |
-	 *  --------------
-	 *  to become:
-	 *  --------------
-	 *  | aaaaa00000 |
-	 *  --------------
-	 *  With the destination mapping containing 5 pages of As.
-	 *  ---------
-	 *  | aaaaa |
-	 *  ---------
-	 */
-	unsigned long num_pages = 10;
-	void *source_mapping =
-	    mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
-		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	BUG_ON(source_mapping == MAP_FAILED, "mmap");
-	memset(source_mapping, 'a', num_pages * page_size);
-
-	// We will grab the last 5 pages of the source and move them.
-	void *dest_mapping =
-	    mremap(source_mapping + (5 * page_size), 5 * page_size,
-		   5 * page_size,
-		   MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
-	BUG_ON(dest_mapping == MAP_FAILED, "mremap");
-
-	// We expect the first 5 pages of the source to contain a's and the
-	// final 5 pages to contain zeros.
-	BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') !=
-	       0, "first 5 pages of source should have original pages");
-	BUG_ON(check_region_contains_byte
-	       (source_mapping + (5 * page_size), 5 * page_size, 0) != 0,
-	       "final 5 pages of source should have no ptes");
-
-	// Finally we expect the destination to have 5 pages worth of a's.
-	BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') !=
-	       0, "dest mapping should contain ptes from the source");
-
-	BUG_ON(munmap(dest_mapping, 5 * page_size) == -1,
-	       "unable to unmap destination mapping");
-	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
-	       "unable to unmap source mapping");
-}
-
-// This test validates that we can remap over only a portion of a mapping.
-static void mremap_dontunmap_partial_mapping_overwrite(void)
-{
-	/*
-	 *  source mapping:
-	 *  ---------
-	 *  |aaaaa|
-	 *  ---------
-	 *  dest mapping initially:
-	 *  -----------
-	 *  |XXXXXXXXXX|
-	 *  ------------
-	 *  Source to become:
-	 *  ---------
-	 *  |00000|
-	 *  ---------
-	 *  With the destination mapping containing 5 pages of As.
-	 *  ------------
-	 *  |aaaaaXXXXX|
-	 *  ------------
-	 */
-	void *source_mapping =
-	    mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE,
-		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	BUG_ON(source_mapping == MAP_FAILED, "mmap");
-	memset(source_mapping, 'a', 5 * page_size);
-
-	void *dest_mapping =
-	    mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
-		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	BUG_ON(dest_mapping == MAP_FAILED, "mmap");
-	memset(dest_mapping, 'X', 10 * page_size);
-
-	// We will grab the last 5 pages of the source and move them.
-	void *remapped_mapping =
-	    mremap(source_mapping, 5 * page_size,
-		   5 * page_size,
-		   MREMAP_DONTUNMAP | MREMAP_MAYMOVE | MREMAP_FIXED, dest_mapping);
-	BUG_ON(dest_mapping == MAP_FAILED, "mremap");
-	BUG_ON(dest_mapping != remapped_mapping, "expected to remap to dest_mapping");
-
-	BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) !=
-	       0, "first 5 pages of source should have no ptes");
-
-	// Finally we expect the destination to have 5 pages worth of a's.
-	BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != 0,
-			"dest mapping should contain ptes from the source");
-
-	// Finally the last 5 pages shouldn't have been touched.
-	BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size),
-				5 * page_size, 'X') != 0,
-			"dest mapping should have retained the last 5 pages");
-
-	BUG_ON(munmap(dest_mapping, 10 * page_size) == -1,
-	       "unable to unmap destination mapping");
-	BUG_ON(munmap(source_mapping, 5 * page_size) == -1,
-	       "unable to unmap source mapping");
-}
-
-int main(void)
-{
-	page_size = sysconf(_SC_PAGE_SIZE);
-
-	// test for kernel support for MREMAP_DONTUNMAP skipping the test if
-	// not.
-	if (kernel_support_for_mremap_dontunmap() != 0) {
-		printf("No kernel support for MREMAP_DONTUNMAP\n");
-		return KSFT_SKIP;
-	}
-
-	// Keep a page sized buffer around for when we need it.
-	page_buffer =
-	    mmap(NULL, page_size, PROT_READ | PROT_WRITE,
-		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page.");
-
-	mremap_dontunmap_simple();
-	mremap_dontunmap_simple_shmem();
-	mremap_dontunmap_simple_fixed();
-	mremap_dontunmap_partial_mapping();
-	mremap_dontunmap_partial_mapping_overwrite();
-
-	BUG_ON(munmap(page_buffer, page_size) == -1,
-	       "unable to unmap page buffer");
-
-	printf("OK\n");
-	return 0;
-}
diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c
deleted file mode 100644
index 9496346973d4..000000000000
--- a/tools/testing/selftests/vm/mremap_test.c
+++ /dev/null
@@ -1,475 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2020 Google LLC
- */
-#define _GNU_SOURCE
-
-#include <errno.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <time.h>
-#include <stdbool.h>
-
-#include "../kselftest.h"
-
-#define EXPECT_SUCCESS 0
-#define EXPECT_FAILURE 1
-#define NON_OVERLAPPING 0
-#define OVERLAPPING 1
-#define NS_PER_SEC 1000000000ULL
-#define VALIDATION_DEFAULT_THRESHOLD 4	/* 4MB */
-#define VALIDATION_NO_THRESHOLD 0	/* Verify the entire region */
-
-#define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
-
-struct config {
-	unsigned long long src_alignment;
-	unsigned long long dest_alignment;
-	unsigned long long region_size;
-	int overlapping;
-};
-
-struct test {
-	const char *name;
-	struct config config;
-	int expect_failure;
-};
-
-enum {
-	_1KB = 1ULL << 10,	/* 1KB -> not page aligned */
-	_4KB = 4ULL << 10,
-	_8KB = 8ULL << 10,
-	_1MB = 1ULL << 20,
-	_2MB = 2ULL << 20,
-	_4MB = 4ULL << 20,
-	_1GB = 1ULL << 30,
-	_2GB = 2ULL << 30,
-	PMD = _2MB,
-	PUD = _1GB,
-};
-
-#define PTE page_size
-
-#define MAKE_TEST(source_align, destination_align, size,	\
-		  overlaps, should_fail, test_name)		\
-(struct test){							\
-	.name = test_name,					\
-	.config = {						\
-		.src_alignment = source_align,			\
-		.dest_alignment = destination_align,		\
-		.region_size = size,				\
-		.overlapping = overlaps,			\
-	},							\
-	.expect_failure = should_fail				\
-}
-
-/*
- * Returns false if the requested remap region overlaps with an
- * existing mapping (e.g text, stack) else returns true.
- */
-static bool is_remap_region_valid(void *addr, unsigned long long size)
-{
-	void *remap_addr = NULL;
-	bool ret = true;
-
-	/* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */
-	remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE,
-					 MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
-					 -1, 0);
-
-	if (remap_addr == MAP_FAILED) {
-		if (errno == EEXIST)
-			ret = false;
-	} else {
-		munmap(remap_addr, size);
-	}
-
-	return ret;
-}
-
-/* Returns mmap_min_addr sysctl tunable from procfs */
-static unsigned long long get_mmap_min_addr(void)
-{
-	FILE *fp;
-	int n_matched;
-	static unsigned long long addr;
-
-	if (addr)
-		return addr;
-
-	fp = fopen("/proc/sys/vm/mmap_min_addr", "r");
-	if (fp == NULL) {
-		ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n",
-			strerror(errno));
-		exit(KSFT_SKIP);
-	}
-
-	n_matched = fscanf(fp, "%llu", &addr);
-	if (n_matched != 1) {
-		ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n",
-			strerror(errno));
-		fclose(fp);
-		exit(KSFT_SKIP);
-	}
-
-	fclose(fp);
-	return addr;
-}
-
-/*
- * This test validates that merge is called when expanding a mapping.
- * Mapping containing three pages is created, middle page is unmapped
- * and then the mapping containing the first page is expanded so that
- * it fills the created hole. The two parts should merge creating
- * single mapping with three pages.
- */
-static void mremap_expand_merge(unsigned long page_size)
-{
-	char *test_name = "mremap expand merge";
-	FILE *fp;
-	char *line = NULL;
-	size_t len = 0;
-	bool success = false;
-	char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-
-	munmap(start + page_size, page_size);
-	mremap(start, page_size, 2 * page_size, 0);
-
-	fp = fopen("/proc/self/maps", "r");
-	if (fp == NULL) {
-		ksft_test_result_fail("%s\n", test_name);
-		return;
-	}
-
-	while (getline(&line, &len, fp) != -1) {
-		char *first = strtok(line, "- ");
-		void *first_val = (void *)strtol(first, NULL, 16);
-		char *second = strtok(NULL, "- ");
-		void *second_val = (void *) strtol(second, NULL, 16);
-
-		if (first_val == start && second_val == start + 3 * page_size) {
-			success = true;
-			break;
-		}
-	}
-	if (success)
-		ksft_test_result_pass("%s\n", test_name);
-	else
-		ksft_test_result_fail("%s\n", test_name);
-	fclose(fp);
-}
-
-/*
- * Returns the start address of the mapping on success, else returns
- * NULL on failure.
- */
-static void *get_source_mapping(struct config c)
-{
-	unsigned long long addr = 0ULL;
-	void *src_addr = NULL;
-	unsigned long long mmap_min_addr;
-
-	mmap_min_addr = get_mmap_min_addr();
-
-retry:
-	addr += c.src_alignment;
-	if (addr < mmap_min_addr)
-		goto retry;
-
-	src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE,
-					MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
-					-1, 0);
-	if (src_addr == MAP_FAILED) {
-		if (errno == EPERM || errno == EEXIST)
-			goto retry;
-		goto error;
-	}
-	/*
-	 * Check that the address is aligned to the specified alignment.
-	 * Addresses which have alignments that are multiples of that
-	 * specified are not considered valid. For instance, 1GB address is
-	 * 2MB-aligned, however it will not be considered valid for a
-	 * requested alignment of 2MB. This is done to reduce coincidental
-	 * alignment in the tests.
-	 */
-	if (((unsigned long long) src_addr & (c.src_alignment - 1)) ||
-			!((unsigned long long) src_addr & c.src_alignment)) {
-		munmap(src_addr, c.region_size);
-		goto retry;
-	}
-
-	if (!src_addr)
-		goto error;
-
-	return src_addr;
-error:
-	ksft_print_msg("Failed to map source region: %s\n",
-			strerror(errno));
-	return NULL;
-}
-
-/* Returns the time taken for the remap on success else returns -1. */
-static long long remap_region(struct config c, unsigned int threshold_mb,
-			      char pattern_seed)
-{
-	void *addr, *src_addr, *dest_addr;
-	unsigned long long i;
-	struct timespec t_start = {0, 0}, t_end = {0, 0};
-	long long  start_ns, end_ns, align_mask, ret, offset;
-	unsigned long long threshold;
-
-	if (threshold_mb == VALIDATION_NO_THRESHOLD)
-		threshold = c.region_size;
-	else
-		threshold = MIN(threshold_mb * _1MB, c.region_size);
-
-	src_addr = get_source_mapping(c);
-	if (!src_addr) {
-		ret = -1;
-		goto out;
-	}
-
-	/* Set byte pattern */
-	srand(pattern_seed);
-	for (i = 0; i < threshold; i++)
-		memset((char *) src_addr + i, (char) rand(), 1);
-
-	/* Mask to zero out lower bits of address for alignment */
-	align_mask = ~(c.dest_alignment - 1);
-	/* Offset of destination address from the end of the source region */
-	offset = (c.overlapping) ? -c.dest_alignment : c.dest_alignment;
-	addr = (void *) (((unsigned long long) src_addr + c.region_size
-			  + offset) & align_mask);
-
-	/* See comment in get_source_mapping() */
-	if (!((unsigned long long) addr & c.dest_alignment))
-		addr = (void *) ((unsigned long long) addr | c.dest_alignment);
-
-	/* Don't destroy existing mappings unless expected to overlap */
-	while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) {
-		/* Check for unsigned overflow */
-		if (addr + c.dest_alignment < addr) {
-			ksft_print_msg("Couldn't find a valid region to remap to\n");
-			ret = -1;
-			goto out;
-		}
-		addr += c.dest_alignment;
-	}
-
-	clock_gettime(CLOCK_MONOTONIC, &t_start);
-	dest_addr = mremap(src_addr, c.region_size, c.region_size,
-					  MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr);
-	clock_gettime(CLOCK_MONOTONIC, &t_end);
-
-	if (dest_addr == MAP_FAILED) {
-		ksft_print_msg("mremap failed: %s\n", strerror(errno));
-		ret = -1;
-		goto clean_up_src;
-	}
-
-	/* Verify byte pattern after remapping */
-	srand(pattern_seed);
-	for (i = 0; i < threshold; i++) {
-		char c = (char) rand();
-
-		if (((char *) dest_addr)[i] != c) {
-			ksft_print_msg("Data after remap doesn't match at offset %d\n",
-				       i);
-			ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff,
-					((char *) dest_addr)[i] & 0xff);
-			ret = -1;
-			goto clean_up_dest;
-		}
-	}
-
-	start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec;
-	end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec;
-	ret = end_ns - start_ns;
-
-/*
- * Since the destination address is specified using MREMAP_FIXED, subsequent
- * mremap will unmap any previous mapping at the address range specified by
- * dest_addr and region_size. This significantly affects the remap time of
- * subsequent tests. So we clean up mappings after each test.
- */
-clean_up_dest:
-	munmap(dest_addr, c.region_size);
-clean_up_src:
-	munmap(src_addr, c.region_size);
-out:
-	return ret;
-}
-
-static void run_mremap_test_case(struct test test_case, int *failures,
-				 unsigned int threshold_mb,
-				 unsigned int pattern_seed)
-{
-	long long remap_time = remap_region(test_case.config, threshold_mb,
-					    pattern_seed);
-
-	if (remap_time < 0) {
-		if (test_case.expect_failure)
-			ksft_test_result_xfail("%s\n\tExpected mremap failure\n",
-					      test_case.name);
-		else {
-			ksft_test_result_fail("%s\n", test_case.name);
-			*failures += 1;
-		}
-	} else {
-		/*
-		 * Comparing mremap time is only applicable if entire region
-		 * was faulted in.
-		 */
-		if (threshold_mb == VALIDATION_NO_THRESHOLD ||
-		    test_case.config.region_size <= threshold_mb * _1MB)
-			ksft_test_result_pass("%s\n\tmremap time: %12lldns\n",
-					      test_case.name, remap_time);
-		else
-			ksft_test_result_pass("%s\n", test_case.name);
-	}
-}
-
-static void usage(const char *cmd)
-{
-	fprintf(stderr,
-		"Usage: %s [[-t <threshold_mb>] [-p <pattern_seed>]]\n"
-		"-t\t only validate threshold_mb of the remapped region\n"
-		"  \t if 0 is supplied no threshold is used; all tests\n"
-		"  \t are run and remapped regions validated fully.\n"
-		"  \t The default threshold used is 4MB.\n"
-		"-p\t provide a seed to generate the random pattern for\n"
-		"  \t validating the remapped region.\n", cmd);
-}
-
-static int parse_args(int argc, char **argv, unsigned int *threshold_mb,
-		      unsigned int *pattern_seed)
-{
-	const char *optstr = "t:p:";
-	int opt;
-
-	while ((opt = getopt(argc, argv, optstr)) != -1) {
-		switch (opt) {
-		case 't':
-			*threshold_mb = atoi(optarg);
-			break;
-		case 'p':
-			*pattern_seed = atoi(optarg);
-			break;
-		default:
-			usage(argv[0]);
-			return -1;
-		}
-	}
-
-	if (optind < argc) {
-		usage(argv[0]);
-		return -1;
-	}
-
-	return 0;
-}
-
-#define MAX_TEST 13
-#define MAX_PERF_TEST 3
-int main(int argc, char **argv)
-{
-	int failures = 0;
-	int i, run_perf_tests;
-	unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD;
-	unsigned int pattern_seed;
-	int num_expand_tests = 1;
-	struct test test_cases[MAX_TEST];
-	struct test perf_test_cases[MAX_PERF_TEST];
-	int page_size;
-	time_t t;
-
-	pattern_seed = (unsigned int) time(&t);
-
-	if (parse_args(argc, argv, &threshold_mb, &pattern_seed) < 0)
-		exit(EXIT_FAILURE);
-
-	ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n",
-		       threshold_mb, pattern_seed);
-
-	page_size = sysconf(_SC_PAGESIZE);
-
-	/* Expected mremap failures */
-	test_cases[0] =	MAKE_TEST(page_size, page_size, page_size,
-				  OVERLAPPING, EXPECT_FAILURE,
-				  "mremap - Source and Destination Regions Overlapping");
-
-	test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size,
-				  NON_OVERLAPPING, EXPECT_FAILURE,
-				  "mremap - Destination Address Misaligned (1KB-aligned)");
-	test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size,
-				  NON_OVERLAPPING, EXPECT_FAILURE,
-				  "mremap - Source Address Misaligned (1KB-aligned)");
-
-	/* Src addr PTE aligned */
-	test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2,
-				  NON_OVERLAPPING, EXPECT_SUCCESS,
-				  "8KB mremap - Source PTE-aligned, Destination PTE-aligned");
-
-	/* Src addr 1MB aligned */
-	test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-				  "2MB mremap - Source 1MB-aligned, Destination PTE-aligned");
-	test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-				  "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned");
-
-	/* Src addr PMD aligned */
-	test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-				  "4MB mremap - Source PMD-aligned, Destination PTE-aligned");
-	test_cases[7] =	MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-				  "4MB mremap - Source PMD-aligned, Destination 1MB-aligned");
-	test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-				  "4MB mremap - Source PMD-aligned, Destination PMD-aligned");
-
-	/* Src addr PUD aligned */
-	test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-				  "2GB mremap - Source PUD-aligned, Destination PTE-aligned");
-	test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-				   "2GB mremap - Source PUD-aligned, Destination 1MB-aligned");
-	test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-				   "2GB mremap - Source PUD-aligned, Destination PMD-aligned");
-	test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-				   "2GB mremap - Source PUD-aligned, Destination PUD-aligned");
-
-	perf_test_cases[0] =  MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-					"1GB mremap - Source PTE-aligned, Destination PTE-aligned");
-	/*
-	 * mremap 1GB region - Page table level aligned time
-	 * comparison.
-	 */
-	perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-				       "1GB mremap - Source PMD-aligned, Destination PMD-aligned");
-	perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-				       "1GB mremap - Source PUD-aligned, Destination PUD-aligned");
-
-	run_perf_tests =  (threshold_mb == VALIDATION_NO_THRESHOLD) ||
-				(threshold_mb * _1MB >= _1GB);
-
-	ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ?
-		      ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests);
-
-	for (i = 0; i < ARRAY_SIZE(test_cases); i++)
-		run_mremap_test_case(test_cases[i], &failures, threshold_mb,
-				     pattern_seed);
-
-	mremap_expand_merge(page_size);
-
-	if (run_perf_tests) {
-		ksft_print_msg("\n%s\n",
-		 "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:");
-		for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++)
-			run_mremap_test_case(perf_test_cases[i], &failures,
-					     threshold_mb, pattern_seed);
-	}
-
-	if (failures > 0)
-		ksft_exit_fail();
-	else
-		ksft_exit_pass();
-}
diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/vm/on-fault-limit.c
deleted file mode 100644
index 634d87dfb2a4..000000000000
--- a/tools/testing/selftests/vm/on-fault-limit.c
+++ /dev/null
@@ -1,48 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <sys/mman.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <string.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-
-#ifndef MCL_ONFAULT
-#define MCL_ONFAULT (MCL_FUTURE << 1)
-#endif
-
-static int test_limit(void)
-{
-	int ret = 1;
-	struct rlimit lims;
-	void *map;
-
-	if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
-		perror("getrlimit");
-		return ret;
-	}
-
-	if (mlockall(MCL_ONFAULT | MCL_FUTURE)) {
-		perror("mlockall");
-		return ret;
-	}
-
-	map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
-		   MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
-	if (map != MAP_FAILED)
-		printf("mmap should have failed, but didn't\n");
-	else {
-		ret = 0;
-		munmap(map, 2 * lims.rlim_max);
-	}
-
-	munlockall();
-	return ret;
-}
-
-int main(int argc, char **argv)
-{
-	int ret = 0;
-
-	ret += test_limit();
-	return ret;
-}
diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h
deleted file mode 100644
index 92f3be3dd8e5..000000000000
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ /dev/null
@@ -1,226 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _PKEYS_HELPER_H
-#define _PKEYS_HELPER_H
-#define _GNU_SOURCE
-#include <string.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <signal.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <ucontext.h>
-#include <sys/mman.h>
-
-#include "../kselftest.h"
-
-/* Define some kernel-like types */
-#define  u8 __u8
-#define u16 __u16
-#define u32 __u32
-#define u64 __u64
-
-#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
-
-#ifndef DEBUG_LEVEL
-#define DEBUG_LEVEL 0
-#endif
-#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
-extern int dprint_in_signal;
-extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
-
-extern int test_nr;
-extern int iteration_nr;
-
-#ifdef __GNUC__
-__attribute__((format(printf, 1, 2)))
-#endif
-static inline void sigsafe_printf(const char *format, ...)
-{
-	va_list ap;
-
-	if (!dprint_in_signal) {
-		va_start(ap, format);
-		vprintf(format, ap);
-		va_end(ap);
-	} else {
-		int ret;
-		/*
-		 * No printf() functions are signal-safe.
-		 * They deadlock easily. Write the format
-		 * string to get some output, even if
-		 * incomplete.
-		 */
-		ret = write(1, format, strlen(format));
-		if (ret < 0)
-			exit(1);
-	}
-}
-#define dprintf_level(level, args...) do {	\
-	if (level <= DEBUG_LEVEL)		\
-		sigsafe_printf(args);		\
-} while (0)
-#define dprintf0(args...) dprintf_level(0, args)
-#define dprintf1(args...) dprintf_level(1, args)
-#define dprintf2(args...) dprintf_level(2, args)
-#define dprintf3(args...) dprintf_level(3, args)
-#define dprintf4(args...) dprintf_level(4, args)
-
-extern void abort_hooks(void);
-#define pkey_assert(condition) do {		\
-	if (!(condition)) {			\
-		dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
-				__FILE__, __LINE__,	\
-				test_nr, iteration_nr);	\
-		dprintf0("errno at assert: %d", errno);	\
-		abort_hooks();			\
-		exit(__LINE__);			\
-	}					\
-} while (0)
-
-__attribute__((noinline)) int read_ptr(int *ptr);
-void expected_pkey_fault(int pkey);
-int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
-int sys_pkey_free(unsigned long pkey);
-int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
-		unsigned long pkey);
-void record_pkey_malloc(void *ptr, long size, int prot);
-
-#if defined(__i386__) || defined(__x86_64__) /* arch */
-#include "pkey-x86.h"
-#elif defined(__powerpc64__) /* arch */
-#include "pkey-powerpc.h"
-#else /* arch */
-#error Architecture not supported
-#endif /* arch */
-
-#define PKEY_MASK	(PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
-
-static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
-{
-	u32 shift = pkey_bit_position(pkey);
-	/* mask out bits from pkey in old value */
-	reg &= ~((u64)PKEY_MASK << shift);
-	/* OR in new bits for pkey */
-	reg |= (flags & PKEY_MASK) << shift;
-	return reg;
-}
-
-static inline u64 get_pkey_bits(u64 reg, int pkey)
-{
-	u32 shift = pkey_bit_position(pkey);
-	/*
-	 * shift down the relevant bits to the lowest two, then
-	 * mask off all the other higher bits
-	 */
-	return ((reg >> shift) & PKEY_MASK);
-}
-
-extern u64 shadow_pkey_reg;
-
-static inline u64 _read_pkey_reg(int line)
-{
-	u64 pkey_reg = __read_pkey_reg();
-
-	dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx"
-			" shadow: %016llx\n",
-			line, pkey_reg, shadow_pkey_reg);
-	assert(pkey_reg == shadow_pkey_reg);
-
-	return pkey_reg;
-}
-
-#define read_pkey_reg() _read_pkey_reg(__LINE__)
-
-static inline void write_pkey_reg(u64 pkey_reg)
-{
-	dprintf4("%s() changing %016llx to %016llx\n", __func__,
-			__read_pkey_reg(), pkey_reg);
-	/* will do the shadow check for us: */
-	read_pkey_reg();
-	__write_pkey_reg(pkey_reg);
-	shadow_pkey_reg = pkey_reg;
-	dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__,
-			pkey_reg, __read_pkey_reg());
-}
-
-/*
- * These are technically racy. since something could
- * change PKEY register between the read and the write.
- */
-static inline void __pkey_access_allow(int pkey, int do_allow)
-{
-	u64 pkey_reg = read_pkey_reg();
-	int bit = pkey * 2;
-
-	if (do_allow)
-		pkey_reg &= (1<<bit);
-	else
-		pkey_reg |= (1<<bit);
-
-	dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
-	write_pkey_reg(pkey_reg);
-}
-
-static inline void __pkey_write_allow(int pkey, int do_allow_write)
-{
-	u64 pkey_reg = read_pkey_reg();
-	int bit = pkey * 2 + 1;
-
-	if (do_allow_write)
-		pkey_reg &= (1<<bit);
-	else
-		pkey_reg |= (1<<bit);
-
-	write_pkey_reg(pkey_reg);
-	dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
-}
-
-#define ALIGN_UP(x, align_to)	(((x) + ((align_to)-1)) & ~((align_to)-1))
-#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
-#define ALIGN_PTR_UP(p, ptr_align_to)	\
-	((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
-#define ALIGN_PTR_DOWN(p, ptr_align_to)	\
-	((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to))
-#define __stringify_1(x...)     #x
-#define __stringify(x...)       __stringify_1(x)
-
-static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si)
-{
-#ifdef si_pkey
-	return &si->si_pkey;
-#else
-	return (u32 *)(((u8 *)si) + si_pkey_offset);
-#endif
-}
-
-static inline int kernel_has_pkeys(void)
-{
-	/* try allocating a key and see if it succeeds */
-	int ret = sys_pkey_alloc(0, 0);
-	if (ret <= 0) {
-		return 0;
-	}
-	sys_pkey_free(ret);
-	return 1;
-}
-
-static inline int is_pkeys_supported(void)
-{
-	/* check if the cpu supports pkeys */
-	if (!cpu_has_pkeys()) {
-		dprintf1("SKIP: %s: no CPU support\n", __func__);
-		return 0;
-	}
-
-	/* check if the kernel supports pkeys */
-	if (!kernel_has_pkeys()) {
-		dprintf1("SKIP: %s: no kernel support\n", __func__);
-		return 0;
-	}
-
-	return 1;
-}
-
-#endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h
deleted file mode 100644
index 1ebb586b2fbc..000000000000
--- a/tools/testing/selftests/vm/pkey-powerpc.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _PKEYS_POWERPC_H
-#define _PKEYS_POWERPC_H
-
-#ifndef SYS_mprotect_key
-# define SYS_mprotect_key	386
-#endif
-#ifndef SYS_pkey_alloc
-# define SYS_pkey_alloc		384
-# define SYS_pkey_free		385
-#endif
-#define REG_IP_IDX		PT_NIP
-#define REG_TRAPNO		PT_TRAP
-#define gregs			gp_regs
-#define fpregs			fp_regs
-#define si_pkey_offset		0x20
-
-#undef PKEY_DISABLE_ACCESS
-#define PKEY_DISABLE_ACCESS	0x3  /* disable read and write */
-
-#undef PKEY_DISABLE_WRITE
-#define PKEY_DISABLE_WRITE	0x2
-
-#define NR_PKEYS		32
-#define NR_RESERVED_PKEYS_4K	27 /* pkey-0, pkey-1, exec-only-pkey
-				      and 24 other keys that cannot be
-				      represented in the PTE */
-#define NR_RESERVED_PKEYS_64K_3KEYS	3 /* PowerNV and KVM: pkey-0,
-					     pkey-1 and exec-only key */
-#define NR_RESERVED_PKEYS_64K_4KEYS	4 /* PowerVM: pkey-0, pkey-1,
-					     pkey-31 and exec-only key */
-#define PKEY_BITS_PER_PKEY	2
-#define HPAGE_SIZE		(1UL << 24)
-#define PAGE_SIZE		sysconf(_SC_PAGESIZE)
-
-static inline u32 pkey_bit_position(int pkey)
-{
-	return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY;
-}
-
-static inline u64 __read_pkey_reg(void)
-{
-	u64 pkey_reg;
-
-	asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg));
-
-	return pkey_reg;
-}
-
-static inline void __write_pkey_reg(u64 pkey_reg)
-{
-	u64 amr = pkey_reg;
-
-	dprintf4("%s() changing %016llx to %016llx\n",
-			 __func__, __read_pkey_reg(), pkey_reg);
-
-	asm volatile("isync; mtspr 0xd, %0; isync"
-		     : : "r" ((unsigned long)(amr)) : "memory");
-
-	dprintf4("%s() pkey register after changing %016llx to %016llx\n",
-			__func__, __read_pkey_reg(), pkey_reg);
-}
-
-static inline int cpu_has_pkeys(void)
-{
-	/* No simple way to determine this */
-	return 1;
-}
-
-static inline bool arch_is_powervm()
-{
-	struct stat buf;
-
-	if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) &&
-	    (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) &&
-	    (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) )
-		return true;
-
-	return false;
-}
-
-static inline int get_arch_reserved_keys(void)
-{
-	if (sysconf(_SC_PAGESIZE) == 4096)
-		return NR_RESERVED_PKEYS_4K;
-	else
-		if (arch_is_powervm())
-			return NR_RESERVED_PKEYS_64K_4KEYS;
-		else
-			return NR_RESERVED_PKEYS_64K_3KEYS;
-}
-
-void expect_fault_on_read_execonly_key(void *p1, int pkey)
-{
-	/*
-	 * powerpc does not allow userspace to change permissions of exec-only
-	 * keys since those keys are not allocated by userspace. The signal
-	 * handler wont be able to reset the permissions, which means the code
-	 * will infinitely continue to segfault here.
-	 */
-	return;
-}
-
-/* 4-byte instructions * 16384 = 64K page */
-#define __page_o_noops() asm(".rept 16384 ; nop; .endr")
-
-void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
-{
-	void *ptr;
-	int ret;
-
-	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
-			size, prot, pkey);
-	pkey_assert(pkey < NR_PKEYS);
-	ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-	pkey_assert(ptr != (void *)-1);
-
-	ret = syscall(__NR_subpage_prot, ptr, size, NULL);
-	if (ret) {
-		perror("subpage_perm");
-		return PTR_ERR_ENOTSUP;
-	}
-
-	ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
-	pkey_assert(!ret);
-	record_pkey_malloc(ptr, size, prot);
-
-	dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
-	return ptr;
-}
-
-#endif /* _PKEYS_POWERPC_H */
diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h
deleted file mode 100644
index 72c14cd3ddc7..000000000000
--- a/tools/testing/selftests/vm/pkey-x86.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _PKEYS_X86_H
-#define _PKEYS_X86_H
-
-#ifdef __i386__
-
-#ifndef SYS_mprotect_key
-# define SYS_mprotect_key	380
-#endif
-
-#ifndef SYS_pkey_alloc
-# define SYS_pkey_alloc		381
-# define SYS_pkey_free		382
-#endif
-
-#define REG_IP_IDX		REG_EIP
-#define si_pkey_offset		0x14
-
-#else
-
-#ifndef SYS_mprotect_key
-# define SYS_mprotect_key	329
-#endif
-
-#ifndef SYS_pkey_alloc
-# define SYS_pkey_alloc		330
-# define SYS_pkey_free		331
-#endif
-
-#define REG_IP_IDX		REG_RIP
-#define si_pkey_offset		0x20
-
-#endif
-
-#ifndef PKEY_DISABLE_ACCESS
-# define PKEY_DISABLE_ACCESS	0x1
-#endif
-
-#ifndef PKEY_DISABLE_WRITE
-# define PKEY_DISABLE_WRITE	0x2
-#endif
-
-#define NR_PKEYS		16
-#define NR_RESERVED_PKEYS	2 /* pkey-0 and exec-only-pkey */
-#define PKEY_BITS_PER_PKEY	2
-#define HPAGE_SIZE		(1UL<<21)
-#define PAGE_SIZE		4096
-#define MB			(1<<20)
-
-static inline void __page_o_noops(void)
-{
-	/* 8-bytes of instruction * 512 bytes = 1 page */
-	asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
-}
-
-static inline u64 __read_pkey_reg(void)
-{
-	unsigned int eax, edx;
-	unsigned int ecx = 0;
-	unsigned pkey_reg;
-
-	asm volatile(".byte 0x0f,0x01,0xee\n\t"
-		     : "=a" (eax), "=d" (edx)
-		     : "c" (ecx));
-	pkey_reg = eax;
-	return pkey_reg;
-}
-
-static inline void __write_pkey_reg(u64 pkey_reg)
-{
-	unsigned int eax = pkey_reg;
-	unsigned int ecx = 0;
-	unsigned int edx = 0;
-
-	dprintf4("%s() changing %016llx to %016llx\n", __func__,
-			__read_pkey_reg(), pkey_reg);
-	asm volatile(".byte 0x0f,0x01,0xef\n\t"
-		     : : "a" (eax), "c" (ecx), "d" (edx));
-	assert(pkey_reg == __read_pkey_reg());
-}
-
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */
-#define X86_FEATURE_PKU        (1<<3) /* Protection Keys for Userspace */
-#define X86_FEATURE_OSPKE      (1<<4) /* OS Protection Keys Enable */
-
-static inline int cpu_has_pkeys(void)
-{
-	unsigned int eax;
-	unsigned int ebx;
-	unsigned int ecx;
-	unsigned int edx;
-
-	__cpuid_count(0x7, 0x0, eax, ebx, ecx, edx);
-
-	if (!(ecx & X86_FEATURE_PKU)) {
-		dprintf2("cpu does not have PKU\n");
-		return 0;
-	}
-	if (!(ecx & X86_FEATURE_OSPKE)) {
-		dprintf2("cpu does not have OSPKE\n");
-		return 0;
-	}
-	return 1;
-}
-
-static inline int cpu_max_xsave_size(void)
-{
-	unsigned long XSTATE_CPUID = 0xd;
-	unsigned int eax;
-	unsigned int ebx;
-	unsigned int ecx;
-	unsigned int edx;
-
-	__cpuid_count(XSTATE_CPUID, 0, eax, ebx, ecx, edx);
-	return ecx;
-}
-
-static inline u32 pkey_bit_position(int pkey)
-{
-	return pkey * PKEY_BITS_PER_PKEY;
-}
-
-#define XSTATE_PKEY_BIT	(9)
-#define XSTATE_PKEY	0x200
-#define XSTATE_BV_OFFSET	512
-
-int pkey_reg_xstate_offset(void)
-{
-	unsigned int eax;
-	unsigned int ebx;
-	unsigned int ecx;
-	unsigned int edx;
-	int xstate_offset;
-	int xstate_size;
-	unsigned long XSTATE_CPUID = 0xd;
-	int leaf;
-
-	/* assume that XSTATE_PKEY is set in XCR0 */
-	leaf = XSTATE_PKEY_BIT;
-	{
-		__cpuid_count(XSTATE_CPUID, leaf, eax, ebx, ecx, edx);
-
-		if (leaf == XSTATE_PKEY_BIT) {
-			xstate_offset = ebx;
-			xstate_size = eax;
-		}
-	}
-
-	if (xstate_size == 0) {
-		printf("could not find size/offset of PKEY in xsave state\n");
-		return 0;
-	}
-
-	return xstate_offset;
-}
-
-static inline int get_arch_reserved_keys(void)
-{
-	return NR_RESERVED_PKEYS;
-}
-
-void expect_fault_on_read_execonly_key(void *p1, int pkey)
-{
-	int ptr_contents;
-
-	ptr_contents = read_ptr(p1);
-	dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
-	expected_pkey_fault(pkey);
-}
-
-void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
-{
-	return PTR_ERR_ENOTSUP;
-}
-
-#endif /* _PKEYS_X86_H */
diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c
deleted file mode 100644
index 95f403a0c46d..000000000000
--- a/tools/testing/selftests/vm/protection_keys.c
+++ /dev/null
@@ -1,1788 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
- *
- * There are examples in here of:
- *  * how to set protection keys on memory
- *  * how to set/clear bits in pkey registers (the rights register)
- *  * how to handle SEGV_PKUERR signals and extract pkey-relevant
- *    information from the siginfo
- *
- * Things to add:
- *	make sure KSM and KSM COW breaking works
- *	prefault pages in at malloc, or not
- *	protect MPX bounds tables with protection keys?
- *	make sure VMA splitting/merging is working correctly
- *	OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
- *	look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
- *	do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
- *
- * Compile like this:
- *	gcc -mxsave      -o protection_keys    -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
- *	gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
- */
-#define _GNU_SOURCE
-#define __SANE_USERSPACE_TYPES__
-#include <errno.h>
-#include <linux/elf.h>
-#include <linux/futex.h>
-#include <time.h>
-#include <sys/time.h>
-#include <sys/syscall.h>
-#include <string.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <signal.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <ucontext.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <sys/ptrace.h>
-#include <setjmp.h>
-
-#include "pkey-helpers.h"
-
-int iteration_nr = 1;
-int test_nr;
-
-u64 shadow_pkey_reg;
-int dprint_in_signal;
-char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
-
-void cat_into_file(char *str, char *file)
-{
-	int fd = open(file, O_RDWR);
-	int ret;
-
-	dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
-	/*
-	 * these need to be raw because they are called under
-	 * pkey_assert()
-	 */
-	if (fd < 0) {
-		fprintf(stderr, "error opening '%s'\n", str);
-		perror("error: ");
-		exit(__LINE__);
-	}
-
-	ret = write(fd, str, strlen(str));
-	if (ret != strlen(str)) {
-		perror("write to file failed");
-		fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
-		exit(__LINE__);
-	}
-	close(fd);
-}
-
-#if CONTROL_TRACING > 0
-static int warned_tracing;
-int tracing_root_ok(void)
-{
-	if (geteuid() != 0) {
-		if (!warned_tracing)
-			fprintf(stderr, "WARNING: not run as root, "
-					"can not do tracing control\n");
-		warned_tracing = 1;
-		return 0;
-	}
-	return 1;
-}
-#endif
-
-void tracing_on(void)
-{
-#if CONTROL_TRACING > 0
-#define TRACEDIR "/sys/kernel/debug/tracing"
-	char pidstr[32];
-
-	if (!tracing_root_ok())
-		return;
-
-	sprintf(pidstr, "%d", getpid());
-	cat_into_file("0", TRACEDIR "/tracing_on");
-	cat_into_file("\n", TRACEDIR "/trace");
-	if (1) {
-		cat_into_file("function_graph", TRACEDIR "/current_tracer");
-		cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
-	} else {
-		cat_into_file("nop", TRACEDIR "/current_tracer");
-	}
-	cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
-	cat_into_file("1", TRACEDIR "/tracing_on");
-	dprintf1("enabled tracing\n");
-#endif
-}
-
-void tracing_off(void)
-{
-#if CONTROL_TRACING > 0
-	if (!tracing_root_ok())
-		return;
-	cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
-#endif
-}
-
-void abort_hooks(void)
-{
-	fprintf(stderr, "running %s()...\n", __func__);
-	tracing_off();
-#ifdef SLEEP_ON_ABORT
-	sleep(SLEEP_ON_ABORT);
-#endif
-}
-
-/*
- * This attempts to have roughly a page of instructions followed by a few
- * instructions that do a write, and another page of instructions.  That
- * way, we are pretty sure that the write is in the second page of
- * instructions and has at least a page of padding behind it.
- *
- * *That* lets us be sure to madvise() away the write instruction, which
- * will then fault, which makes sure that the fault code handles
- * execute-only memory properly.
- */
-#ifdef __powerpc64__
-/* This way, both 4K and 64K alignment are maintained */
-__attribute__((__aligned__(65536)))
-#else
-__attribute__((__aligned__(PAGE_SIZE)))
-#endif
-void lots_o_noops_around_write(int *write_to_me)
-{
-	dprintf3("running %s()\n", __func__);
-	__page_o_noops();
-	/* Assume this happens in the second page of instructions: */
-	*write_to_me = __LINE__;
-	/* pad out by another page: */
-	__page_o_noops();
-	dprintf3("%s() done\n", __func__);
-}
-
-void dump_mem(void *dumpme, int len_bytes)
-{
-	char *c = (void *)dumpme;
-	int i;
-
-	for (i = 0; i < len_bytes; i += sizeof(u64)) {
-		u64 *ptr = (u64 *)(c + i);
-		dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr);
-	}
-}
-
-static u32 hw_pkey_get(int pkey, unsigned long flags)
-{
-	u64 pkey_reg = __read_pkey_reg();
-
-	dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
-			__func__, pkey, flags, 0, 0);
-	dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg);
-
-	return (u32) get_pkey_bits(pkey_reg, pkey);
-}
-
-static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
-{
-	u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
-	u64 old_pkey_reg = __read_pkey_reg();
-	u64 new_pkey_reg;
-
-	/* make sure that 'rights' only contains the bits we expect: */
-	assert(!(rights & ~mask));
-
-	/* modify bits accordingly in old pkey_reg and assign it */
-	new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights);
-
-	__write_pkey_reg(new_pkey_reg);
-
-	dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x"
-		" pkey_reg now: %016llx old_pkey_reg: %016llx\n",
-		__func__, pkey, rights, flags, 0, __read_pkey_reg(),
-		old_pkey_reg);
-	return 0;
-}
-
-void pkey_disable_set(int pkey, int flags)
-{
-	unsigned long syscall_flags = 0;
-	int ret;
-	int pkey_rights;
-	u64 orig_pkey_reg = read_pkey_reg();
-
-	dprintf1("START->%s(%d, 0x%x)\n", __func__,
-		pkey, flags);
-	pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
-
-	pkey_rights = hw_pkey_get(pkey, syscall_flags);
-
-	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
-			pkey, pkey, pkey_rights);
-
-	pkey_assert(pkey_rights >= 0);
-
-	pkey_rights |= flags;
-
-	ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
-	assert(!ret);
-	/* pkey_reg and flags have the same format */
-	shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
-	dprintf1("%s(%d) shadow: 0x%016llx\n",
-		__func__, pkey, shadow_pkey_reg);
-
-	pkey_assert(ret >= 0);
-
-	pkey_rights = hw_pkey_get(pkey, syscall_flags);
-	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
-			pkey, pkey, pkey_rights);
-
-	dprintf1("%s(%d) pkey_reg: 0x%016llx\n",
-		__func__, pkey, read_pkey_reg());
-	if (flags)
-		pkey_assert(read_pkey_reg() >= orig_pkey_reg);
-	dprintf1("END<---%s(%d, 0x%x)\n", __func__,
-		pkey, flags);
-}
-
-void pkey_disable_clear(int pkey, int flags)
-{
-	unsigned long syscall_flags = 0;
-	int ret;
-	int pkey_rights = hw_pkey_get(pkey, syscall_flags);
-	u64 orig_pkey_reg = read_pkey_reg();
-
-	pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
-
-	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
-			pkey, pkey, pkey_rights);
-	pkey_assert(pkey_rights >= 0);
-
-	pkey_rights &= ~flags;
-
-	ret = hw_pkey_set(pkey, pkey_rights, 0);
-	shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
-	pkey_assert(ret >= 0);
-
-	pkey_rights = hw_pkey_get(pkey, syscall_flags);
-	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
-			pkey, pkey, pkey_rights);
-
-	dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__,
-			pkey, read_pkey_reg());
-	if (flags)
-		assert(read_pkey_reg() <= orig_pkey_reg);
-}
-
-void pkey_write_allow(int pkey)
-{
-	pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
-}
-void pkey_write_deny(int pkey)
-{
-	pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
-}
-void pkey_access_allow(int pkey)
-{
-	pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
-}
-void pkey_access_deny(int pkey)
-{
-	pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
-}
-
-/* Failed address bound checks: */
-#ifndef SEGV_BNDERR
-# define SEGV_BNDERR		3
-#endif
-
-#ifndef SEGV_PKUERR
-# define SEGV_PKUERR		4
-#endif
-
-static char *si_code_str(int si_code)
-{
-	if (si_code == SEGV_MAPERR)
-		return "SEGV_MAPERR";
-	if (si_code == SEGV_ACCERR)
-		return "SEGV_ACCERR";
-	if (si_code == SEGV_BNDERR)
-		return "SEGV_BNDERR";
-	if (si_code == SEGV_PKUERR)
-		return "SEGV_PKUERR";
-	return "UNKNOWN";
-}
-
-int pkey_faults;
-int last_si_pkey = -1;
-void signal_handler(int signum, siginfo_t *si, void *vucontext)
-{
-	ucontext_t *uctxt = vucontext;
-	int trapno;
-	unsigned long ip;
-	char *fpregs;
-#if defined(__i386__) || defined(__x86_64__) /* arch */
-	u32 *pkey_reg_ptr;
-	int pkey_reg_offset;
-#endif /* arch */
-	u64 siginfo_pkey;
-	u32 *si_pkey_ptr;
-
-	dprint_in_signal = 1;
-	dprintf1(">>>>===============SIGSEGV============================\n");
-	dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
-			__func__, __LINE__,
-			__read_pkey_reg(), shadow_pkey_reg);
-
-	trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
-	ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
-	fpregs = (char *) uctxt->uc_mcontext.fpregs;
-
-	dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n",
-			__func__, trapno, ip, si_code_str(si->si_code),
-			si->si_code);
-
-#if defined(__i386__) || defined(__x86_64__) /* arch */
-#ifdef __i386__
-	/*
-	 * 32-bit has some extra padding so that userspace can tell whether
-	 * the XSTATE header is present in addition to the "legacy" FPU
-	 * state.  We just assume that it is here.
-	 */
-	fpregs += 0x70;
-#endif /* i386 */
-	pkey_reg_offset = pkey_reg_xstate_offset();
-	pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]);
-
-	/*
-	 * If we got a PKEY fault, we *HAVE* to have at least one bit set in
-	 * here.
-	 */
-	dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset());
-	if (DEBUG_LEVEL > 4)
-		dump_mem(pkey_reg_ptr - 128, 256);
-	pkey_assert(*pkey_reg_ptr);
-#endif /* arch */
-
-	dprintf1("siginfo: %p\n", si);
-	dprintf1(" fpregs: %p\n", fpregs);
-
-	if ((si->si_code == SEGV_MAPERR) ||
-	    (si->si_code == SEGV_ACCERR) ||
-	    (si->si_code == SEGV_BNDERR)) {
-		printf("non-PK si_code, exiting...\n");
-		exit(4);
-	}
-
-	si_pkey_ptr = siginfo_get_pkey_ptr(si);
-	dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
-	dump_mem((u8 *)si_pkey_ptr - 8, 24);
-	siginfo_pkey = *si_pkey_ptr;
-	pkey_assert(siginfo_pkey < NR_PKEYS);
-	last_si_pkey = siginfo_pkey;
-
-	/*
-	 * need __read_pkey_reg() version so we do not do shadow_pkey_reg
-	 * checking
-	 */
-	dprintf1("signal pkey_reg from  pkey_reg: %016llx\n",
-			__read_pkey_reg());
-	dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey);
-#if defined(__i386__) || defined(__x86_64__) /* arch */
-	dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr);
-	*(u64 *)pkey_reg_ptr = 0x00000000;
-	dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n");
-#elif defined(__powerpc64__) /* arch */
-	/* restore access and let the faulting instruction continue */
-	pkey_access_allow(siginfo_pkey);
-#endif /* arch */
-	pkey_faults++;
-	dprintf1("<<<<==================================================\n");
-	dprint_in_signal = 0;
-}
-
-int wait_all_children(void)
-{
-	int status;
-	return waitpid(-1, &status, 0);
-}
-
-void sig_chld(int x)
-{
-	dprint_in_signal = 1;
-	dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
-	dprint_in_signal = 0;
-}
-
-void setup_sigsegv_handler(void)
-{
-	int r, rs;
-	struct sigaction newact;
-	struct sigaction oldact;
-
-	/* #PF is mapped to sigsegv */
-	int signum  = SIGSEGV;
-
-	newact.sa_handler = 0;
-	newact.sa_sigaction = signal_handler;
-
-	/*sigset_t - signals to block while in the handler */
-	/* get the old signal mask. */
-	rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
-	pkey_assert(rs == 0);
-
-	/* call sa_sigaction, not sa_handler*/
-	newact.sa_flags = SA_SIGINFO;
-
-	newact.sa_restorer = 0;  /* void(*)(), obsolete */
-	r = sigaction(signum, &newact, &oldact);
-	r = sigaction(SIGALRM, &newact, &oldact);
-	pkey_assert(r == 0);
-}
-
-void setup_handlers(void)
-{
-	signal(SIGCHLD, &sig_chld);
-	setup_sigsegv_handler();
-}
-
-pid_t fork_lazy_child(void)
-{
-	pid_t forkret;
-
-	forkret = fork();
-	pkey_assert(forkret >= 0);
-	dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
-
-	if (!forkret) {
-		/* in the child */
-		while (1) {
-			dprintf1("child sleeping...\n");
-			sleep(30);
-		}
-	}
-	return forkret;
-}
-
-int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
-		unsigned long pkey)
-{
-	int sret;
-
-	dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
-			ptr, size, orig_prot, pkey);
-
-	errno = 0;
-	sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey);
-	if (errno) {
-		dprintf2("SYS_mprotect_key sret: %d\n", sret);
-		dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
-		dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
-		if (DEBUG_LEVEL >= 2)
-			perror("SYS_mprotect_pkey");
-	}
-	return sret;
-}
-
-int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
-{
-	int ret = syscall(SYS_pkey_alloc, flags, init_val);
-	dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
-			__func__, flags, init_val, ret, errno);
-	return ret;
-}
-
-int alloc_pkey(void)
-{
-	int ret;
-	unsigned long init_val = 0x0;
-
-	dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
-			__func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg);
-	ret = sys_pkey_alloc(0, init_val);
-	/*
-	 * pkey_alloc() sets PKEY register, so we need to reflect it in
-	 * shadow_pkey_reg:
-	 */
-	dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-			" shadow: 0x%016llx\n",
-			__func__, __LINE__, ret, __read_pkey_reg(),
-			shadow_pkey_reg);
-	if (ret > 0) {
-		/* clear both the bits: */
-		shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
-						~PKEY_MASK);
-		dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-				" shadow: 0x%016llx\n",
-				__func__,
-				__LINE__, ret, __read_pkey_reg(),
-				shadow_pkey_reg);
-		/*
-		 * move the new state in from init_val
-		 * (remember, we cheated and init_val == pkey_reg format)
-		 */
-		shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
-						init_val);
-	}
-	dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-			" shadow: 0x%016llx\n",
-			__func__, __LINE__, ret, __read_pkey_reg(),
-			shadow_pkey_reg);
-	dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno);
-	/* for shadow checking: */
-	read_pkey_reg();
-	dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-		 " shadow: 0x%016llx\n",
-		__func__, __LINE__, ret, __read_pkey_reg(),
-		shadow_pkey_reg);
-	return ret;
-}
-
-int sys_pkey_free(unsigned long pkey)
-{
-	int ret = syscall(SYS_pkey_free, pkey);
-	dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
-	return ret;
-}
-
-/*
- * I had a bug where pkey bits could be set by mprotect() but
- * not cleared.  This ensures we get lots of random bit sets
- * and clears on the vma and pte pkey bits.
- */
-int alloc_random_pkey(void)
-{
-	int max_nr_pkey_allocs;
-	int ret;
-	int i;
-	int alloced_pkeys[NR_PKEYS];
-	int nr_alloced = 0;
-	int random_index;
-	memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
-
-	/* allocate every possible key and make a note of which ones we got */
-	max_nr_pkey_allocs = NR_PKEYS;
-	for (i = 0; i < max_nr_pkey_allocs; i++) {
-		int new_pkey = alloc_pkey();
-		if (new_pkey < 0)
-			break;
-		alloced_pkeys[nr_alloced++] = new_pkey;
-	}
-
-	pkey_assert(nr_alloced > 0);
-	/* select a random one out of the allocated ones */
-	random_index = rand() % nr_alloced;
-	ret = alloced_pkeys[random_index];
-	/* now zero it out so we don't free it next */
-	alloced_pkeys[random_index] = 0;
-
-	/* go through the allocated ones that we did not want and free them */
-	for (i = 0; i < nr_alloced; i++) {
-		int free_ret;
-		if (!alloced_pkeys[i])
-			continue;
-		free_ret = sys_pkey_free(alloced_pkeys[i]);
-		pkey_assert(!free_ret);
-	}
-	dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-			 " shadow: 0x%016llx\n", __func__,
-			__LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
-	return ret;
-}
-
-int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
-		unsigned long pkey)
-{
-	int nr_iterations = random() % 100;
-	int ret;
-
-	while (0) {
-		int rpkey = alloc_random_pkey();
-		ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
-		dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
-				ptr, size, orig_prot, pkey, ret);
-		if (nr_iterations-- < 0)
-			break;
-
-		dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-			" shadow: 0x%016llx\n",
-			__func__, __LINE__, ret, __read_pkey_reg(),
-			shadow_pkey_reg);
-		sys_pkey_free(rpkey);
-		dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-			" shadow: 0x%016llx\n",
-			__func__, __LINE__, ret, __read_pkey_reg(),
-			shadow_pkey_reg);
-	}
-	pkey_assert(pkey < NR_PKEYS);
-
-	ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
-	dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
-			ptr, size, orig_prot, pkey, ret);
-	pkey_assert(!ret);
-	dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-			" shadow: 0x%016llx\n", __func__,
-			__LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
-	return ret;
-}
-
-struct pkey_malloc_record {
-	void *ptr;
-	long size;
-	int prot;
-};
-struct pkey_malloc_record *pkey_malloc_records;
-struct pkey_malloc_record *pkey_last_malloc_record;
-long nr_pkey_malloc_records;
-void record_pkey_malloc(void *ptr, long size, int prot)
-{
-	long i;
-	struct pkey_malloc_record *rec = NULL;
-
-	for (i = 0; i < nr_pkey_malloc_records; i++) {
-		rec = &pkey_malloc_records[i];
-		/* find a free record */
-		if (rec)
-			break;
-	}
-	if (!rec) {
-		/* every record is full */
-		size_t old_nr_records = nr_pkey_malloc_records;
-		size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
-		size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
-		dprintf2("new_nr_records: %zd\n", new_nr_records);
-		dprintf2("new_size: %zd\n", new_size);
-		pkey_malloc_records = realloc(pkey_malloc_records, new_size);
-		pkey_assert(pkey_malloc_records != NULL);
-		rec = &pkey_malloc_records[nr_pkey_malloc_records];
-		/*
-		 * realloc() does not initialize memory, so zero it from
-		 * the first new record all the way to the end.
-		 */
-		for (i = 0; i < new_nr_records - old_nr_records; i++)
-			memset(rec + i, 0, sizeof(*rec));
-	}
-	dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
-		(int)(rec - pkey_malloc_records), rec, ptr, size);
-	rec->ptr = ptr;
-	rec->size = size;
-	rec->prot = prot;
-	pkey_last_malloc_record = rec;
-	nr_pkey_malloc_records++;
-}
-
-void free_pkey_malloc(void *ptr)
-{
-	long i;
-	int ret;
-	dprintf3("%s(%p)\n", __func__, ptr);
-	for (i = 0; i < nr_pkey_malloc_records; i++) {
-		struct pkey_malloc_record *rec = &pkey_malloc_records[i];
-		dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
-				ptr, i, rec, rec->ptr, rec->size);
-		if ((ptr <  rec->ptr) ||
-		    (ptr >= rec->ptr + rec->size))
-			continue;
-
-		dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
-				ptr, i, rec, rec->ptr, rec->size);
-		nr_pkey_malloc_records--;
-		ret = munmap(rec->ptr, rec->size);
-		dprintf3("munmap ret: %d\n", ret);
-		pkey_assert(!ret);
-		dprintf3("clearing rec->ptr, rec: %p\n", rec);
-		rec->ptr = NULL;
-		dprintf3("done clearing rec->ptr, rec: %p\n", rec);
-		return;
-	}
-	pkey_assert(false);
-}
-
-
-void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
-{
-	void *ptr;
-	int ret;
-
-	read_pkey_reg();
-	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
-			size, prot, pkey);
-	pkey_assert(pkey < NR_PKEYS);
-	ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-	pkey_assert(ptr != (void *)-1);
-	ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
-	pkey_assert(!ret);
-	record_pkey_malloc(ptr, size, prot);
-	read_pkey_reg();
-
-	dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
-	return ptr;
-}
-
-void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
-{
-	int ret;
-	void *ptr;
-
-	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
-			size, prot, pkey);
-	/*
-	 * Guarantee we can fit at least one huge page in the resulting
-	 * allocation by allocating space for 2:
-	 */
-	size = ALIGN_UP(size, HPAGE_SIZE * 2);
-	ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-	pkey_assert(ptr != (void *)-1);
-	record_pkey_malloc(ptr, size, prot);
-	mprotect_pkey(ptr, size, prot, pkey);
-
-	dprintf1("unaligned ptr: %p\n", ptr);
-	ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
-	dprintf1("  aligned ptr: %p\n", ptr);
-	ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
-	dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
-	ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
-	dprintf1("MADV_WILLNEED ret: %d\n", ret);
-	memset(ptr, 0, HPAGE_SIZE);
-
-	dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
-	return ptr;
-}
-
-int hugetlb_setup_ok;
-#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages"
-#define GET_NR_HUGE_PAGES 10
-void setup_hugetlbfs(void)
-{
-	int err;
-	int fd;
-	char buf[256];
-	long hpagesz_kb;
-	long hpagesz_mb;
-
-	if (geteuid() != 0) {
-		fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
-		return;
-	}
-
-	cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
-
-	/*
-	 * Now go make sure that we got the pages and that they
-	 * are PMD-level pages. Someone might have made PUD-level
-	 * pages the default.
-	 */
-	hpagesz_kb = HPAGE_SIZE / 1024;
-	hpagesz_mb = hpagesz_kb / 1024;
-	sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb);
-	fd = open(buf, O_RDONLY);
-	if (fd < 0) {
-		fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n",
-			hpagesz_mb, strerror(errno));
-		return;
-	}
-
-	/* -1 to guarantee leaving the trailing \0 */
-	err = read(fd, buf, sizeof(buf)-1);
-	close(fd);
-	if (err <= 0) {
-		fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n",
-			hpagesz_mb, strerror(errno));
-		return;
-	}
-
-	if (atoi(buf) != GET_NR_HUGE_PAGES) {
-		fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n",
-			hpagesz_mb, buf, GET_NR_HUGE_PAGES);
-		return;
-	}
-
-	hugetlb_setup_ok = 1;
-}
-
-void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
-{
-	void *ptr;
-	int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
-
-	if (!hugetlb_setup_ok)
-		return PTR_ERR_ENOTSUP;
-
-	dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
-	size = ALIGN_UP(size, HPAGE_SIZE * 2);
-	pkey_assert(pkey < NR_PKEYS);
-	ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
-	pkey_assert(ptr != (void *)-1);
-	mprotect_pkey(ptr, size, prot, pkey);
-
-	record_pkey_malloc(ptr, size, prot);
-
-	dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
-	return ptr;
-}
-
-void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
-{
-	void *ptr;
-	int fd;
-
-	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
-			size, prot, pkey);
-	pkey_assert(pkey < NR_PKEYS);
-	fd = open("/dax/foo", O_RDWR);
-	pkey_assert(fd >= 0);
-
-	ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
-	pkey_assert(ptr != (void *)-1);
-
-	mprotect_pkey(ptr, size, prot, pkey);
-
-	record_pkey_malloc(ptr, size, prot);
-
-	dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
-	close(fd);
-	return ptr;
-}
-
-void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
-
-	malloc_pkey_with_mprotect,
-	malloc_pkey_with_mprotect_subpage,
-	malloc_pkey_anon_huge,
-	malloc_pkey_hugetlb
-/* can not do direct with the pkey_mprotect() API:
-	malloc_pkey_mmap_direct,
-	malloc_pkey_mmap_dax,
-*/
-};
-
-void *malloc_pkey(long size, int prot, u16 pkey)
-{
-	void *ret;
-	static int malloc_type;
-	int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
-
-	pkey_assert(pkey < NR_PKEYS);
-
-	while (1) {
-		pkey_assert(malloc_type < nr_malloc_types);
-
-		ret = pkey_malloc[malloc_type](size, prot, pkey);
-		pkey_assert(ret != (void *)-1);
-
-		malloc_type++;
-		if (malloc_type >= nr_malloc_types)
-			malloc_type = (random()%nr_malloc_types);
-
-		/* try again if the malloc_type we tried is unsupported */
-		if (ret == PTR_ERR_ENOTSUP)
-			continue;
-
-		break;
-	}
-
-	dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
-			size, prot, pkey, ret);
-	return ret;
-}
-
-int last_pkey_faults;
-#define UNKNOWN_PKEY -2
-void expected_pkey_fault(int pkey)
-{
-	dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n",
-			__func__, last_pkey_faults, pkey_faults);
-	dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
-	pkey_assert(last_pkey_faults + 1 == pkey_faults);
-
-       /*
-	* For exec-only memory, we do not know the pkey in
-	* advance, so skip this check.
-	*/
-	if (pkey != UNKNOWN_PKEY)
-		pkey_assert(last_si_pkey == pkey);
-
-#if defined(__i386__) || defined(__x86_64__) /* arch */
-	/*
-	 * The signal handler shold have cleared out PKEY register to let the
-	 * test program continue.  We now have to restore it.
-	 */
-	if (__read_pkey_reg() != 0)
-#else /* arch */
-	if (__read_pkey_reg() != shadow_pkey_reg)
-#endif /* arch */
-		pkey_assert(0);
-
-	__write_pkey_reg(shadow_pkey_reg);
-	dprintf1("%s() set pkey_reg=%016llx to restore state after signal "
-		       "nuked it\n", __func__, shadow_pkey_reg);
-	last_pkey_faults = pkey_faults;
-	last_si_pkey = -1;
-}
-
-#define do_not_expect_pkey_fault(msg)	do {			\
-	if (last_pkey_faults != pkey_faults)			\
-		dprintf0("unexpected PKey fault: %s\n", msg);	\
-	pkey_assert(last_pkey_faults == pkey_faults);		\
-} while (0)
-
-int test_fds[10] = { -1 };
-int nr_test_fds;
-void __save_test_fd(int fd)
-{
-	pkey_assert(fd >= 0);
-	pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
-	test_fds[nr_test_fds] = fd;
-	nr_test_fds++;
-}
-
-int get_test_read_fd(void)
-{
-	int test_fd = open("/etc/passwd", O_RDONLY);
-	__save_test_fd(test_fd);
-	return test_fd;
-}
-
-void close_test_fds(void)
-{
-	int i;
-
-	for (i = 0; i < nr_test_fds; i++) {
-		if (test_fds[i] < 0)
-			continue;
-		close(test_fds[i]);
-		test_fds[i] = -1;
-	}
-	nr_test_fds = 0;
-}
-
-#define barrier() __asm__ __volatile__("": : :"memory")
-__attribute__((noinline)) int read_ptr(int *ptr)
-{
-	/*
-	 * Keep GCC from optimizing this away somehow
-	 */
-	barrier();
-	return *ptr;
-}
-
-void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
-{
-	int i, err;
-	int max_nr_pkey_allocs;
-	int alloced_pkeys[NR_PKEYS];
-	int nr_alloced = 0;
-	long size;
-
-	pkey_assert(pkey_last_malloc_record);
-	size = pkey_last_malloc_record->size;
-	/*
-	 * This is a bit of a hack.  But mprotect() requires
-	 * huge-page-aligned sizes when operating on hugetlbfs.
-	 * So, make sure that we use something that's a multiple
-	 * of a huge page when we can.
-	 */
-	if (size >= HPAGE_SIZE)
-		size = HPAGE_SIZE;
-
-	/* allocate every possible key and make sure key-0 never got allocated */
-	max_nr_pkey_allocs = NR_PKEYS;
-	for (i = 0; i < max_nr_pkey_allocs; i++) {
-		int new_pkey = alloc_pkey();
-		pkey_assert(new_pkey != 0);
-
-		if (new_pkey < 0)
-			break;
-		alloced_pkeys[nr_alloced++] = new_pkey;
-	}
-	/* free all the allocated keys */
-	for (i = 0; i < nr_alloced; i++) {
-		int free_ret;
-
-		if (!alloced_pkeys[i])
-			continue;
-		free_ret = sys_pkey_free(alloced_pkeys[i]);
-		pkey_assert(!free_ret);
-	}
-
-	/* attach key-0 in various modes */
-	err = sys_mprotect_pkey(ptr, size, PROT_READ, 0);
-	pkey_assert(!err);
-	err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0);
-	pkey_assert(!err);
-	err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0);
-	pkey_assert(!err);
-	err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0);
-	pkey_assert(!err);
-	err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0);
-	pkey_assert(!err);
-}
-
-void test_read_of_write_disabled_region(int *ptr, u16 pkey)
-{
-	int ptr_contents;
-
-	dprintf1("disabling write access to PKEY[1], doing read\n");
-	pkey_write_deny(pkey);
-	ptr_contents = read_ptr(ptr);
-	dprintf1("*ptr: %d\n", ptr_contents);
-	dprintf1("\n");
-}
-void test_read_of_access_disabled_region(int *ptr, u16 pkey)
-{
-	int ptr_contents;
-
-	dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
-	read_pkey_reg();
-	pkey_access_deny(pkey);
-	ptr_contents = read_ptr(ptr);
-	dprintf1("*ptr: %d\n", ptr_contents);
-	expected_pkey_fault(pkey);
-}
-
-void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
-		u16 pkey)
-{
-	int ptr_contents;
-
-	dprintf1("disabling access to PKEY[%02d], doing read @ %p\n",
-				pkey, ptr);
-	ptr_contents = read_ptr(ptr);
-	dprintf1("reading ptr before disabling the read : %d\n",
-			ptr_contents);
-	read_pkey_reg();
-	pkey_access_deny(pkey);
-	ptr_contents = read_ptr(ptr);
-	dprintf1("*ptr: %d\n", ptr_contents);
-	expected_pkey_fault(pkey);
-}
-
-void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
-		u16 pkey)
-{
-	*ptr = __LINE__;
-	dprintf1("disabling write access; after accessing the page, "
-		"to PKEY[%02d], doing write\n", pkey);
-	pkey_write_deny(pkey);
-	*ptr = __LINE__;
-	expected_pkey_fault(pkey);
-}
-
-void test_write_of_write_disabled_region(int *ptr, u16 pkey)
-{
-	dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
-	pkey_write_deny(pkey);
-	*ptr = __LINE__;
-	expected_pkey_fault(pkey);
-}
-void test_write_of_access_disabled_region(int *ptr, u16 pkey)
-{
-	dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
-	pkey_access_deny(pkey);
-	*ptr = __LINE__;
-	expected_pkey_fault(pkey);
-}
-
-void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
-			u16 pkey)
-{
-	*ptr = __LINE__;
-	dprintf1("disabling access; after accessing the page, "
-		" to PKEY[%02d], doing write\n", pkey);
-	pkey_access_deny(pkey);
-	*ptr = __LINE__;
-	expected_pkey_fault(pkey);
-}
-
-void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
-{
-	int ret;
-	int test_fd = get_test_read_fd();
-
-	dprintf1("disabling access to PKEY[%02d], "
-		 "having kernel read() to buffer\n", pkey);
-	pkey_access_deny(pkey);
-	ret = read(test_fd, ptr, 1);
-	dprintf1("read ret: %d\n", ret);
-	pkey_assert(ret);
-}
-void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
-{
-	int ret;
-	int test_fd = get_test_read_fd();
-
-	pkey_write_deny(pkey);
-	ret = read(test_fd, ptr, 100);
-	dprintf1("read ret: %d\n", ret);
-	if (ret < 0 && (DEBUG_LEVEL > 0))
-		perror("verbose read result (OK for this to be bad)");
-	pkey_assert(ret);
-}
-
-void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
-{
-	int pipe_ret, vmsplice_ret;
-	struct iovec iov;
-	int pipe_fds[2];
-
-	pipe_ret = pipe(pipe_fds);
-
-	pkey_assert(pipe_ret == 0);
-	dprintf1("disabling access to PKEY[%02d], "
-		 "having kernel vmsplice from buffer\n", pkey);
-	pkey_access_deny(pkey);
-	iov.iov_base = ptr;
-	iov.iov_len = PAGE_SIZE;
-	vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
-	dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
-	pkey_assert(vmsplice_ret == -1);
-
-	close(pipe_fds[0]);
-	close(pipe_fds[1]);
-}
-
-void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
-{
-	int ignored = 0xdada;
-	int futex_ret;
-	int some_int = __LINE__;
-
-	dprintf1("disabling write to PKEY[%02d], "
-		 "doing futex gunk in buffer\n", pkey);
-	*ptr = some_int;
-	pkey_write_deny(pkey);
-	futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
-			&ignored, ignored);
-	if (DEBUG_LEVEL > 0)
-		perror("futex");
-	dprintf1("futex() ret: %d\n", futex_ret);
-}
-
-/* Assumes that all pkeys other than 'pkey' are unallocated */
-void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
-{
-	int err;
-	int i;
-
-	/* Note: 0 is the default pkey, so don't mess with it */
-	for (i = 1; i < NR_PKEYS; i++) {
-		if (pkey == i)
-			continue;
-
-		dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
-		err = sys_pkey_free(i);
-		pkey_assert(err);
-
-		err = sys_pkey_free(i);
-		pkey_assert(err);
-
-		err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
-		pkey_assert(err);
-	}
-}
-
-/* Assumes that all pkeys other than 'pkey' are unallocated */
-void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
-{
-	int err;
-	int bad_pkey = NR_PKEYS+99;
-
-	/* pass a known-invalid pkey in: */
-	err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
-	pkey_assert(err);
-}
-
-void become_child(void)
-{
-	pid_t forkret;
-
-	forkret = fork();
-	pkey_assert(forkret >= 0);
-	dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
-
-	if (!forkret) {
-		/* in the child */
-		return;
-	}
-	exit(0);
-}
-
-/* Assumes that all pkeys other than 'pkey' are unallocated */
-void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
-{
-	int err;
-	int allocated_pkeys[NR_PKEYS] = {0};
-	int nr_allocated_pkeys = 0;
-	int i;
-
-	for (i = 0; i < NR_PKEYS*3; i++) {
-		int new_pkey;
-		dprintf1("%s() alloc loop: %d\n", __func__, i);
-		new_pkey = alloc_pkey();
-		dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx"
-				" shadow: 0x%016llx\n",
-				__func__, __LINE__, err, __read_pkey_reg(),
-				shadow_pkey_reg);
-		read_pkey_reg(); /* for shadow checking */
-		dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
-		if ((new_pkey == -1) && (errno == ENOSPC)) {
-			dprintf2("%s() failed to allocate pkey after %d tries\n",
-				__func__, nr_allocated_pkeys);
-		} else {
-			/*
-			 * Ensure the number of successes never
-			 * exceeds the number of keys supported
-			 * in the hardware.
-			 */
-			pkey_assert(nr_allocated_pkeys < NR_PKEYS);
-			allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
-		}
-
-		/*
-		 * Make sure that allocation state is properly
-		 * preserved across fork().
-		 */
-		if (i == NR_PKEYS*2)
-			become_child();
-	}
-
-	dprintf3("%s()::%d\n", __func__, __LINE__);
-
-	/*
-	 * On x86:
-	 * There are 16 pkeys supported in hardware.  Three are
-	 * allocated by the time we get here:
-	 *   1. The default key (0)
-	 *   2. One possibly consumed by an execute-only mapping.
-	 *   3. One allocated by the test code and passed in via
-	 *      'pkey' to this function.
-	 * Ensure that we can allocate at least another 13 (16-3).
-	 *
-	 * On powerpc:
-	 * There are either 5, 28, 29 or 32 pkeys supported in
-	 * hardware depending on the page size (4K or 64K) and
-	 * platform (powernv or powervm). Four are allocated by
-	 * the time we get here. These include pkey-0, pkey-1,
-	 * exec-only pkey and the one allocated by the test code.
-	 * Ensure that we can allocate the remaining.
-	 */
-	pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1));
-
-	for (i = 0; i < nr_allocated_pkeys; i++) {
-		err = sys_pkey_free(allocated_pkeys[i]);
-		pkey_assert(!err);
-		read_pkey_reg(); /* for shadow checking */
-	}
-}
-
-void arch_force_pkey_reg_init(void)
-{
-#if defined(__i386__) || defined(__x86_64__) /* arch */
-	u64 *buf;
-
-	/*
-	 * All keys should be allocated and set to allow reads and
-	 * writes, so the register should be all 0.  If not, just
-	 * skip the test.
-	 */
-	if (read_pkey_reg())
-		return;
-
-	/*
-	 * Just allocate an absurd about of memory rather than
-	 * doing the XSAVE size enumeration dance.
-	 */
-	buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-
-	/* These __builtins require compiling with -mxsave */
-
-	/* XSAVE to build a valid buffer: */
-	__builtin_ia32_xsave(buf, XSTATE_PKEY);
-	/* Clear XSTATE_BV[PKRU]: */
-	buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY;
-	/* XRSTOR will likely get PKRU back to the init state: */
-	__builtin_ia32_xrstor(buf, XSTATE_PKEY);
-
-	munmap(buf, 1*MB);
-#endif
-}
-
-
-/*
- * This is mostly useless on ppc for now.  But it will not
- * hurt anything and should give some better coverage as
- * a long-running test that continually checks the pkey
- * register.
- */
-void test_pkey_init_state(int *ptr, u16 pkey)
-{
-	int err;
-	int allocated_pkeys[NR_PKEYS] = {0};
-	int nr_allocated_pkeys = 0;
-	int i;
-
-	for (i = 0; i < NR_PKEYS; i++) {
-		int new_pkey = alloc_pkey();
-
-		if (new_pkey < 0)
-			continue;
-		allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
-	}
-
-	dprintf3("%s()::%d\n", __func__, __LINE__);
-
-	arch_force_pkey_reg_init();
-
-	/*
-	 * Loop for a bit, hoping to get exercise the kernel
-	 * context switch code.
-	 */
-	for (i = 0; i < 1000000; i++)
-		read_pkey_reg();
-
-	for (i = 0; i < nr_allocated_pkeys; i++) {
-		err = sys_pkey_free(allocated_pkeys[i]);
-		pkey_assert(!err);
-		read_pkey_reg(); /* for shadow checking */
-	}
-}
-
-/*
- * pkey 0 is special.  It is allocated by default, so you do not
- * have to call pkey_alloc() to use it first.  Make sure that it
- * is usable.
- */
-void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
-{
-	long size;
-	int prot;
-
-	assert(pkey_last_malloc_record);
-	size = pkey_last_malloc_record->size;
-	/*
-	 * This is a bit of a hack.  But mprotect() requires
-	 * huge-page-aligned sizes when operating on hugetlbfs.
-	 * So, make sure that we use something that's a multiple
-	 * of a huge page when we can.
-	 */
-	if (size >= HPAGE_SIZE)
-		size = HPAGE_SIZE;
-	prot = pkey_last_malloc_record->prot;
-
-	/* Use pkey 0 */
-	mprotect_pkey(ptr, size, prot, 0);
-
-	/* Make sure that we can set it back to the original pkey. */
-	mprotect_pkey(ptr, size, prot, pkey);
-}
-
-void test_ptrace_of_child(int *ptr, u16 pkey)
-{
-	__attribute__((__unused__)) int peek_result;
-	pid_t child_pid;
-	void *ignored = 0;
-	long ret;
-	int status;
-	/*
-	 * This is the "control" for our little expermient.  Make sure
-	 * we can always access it when ptracing.
-	 */
-	int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
-	int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
-
-	/*
-	 * Fork a child which is an exact copy of this process, of course.
-	 * That means we can do all of our tests via ptrace() and then plain
-	 * memory access and ensure they work differently.
-	 */
-	child_pid = fork_lazy_child();
-	dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
-
-	ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
-	if (ret)
-		perror("attach");
-	dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
-	pkey_assert(ret != -1);
-	ret = waitpid(child_pid, &status, WUNTRACED);
-	if ((ret != child_pid) || !(WIFSTOPPED(status))) {
-		fprintf(stderr, "weird waitpid result %ld stat %x\n",
-				ret, status);
-		pkey_assert(0);
-	}
-	dprintf2("waitpid ret: %ld\n", ret);
-	dprintf2("waitpid status: %d\n", status);
-
-	pkey_access_deny(pkey);
-	pkey_write_deny(pkey);
-
-	/* Write access, untested for now:
-	ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
-	pkey_assert(ret != -1);
-	dprintf1("poke at %p: %ld\n", peek_at, ret);
-	*/
-
-	/*
-	 * Try to access the pkey-protected "ptr" via ptrace:
-	 */
-	ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
-	/* expect it to work, without an error: */
-	pkey_assert(ret != -1);
-	/* Now access from the current task, and expect an exception: */
-	peek_result = read_ptr(ptr);
-	expected_pkey_fault(pkey);
-
-	/*
-	 * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
-	 */
-	ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
-	/* expect it to work, without an error: */
-	pkey_assert(ret != -1);
-	/* Now access from the current task, and expect NO exception: */
-	peek_result = read_ptr(plain_ptr);
-	do_not_expect_pkey_fault("read plain pointer after ptrace");
-
-	ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
-	pkey_assert(ret != -1);
-
-	ret = kill(child_pid, SIGKILL);
-	pkey_assert(ret != -1);
-
-	wait(&status);
-
-	free(plain_ptr_unaligned);
-}
-
-void *get_pointer_to_instructions(void)
-{
-	void *p1;
-
-	p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
-	dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
-	/* lots_o_noops_around_write should be page-aligned already */
-	assert(p1 == &lots_o_noops_around_write);
-
-	/* Point 'p1' at the *second* page of the function: */
-	p1 += PAGE_SIZE;
-
-	/*
-	 * Try to ensure we fault this in on next touch to ensure
-	 * we get an instruction fault as opposed to a data one
-	 */
-	madvise(p1, PAGE_SIZE, MADV_DONTNEED);
-
-	return p1;
-}
-
-void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
-{
-	void *p1;
-	int scratch;
-	int ptr_contents;
-	int ret;
-
-	p1 = get_pointer_to_instructions();
-	lots_o_noops_around_write(&scratch);
-	ptr_contents = read_ptr(p1);
-	dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
-
-	ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
-	pkey_assert(!ret);
-	pkey_access_deny(pkey);
-
-	dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
-
-	/*
-	 * Make sure this is an *instruction* fault
-	 */
-	madvise(p1, PAGE_SIZE, MADV_DONTNEED);
-	lots_o_noops_around_write(&scratch);
-	do_not_expect_pkey_fault("executing on PROT_EXEC memory");
-	expect_fault_on_read_execonly_key(p1, pkey);
-}
-
-void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
-{
-	void *p1;
-	int scratch;
-	int ptr_contents;
-	int ret;
-
-	dprintf1("%s() start\n", __func__);
-
-	p1 = get_pointer_to_instructions();
-	lots_o_noops_around_write(&scratch);
-	ptr_contents = read_ptr(p1);
-	dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
-
-	/* Use a *normal* mprotect(), not mprotect_pkey(): */
-	ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
-	pkey_assert(!ret);
-
-	/*
-	 * Reset the shadow, assuming that the above mprotect()
-	 * correctly changed PKRU, but to an unknown value since
-	 * the actual allocated pkey is unknown.
-	 */
-	shadow_pkey_reg = __read_pkey_reg();
-
-	dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
-
-	/* Make sure this is an *instruction* fault */
-	madvise(p1, PAGE_SIZE, MADV_DONTNEED);
-	lots_o_noops_around_write(&scratch);
-	do_not_expect_pkey_fault("executing on PROT_EXEC memory");
-	expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY);
-
-	/*
-	 * Put the memory back to non-PROT_EXEC.  Should clear the
-	 * exec-only pkey off the VMA and allow it to be readable
-	 * again.  Go to PROT_NONE first to check for a kernel bug
-	 * that did not clear the pkey when doing PROT_NONE.
-	 */
-	ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
-	pkey_assert(!ret);
-
-	ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
-	pkey_assert(!ret);
-	ptr_contents = read_ptr(p1);
-	do_not_expect_pkey_fault("plain read on recently PROT_EXEC area");
-}
-
-#if defined(__i386__) || defined(__x86_64__)
-void test_ptrace_modifies_pkru(int *ptr, u16 pkey)
-{
-	u32 new_pkru;
-	pid_t child;
-	int status, ret;
-	int pkey_offset = pkey_reg_xstate_offset();
-	size_t xsave_size = cpu_max_xsave_size();
-	void *xsave;
-	u32 *pkey_register;
-	u64 *xstate_bv;
-	struct iovec iov;
-
-	new_pkru = ~read_pkey_reg();
-	/* Don't make PROT_EXEC mappings inaccessible */
-	new_pkru &= ~3;
-
-	child = fork();
-	pkey_assert(child >= 0);
-	dprintf3("[%d] fork() ret: %d\n", getpid(), child);
-	if (!child) {
-		ptrace(PTRACE_TRACEME, 0, 0, 0);
-		/* Stop and allow the tracer to modify PKRU directly */
-		raise(SIGSTOP);
-
-		/*
-		 * need __read_pkey_reg() version so we do not do shadow_pkey_reg
-		 * checking
-		 */
-		if (__read_pkey_reg() != new_pkru)
-			exit(1);
-
-		/* Stop and allow the tracer to clear XSTATE_BV for PKRU */
-		raise(SIGSTOP);
-
-		if (__read_pkey_reg() != 0)
-			exit(1);
-
-		/* Stop and allow the tracer to examine PKRU */
-		raise(SIGSTOP);
-
-		exit(0);
-	}
-
-	pkey_assert(child == waitpid(child, &status, 0));
-	dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
-	pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
-
-	xsave = (void *)malloc(xsave_size);
-	pkey_assert(xsave > 0);
-
-	/* Modify the PKRU register directly */
-	iov.iov_base = xsave;
-	iov.iov_len = xsave_size;
-	ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
-	pkey_assert(ret == 0);
-
-	pkey_register = (u32 *)(xsave + pkey_offset);
-	pkey_assert(*pkey_register == read_pkey_reg());
-
-	*pkey_register = new_pkru;
-
-	ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov);
-	pkey_assert(ret == 0);
-
-	/* Test that the modification is visible in ptrace before any execution */
-	memset(xsave, 0xCC, xsave_size);
-	ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
-	pkey_assert(ret == 0);
-	pkey_assert(*pkey_register == new_pkru);
-
-	/* Execute the tracee */
-	ret = ptrace(PTRACE_CONT, child, 0, 0);
-	pkey_assert(ret == 0);
-
-	/* Test that the tracee saw the PKRU value change */
-	pkey_assert(child == waitpid(child, &status, 0));
-	dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
-	pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
-
-	/* Test that the modification is visible in ptrace after execution */
-	memset(xsave, 0xCC, xsave_size);
-	ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
-	pkey_assert(ret == 0);
-	pkey_assert(*pkey_register == new_pkru);
-
-	/* Clear the PKRU bit from XSTATE_BV */
-	xstate_bv = (u64 *)(xsave + 512);
-	*xstate_bv &= ~(1 << 9);
-
-	ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov);
-	pkey_assert(ret == 0);
-
-	/* Test that the modification is visible in ptrace before any execution */
-	memset(xsave, 0xCC, xsave_size);
-	ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
-	pkey_assert(ret == 0);
-	pkey_assert(*pkey_register == 0);
-
-	ret = ptrace(PTRACE_CONT, child, 0, 0);
-	pkey_assert(ret == 0);
-
-	/* Test that the tracee saw the PKRU value go to 0 */
-	pkey_assert(child == waitpid(child, &status, 0));
-	dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
-	pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
-
-	/* Test that the modification is visible in ptrace after execution */
-	memset(xsave, 0xCC, xsave_size);
-	ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
-	pkey_assert(ret == 0);
-	pkey_assert(*pkey_register == 0);
-
-	ret = ptrace(PTRACE_CONT, child, 0, 0);
-	pkey_assert(ret == 0);
-	pkey_assert(child == waitpid(child, &status, 0));
-	dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
-	pkey_assert(WIFEXITED(status));
-	pkey_assert(WEXITSTATUS(status) == 0);
-	free(xsave);
-}
-#endif
-
-void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
-{
-	int size = PAGE_SIZE;
-	int sret;
-
-	if (cpu_has_pkeys()) {
-		dprintf1("SKIP: %s: no CPU support\n", __func__);
-		return;
-	}
-
-	sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey);
-	pkey_assert(sret < 0);
-}
-
-void (*pkey_tests[])(int *ptr, u16 pkey) = {
-	test_read_of_write_disabled_region,
-	test_read_of_access_disabled_region,
-	test_read_of_access_disabled_region_with_page_already_mapped,
-	test_write_of_write_disabled_region,
-	test_write_of_write_disabled_region_with_page_already_mapped,
-	test_write_of_access_disabled_region,
-	test_write_of_access_disabled_region_with_page_already_mapped,
-	test_kernel_write_of_access_disabled_region,
-	test_kernel_write_of_write_disabled_region,
-	test_kernel_gup_of_access_disabled_region,
-	test_kernel_gup_write_to_write_disabled_region,
-	test_executing_on_unreadable_memory,
-	test_implicit_mprotect_exec_only_memory,
-	test_mprotect_with_pkey_0,
-	test_ptrace_of_child,
-	test_pkey_init_state,
-	test_pkey_syscalls_on_non_allocated_pkey,
-	test_pkey_syscalls_bad_args,
-	test_pkey_alloc_exhaust,
-	test_pkey_alloc_free_attach_pkey0,
-#if defined(__i386__) || defined(__x86_64__)
-	test_ptrace_modifies_pkru,
-#endif
-};
-
-void run_tests_once(void)
-{
-	int *ptr;
-	int prot = PROT_READ|PROT_WRITE;
-
-	for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
-		int pkey;
-		int orig_pkey_faults = pkey_faults;
-
-		dprintf1("======================\n");
-		dprintf1("test %d preparing...\n", test_nr);
-
-		tracing_on();
-		pkey = alloc_random_pkey();
-		dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
-		ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
-		dprintf1("test %d starting...\n", test_nr);
-		pkey_tests[test_nr](ptr, pkey);
-		dprintf1("freeing test memory: %p\n", ptr);
-		free_pkey_malloc(ptr);
-		sys_pkey_free(pkey);
-
-		dprintf1("pkey_faults: %d\n", pkey_faults);
-		dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults);
-
-		tracing_off();
-		close_test_fds();
-
-		printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr);
-		dprintf1("======================\n\n");
-	}
-	iteration_nr++;
-}
-
-void pkey_setup_shadow(void)
-{
-	shadow_pkey_reg = __read_pkey_reg();
-}
-
-int main(void)
-{
-	int nr_iterations = 22;
-	int pkeys_supported = is_pkeys_supported();
-
-	srand((unsigned int)time(NULL));
-
-	setup_handlers();
-
-	printf("has pkeys: %d\n", pkeys_supported);
-
-	if (!pkeys_supported) {
-		int size = PAGE_SIZE;
-		int *ptr;
-
-		printf("running PKEY tests for unsupported CPU/OS\n");
-
-		ptr  = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-		assert(ptr != (void *)-1);
-		test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
-		exit(0);
-	}
-
-	pkey_setup_shadow();
-	printf("startup pkey_reg: %016llx\n", read_pkey_reg());
-	setup_hugetlbfs();
-
-	while (nr_iterations-- > 0)
-		run_tests_once();
-
-	printf("done (all tests OK)\n");
-	return 0;
-}
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
deleted file mode 100755
index 8984e0bb58c7..000000000000
--- a/tools/testing/selftests/vm/run_vmtests.sh
+++ /dev/null
@@ -1,274 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-# Please run as root
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-exitcode=0
-
-usage() {
-	cat <<EOF
-usage: ${BASH_SOURCE[0]:-$0} [ -h | -t "<categories>"]
-  -t: specify specific categories to tests to run
-  -h: display this message
-
-The default behavior is to run all tests.
-
-Alternatively, specific groups tests can be run by passing a string
-to the -t argument containing one or more of the following categories
-separated by spaces:
-- mmap
-	tests for mmap(2)
-- gup_test
-	tests for gup using gup_test interface
-- userfaultfd
-	tests for  userfaultfd(2)
-- compaction
-	a test for the patch "Allow compaction of unevictable pages"
-- mlock
-	tests for mlock(2)
-- mremap
-	tests for mremap(2)
-- hugevm
-	tests for very large virtual address space
-- vmalloc
-	vmalloc smoke tests
-- hmm
-	hmm smoke tests
-- madv_populate
-	test memadvise(2) MADV_POPULATE_{READ,WRITE} options
-- memfd_secret
-	test memfd_secret(2)
-- process_mrelease
-	test process_mrelease(2)
-- ksm
-	ksm tests that do not require >=2 NUMA nodes
-- ksm_numa
-	ksm tests that require >=2 NUMA nodes
-- pkey
-	memory protection key tests
-- soft_dirty
-	test soft dirty page bit semantics
-- cow
-	test copy-on-write semantics
-example: ./run_vmtests.sh -t "hmm mmap ksm"
-EOF
-	exit 0
-}
-
-
-while getopts "ht:" OPT; do
-	case ${OPT} in
-		"h") usage ;;
-		"t") VM_SELFTEST_ITEMS=${OPTARG} ;;
-	esac
-done
-shift $((OPTIND -1))
-
-# default behavior: run all tests
-VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default}
-
-test_selected() {
-	if [ "$VM_SELFTEST_ITEMS" == "default" ]; then
-		# If no VM_SELFTEST_ITEMS are specified, run all tests
-		return 0
-	fi
-	# If test selected argument is one of the test items
-	if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then
-	        return 0
-	else
-	        return 1
-	fi
-}
-
-# get huge pagesize and freepages from /proc/meminfo
-while read -r name size unit; do
-	if [ "$name" = "HugePages_Free:" ]; then
-		freepgs="$size"
-	fi
-	if [ "$name" = "Hugepagesize:" ]; then
-		hpgsize_KB="$size"
-	fi
-done < /proc/meminfo
-
-# Simple hugetlbfs tests have a hardcoded minimum requirement of
-# huge pages totaling 256MB (262144KB) in size.  The userfaultfd
-# hugetlb test requires a minimum of 2 * nr_cpus huge pages.  Take
-# both of these requirements into account and attempt to increase
-# number of huge pages available.
-nr_cpus=$(nproc)
-hpgsize_MB=$((hpgsize_KB / 1024))
-half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128))
-needmem_KB=$((half_ufd_size_MB * 2 * 1024))
-
-# set proper nr_hugepages
-if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then
-	nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
-	needpgs=$((needmem_KB / hpgsize_KB))
-	tries=2
-	while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do
-		lackpgs=$((needpgs - freepgs))
-		echo 3 > /proc/sys/vm/drop_caches
-		if ! echo $((lackpgs + nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then
-			echo "Please run this test as root"
-			exit $ksft_skip
-		fi
-		while read -r name size unit; do
-			if [ "$name" = "HugePages_Free:" ]; then
-				freepgs=$size
-			fi
-		done < /proc/meminfo
-		tries=$((tries - 1))
-	done
-	if [ "$freepgs" -lt "$needpgs" ]; then
-		printf "Not enough huge pages available (%d < %d)\n" \
-		       "$freepgs" "$needpgs"
-		exit 1
-	fi
-else
-	echo "no hugetlbfs support in kernel?"
-	exit 1
-fi
-
-# filter 64bit architectures
-ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64"
-if [ -z "$ARCH" ]; then
-	ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/')
-fi
-VADDR64=0
-echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1
-
-# Usage: run_test [test binary] [arbitrary test arguments...]
-run_test() {
-	if test_selected ${CATEGORY}; then
-		local title="running $*"
-		local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -)
-		printf "%s\n%s\n%s\n" "$sep" "$title" "$sep"
-
-		"$@"
-		local ret=$?
-		if [ $ret -eq 0 ]; then
-			echo "[PASS]"
-		elif [ $ret -eq $ksft_skip ]; then
-			echo "[SKIP]"
-			exitcode=$ksft_skip
-		else
-			echo "[FAIL]"
-			exitcode=1
-		fi
-	fi # test_selected
-}
-
-CATEGORY="hugetlb" run_test ./hugepage-mmap
-
-shmmax=$(cat /proc/sys/kernel/shmmax)
-shmall=$(cat /proc/sys/kernel/shmall)
-echo 268435456 > /proc/sys/kernel/shmmax
-echo 4194304 > /proc/sys/kernel/shmall
-CATEGORY="hugetlb" run_test ./hugepage-shm
-echo "$shmmax" > /proc/sys/kernel/shmmax
-echo "$shmall" > /proc/sys/kernel/shmall
-
-CATEGORY="hugetlb" run_test ./map_hugetlb
-CATEGORY="hugetlb" run_test ./hugepage-mremap
-CATEGORY="hugetlb" run_test ./hugepage-vmemmap
-CATEGORY="hugetlb" run_test ./hugetlb-madvise
-
-if test_selected "hugetlb"; then
-	echo "NOTE: These hugetlb tests provide minimal coverage.  Use"
-	echo "      https://github.com/libhugetlbfs/libhugetlbfs.git for"
-	echo "      hugetlb regression testing."
-fi
-
-CATEGORY="mmap" run_test ./map_fixed_noreplace
-
-# get_user_pages_fast() benchmark
-CATEGORY="gup_test" run_test ./gup_test -u
-# pin_user_pages_fast() benchmark
-CATEGORY="gup_test" run_test ./gup_test -a
-# Dump pages 0, 19, and 4096, using pin_user_pages:
-CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000
-
-uffd_mods=("" ":dev")
-for mod in "${uffd_mods[@]}"; do
-	CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16
-	# Hugetlb tests require source and destination huge pages. Pass in half
-	# the size ($half_ufd_size_MB), which is used for *each*.
-	CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32
-	CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32
-	CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16
-done
-
-#cleanup
-echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
-
-CATEGORY="compaction" run_test ./compaction_test
-
-CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit
-
-CATEGORY="mmap" run_test ./map_populate
-
-CATEGORY="mlock" run_test ./mlock-random-test
-
-CATEGORY="mlock" run_test ./mlock2-tests
-
-CATEGORY="process_mrelease" run_test ./mrelease_test
-
-CATEGORY="mremap" run_test ./mremap_test
-
-CATEGORY="hugetlb" run_test ./thuge-gen
-
-if [ $VADDR64 -ne 0 ]; then
-	CATEGORY="hugevm" run_test ./virtual_address_range
-
-	# virtual address 128TB switch test
-	CATEGORY="hugevm" run_test ./va_128TBswitch.sh
-fi # VADDR64
-
-# vmalloc stability smoke test
-CATEGORY="vmalloc" run_test ./test_vmalloc.sh smoke
-
-CATEGORY="mremap" run_test ./mremap_dontunmap
-
-CATEGORY="hmm" run_test ./test_hmm.sh smoke
-
-# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
-CATEGORY="madv_populate" run_test ./madv_populate
-
-CATEGORY="memfd_secret" run_test ./memfd_secret
-
-# KSM MADV_MERGEABLE test with 10 identical pages
-CATEGORY="ksm" run_test ./ksm_tests -M -p 10
-# KSM unmerge test
-CATEGORY="ksm" run_test ./ksm_tests -U
-# KSM test with 10 zero pages and use_zero_pages = 0
-CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0
-# KSM test with 10 zero pages and use_zero_pages = 1
-CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1
-# KSM test with 2 NUMA nodes and merge_across_nodes = 1
-CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1
-# KSM test with 2 NUMA nodes and merge_across_nodes = 0
-CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0
-
-CATEGORY="ksm" run_test ./ksm_functional_tests
-
-run_test ./ksm_functional_tests
-
-# protection_keys tests
-if [ -x ./protection_keys_32 ]
-then
-	CATEGORY="pkey" run_test ./protection_keys_32
-fi
-
-if [ -x ./protection_keys_64 ]
-then
-	CATEGORY="pkey" run_test ./protection_keys_64
-fi
-
-CATEGORY="soft_dirty" run_test ./soft-dirty
-
-# COW tests
-CATEGORY="cow" run_test ./cow
-
-exit $exitcode
diff --git a/tools/testing/selftests/vm/settings b/tools/testing/selftests/vm/settings
deleted file mode 100644
index 9abfc60e9e6f..000000000000
--- a/tools/testing/selftests/vm/settings
+++ /dev/null
@@ -1 +0,0 @@
-timeout=45
diff --git a/tools/testing/selftests/vm/soft-dirty.c b/tools/testing/selftests/vm/soft-dirty.c
deleted file mode 100644
index 21d8830c5f24..000000000000
--- a/tools/testing/selftests/vm/soft-dirty.c
+++ /dev/null
@@ -1,210 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <stdio.h>
-#include <string.h>
-#include <stdbool.h>
-#include <fcntl.h>
-#include <stdint.h>
-#include <malloc.h>
-#include <sys/mman.h>
-#include "../kselftest.h"
-#include "vm_util.h"
-
-#define PAGEMAP_FILE_PATH "/proc/self/pagemap"
-#define TEST_ITERATIONS 10000
-
-static void test_simple(int pagemap_fd, int pagesize)
-{
-	int i;
-	char *map;
-
-	map = aligned_alloc(pagesize, pagesize);
-	if (!map)
-		ksft_exit_fail_msg("mmap failed\n");
-
-	clear_softdirty();
-
-	for (i = 0 ; i < TEST_ITERATIONS; i++) {
-		if (pagemap_is_softdirty(pagemap_fd, map) == 1) {
-			ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i);
-			break;
-		}
-
-		clear_softdirty();
-		// Write something to the page to get the dirty bit enabled on the page
-		map[0]++;
-
-		if (pagemap_is_softdirty(pagemap_fd, map) == 0) {
-			ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i);
-			break;
-		}
-
-		clear_softdirty();
-	}
-	free(map);
-
-	ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__);
-}
-
-static void test_vma_reuse(int pagemap_fd, int pagesize)
-{
-	char *map, *map2;
-
-	map = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0);
-	if (map == MAP_FAILED)
-		ksft_exit_fail_msg("mmap failed");
-
-	// The kernel always marks new regions as soft dirty
-	ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
-			 "Test %s dirty bit of allocated page\n", __func__);
-
-	clear_softdirty();
-	munmap(map, pagesize);
-
-	map2 = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0);
-	if (map2 == MAP_FAILED)
-		ksft_exit_fail_msg("mmap failed");
-
-	// Dirty bit is set for new regions even if they are reused
-	if (map == map2)
-		ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1,
-				 "Test %s dirty bit of reused address page\n", __func__);
-	else
-		ksft_test_result_skip("Test %s dirty bit of reused address page\n", __func__);
-
-	munmap(map2, pagesize);
-}
-
-static void test_hugepage(int pagemap_fd, int pagesize)
-{
-	char *map;
-	int i, ret;
-	size_t hpage_len = read_pmd_pagesize();
-
-	map = memalign(hpage_len, hpage_len);
-	if (!map)
-		ksft_exit_fail_msg("memalign failed\n");
-
-	ret = madvise(map, hpage_len, MADV_HUGEPAGE);
-	if (ret)
-		ksft_exit_fail_msg("madvise failed %d\n", ret);
-
-	for (i = 0; i < hpage_len; i++)
-		map[i] = (char)i;
-
-	if (check_huge_anon(map, 1, hpage_len)) {
-		ksft_test_result_pass("Test %s huge page allocation\n", __func__);
-
-		clear_softdirty();
-		for (i = 0 ; i < TEST_ITERATIONS ; i++) {
-			if (pagemap_is_softdirty(pagemap_fd, map) == 1) {
-				ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i);
-				break;
-			}
-
-			clear_softdirty();
-			// Write something to the page to get the dirty bit enabled on the page
-			map[0]++;
-
-			if (pagemap_is_softdirty(pagemap_fd, map) == 0) {
-				ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i);
-				break;
-			}
-			clear_softdirty();
-		}
-
-		ksft_test_result(i == TEST_ITERATIONS, "Test %s huge page dirty bit\n", __func__);
-	} else {
-		// hugepage allocation failed. skip these tests
-		ksft_test_result_skip("Test %s huge page allocation\n", __func__);
-		ksft_test_result_skip("Test %s huge page dirty bit\n", __func__);
-	}
-	free(map);
-}
-
-static void test_mprotect(int pagemap_fd, int pagesize, bool anon)
-{
-	const char *type[] = {"file", "anon"};
-	const char *fname = "./soft-dirty-test-file";
-	int test_fd;
-	char *map;
-
-	if (anon) {
-		map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE,
-			   MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-		if (!map)
-			ksft_exit_fail_msg("anon mmap failed\n");
-	} else {
-		test_fd = open(fname, O_RDWR | O_CREAT);
-		if (test_fd < 0) {
-			ksft_test_result_skip("Test %s open() file failed\n", __func__);
-			return;
-		}
-		unlink(fname);
-		ftruncate(test_fd, pagesize);
-		map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE,
-			   MAP_SHARED, test_fd, 0);
-		if (!map)
-			ksft_exit_fail_msg("file mmap failed\n");
-	}
-
-	*map = 1;
-	ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
-			 "Test %s-%s dirty bit of new written page\n",
-			 __func__, type[anon]);
-	clear_softdirty();
-	ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
-			 "Test %s-%s soft-dirty clear after clear_refs\n",
-			 __func__, type[anon]);
-	mprotect(map, pagesize, PROT_READ);
-	ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
-			 "Test %s-%s soft-dirty clear after marking RO\n",
-			 __func__, type[anon]);
-	mprotect(map, pagesize, PROT_READ|PROT_WRITE);
-	ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
-			 "Test %s-%s soft-dirty clear after marking RW\n",
-			 __func__, type[anon]);
-	*map = 2;
-	ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
-			 "Test %s-%s soft-dirty after rewritten\n",
-			 __func__, type[anon]);
-
-	munmap(map, pagesize);
-
-	if (!anon)
-		close(test_fd);
-}
-
-static void test_mprotect_anon(int pagemap_fd, int pagesize)
-{
-	test_mprotect(pagemap_fd, pagesize, true);
-}
-
-static void test_mprotect_file(int pagemap_fd, int pagesize)
-{
-	test_mprotect(pagemap_fd, pagesize, false);
-}
-
-int main(int argc, char **argv)
-{
-	int pagemap_fd;
-	int pagesize;
-
-	ksft_print_header();
-	ksft_set_plan(15);
-
-	pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY);
-	if (pagemap_fd < 0)
-		ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH);
-
-	pagesize = getpagesize();
-
-	test_simple(pagemap_fd, pagesize);
-	test_vma_reuse(pagemap_fd, pagesize);
-	test_hugepage(pagemap_fd, pagesize);
-	test_mprotect_anon(pagemap_fd, pagesize);
-	test_mprotect_file(pagemap_fd, pagesize);
-
-	close(pagemap_fd);
-
-	return ksft_exit_pass();
-}
diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
deleted file mode 100644
index 76e1c36dd9e5..000000000000
--- a/tools/testing/selftests/vm/split_huge_page_test.c
+++ /dev/null
@@ -1,309 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
- * address range in a process via <debugfs>/split_huge_pages interface.
- */
-
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include <inttypes.h>
-#include <string.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/mount.h>
-#include <malloc.h>
-#include <stdbool.h>
-#include "vm_util.h"
-
-uint64_t pagesize;
-unsigned int pageshift;
-uint64_t pmd_pagesize;
-
-#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages"
-#define INPUT_MAX 80
-
-#define PID_FMT "%d,0x%lx,0x%lx"
-#define PATH_FMT "%s,0x%lx,0x%lx"
-
-#define PFN_MASK     ((1UL<<55)-1)
-#define KPF_THP      (1UL<<22)
-
-int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
-{
-	uint64_t paddr;
-	uint64_t page_flags;
-
-	if (pagemap_file) {
-		pread(pagemap_file, &paddr, sizeof(paddr),
-			((long)vaddr >> pageshift) * sizeof(paddr));
-
-		if (kpageflags_file) {
-			pread(kpageflags_file, &page_flags, sizeof(page_flags),
-				(paddr & PFN_MASK) * sizeof(page_flags));
-
-			return !!(page_flags & KPF_THP);
-		}
-	}
-	return 0;
-}
-
-static int write_file(const char *path, const char *buf, size_t buflen)
-{
-	int fd;
-	ssize_t numwritten;
-
-	fd = open(path, O_WRONLY);
-	if (fd == -1)
-		return 0;
-
-	numwritten = write(fd, buf, buflen - 1);
-	close(fd);
-	if (numwritten < 1)
-		return 0;
-
-	return (unsigned int) numwritten;
-}
-
-static void write_debugfs(const char *fmt, ...)
-{
-	char input[INPUT_MAX];
-	int ret;
-	va_list argp;
-
-	va_start(argp, fmt);
-	ret = vsnprintf(input, INPUT_MAX, fmt, argp);
-	va_end(argp);
-
-	if (ret >= INPUT_MAX) {
-		printf("%s: Debugfs input is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
-
-	if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
-		perror(SPLIT_DEBUGFS);
-		exit(EXIT_FAILURE);
-	}
-}
-
-void split_pmd_thp(void)
-{
-	char *one_page;
-	size_t len = 4 * pmd_pagesize;
-	size_t i;
-
-	one_page = memalign(pmd_pagesize, len);
-
-	if (!one_page) {
-		printf("Fail to allocate memory\n");
-		exit(EXIT_FAILURE);
-	}
-
-	madvise(one_page, len, MADV_HUGEPAGE);
-
-	for (i = 0; i < len; i++)
-		one_page[i] = (char)i;
-
-	if (!check_huge_anon(one_page, 1, pmd_pagesize)) {
-		printf("No THP is allocated\n");
-		exit(EXIT_FAILURE);
-	}
-
-	/* split all THPs */
-	write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
-		(uint64_t)one_page + len);
-
-	for (i = 0; i < len; i++)
-		if (one_page[i] != (char)i) {
-			printf("%ld byte corrupted\n", i);
-			exit(EXIT_FAILURE);
-		}
-
-
-	if (check_huge_anon(one_page, 0, pmd_pagesize)) {
-		printf("Still AnonHugePages not split\n");
-		exit(EXIT_FAILURE);
-	}
-
-	printf("Split huge pages successful\n");
-	free(one_page);
-}
-
-void split_pte_mapped_thp(void)
-{
-	char *one_page, *pte_mapped, *pte_mapped2;
-	size_t len = 4 * pmd_pagesize;
-	uint64_t thp_size;
-	size_t i;
-	const char *pagemap_template = "/proc/%d/pagemap";
-	const char *kpageflags_proc = "/proc/kpageflags";
-	char pagemap_proc[255];
-	int pagemap_fd;
-	int kpageflags_fd;
-
-	if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
-		perror("get pagemap proc error");
-		exit(EXIT_FAILURE);
-	}
-	pagemap_fd = open(pagemap_proc, O_RDONLY);
-
-	if (pagemap_fd == -1) {
-		perror("read pagemap:");
-		exit(EXIT_FAILURE);
-	}
-
-	kpageflags_fd = open(kpageflags_proc, O_RDONLY);
-
-	if (kpageflags_fd == -1) {
-		perror("read kpageflags:");
-		exit(EXIT_FAILURE);
-	}
-
-	one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
-			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-
-	madvise(one_page, len, MADV_HUGEPAGE);
-
-	for (i = 0; i < len; i++)
-		one_page[i] = (char)i;
-
-	if (!check_huge_anon(one_page, 1, pmd_pagesize)) {
-		printf("No THP is allocated\n");
-		exit(EXIT_FAILURE);
-	}
-
-	/* remap the first pagesize of first THP */
-	pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
-
-	/* remap the Nth pagesize of Nth THP */
-	for (i = 1; i < 4; i++) {
-		pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
-				     pagesize, pagesize,
-				     MREMAP_MAYMOVE|MREMAP_FIXED,
-				     pte_mapped + pagesize * i);
-		if (pte_mapped2 == (char *)-1) {
-			perror("mremap failed");
-			exit(EXIT_FAILURE);
-		}
-	}
-
-	/* smap does not show THPs after mremap, use kpageflags instead */
-	thp_size = 0;
-	for (i = 0; i < pagesize * 4; i++)
-		if (i % pagesize == 0 &&
-		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
-			thp_size++;
-
-	if (thp_size != 4) {
-		printf("Some THPs are missing during mremap\n");
-		exit(EXIT_FAILURE);
-	}
-
-	/* split all remapped THPs */
-	write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped,
-		      (uint64_t)pte_mapped + pagesize * 4);
-
-	/* smap does not show THPs after mremap, use kpageflags instead */
-	thp_size = 0;
-	for (i = 0; i < pagesize * 4; i++) {
-		if (pte_mapped[i] != (char)i) {
-			printf("%ld byte corrupted\n", i);
-			exit(EXIT_FAILURE);
-		}
-		if (i % pagesize == 0 &&
-		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
-			thp_size++;
-	}
-
-	if (thp_size) {
-		printf("Still %ld THPs not split\n", thp_size);
-		exit(EXIT_FAILURE);
-	}
-
-	printf("Split PTE-mapped huge pages successful\n");
-	munmap(one_page, len);
-	close(pagemap_fd);
-	close(kpageflags_fd);
-}
-
-void split_file_backed_thp(void)
-{
-	int status;
-	int fd;
-	ssize_t num_written;
-	char tmpfs_template[] = "/tmp/thp_split_XXXXXX";
-	const char *tmpfs_loc = mkdtemp(tmpfs_template);
-	char testfile[INPUT_MAX];
-	uint64_t pgoff_start = 0, pgoff_end = 1024;
-
-	printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n");
-
-	status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
-
-	if (status) {
-		printf("Unable to create a tmpfs for testing\n");
-		exit(EXIT_FAILURE);
-	}
-
-	status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
-	if (status >= INPUT_MAX) {
-		printf("Fail to create file-backed THP split testing file\n");
-		goto cleanup;
-	}
-
-	fd = open(testfile, O_CREAT|O_WRONLY);
-	if (fd == -1) {
-		perror("Cannot open testing file\n");
-		goto cleanup;
-	}
-
-	/* write something to the file, so a file-backed THP can be allocated */
-	num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1);
-	close(fd);
-
-	if (num_written < 1) {
-		printf("Fail to write data to testing file\n");
-		goto cleanup;
-	}
-
-	/* split the file-backed THP */
-	write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end);
-
-	status = unlink(testfile);
-	if (status)
-		perror("Cannot remove testing file\n");
-
-cleanup:
-	status = umount(tmpfs_loc);
-	if (status) {
-		printf("Unable to umount %s\n", tmpfs_loc);
-		exit(EXIT_FAILURE);
-	}
-	status = rmdir(tmpfs_loc);
-	if (status) {
-		perror("cannot remove tmp dir");
-		exit(EXIT_FAILURE);
-	}
-
-	printf("file-backed THP split test done, please check dmesg for more information\n");
-}
-
-int main(int argc, char **argv)
-{
-	if (geteuid() != 0) {
-		printf("Please run the benchmark as root\n");
-		exit(EXIT_FAILURE);
-	}
-
-	pagesize = getpagesize();
-	pageshift = ffs(pagesize) - 1;
-	pmd_pagesize = read_pmd_pagesize();
-
-	split_pmd_thp();
-	split_pte_mapped_thp();
-	split_file_backed_thp();
-
-	return 0;
-}
diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh
deleted file mode 100755
index 46e19b5d648d..000000000000
--- a/tools/testing/selftests/vm/test_hmm.sh
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
-#
-# This is a test script for the kernel test driver to analyse vmalloc
-# allocator. Therefore it is just a kernel module loader. You can specify
-# and pass different parameters in order to:
-#     a) analyse performance of vmalloc allocations;
-#     b) stressing and stability check of vmalloc subsystem.
-
-TEST_NAME="test_hmm"
-DRIVER="test_hmm"
-
-# 1 if fails
-exitcode=1
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-check_test_requirements()
-{
-	uid=$(id -u)
-	if [ $uid -ne 0 ]; then
-		echo "$0: Must be run as root"
-		exit $ksft_skip
-	fi
-
-	if ! which modprobe > /dev/null 2>&1; then
-		echo "$0: You need modprobe installed"
-		exit $ksft_skip
-	fi
-
-	if ! modinfo $DRIVER > /dev/null 2>&1; then
-		echo "$0: You must have the following enabled in your kernel:"
-		echo "CONFIG_TEST_HMM=m"
-		exit $ksft_skip
-	fi
-}
-
-load_driver()
-{
-	if [ $# -eq 0 ]; then
-		modprobe $DRIVER > /dev/null 2>&1
-	else
-		if [ $# -eq 2 ]; then
-			modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2
-				> /dev/null 2>&1
-		else
-			echo "Missing module parameters. Make sure pass"\
-			"spm_addr_dev0 and spm_addr_dev1"
-			usage
-		fi
-	fi
-}
-
-unload_driver()
-{
-	modprobe -r $DRIVER > /dev/null 2>&1
-}
-
-run_smoke()
-{
-	echo "Running smoke test. Note, this test provides basic coverage."
-
-	load_driver $1 $2
-	$(dirname "${BASH_SOURCE[0]}")/hmm-tests
-	unload_driver
-}
-
-usage()
-{
-	echo -n "Usage: $0"
-	echo
-	echo "Example usage:"
-	echo
-	echo "# Shows help message"
-	echo "./${TEST_NAME}.sh"
-	echo
-	echo "# Smoke testing"
-	echo "./${TEST_NAME}.sh smoke"
-	echo
-	echo "# Smoke testing with SPM enabled"
-	echo "./${TEST_NAME}.sh smoke <spm_addr_dev0> <spm_addr_dev1>"
-	echo
-	exit 0
-}
-
-function run_test()
-{
-	if [ $# -eq 0 ]; then
-		usage
-	else
-		if [ "$1" = "smoke" ]; then
-			run_smoke $2 $3
-		else
-			usage
-		fi
-	fi
-}
-
-check_test_requirements
-run_test $@
-
-exit 0
diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/vm/test_vmalloc.sh
deleted file mode 100755
index d73b846736f1..000000000000
--- a/tools/testing/selftests/vm/test_vmalloc.sh
+++ /dev/null
@@ -1,177 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
-#
-# This is a test script for the kernel test driver to analyse vmalloc
-# allocator. Therefore it is just a kernel module loader. You can specify
-# and pass different parameters in order to:
-#     a) analyse performance of vmalloc allocations;
-#     b) stressing and stability check of vmalloc subsystem.
-
-TEST_NAME="vmalloc"
-DRIVER="test_${TEST_NAME}"
-NUM_CPUS=`grep -c ^processor /proc/cpuinfo`
-
-# 1 if fails
-exitcode=1
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-#
-# Static templates for performance, stressing and smoke tests.
-# Also it is possible to pass any supported parameters manualy.
-#
-PERF_PARAM="sequential_test_order=1 test_repeat_count=3"
-SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10"
-STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20"
-
-check_test_requirements()
-{
-	uid=$(id -u)
-	if [ $uid -ne 0 ]; then
-		echo "$0: Must be run as root"
-		exit $ksft_skip
-	fi
-
-	if ! which modprobe > /dev/null 2>&1; then
-		echo "$0: You need modprobe installed"
-		exit $ksft_skip
-	fi
-
-	if ! modinfo $DRIVER > /dev/null 2>&1; then
-		echo "$0: You must have the following enabled in your kernel:"
-		echo "CONFIG_TEST_VMALLOC=m"
-		exit $ksft_skip
-	fi
-}
-
-run_perfformance_check()
-{
-	echo "Run performance tests to evaluate how fast vmalloc allocation is."
-	echo "It runs all test cases on one single CPU with sequential order."
-
-	modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1
-	echo "Done."
-	echo "Ccheck the kernel message buffer to see the summary."
-}
-
-run_stability_check()
-{
-	echo "Run stability tests. In order to stress vmalloc subsystem all"
-	echo "available test cases are run by NUM_CPUS workers simultaneously."
-	echo "It will take time, so be patient."
-
-	modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1
-	echo "Done."
-	echo "Check the kernel ring buffer to see the summary."
-}
-
-run_smoke_check()
-{
-	echo "Run smoke test. Note, this test provides basic coverage."
-	echo "Please check $0 output how it can be used"
-	echo "for deep performance analysis as well as stress testing."
-
-	modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1
-	echo "Done."
-	echo "Check the kernel ring buffer to see the summary."
-}
-
-usage()
-{
-	echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | "
-	echo "manual parameters"
-	echo
-	echo "Valid tests and parameters:"
-	echo
-	modinfo $DRIVER
-	echo
-	echo "Example usage:"
-	echo
-	echo "# Shows help message"
-	echo "./${DRIVER}.sh"
-	echo
-	echo "# Runs 1 test(id_1), repeats it 5 times by NUM_CPUS workers"
-	echo "./${DRIVER}.sh nr_threads=$NUM_CPUS run_test_mask=1 test_repeat_count=5"
-	echo
-	echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with "
-	echo "sequential order"
-	echo -n "./${DRIVER}.sh sequential_test_order=1 "
-	echo "run_test_mask=23"
-	echo
-	echo -n "# Runs all tests by NUM_CPUS workers, shuffled order, repeats "
-	echo "20 times"
-	echo "./${DRIVER}.sh nr_threads=$NUM_CPUS test_repeat_count=20"
-	echo
-	echo "# Performance analysis"
-	echo "./${DRIVER}.sh performance"
-	echo
-	echo "# Stress testing"
-	echo "./${DRIVER}.sh stress"
-	echo
-	exit 0
-}
-
-function validate_passed_args()
-{
-	VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'`
-
-	#
-	# Something has been passed, check it.
-	#
-	for passed_arg in $@; do
-		key=${passed_arg//=*/}
-		val="${passed_arg:$((${#key}+1))}"
-		valid=0
-
-		for valid_arg in $VALID_ARGS; do
-			if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then
-				valid=1
-				break
-			fi
-		done
-
-		if [[ $valid -ne 1 ]]; then
-			echo "Error: key or value is not correct: ${key} $val"
-			exit $exitcode
-		fi
-	done
-}
-
-function run_manual_check()
-{
-	#
-	# Validate passed parameters. If there is wrong one,
-	# the script exists and does not execute further.
-	#
-	validate_passed_args $@
-
-	echo "Run the test with following parameters: $@"
-	modprobe $DRIVER $@ > /dev/null 2>&1
-	echo "Done."
-	echo "Check the kernel ring buffer to see the summary."
-}
-
-function run_test()
-{
-	if [ $# -eq 0 ]; then
-		usage
-	else
-		if [[ "$1" = "performance" ]]; then
-			run_perfformance_check
-		elif [[ "$1" = "stress" ]]; then
-			run_stability_check
-		elif [[ "$1" = "smoke" ]]; then
-			run_smoke_check
-		else
-			run_manual_check $@
-		fi
-	fi
-}
-
-check_test_requirements
-run_test $@
-
-exit 0
diff --git a/tools/testing/selftests/vm/thuge-gen.c b/tools/testing/selftests/vm/thuge-gen.c
deleted file mode 100644
index 361ef7192cc6..000000000000
--- a/tools/testing/selftests/vm/thuge-gen.c
+++ /dev/null
@@ -1,257 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Test selecting other page sizes for mmap/shmget.
-
-   Before running this huge pages for each huge page size must have been
-   reserved.
-   For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used.
-   Also shmmax must be increased.
-   And you need to run as root to work around some weird permissions in shm.
-   And nothing using huge pages should run in parallel.
-   When the program aborts you may need to clean up the shm segments with
-   ipcrm -m by hand, like this
-   sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m
-   (warning this will remove all if someone else uses them) */
-
-#define _GNU_SOURCE 1
-#include <sys/mman.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#include <sys/stat.h>
-#include <glob.h>
-#include <assert.h>
-#include <unistd.h>
-#include <stdarg.h>
-#include <string.h>
-
-#define err(x) perror(x), exit(1)
-
-#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
-#define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
-#define MAP_HUGE_SHIFT  26
-#define MAP_HUGE_MASK   0x3f
-#if !defined(MAP_HUGETLB)
-#define MAP_HUGETLB	0x40000
-#endif
-
-#define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
-#define SHM_HUGE_SHIFT  26
-#define SHM_HUGE_MASK   0x3f
-#define SHM_HUGE_2MB    (21 << SHM_HUGE_SHIFT)
-#define SHM_HUGE_1GB    (30 << SHM_HUGE_SHIFT)
-
-#define NUM_PAGESIZES   5
-
-#define NUM_PAGES 4
-
-#define Dprintf(fmt...) // printf(fmt)
-
-unsigned long page_sizes[NUM_PAGESIZES];
-int num_page_sizes;
-
-int ilog2(unsigned long v)
-{
-	int l = 0;
-	while ((1UL << l) < v)
-		l++;
-	return l;
-}
-
-void find_pagesizes(void)
-{
-	glob_t g;
-	int i;
-	glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
-	assert(g.gl_pathc <= NUM_PAGESIZES);
-	for (i = 0; i < g.gl_pathc; i++) {
-		sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
-				&page_sizes[i]);
-		page_sizes[i] <<= 10;
-		printf("Found %luMB\n", page_sizes[i] >> 20);
-	}
-	num_page_sizes = g.gl_pathc;
-	globfree(&g);
-}
-
-unsigned long default_huge_page_size(void)
-{
-	unsigned long hps = 0;
-	char *line = NULL;
-	size_t linelen = 0;
-	FILE *f = fopen("/proc/meminfo", "r");
-	if (!f)
-		return 0;
-	while (getline(&line, &linelen, f) > 0) {
-		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
-			hps <<= 10;
-			break;
-		}
-	}
-	free(line);
-	return hps;
-}
-
-void show(unsigned long ps)
-{
-	char buf[100];
-	if (ps == getpagesize())
-		return;
-	printf("%luMB: ", ps >> 20);
-	fflush(stdout);
-	snprintf(buf, sizeof buf,
-		"cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
-		ps >> 10);
-	system(buf);
-}
-
-unsigned long read_sysfs(int warn, char *fmt, ...)
-{
-	char *line = NULL;
-	size_t linelen = 0;
-	char buf[100];
-	FILE *f;
-	va_list ap;
-	unsigned long val = 0;
-
-	va_start(ap, fmt);
-	vsnprintf(buf, sizeof buf, fmt, ap);
-	va_end(ap);
-
-	f = fopen(buf, "r");
-	if (!f) {
-		if (warn)
-			printf("missing %s\n", buf);
-		return 0;
-	}
-	if (getline(&line, &linelen, f) > 0) {
-		sscanf(line, "%lu", &val);
-	}
-	fclose(f);
-	free(line);
-	return val;
-}
-
-unsigned long read_free(unsigned long ps)
-{
-	return read_sysfs(ps != getpagesize(),
-			"/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
-			ps >> 10);
-}
-
-void test_mmap(unsigned long size, unsigned flags)
-{
-	char *map;
-	unsigned long before, after;
-	int err;
-
-	before = read_free(size);
-	map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE,
-			MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0);
-
-	if (map == (char *)-1) err("mmap");
-	memset(map, 0xff, size*NUM_PAGES);
-	after = read_free(size);
-	Dprintf("before %lu after %lu diff %ld size %lu\n",
-		before, after, before - after, size);
-	assert(size == getpagesize() || (before - after) == NUM_PAGES);
-	show(size);
-	err = munmap(map, size);
-	assert(!err);
-}
-
-void test_shmget(unsigned long size, unsigned flags)
-{
-	int id;
-	unsigned long before, after;
-	int err;
-
-	before = read_free(size);
-	id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags);
-	if (id < 0) err("shmget");
-
-	struct shm_info i;
-	if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl");
-	Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss);
-
-
-	Dprintf("id %d\n", id);
-	char *map = shmat(id, NULL, 0600);
-	if (map == (char*)-1) err("shmat");
-
-	shmctl(id, IPC_RMID, NULL);
-
-	memset(map, 0xff, size*NUM_PAGES);
-	after = read_free(size);
-
-	Dprintf("before %lu after %lu diff %ld size %lu\n",
-		before, after, before - after, size);
-	assert(size == getpagesize() || (before - after) == NUM_PAGES);
-	show(size);
-	err = shmdt(map);
-	assert(!err);
-}
-
-void sanity_checks(void)
-{
-	int i;
-	unsigned long largest = getpagesize();
-
-	for (i = 0; i < num_page_sizes; i++) {
-		if (page_sizes[i] > largest)
-			largest = page_sizes[i];
-
-		if (read_free(page_sizes[i]) < NUM_PAGES) {
-			printf("Not enough huge pages for page size %lu MB, need %u\n",
-				page_sizes[i] >> 20,
-				NUM_PAGES);
-			exit(0);
-		}
-	}
-
-	if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) {
-		printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES);
-		exit(0);
-	}
-
-#if defined(__x86_64__)
-	if (largest != 1U<<30) {
-		printf("No GB pages available on x86-64\n"
-		       "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
-		exit(0);
-	}
-#endif
-}
-
-int main(void)
-{
-	int i;
-	unsigned default_hps = default_huge_page_size();
-
-	find_pagesizes();
-
-	sanity_checks();
-
-	for (i = 0; i < num_page_sizes; i++) {
-		unsigned long ps = page_sizes[i];
-		int arg = ilog2(ps) << MAP_HUGE_SHIFT;
-		printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
-		test_mmap(ps, MAP_HUGETLB | arg);
-	}
-	printf("Testing default huge mmap\n");
-	test_mmap(default_hps, SHM_HUGETLB);
-
-	puts("Testing non-huge shmget");
-	test_shmget(getpagesize(), 0);
-
-	for (i = 0; i < num_page_sizes; i++) {
-		unsigned long ps = page_sizes[i];
-		int arg = ilog2(ps) << SHM_HUGE_SHIFT;
-		printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
-		test_shmget(ps, SHM_HUGETLB | arg);
-	}
-	puts("default huge shmget");
-	test_shmget(default_hps, SHM_HUGETLB);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/vm/transhuge-stress.c
deleted file mode 100644
index e3f00adb1b82..000000000000
--- a/tools/testing/selftests/vm/transhuge-stress.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Stress test for transparent huge pages, memory compaction and migration.
- *
- * Authors: Konstantin Khlebnikov <koct9i@gmail.com>
- *
- * This is free and unencumbered software released into the public domain.
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <err.h>
-#include <time.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <string.h>
-#include <sys/mman.h>
-#include "util.h"
-
-int backing_fd = -1;
-int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE;
-#define PROT_RW (PROT_READ | PROT_WRITE)
-
-int main(int argc, char **argv)
-{
-	size_t ram, len;
-	void *ptr, *p;
-	struct timespec a, b;
-	int i = 0;
-	char *name = NULL;
-	double s;
-	uint8_t *map;
-	size_t map_len;
-	int pagemap_fd;
-
-	ram = sysconf(_SC_PHYS_PAGES);
-	if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4)
-		ram = SIZE_MAX / 4;
-	else
-		ram *= sysconf(_SC_PAGESIZE);
-	len = ram;
-
-	while (++i < argc) {
-		if (!strcmp(argv[i], "-h"))
-			errx(1, "usage: %s [size in MiB]", argv[0]);
-		else if (!strcmp(argv[i], "-f"))
-			name = argv[++i];
-		else
-			len = atoll(argv[i]) << 20;
-	}
-
-	if (name) {
-		backing_fd = open(name, O_RDWR);
-		if (backing_fd == -1)
-			errx(2, "open %s", name);
-		mmap_flags = MAP_SHARED;
-	}
-
-	warnx("allocate %zd transhuge pages, using %zd MiB virtual memory"
-	      " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20,
-	      ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1));
-
-	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
-	if (pagemap_fd < 0)
-		err(2, "open pagemap");
-
-	len -= len % HPAGE_SIZE;
-	ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0);
-	if (ptr == MAP_FAILED)
-		err(2, "initial mmap");
-	ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;
-
-	if (madvise(ptr, len, MADV_HUGEPAGE))
-		err(2, "MADV_HUGEPAGE");
-
-	map_len = ram >> (HPAGE_SHIFT - 1);
-	map = malloc(map_len);
-	if (!map)
-		errx(2, "map malloc");
-
-	while (1) {
-		int nr_succeed = 0, nr_failed = 0, nr_pages = 0;
-
-		memset(map, 0, map_len);
-
-		clock_gettime(CLOCK_MONOTONIC, &a);
-		for (p = ptr; p < ptr + len; p += HPAGE_SIZE) {
-			int64_t pfn;
-
-			pfn = allocate_transhuge(p, pagemap_fd);
-
-			if (pfn < 0) {
-				nr_failed++;
-			} else {
-				size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT);
-
-				nr_succeed++;
-				if (idx >= map_len) {
-					map = realloc(map, idx + 1);
-					if (!map)
-						errx(2, "map realloc");
-					memset(map + map_len, 0, idx + 1 - map_len);
-					map_len = idx + 1;
-				}
-				if (!map[idx])
-					nr_pages++;
-				map[idx] = 1;
-			}
-
-			/* split transhuge page, keep last page */
-			if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED))
-				err(2, "MADV_DONTNEED");
-		}
-		clock_gettime(CLOCK_MONOTONIC, &b);
-		s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.;
-
-		warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
-		      "%4d succeed, %4d failed, %4d different pages",
-		      s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
-		      nr_succeed, nr_failed, nr_pages);
-	}
-}
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
deleted file mode 100644
index 7f22844ed704..000000000000
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ /dev/null
@@ -1,1858 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Stress userfaultfd syscall.
- *
- *  Copyright (C) 2015  Red Hat, Inc.
- *
- * This test allocates two virtual areas and bounces the physical
- * memory across the two virtual areas (from area_src to area_dst)
- * using userfaultfd.
- *
- * There are three threads running per CPU:
- *
- * 1) one per-CPU thread takes a per-page pthread_mutex in a random
- *    page of the area_dst (while the physical page may still be in
- *    area_src), and increments a per-page counter in the same page,
- *    and checks its value against a verification region.
- *
- * 2) another per-CPU thread handles the userfaults generated by
- *    thread 1 above. userfaultfd blocking reads or poll() modes are
- *    exercised interleaved.
- *
- * 3) one last per-CPU thread transfers the memory in the background
- *    at maximum bandwidth (if not already transferred by thread
- *    2). Each cpu thread takes cares of transferring a portion of the
- *    area.
- *
- * When all threads of type 3 completed the transfer, one bounce is
- * complete. area_src and area_dst are then swapped. All threads are
- * respawned and so the bounce is immediately restarted in the
- * opposite direction.
- *
- * per-CPU threads 1 by triggering userfaults inside
- * pthread_mutex_lock will also verify the atomicity of the memory
- * transfer (UFFDIO_COPY).
- */
-
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <errno.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <time.h>
-#include <signal.h>
-#include <poll.h>
-#include <string.h>
-#include <linux/mman.h>
-#include <sys/mman.h>
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/wait.h>
-#include <pthread.h>
-#include <linux/userfaultfd.h>
-#include <setjmp.h>
-#include <stdbool.h>
-#include <assert.h>
-#include <inttypes.h>
-#include <stdint.h>
-#include <sys/random.h>
-
-#include "../kselftest.h"
-#include "vm_util.h"
-
-#ifdef __NR_userfaultfd
-
-static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size;
-
-#define BOUNCE_RANDOM		(1<<0)
-#define BOUNCE_RACINGFAULTS	(1<<1)
-#define BOUNCE_VERIFY		(1<<2)
-#define BOUNCE_POLL		(1<<3)
-static int bounces;
-
-#define TEST_ANON	1
-#define TEST_HUGETLB	2
-#define TEST_SHMEM	3
-static int test_type;
-
-#define UFFD_FLAGS	(O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY)
-
-#define BASE_PMD_ADDR ((void *)(1UL << 30))
-
-/* test using /dev/userfaultfd, instead of userfaultfd(2) */
-static bool test_dev_userfaultfd;
-
-/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
-#define ALARM_INTERVAL_SECS 10
-static volatile bool test_uffdio_copy_eexist = true;
-static volatile bool test_uffdio_zeropage_eexist = true;
-/* Whether to test uffd write-protection */
-static bool test_uffdio_wp = true;
-/* Whether to test uffd minor faults */
-static bool test_uffdio_minor = false;
-static bool map_shared;
-static int mem_fd;
-static unsigned long long *count_verify;
-static int uffd = -1;
-static int uffd_flags, finished, *pipefd;
-static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
-static char *zeropage;
-pthread_attr_t attr;
-static bool test_collapse;
-
-/* Userfaultfd test statistics */
-struct uffd_stats {
-	int cpu;
-	unsigned long missing_faults;
-	unsigned long wp_faults;
-	unsigned long minor_faults;
-};
-
-/* pthread_mutex_t starts at page offset 0 */
-#define area_mutex(___area, ___nr)					\
-	((pthread_mutex_t *) ((___area) + (___nr)*page_size))
-/*
- * count is placed in the page after pthread_mutex_t naturally aligned
- * to avoid non alignment faults on non-x86 archs.
- */
-#define area_count(___area, ___nr)					\
-	((volatile unsigned long long *) ((unsigned long)		\
-				 ((___area) + (___nr)*page_size +	\
-				  sizeof(pthread_mutex_t) +		\
-				  sizeof(unsigned long long) - 1) &	\
-				 ~(unsigned long)(sizeof(unsigned long long) \
-						  -  1)))
-
-#define swap(a, b) \
-	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
-
-#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1)))
-
-const char *examples =
-    "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
-    "./userfaultfd anon 100 99999\n\n"
-    "# Run the same anonymous memory test, but using /dev/userfaultfd:\n"
-    "./userfaultfd anon:dev 100 99999\n\n"
-    "# Run share memory test on 1GiB region with 99 bounces:\n"
-    "./userfaultfd shmem 1000 99\n\n"
-    "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
-    "./userfaultfd hugetlb 256 50\n\n"
-    "# Run the same hugetlb test but using shared file:\n"
-    "./userfaultfd hugetlb_shared 256 50\n\n"
-    "# 10MiB-~6GiB 999 bounces anonymous test, "
-    "continue forever unless an error triggers\n"
-    "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
-
-static void usage(void)
-{
-	fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
-		"[hugetlbfs_file]\n\n");
-	fprintf(stderr, "Supported <test type>: anon, hugetlb, "
-		"hugetlb_shared, shmem\n\n");
-	fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. "
-		"Supported mods:\n");
-	fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n");
-	fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n");
-	fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n"
-		"memory\n");
-	fprintf(stderr, "\nExample test mod usage:\n");
-	fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n");
-	fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n");
-
-	fprintf(stderr, "Examples:\n\n");
-	fprintf(stderr, "%s", examples);
-	exit(1);
-}
-
-#define _err(fmt, ...)						\
-	do {							\
-		int ret = errno;				\
-		fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__);	\
-		fprintf(stderr, " (errno=%d, line=%d)\n",	\
-			ret, __LINE__);				\
-	} while (0)
-
-#define errexit(exitcode, fmt, ...)		\
-	do {					\
-		_err(fmt, ##__VA_ARGS__);	\
-		exit(exitcode);			\
-	} while (0)
-
-#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__)
-
-static void uffd_stats_reset(struct uffd_stats *uffd_stats,
-			     unsigned long n_cpus)
-{
-	int i;
-
-	for (i = 0; i < n_cpus; i++) {
-		uffd_stats[i].cpu = i;
-		uffd_stats[i].missing_faults = 0;
-		uffd_stats[i].wp_faults = 0;
-		uffd_stats[i].minor_faults = 0;
-	}
-}
-
-static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
-{
-	int i;
-	unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
-
-	for (i = 0; i < n_cpus; i++) {
-		miss_total += stats[i].missing_faults;
-		wp_total += stats[i].wp_faults;
-		minor_total += stats[i].minor_faults;
-	}
-
-	printf("userfaults: ");
-	if (miss_total) {
-		printf("%llu missing (", miss_total);
-		for (i = 0; i < n_cpus; i++)
-			printf("%lu+", stats[i].missing_faults);
-		printf("\b) ");
-	}
-	if (wp_total) {
-		printf("%llu wp (", wp_total);
-		for (i = 0; i < n_cpus; i++)
-			printf("%lu+", stats[i].wp_faults);
-		printf("\b) ");
-	}
-	if (minor_total) {
-		printf("%llu minor (", minor_total);
-		for (i = 0; i < n_cpus; i++)
-			printf("%lu+", stats[i].minor_faults);
-		printf("\b)");
-	}
-	printf("\n");
-}
-
-static void anon_release_pages(char *rel_area)
-{
-	if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
-		err("madvise(MADV_DONTNEED) failed");
-}
-
-static void anon_allocate_area(void **alloc_area, bool is_src)
-{
-	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
-			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-}
-
-static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
-{
-}
-
-static void hugetlb_release_pages(char *rel_area)
-{
-	if (!map_shared) {
-		if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
-			err("madvise(MADV_DONTNEED) failed");
-	} else {
-		if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
-			err("madvise(MADV_REMOVE) failed");
-	}
-}
-
-static void hugetlb_allocate_area(void **alloc_area, bool is_src)
-{
-	off_t size = nr_pages * page_size;
-	off_t offset = is_src ? 0 : size;
-	void *area_alias = NULL;
-	char **alloc_area_alias;
-
-	*alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
-			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
-			   (is_src ? 0 : MAP_NORESERVE),
-			   mem_fd, offset);
-	if (*alloc_area == MAP_FAILED)
-		err("mmap of hugetlbfs file failed");
-
-	if (map_shared) {
-		area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
-				  MAP_SHARED, mem_fd, offset);
-		if (area_alias == MAP_FAILED)
-			err("mmap of hugetlb file alias failed");
-	}
-
-	if (is_src) {
-		alloc_area_alias = &area_src_alias;
-	} else {
-		alloc_area_alias = &area_dst_alias;
-	}
-	if (area_alias)
-		*alloc_area_alias = area_alias;
-}
-
-static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
-{
-	if (!map_shared)
-		return;
-
-	*start = (unsigned long) area_dst_alias + offset;
-}
-
-static void shmem_release_pages(char *rel_area)
-{
-	if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
-		err("madvise(MADV_REMOVE) failed");
-}
-
-static void shmem_allocate_area(void **alloc_area, bool is_src)
-{
-	void *area_alias = NULL;
-	size_t bytes = nr_pages * page_size;
-	unsigned long offset = is_src ? 0 : bytes;
-	char *p = NULL, *p_alias = NULL;
-
-	if (test_collapse) {
-		p = BASE_PMD_ADDR;
-		if (!is_src)
-			/* src map + alias + interleaved hpages */
-			p += 2 * (bytes + hpage_size);
-		p_alias = p;
-		p_alias += bytes;
-		p_alias += hpage_size;  /* Prevent src/dst VMA merge */
-	}
-
-	*alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
-			   mem_fd, offset);
-	if (*alloc_area == MAP_FAILED)
-		err("mmap of memfd failed");
-	if (test_collapse && *alloc_area != p)
-		err("mmap of memfd failed at %p", p);
-
-	area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
-			  mem_fd, offset);
-	if (area_alias == MAP_FAILED)
-		err("mmap of memfd alias failed");
-	if (test_collapse && area_alias != p_alias)
-		err("mmap of anonymous memory failed at %p", p_alias);
-
-	if (is_src)
-		area_src_alias = area_alias;
-	else
-		area_dst_alias = area_alias;
-}
-
-static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
-{
-	*start = (unsigned long)area_dst_alias + offset;
-}
-
-static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
-{
-	if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size))
-		err("Did not find expected %d number of hugepages",
-		    expect_nr_hpages);
-}
-
-struct uffd_test_ops {
-	void (*allocate_area)(void **alloc_area, bool is_src);
-	void (*release_pages)(char *rel_area);
-	void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
-	void (*check_pmd_mapping)(void *p, int expect_nr_hpages);
-};
-
-static struct uffd_test_ops anon_uffd_test_ops = {
-	.allocate_area	= anon_allocate_area,
-	.release_pages	= anon_release_pages,
-	.alias_mapping = noop_alias_mapping,
-	.check_pmd_mapping = NULL,
-};
-
-static struct uffd_test_ops shmem_uffd_test_ops = {
-	.allocate_area	= shmem_allocate_area,
-	.release_pages	= shmem_release_pages,
-	.alias_mapping = shmem_alias_mapping,
-	.check_pmd_mapping = shmem_check_pmd_mapping,
-};
-
-static struct uffd_test_ops hugetlb_uffd_test_ops = {
-	.allocate_area	= hugetlb_allocate_area,
-	.release_pages	= hugetlb_release_pages,
-	.alias_mapping = hugetlb_alias_mapping,
-	.check_pmd_mapping = NULL,
-};
-
-static struct uffd_test_ops *uffd_test_ops;
-
-static inline uint64_t uffd_minor_feature(void)
-{
-	if (test_type == TEST_HUGETLB && map_shared)
-		return UFFD_FEATURE_MINOR_HUGETLBFS;
-	else if (test_type == TEST_SHMEM)
-		return UFFD_FEATURE_MINOR_SHMEM;
-	else
-		return 0;
-}
-
-static uint64_t get_expected_ioctls(uint64_t mode)
-{
-	uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
-
-	if (test_type == TEST_HUGETLB)
-		ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
-
-	if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
-		ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
-
-	if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
-		ioctls &= ~(1 << _UFFDIO_CONTINUE);
-
-	return ioctls;
-}
-
-static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
-{
-	uint64_t expected = get_expected_ioctls(mode);
-	uint64_t actual = ioctls & expected;
-
-	if (actual != expected) {
-		err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
-		    expected, actual);
-	}
-}
-
-static int __userfaultfd_open_dev(void)
-{
-	int fd, _uffd;
-
-	fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
-	if (fd < 0)
-		errexit(KSFT_SKIP, "opening /dev/userfaultfd failed");
-
-	_uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS);
-	if (_uffd < 0)
-		errexit(errno == ENOTTY ? KSFT_SKIP : 1,
-			"creating userfaultfd failed");
-	close(fd);
-	return _uffd;
-}
-
-static void userfaultfd_open(uint64_t *features)
-{
-	struct uffdio_api uffdio_api;
-
-	if (test_dev_userfaultfd)
-		uffd = __userfaultfd_open_dev();
-	else {
-		uffd = syscall(__NR_userfaultfd, UFFD_FLAGS);
-		if (uffd < 0)
-			errexit(errno == ENOSYS ? KSFT_SKIP : 1,
-				"creating userfaultfd failed");
-	}
-	uffd_flags = fcntl(uffd, F_GETFD, NULL);
-
-	uffdio_api.api = UFFD_API;
-	uffdio_api.features = *features;
-	if (ioctl(uffd, UFFDIO_API, &uffdio_api))
-		err("UFFDIO_API failed.\nPlease make sure to "
-		    "run with either root or ptrace capability.");
-	if (uffdio_api.api != UFFD_API)
-		err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
-
-	*features = uffdio_api.features;
-}
-
-static inline void munmap_area(void **area)
-{
-	if (*area)
-		if (munmap(*area, nr_pages * page_size))
-			err("munmap");
-
-	*area = NULL;
-}
-
-static void uffd_test_ctx_clear(void)
-{
-	size_t i;
-
-	if (pipefd) {
-		for (i = 0; i < nr_cpus * 2; ++i) {
-			if (close(pipefd[i]))
-				err("close pipefd");
-		}
-		free(pipefd);
-		pipefd = NULL;
-	}
-
-	if (count_verify) {
-		free(count_verify);
-		count_verify = NULL;
-	}
-
-	if (uffd != -1) {
-		if (close(uffd))
-			err("close uffd");
-		uffd = -1;
-	}
-
-	munmap_area((void **)&area_src);
-	munmap_area((void **)&area_src_alias);
-	munmap_area((void **)&area_dst);
-	munmap_area((void **)&area_dst_alias);
-	munmap_area((void **)&area_remap);
-}
-
-static void uffd_test_ctx_init(uint64_t features)
-{
-	unsigned long nr, cpu;
-
-	uffd_test_ctx_clear();
-
-	uffd_test_ops->allocate_area((void **)&area_src, true);
-	uffd_test_ops->allocate_area((void **)&area_dst, false);
-
-	userfaultfd_open(&features);
-
-	count_verify = malloc(nr_pages * sizeof(unsigned long long));
-	if (!count_verify)
-		err("count_verify");
-
-	for (nr = 0; nr < nr_pages; nr++) {
-		*area_mutex(area_src, nr) =
-			(pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
-		count_verify[nr] = *area_count(area_src, nr) = 1;
-		/*
-		 * In the transition between 255 to 256, powerpc will
-		 * read out of order in my_bcmp and see both bytes as
-		 * zero, so leave a placeholder below always non-zero
-		 * after the count, to avoid my_bcmp to trigger false
-		 * positives.
-		 */
-		*(area_count(area_src, nr) + 1) = 1;
-	}
-
-	/*
-	 * After initialization of area_src, we must explicitly release pages
-	 * for area_dst to make sure it's fully empty.  Otherwise we could have
-	 * some area_dst pages be errornously initialized with zero pages,
-	 * hence we could hit memory corruption later in the test.
-	 *
-	 * One example is when THP is globally enabled, above allocate_area()
-	 * calls could have the two areas merged into a single VMA (as they
-	 * will have the same VMA flags so they're mergeable).  When we
-	 * initialize the area_src above, it's possible that some part of
-	 * area_dst could have been faulted in via one huge THP that will be
-	 * shared between area_src and area_dst.  It could cause some of the
-	 * area_dst won't be trapped by missing userfaults.
-	 *
-	 * This release_pages() will guarantee even if that happened, we'll
-	 * proactively split the thp and drop any accidentally initialized
-	 * pages within area_dst.
-	 */
-	uffd_test_ops->release_pages(area_dst);
-
-	pipefd = malloc(sizeof(int) * nr_cpus * 2);
-	if (!pipefd)
-		err("pipefd");
-	for (cpu = 0; cpu < nr_cpus; cpu++)
-		if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
-			err("pipe");
-}
-
-static int my_bcmp(char *str1, char *str2, size_t n)
-{
-	unsigned long i;
-	for (i = 0; i < n; i++)
-		if (str1[i] != str2[i])
-			return 1;
-	return 0;
-}
-
-static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
-{
-	struct uffdio_writeprotect prms;
-
-	/* Write protection page faults */
-	prms.range.start = start;
-	prms.range.len = len;
-	/* Undo write-protect, do wakeup after that */
-	prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
-
-	if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
-		err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
-}
-
-static void continue_range(int ufd, __u64 start, __u64 len)
-{
-	struct uffdio_continue req;
-	int ret;
-
-	req.range.start = start;
-	req.range.len = len;
-	req.mode = 0;
-
-	if (ioctl(ufd, UFFDIO_CONTINUE, &req))
-		err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
-		    (uint64_t)start);
-
-	/*
-	 * Error handling within the kernel for continue is subtly different
-	 * from copy or zeropage, so it may be a source of bugs. Trigger an
-	 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
-	 */
-	req.mapped = 0;
-	ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
-	if (ret >= 0 || req.mapped != -EEXIST)
-		err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
-		    ret, (int64_t) req.mapped);
-}
-
-static void *locking_thread(void *arg)
-{
-	unsigned long cpu = (unsigned long) arg;
-	unsigned long page_nr;
-	unsigned long long count;
-
-	if (!(bounces & BOUNCE_RANDOM)) {
-		page_nr = -bounces;
-		if (!(bounces & BOUNCE_RACINGFAULTS))
-			page_nr += cpu * nr_pages_per_cpu;
-	}
-
-	while (!finished) {
-		if (bounces & BOUNCE_RANDOM) {
-			if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr))
-				err("getrandom failed");
-		} else
-			page_nr += 1;
-		page_nr %= nr_pages;
-		pthread_mutex_lock(area_mutex(area_dst, page_nr));
-		count = *area_count(area_dst, page_nr);
-		if (count != count_verify[page_nr])
-			err("page_nr %lu memory corruption %llu %llu",
-			    page_nr, count, count_verify[page_nr]);
-		count++;
-		*area_count(area_dst, page_nr) = count_verify[page_nr] = count;
-		pthread_mutex_unlock(area_mutex(area_dst, page_nr));
-	}
-
-	return NULL;
-}
-
-static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
-			    unsigned long offset)
-{
-	uffd_test_ops->alias_mapping(&uffdio_copy->dst,
-				     uffdio_copy->len,
-				     offset);
-	if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
-		/* real retval in ufdio_copy.copy */
-		if (uffdio_copy->copy != -EEXIST)
-			err("UFFDIO_COPY retry error: %"PRId64,
-			    (int64_t)uffdio_copy->copy);
-	} else {
-		err("UFFDIO_COPY retry unexpected: %"PRId64,
-		    (int64_t)uffdio_copy->copy);
-	}
-}
-
-static void wake_range(int ufd, unsigned long addr, unsigned long len)
-{
-	struct uffdio_range uffdio_wake;
-
-	uffdio_wake.start = addr;
-	uffdio_wake.len = len;
-
-	if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
-		fprintf(stderr, "error waking %lu\n",
-			addr), exit(1);
-}
-
-static int __copy_page(int ufd, unsigned long offset, bool retry)
-{
-	struct uffdio_copy uffdio_copy;
-
-	if (offset >= nr_pages * page_size)
-		err("unexpected offset %lu\n", offset);
-	uffdio_copy.dst = (unsigned long) area_dst + offset;
-	uffdio_copy.src = (unsigned long) area_src + offset;
-	uffdio_copy.len = page_size;
-	if (test_uffdio_wp)
-		uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
-	else
-		uffdio_copy.mode = 0;
-	uffdio_copy.copy = 0;
-	if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
-		/* real retval in ufdio_copy.copy */
-		if (uffdio_copy.copy != -EEXIST)
-			err("UFFDIO_COPY error: %"PRId64,
-			    (int64_t)uffdio_copy.copy);
-		wake_range(ufd, uffdio_copy.dst, page_size);
-	} else if (uffdio_copy.copy != page_size) {
-		err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
-	} else {
-		if (test_uffdio_copy_eexist && retry) {
-			test_uffdio_copy_eexist = false;
-			retry_copy_page(ufd, &uffdio_copy, offset);
-		}
-		return 1;
-	}
-	return 0;
-}
-
-static int copy_page_retry(int ufd, unsigned long offset)
-{
-	return __copy_page(ufd, offset, true);
-}
-
-static int copy_page(int ufd, unsigned long offset)
-{
-	return __copy_page(ufd, offset, false);
-}
-
-static int uffd_read_msg(int ufd, struct uffd_msg *msg)
-{
-	int ret = read(uffd, msg, sizeof(*msg));
-
-	if (ret != sizeof(*msg)) {
-		if (ret < 0) {
-			if (errno == EAGAIN || errno == EINTR)
-				return 1;
-			err("blocking read error");
-		} else {
-			err("short read");
-		}
-	}
-
-	return 0;
-}
-
-static void uffd_handle_page_fault(struct uffd_msg *msg,
-				   struct uffd_stats *stats)
-{
-	unsigned long offset;
-
-	if (msg->event != UFFD_EVENT_PAGEFAULT)
-		err("unexpected msg event %u", msg->event);
-
-	if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
-		/* Write protect page faults */
-		wp_range(uffd, msg->arg.pagefault.address, page_size, false);
-		stats->wp_faults++;
-	} else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
-		uint8_t *area;
-		int b;
-
-		/*
-		 * Minor page faults
-		 *
-		 * To prove we can modify the original range for testing
-		 * purposes, we're going to bit flip this range before
-		 * continuing.
-		 *
-		 * Note that this requires all minor page fault tests operate on
-		 * area_dst (non-UFFD-registered) and area_dst_alias
-		 * (UFFD-registered).
-		 */
-
-		area = (uint8_t *)(area_dst +
-				   ((char *)msg->arg.pagefault.address -
-				    area_dst_alias));
-		for (b = 0; b < page_size; ++b)
-			area[b] = ~area[b];
-		continue_range(uffd, msg->arg.pagefault.address, page_size);
-		stats->minor_faults++;
-	} else {
-		/*
-		 * Missing page faults.
-		 *
-		 * Here we force a write check for each of the missing mode
-		 * faults.  It's guaranteed because the only threads that
-		 * will trigger uffd faults are the locking threads, and
-		 * their first instruction to touch the missing page will
-		 * always be pthread_mutex_lock().
-		 *
-		 * Note that here we relied on an NPTL glibc impl detail to
-		 * always read the lock type at the entry of the lock op
-		 * (pthread_mutex_t.__data.__type, offset 0x10) before
-		 * doing any locking operations to guarantee that.  It's
-		 * actually not good to rely on this impl detail because
-		 * logically a pthread-compatible lib can implement the
-		 * locks without types and we can fail when linking with
-		 * them.  However since we used to find bugs with this
-		 * strict check we still keep it around.  Hopefully this
-		 * could be a good hint when it fails again.  If one day
-		 * it'll break on some other impl of glibc we'll revisit.
-		 */
-		if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
-			err("unexpected write fault");
-
-		offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
-		offset &= ~(page_size-1);
-
-		if (copy_page(uffd, offset))
-			stats->missing_faults++;
-	}
-}
-
-static void *uffd_poll_thread(void *arg)
-{
-	struct uffd_stats *stats = (struct uffd_stats *)arg;
-	unsigned long cpu = stats->cpu;
-	struct pollfd pollfd[2];
-	struct uffd_msg msg;
-	struct uffdio_register uffd_reg;
-	int ret;
-	char tmp_chr;
-
-	pollfd[0].fd = uffd;
-	pollfd[0].events = POLLIN;
-	pollfd[1].fd = pipefd[cpu*2];
-	pollfd[1].events = POLLIN;
-
-	for (;;) {
-		ret = poll(pollfd, 2, -1);
-		if (ret <= 0) {
-			if (errno == EINTR || errno == EAGAIN)
-				continue;
-			err("poll error: %d", ret);
-		}
-		if (pollfd[1].revents & POLLIN) {
-			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
-				err("read pipefd error");
-			break;
-		}
-		if (!(pollfd[0].revents & POLLIN))
-			err("pollfd[0].revents %d", pollfd[0].revents);
-		if (uffd_read_msg(uffd, &msg))
-			continue;
-		switch (msg.event) {
-		default:
-			err("unexpected msg event %u\n", msg.event);
-			break;
-		case UFFD_EVENT_PAGEFAULT:
-			uffd_handle_page_fault(&msg, stats);
-			break;
-		case UFFD_EVENT_FORK:
-			close(uffd);
-			uffd = msg.arg.fork.ufd;
-			pollfd[0].fd = uffd;
-			break;
-		case UFFD_EVENT_REMOVE:
-			uffd_reg.range.start = msg.arg.remove.start;
-			uffd_reg.range.len = msg.arg.remove.end -
-				msg.arg.remove.start;
-			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
-				err("remove failure");
-			break;
-		case UFFD_EVENT_REMAP:
-			area_remap = area_dst;  /* save for later unmap */
-			area_dst = (char *)(unsigned long)msg.arg.remap.to;
-			break;
-		}
-	}
-
-	return NULL;
-}
-
-pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-static void *uffd_read_thread(void *arg)
-{
-	struct uffd_stats *stats = (struct uffd_stats *)arg;
-	struct uffd_msg msg;
-
-	pthread_mutex_unlock(&uffd_read_mutex);
-	/* from here cancellation is ok */
-
-	for (;;) {
-		if (uffd_read_msg(uffd, &msg))
-			continue;
-		uffd_handle_page_fault(&msg, stats);
-	}
-
-	return NULL;
-}
-
-static void *background_thread(void *arg)
-{
-	unsigned long cpu = (unsigned long) arg;
-	unsigned long page_nr, start_nr, mid_nr, end_nr;
-
-	start_nr = cpu * nr_pages_per_cpu;
-	end_nr = (cpu+1) * nr_pages_per_cpu;
-	mid_nr = (start_nr + end_nr) / 2;
-
-	/* Copy the first half of the pages */
-	for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
-		copy_page_retry(uffd, page_nr * page_size);
-
-	/*
-	 * If we need to test uffd-wp, set it up now.  Then we'll have
-	 * at least the first half of the pages mapped already which
-	 * can be write-protected for testing
-	 */
-	if (test_uffdio_wp)
-		wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
-			nr_pages_per_cpu * page_size, true);
-
-	/*
-	 * Continue the 2nd half of the page copying, handling write
-	 * protection faults if any
-	 */
-	for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
-		copy_page_retry(uffd, page_nr * page_size);
-
-	return NULL;
-}
-
-static int stress(struct uffd_stats *uffd_stats)
-{
-	unsigned long cpu;
-	pthread_t locking_threads[nr_cpus];
-	pthread_t uffd_threads[nr_cpus];
-	pthread_t background_threads[nr_cpus];
-
-	finished = 0;
-	for (cpu = 0; cpu < nr_cpus; cpu++) {
-		if (pthread_create(&locking_threads[cpu], &attr,
-				   locking_thread, (void *)cpu))
-			return 1;
-		if (bounces & BOUNCE_POLL) {
-			if (pthread_create(&uffd_threads[cpu], &attr,
-					   uffd_poll_thread,
-					   (void *)&uffd_stats[cpu]))
-				return 1;
-		} else {
-			if (pthread_create(&uffd_threads[cpu], &attr,
-					   uffd_read_thread,
-					   (void *)&uffd_stats[cpu]))
-				return 1;
-			pthread_mutex_lock(&uffd_read_mutex);
-		}
-		if (pthread_create(&background_threads[cpu], &attr,
-				   background_thread, (void *)cpu))
-			return 1;
-	}
-	for (cpu = 0; cpu < nr_cpus; cpu++)
-		if (pthread_join(background_threads[cpu], NULL))
-			return 1;
-
-	/*
-	 * Be strict and immediately zap area_src, the whole area has
-	 * been transferred already by the background treads. The
-	 * area_src could then be faulted in a racy way by still
-	 * running uffdio_threads reading zeropages after we zapped
-	 * area_src (but they're guaranteed to get -EEXIST from
-	 * UFFDIO_COPY without writing zero pages into area_dst
-	 * because the background threads already completed).
-	 */
-	uffd_test_ops->release_pages(area_src);
-
-	finished = 1;
-	for (cpu = 0; cpu < nr_cpus; cpu++)
-		if (pthread_join(locking_threads[cpu], NULL))
-			return 1;
-
-	for (cpu = 0; cpu < nr_cpus; cpu++) {
-		char c;
-		if (bounces & BOUNCE_POLL) {
-			if (write(pipefd[cpu*2+1], &c, 1) != 1)
-				err("pipefd write error");
-			if (pthread_join(uffd_threads[cpu],
-					 (void *)&uffd_stats[cpu]))
-				return 1;
-		} else {
-			if (pthread_cancel(uffd_threads[cpu]))
-				return 1;
-			if (pthread_join(uffd_threads[cpu], NULL))
-				return 1;
-		}
-	}
-
-	return 0;
-}
-
-sigjmp_buf jbuf, *sigbuf;
-
-static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
-{
-	if (sig == SIGBUS) {
-		if (sigbuf)
-			siglongjmp(*sigbuf, 1);
-		abort();
-	}
-}
-
-/*
- * For non-cooperative userfaultfd test we fork() a process that will
- * generate pagefaults, will mremap the area monitored by the
- * userfaultfd and at last this process will release the monitored
- * area.
- * For the anonymous and shared memory the area is divided into two
- * parts, the first part is accessed before mremap, and the second
- * part is accessed after mremap. Since hugetlbfs does not support
- * mremap, the entire monitored area is accessed in a single pass for
- * HUGETLB_TEST.
- * The release of the pages currently generates event for shmem and
- * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
- * for hugetlb.
- * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
- * monitored area, generate pagefaults and test that signal is delivered.
- * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
- * test robustness use case - we release monitored area, fork a process
- * that will generate pagefaults and verify signal is generated.
- * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
- * feature. Using monitor thread, verify no userfault events are generated.
- */
-static int faulting_process(int signal_test)
-{
-	unsigned long nr;
-	unsigned long long count;
-	unsigned long split_nr_pages;
-	unsigned long lastnr;
-	struct sigaction act;
-	volatile unsigned long signalled = 0;
-
-	split_nr_pages = (nr_pages + 1) / 2;
-
-	if (signal_test) {
-		sigbuf = &jbuf;
-		memset(&act, 0, sizeof(act));
-		act.sa_sigaction = sighndl;
-		act.sa_flags = SA_SIGINFO;
-		if (sigaction(SIGBUS, &act, 0))
-			err("sigaction");
-		lastnr = (unsigned long)-1;
-	}
-
-	for (nr = 0; nr < split_nr_pages; nr++) {
-		volatile int steps = 1;
-		unsigned long offset = nr * page_size;
-
-		if (signal_test) {
-			if (sigsetjmp(*sigbuf, 1) != 0) {
-				if (steps == 1 && nr == lastnr)
-					err("Signal repeated");
-
-				lastnr = nr;
-				if (signal_test == 1) {
-					if (steps == 1) {
-						/* This is a MISSING request */
-						steps++;
-						if (copy_page(uffd, offset))
-							signalled++;
-					} else {
-						/* This is a WP request */
-						assert(steps == 2);
-						wp_range(uffd,
-							 (__u64)area_dst +
-							 offset,
-							 page_size, false);
-					}
-				} else {
-					signalled++;
-					continue;
-				}
-			}
-		}
-
-		count = *area_count(area_dst, nr);
-		if (count != count_verify[nr])
-			err("nr %lu memory corruption %llu %llu\n",
-			    nr, count, count_verify[nr]);
-		/*
-		 * Trigger write protection if there is by writing
-		 * the same value back.
-		 */
-		*area_count(area_dst, nr) = count;
-	}
-
-	if (signal_test)
-		return signalled != split_nr_pages;
-
-	area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
-			  MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
-	if (area_dst == MAP_FAILED)
-		err("mremap");
-	/* Reset area_src since we just clobbered it */
-	area_src = NULL;
-
-	for (; nr < nr_pages; nr++) {
-		count = *area_count(area_dst, nr);
-		if (count != count_verify[nr]) {
-			err("nr %lu memory corruption %llu %llu\n",
-			    nr, count, count_verify[nr]);
-		}
-		/*
-		 * Trigger write protection if there is by writing
-		 * the same value back.
-		 */
-		*area_count(area_dst, nr) = count;
-	}
-
-	uffd_test_ops->release_pages(area_dst);
-
-	for (nr = 0; nr < nr_pages; nr++)
-		if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
-			err("nr %lu is not zero", nr);
-
-	return 0;
-}
-
-static void retry_uffdio_zeropage(int ufd,
-				  struct uffdio_zeropage *uffdio_zeropage,
-				  unsigned long offset)
-{
-	uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
-				     uffdio_zeropage->range.len,
-				     offset);
-	if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
-		if (uffdio_zeropage->zeropage != -EEXIST)
-			err("UFFDIO_ZEROPAGE error: %"PRId64,
-			    (int64_t)uffdio_zeropage->zeropage);
-	} else {
-		err("UFFDIO_ZEROPAGE error: %"PRId64,
-		    (int64_t)uffdio_zeropage->zeropage);
-	}
-}
-
-static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
-{
-	struct uffdio_zeropage uffdio_zeropage;
-	int ret;
-	bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE);
-	__s64 res;
-
-	if (offset >= nr_pages * page_size)
-		err("unexpected offset %lu", offset);
-	uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
-	uffdio_zeropage.range.len = page_size;
-	uffdio_zeropage.mode = 0;
-	ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
-	res = uffdio_zeropage.zeropage;
-	if (ret) {
-		/* real retval in ufdio_zeropage.zeropage */
-		if (has_zeropage)
-			err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
-		else if (res != -EINVAL)
-			err("UFFDIO_ZEROPAGE not -EINVAL");
-	} else if (has_zeropage) {
-		if (res != page_size) {
-			err("UFFDIO_ZEROPAGE unexpected size");
-		} else {
-			if (test_uffdio_zeropage_eexist && retry) {
-				test_uffdio_zeropage_eexist = false;
-				retry_uffdio_zeropage(ufd, &uffdio_zeropage,
-						      offset);
-			}
-			return 1;
-		}
-	} else
-		err("UFFDIO_ZEROPAGE succeeded");
-
-	return 0;
-}
-
-static int uffdio_zeropage(int ufd, unsigned long offset)
-{
-	return __uffdio_zeropage(ufd, offset, false);
-}
-
-/* exercise UFFDIO_ZEROPAGE */
-static int userfaultfd_zeropage_test(void)
-{
-	struct uffdio_register uffdio_register;
-
-	printf("testing UFFDIO_ZEROPAGE: ");
-	fflush(stdout);
-
-	uffd_test_ctx_init(0);
-
-	uffdio_register.range.start = (unsigned long) area_dst;
-	uffdio_register.range.len = nr_pages * page_size;
-	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
-	if (test_uffdio_wp)
-		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
-	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
-		err("register failure");
-
-	assert_expected_ioctls_present(
-		uffdio_register.mode, uffdio_register.ioctls);
-
-	if (uffdio_zeropage(uffd, 0))
-		if (my_bcmp(area_dst, zeropage, page_size))
-			err("zeropage is not zero");
-
-	printf("done.\n");
-	return 0;
-}
-
-static int userfaultfd_events_test(void)
-{
-	struct uffdio_register uffdio_register;
-	pthread_t uffd_mon;
-	int err, features;
-	pid_t pid;
-	char c;
-	struct uffd_stats stats = { 0 };
-
-	printf("testing events (fork, remap, remove): ");
-	fflush(stdout);
-
-	features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
-		UFFD_FEATURE_EVENT_REMOVE;
-	uffd_test_ctx_init(features);
-
-	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
-
-	uffdio_register.range.start = (unsigned long) area_dst;
-	uffdio_register.range.len = nr_pages * page_size;
-	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
-	if (test_uffdio_wp)
-		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
-	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
-		err("register failure");
-
-	assert_expected_ioctls_present(
-		uffdio_register.mode, uffdio_register.ioctls);
-
-	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
-		err("uffd_poll_thread create");
-
-	pid = fork();
-	if (pid < 0)
-		err("fork");
-
-	if (!pid)
-		exit(faulting_process(0));
-
-	waitpid(pid, &err, 0);
-	if (err)
-		err("faulting process failed");
-	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
-		err("pipe write");
-	if (pthread_join(uffd_mon, NULL))
-		return 1;
-
-	uffd_stats_report(&stats, 1);
-
-	return stats.missing_faults != nr_pages;
-}
-
-static int userfaultfd_sig_test(void)
-{
-	struct uffdio_register uffdio_register;
-	unsigned long userfaults;
-	pthread_t uffd_mon;
-	int err, features;
-	pid_t pid;
-	char c;
-	struct uffd_stats stats = { 0 };
-
-	printf("testing signal delivery: ");
-	fflush(stdout);
-
-	features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
-	uffd_test_ctx_init(features);
-
-	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
-
-	uffdio_register.range.start = (unsigned long) area_dst;
-	uffdio_register.range.len = nr_pages * page_size;
-	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
-	if (test_uffdio_wp)
-		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
-	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
-		err("register failure");
-
-	assert_expected_ioctls_present(
-		uffdio_register.mode, uffdio_register.ioctls);
-
-	if (faulting_process(1))
-		err("faulting process failed");
-
-	uffd_test_ops->release_pages(area_dst);
-
-	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
-		err("uffd_poll_thread create");
-
-	pid = fork();
-	if (pid < 0)
-		err("fork");
-
-	if (!pid)
-		exit(faulting_process(2));
-
-	waitpid(pid, &err, 0);
-	if (err)
-		err("faulting process failed");
-	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
-		err("pipe write");
-	if (pthread_join(uffd_mon, (void **)&userfaults))
-		return 1;
-
-	printf("done.\n");
-	if (userfaults)
-		err("Signal test failed, userfaults: %ld", userfaults);
-
-	return userfaults != 0;
-}
-
-void check_memory_contents(char *p)
-{
-	unsigned long i;
-	uint8_t expected_byte;
-	void *expected_page;
-
-	if (posix_memalign(&expected_page, page_size, page_size))
-		err("out of memory");
-
-	for (i = 0; i < nr_pages; ++i) {
-		expected_byte = ~((uint8_t)(i % ((uint8_t)-1)));
-		memset(expected_page, expected_byte, page_size);
-		if (my_bcmp(expected_page, p + (i * page_size), page_size))
-			err("unexpected page contents after minor fault");
-	}
-
-	free(expected_page);
-}
-
-static int userfaultfd_minor_test(void)
-{
-	unsigned long p;
-	struct uffdio_register uffdio_register;
-	pthread_t uffd_mon;
-	char c;
-	struct uffd_stats stats = { 0 };
-
-	if (!test_uffdio_minor)
-		return 0;
-
-	printf("testing minor faults: ");
-	fflush(stdout);
-
-	uffd_test_ctx_init(uffd_minor_feature());
-
-	uffdio_register.range.start = (unsigned long)area_dst_alias;
-	uffdio_register.range.len = nr_pages * page_size;
-	uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
-	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
-		err("register failure");
-
-	assert_expected_ioctls_present(
-		uffdio_register.mode, uffdio_register.ioctls);
-
-	/*
-	 * After registering with UFFD, populate the non-UFFD-registered side of
-	 * the shared mapping. This should *not* trigger any UFFD minor faults.
-	 */
-	for (p = 0; p < nr_pages; ++p) {
-		memset(area_dst + (p * page_size), p % ((uint8_t)-1),
-		       page_size);
-	}
-
-	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
-		err("uffd_poll_thread create");
-
-	/*
-	 * Read each of the pages back using the UFFD-registered mapping. We
-	 * expect that the first time we touch a page, it will result in a minor
-	 * fault. uffd_poll_thread will resolve the fault by bit-flipping the
-	 * page's contents, and then issuing a CONTINUE ioctl.
-	 */
-	check_memory_contents(area_dst_alias);
-
-	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
-		err("pipe write");
-	if (pthread_join(uffd_mon, NULL))
-		return 1;
-
-	uffd_stats_report(&stats, 1);
-
-	if (test_collapse) {
-		printf("testing collapse of uffd memory into PMD-mapped THPs:");
-		if (madvise(area_dst_alias, nr_pages * page_size,
-			    MADV_COLLAPSE))
-			err("madvise(MADV_COLLAPSE)");
-
-		uffd_test_ops->check_pmd_mapping(area_dst,
-						 nr_pages * page_size /
-						 hpage_size);
-		/*
-		 * This won't cause uffd-fault - it purely just makes sure there
-		 * was no corruption.
-		 */
-		check_memory_contents(area_dst_alias);
-		printf(" done.\n");
-	}
-
-	return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
-}
-
-#define BIT_ULL(nr)                   (1ULL << (nr))
-#define PM_SOFT_DIRTY                 BIT_ULL(55)
-#define PM_MMAP_EXCLUSIVE             BIT_ULL(56)
-#define PM_UFFD_WP                    BIT_ULL(57)
-#define PM_FILE                       BIT_ULL(61)
-#define PM_SWAP                       BIT_ULL(62)
-#define PM_PRESENT                    BIT_ULL(63)
-
-static int pagemap_open(void)
-{
-	int fd = open("/proc/self/pagemap", O_RDONLY);
-
-	if (fd < 0)
-		err("open pagemap");
-
-	return fd;
-}
-
-static uint64_t pagemap_read_vaddr(int fd, void *vaddr)
-{
-	uint64_t value;
-	int ret;
-
-	ret = pread(fd, &value, sizeof(uint64_t),
-		    ((uint64_t)vaddr >> 12) * sizeof(uint64_t));
-	if (ret != sizeof(uint64_t))
-		err("pread() on pagemap failed");
-
-	return value;
-}
-
-/* This macro let __LINE__ works in err() */
-#define  pagemap_check_wp(value, wp) do {				\
-		if (!!(value & PM_UFFD_WP) != wp)			\
-			err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
-	} while (0)
-
-static int pagemap_test_fork(bool present)
-{
-	pid_t child = fork();
-	uint64_t value;
-	int fd, result;
-
-	if (!child) {
-		/* Open the pagemap fd of the child itself */
-		fd = pagemap_open();
-		value = pagemap_read_vaddr(fd, area_dst);
-		/*
-		 * After fork() uffd-wp bit should be gone as long as we're
-		 * without UFFD_FEATURE_EVENT_FORK
-		 */
-		pagemap_check_wp(value, false);
-		/* Succeed */
-		exit(0);
-	}
-	waitpid(child, &result, 0);
-	return result;
-}
-
-static void userfaultfd_pagemap_test(unsigned int test_pgsize)
-{
-	struct uffdio_register uffdio_register;
-	int pagemap_fd;
-	uint64_t value;
-
-	/* Pagemap tests uffd-wp only */
-	if (!test_uffdio_wp)
-		return;
-
-	/* Not enough memory to test this page size */
-	if (test_pgsize > nr_pages * page_size)
-		return;
-
-	printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize);
-	/* Flush so it doesn't flush twice in parent/child later */
-	fflush(stdout);
-
-	uffd_test_ctx_init(0);
-
-	if (test_pgsize > page_size) {
-		/* This is a thp test */
-		if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
-			err("madvise(MADV_HUGEPAGE) failed");
-	} else if (test_pgsize == page_size) {
-		/* This is normal page test; force no thp */
-		if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
-			err("madvise(MADV_NOHUGEPAGE) failed");
-	}
-
-	uffdio_register.range.start = (unsigned long) area_dst;
-	uffdio_register.range.len = nr_pages * page_size;
-	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
-	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
-		err("register failed");
-
-	pagemap_fd = pagemap_open();
-
-	/* Touch the page */
-	*area_dst = 1;
-	wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
-	value = pagemap_read_vaddr(pagemap_fd, area_dst);
-	pagemap_check_wp(value, true);
-	/* Make sure uffd-wp bit dropped when fork */
-	if (pagemap_test_fork(true))
-		err("Detected stall uffd-wp bit in child");
-
-	/* Exclusive required or PAGEOUT won't work */
-	if (!(value & PM_MMAP_EXCLUSIVE))
-		err("multiple mapping detected: 0x%"PRIx64, value);
-
-	if (madvise(area_dst, test_pgsize, MADV_PAGEOUT))
-		err("madvise(MADV_PAGEOUT) failed");
-
-	/* Uffd-wp should persist even swapped out */
-	value = pagemap_read_vaddr(pagemap_fd, area_dst);
-	pagemap_check_wp(value, true);
-	/* Make sure uffd-wp bit dropped when fork */
-	if (pagemap_test_fork(false))
-		err("Detected stall uffd-wp bit in child");
-
-	/* Unprotect; this tests swap pte modifications */
-	wp_range(uffd, (uint64_t)area_dst, page_size, false);
-	value = pagemap_read_vaddr(pagemap_fd, area_dst);
-	pagemap_check_wp(value, false);
-
-	/* Fault in the page from disk */
-	*area_dst = 2;
-	value = pagemap_read_vaddr(pagemap_fd, area_dst);
-	pagemap_check_wp(value, false);
-
-	close(pagemap_fd);
-	printf("done\n");
-}
-
-static int userfaultfd_stress(void)
-{
-	void *area;
-	unsigned long nr;
-	struct uffdio_register uffdio_register;
-	struct uffd_stats uffd_stats[nr_cpus];
-
-	uffd_test_ctx_init(0);
-
-	if (posix_memalign(&area, page_size, page_size))
-		err("out of memory");
-	zeropage = area;
-	bzero(zeropage, page_size);
-
-	pthread_mutex_lock(&uffd_read_mutex);
-
-	pthread_attr_init(&attr);
-	pthread_attr_setstacksize(&attr, 16*1024*1024);
-
-	while (bounces--) {
-		printf("bounces: %d, mode:", bounces);
-		if (bounces & BOUNCE_RANDOM)
-			printf(" rnd");
-		if (bounces & BOUNCE_RACINGFAULTS)
-			printf(" racing");
-		if (bounces & BOUNCE_VERIFY)
-			printf(" ver");
-		if (bounces & BOUNCE_POLL)
-			printf(" poll");
-		else
-			printf(" read");
-		printf(", ");
-		fflush(stdout);
-
-		if (bounces & BOUNCE_POLL)
-			fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
-		else
-			fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
-
-		/* register */
-		uffdio_register.range.start = (unsigned long) area_dst;
-		uffdio_register.range.len = nr_pages * page_size;
-		uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
-		if (test_uffdio_wp)
-			uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
-		if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
-			err("register failure");
-		assert_expected_ioctls_present(
-			uffdio_register.mode, uffdio_register.ioctls);
-
-		if (area_dst_alias) {
-			uffdio_register.range.start = (unsigned long)
-				area_dst_alias;
-			if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
-				err("register failure alias");
-		}
-
-		/*
-		 * The madvise done previously isn't enough: some
-		 * uffd_thread could have read userfaults (one of
-		 * those already resolved by the background thread)
-		 * and it may be in the process of calling
-		 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
-		 * area_src and it would map a zero page in it (of
-		 * course such a UFFDIO_COPY is perfectly safe as it'd
-		 * return -EEXIST). The problem comes at the next
-		 * bounce though: that racing UFFDIO_COPY would
-		 * generate zeropages in the area_src, so invalidating
-		 * the previous MADV_DONTNEED. Without this additional
-		 * MADV_DONTNEED those zeropages leftovers in the
-		 * area_src would lead to -EEXIST failure during the
-		 * next bounce, effectively leaving a zeropage in the
-		 * area_dst.
-		 *
-		 * Try to comment this out madvise to see the memory
-		 * corruption being caught pretty quick.
-		 *
-		 * khugepaged is also inhibited to collapse THP after
-		 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
-		 * required to MADV_DONTNEED here.
-		 */
-		uffd_test_ops->release_pages(area_dst);
-
-		uffd_stats_reset(uffd_stats, nr_cpus);
-
-		/* bounce pass */
-		if (stress(uffd_stats))
-			return 1;
-
-		/* Clear all the write protections if there is any */
-		if (test_uffdio_wp)
-			wp_range(uffd, (unsigned long)area_dst,
-				 nr_pages * page_size, false);
-
-		/* unregister */
-		if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
-			err("unregister failure");
-		if (area_dst_alias) {
-			uffdio_register.range.start = (unsigned long) area_dst;
-			if (ioctl(uffd, UFFDIO_UNREGISTER,
-				  &uffdio_register.range))
-				err("unregister failure alias");
-		}
-
-		/* verification */
-		if (bounces & BOUNCE_VERIFY)
-			for (nr = 0; nr < nr_pages; nr++)
-				if (*area_count(area_dst, nr) != count_verify[nr])
-					err("error area_count %llu %llu %lu\n",
-					    *area_count(area_src, nr),
-					    count_verify[nr], nr);
-
-		/* prepare next bounce */
-		swap(area_src, area_dst);
-
-		swap(area_src_alias, area_dst_alias);
-
-		uffd_stats_report(uffd_stats, nr_cpus);
-	}
-
-	if (test_type == TEST_ANON) {
-		/*
-		 * shmem/hugetlb won't be able to run since they have different
-		 * behavior on fork() (file-backed memory normally drops ptes
-		 * directly when fork), meanwhile the pagemap test will verify
-		 * pgtable entry of fork()ed child.
-		 */
-		userfaultfd_pagemap_test(page_size);
-		/*
-		 * Hard-code for x86_64 for now for 2M THP, as x86_64 is
-		 * currently the only one that supports uffd-wp
-		 */
-		userfaultfd_pagemap_test(page_size * 512);
-	}
-
-	return userfaultfd_zeropage_test() || userfaultfd_sig_test()
-		|| userfaultfd_events_test() || userfaultfd_minor_test();
-}
-
-/*
- * Copied from mlock2-tests.c
- */
-unsigned long default_huge_page_size(void)
-{
-	unsigned long hps = 0;
-	char *line = NULL;
-	size_t linelen = 0;
-	FILE *f = fopen("/proc/meminfo", "r");
-
-	if (!f)
-		return 0;
-	while (getline(&line, &linelen, f) > 0) {
-		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
-			hps <<= 10;
-			break;
-		}
-	}
-
-	free(line);
-	fclose(f);
-	return hps;
-}
-
-static void set_test_type(const char *type)
-{
-	if (!strcmp(type, "anon")) {
-		test_type = TEST_ANON;
-		uffd_test_ops = &anon_uffd_test_ops;
-	} else if (!strcmp(type, "hugetlb")) {
-		test_type = TEST_HUGETLB;
-		uffd_test_ops = &hugetlb_uffd_test_ops;
-	} else if (!strcmp(type, "hugetlb_shared")) {
-		map_shared = true;
-		test_type = TEST_HUGETLB;
-		uffd_test_ops = &hugetlb_uffd_test_ops;
-		/* Minor faults require shared hugetlb; only enable here. */
-		test_uffdio_minor = true;
-	} else if (!strcmp(type, "shmem")) {
-		map_shared = true;
-		test_type = TEST_SHMEM;
-		uffd_test_ops = &shmem_uffd_test_ops;
-		test_uffdio_minor = true;
-	}
-}
-
-static void parse_test_type_arg(const char *raw_type)
-{
-	char *buf = strdup(raw_type);
-	uint64_t features = UFFD_API_FEATURES;
-
-	while (buf) {
-		const char *token = strsep(&buf, ":");
-
-		if (!test_type)
-			set_test_type(token);
-		else if (!strcmp(token, "dev"))
-			test_dev_userfaultfd = true;
-		else if (!strcmp(token, "syscall"))
-			test_dev_userfaultfd = false;
-		else if (!strcmp(token, "collapse"))
-			test_collapse = true;
-		else
-			err("unrecognized test mod '%s'", token);
-	}
-
-	if (!test_type)
-		err("failed to parse test type argument: '%s'", raw_type);
-
-	if (test_collapse && test_type != TEST_SHMEM)
-		err("Unsupported test: %s", raw_type);
-
-	if (test_type == TEST_HUGETLB)
-		page_size = hpage_size;
-	else
-		page_size = sysconf(_SC_PAGE_SIZE);
-
-	if (!page_size)
-		err("Unable to determine page size");
-	if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
-	    > page_size)
-		err("Impossible to run this test");
-
-	/*
-	 * Whether we can test certain features depends not just on test type,
-	 * but also on whether or not this particular kernel supports the
-	 * feature.
-	 */
-
-	userfaultfd_open(&features);
-
-	test_uffdio_wp = test_uffdio_wp &&
-		(features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
-	test_uffdio_minor = test_uffdio_minor &&
-		(features & uffd_minor_feature());
-
-	close(uffd);
-	uffd = -1;
-}
-
-static void sigalrm(int sig)
-{
-	if (sig != SIGALRM)
-		abort();
-	test_uffdio_copy_eexist = true;
-	test_uffdio_zeropage_eexist = true;
-	alarm(ALARM_INTERVAL_SECS);
-}
-
-int main(int argc, char **argv)
-{
-	size_t bytes;
-
-	if (argc < 4)
-		usage();
-
-	if (signal(SIGALRM, sigalrm) == SIG_ERR)
-		err("failed to arm SIGALRM");
-	alarm(ALARM_INTERVAL_SECS);
-
-	hpage_size = default_huge_page_size();
-	parse_test_type_arg(argv[1]);
-	bytes = atol(argv[2]) * 1024 * 1024;
-
-	if (test_collapse && bytes & (hpage_size - 1))
-		err("MiB must be multiple of %lu if :collapse mod set",
-		    hpage_size >> 20);
-
-	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-
-	if (test_collapse) {
-		/* nr_cpus must divide (bytes / page_size), otherwise,
-		 * area allocations of (nr_pages * paze_size) won't be a
-		 * multiple of hpage_size, even if bytes is a multiple of
-		 * hpage_size.
-		 *
-		 * This means that nr_cpus must divide (N * (2 << (H-P))
-		 * where:
-		 *	bytes = hpage_size * N
-		 *	hpage_size = 2 << H
-		 *	page_size = 2 << P
-		 *
-		 * And we want to chose nr_cpus to be the largest value
-		 * satisfying this constraint, not larger than the number
-		 * of online CPUs. Unfortunately, prime factorization of
-		 * N and nr_cpus may be arbitrary, so have to search for it.
-		 * Instead, just use the highest power of 2 dividing both
-		 * nr_cpus and (bytes / page_size).
-		 */
-		int x = factor_of_2(nr_cpus);
-		int y = factor_of_2(bytes / page_size);
-
-		nr_cpus = x < y ? x : y;
-	}
-	nr_pages_per_cpu = bytes / page_size / nr_cpus;
-	if (!nr_pages_per_cpu) {
-		_err("invalid MiB");
-		usage();
-	}
-
-	bounces = atoi(argv[3]);
-	if (bounces <= 0) {
-		_err("invalid bounces");
-		usage();
-	}
-	nr_pages = nr_pages_per_cpu * nr_cpus;
-
-	if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) {
-		unsigned int memfd_flags = 0;
-
-		if (test_type == TEST_HUGETLB)
-			memfd_flags = MFD_HUGETLB;
-		mem_fd = memfd_create(argv[0], memfd_flags);
-		if (mem_fd < 0)
-			err("memfd_create");
-		if (ftruncate(mem_fd, nr_pages * page_size * 2))
-			err("ftruncate");
-		if (fallocate(mem_fd,
-			      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
-			      nr_pages * page_size * 2))
-			err("fallocate");
-	}
-	printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
-	       nr_pages, nr_pages_per_cpu);
-	return userfaultfd_stress();
-}
-
-#else /* __NR_userfaultfd */
-
-#warning "missing __NR_userfaultfd definition"
-
-int main(void)
-{
-	printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
-	return KSFT_SKIP;
-}
-
-#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/vm/util.h b/tools/testing/selftests/vm/util.h
deleted file mode 100644
index b27d26199334..000000000000
--- a/tools/testing/selftests/vm/util.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __KSELFTEST_VM_UTIL_H
-#define __KSELFTEST_VM_UTIL_H
-
-#include <stdint.h>
-#include <sys/mman.h>
-#include <err.h>
-#include <string.h> /* ffsl() */
-#include <unistd.h> /* _SC_PAGESIZE */
-
-static unsigned int __page_size;
-static unsigned int __page_shift;
-
-static inline unsigned int page_size(void)
-{
-	if (!__page_size)
-		__page_size = sysconf(_SC_PAGESIZE);
-	return __page_size;
-}
-
-static inline unsigned int page_shift(void)
-{
-	if (!__page_shift)
-		__page_shift = (ffsl(page_size()) - 1);
-	return __page_shift;
-}
-
-#define PAGE_SHIFT	(page_shift())
-#define PAGE_SIZE	(page_size())
-/*
- * On ppc64 this will only work with radix 2M hugepage size
- */
-#define HPAGE_SHIFT 21
-#define HPAGE_SIZE (1 << HPAGE_SHIFT)
-
-#define PAGEMAP_PRESENT(ent)	(((ent) & (1ull << 63)) != 0)
-#define PAGEMAP_PFN(ent)	((ent) & ((1ull << 55) - 1))
-
-
-static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd)
-{
-	uint64_t ent[2];
-
-	/* drop pmd */
-	if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
-		 MAP_FIXED | MAP_ANONYMOUS |
-		 MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
-		errx(2, "mmap transhuge");
-
-	if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
-		err(2, "MADV_HUGEPAGE");
-
-	/* allocate transparent huge page */
-	*(volatile void **)ptr = ptr;
-
-	if (pread(pagemap_fd, ent, sizeof(ent),
-		  (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
-		err(2, "read pagemap");
-
-	if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
-	    PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
-	    !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1)))
-		return PAGEMAP_PFN(ent[0]);
-
-	return -1;
-}
-
-#endif
diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c
deleted file mode 100644
index 1d2068989883..000000000000
--- a/tools/testing/selftests/vm/va_128TBswitch.c
+++ /dev/null
@@ -1,289 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *
- * Authors: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
- * Authors: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- */
-
-#include <stdio.h>
-#include <sys/mman.h>
-#include <string.h>
-
-#include "../kselftest.h"
-
-#ifdef __powerpc64__
-#define PAGE_SIZE	(64 << 10)
-/*
- * This will work with 16M and 2M hugepage size
- */
-#define HUGETLB_SIZE	(16 << 20)
-#else
-#define PAGE_SIZE	(4 << 10)
-#define HUGETLB_SIZE	(2 << 20)
-#endif
-
-/*
- * >= 128TB is the hint addr value we used to select
- * large address space.
- */
-#define ADDR_SWITCH_HINT (1UL << 47)
-#define LOW_ADDR	((void *) (1UL << 30))
-#define HIGH_ADDR	((void *) (1UL << 48))
-
-struct testcase {
-	void *addr;
-	unsigned long size;
-	unsigned long flags;
-	const char *msg;
-	unsigned int low_addr_required:1;
-	unsigned int keep_mapped:1;
-};
-
-static struct testcase testcases[] = {
-	{
-		/*
-		 * If stack is moved, we could possibly allocate
-		 * this at the requested address.
-		 */
-		.addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
-		.size = PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
-		.low_addr_required = 1,
-	},
-	{
-		/*
-		 * We should never allocate at the requested address or above it
-		 * The len cross the 128TB boundary. Without MAP_FIXED
-		 * we will always search in the lower address space.
-		 */
-		.addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
-		.size = 2 * PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))",
-		.low_addr_required = 1,
-	},
-	{
-		/*
-		 * Exact mapping at 128TB, the area is free we should get that
-		 * even without MAP_FIXED.
-		 */
-		.addr = ((void *)(ADDR_SWITCH_HINT)),
-		.size = PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
-		.keep_mapped = 1,
-	},
-	{
-		.addr = (void *)(ADDR_SWITCH_HINT),
-		.size = 2 * PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
-		.msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
-	},
-	{
-		.addr = NULL,
-		.size = 2 * PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(NULL)",
-		.low_addr_required = 1,
-	},
-	{
-		.addr = LOW_ADDR,
-		.size = 2 * PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(LOW_ADDR)",
-		.low_addr_required = 1,
-	},
-	{
-		.addr = HIGH_ADDR,
-		.size = 2 * PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(HIGH_ADDR)",
-		.keep_mapped = 1,
-	},
-	{
-		.addr = HIGH_ADDR,
-		.size = 2 * PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(HIGH_ADDR) again",
-		.keep_mapped = 1,
-	},
-	{
-		.addr = HIGH_ADDR,
-		.size = 2 * PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
-		.msg = "mmap(HIGH_ADDR, MAP_FIXED)",
-	},
-	{
-		.addr = (void *) -1,
-		.size = 2 * PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(-1)",
-		.keep_mapped = 1,
-	},
-	{
-		.addr = (void *) -1,
-		.size = 2 * PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(-1) again",
-	},
-	{
-		.addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
-		.size = PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
-		.low_addr_required = 1,
-	},
-	{
-		.addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
-		.size = 2 * PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)",
-		.low_addr_required = 1,
-		.keep_mapped = 1,
-	},
-	{
-		.addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2),
-		.size = 2 * PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)",
-		.low_addr_required = 1,
-		.keep_mapped = 1,
-	},
-	{
-		.addr = ((void *)(ADDR_SWITCH_HINT)),
-		.size = PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
-	},
-	{
-		.addr = (void *)(ADDR_SWITCH_HINT),
-		.size = 2 * PAGE_SIZE,
-		.flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
-		.msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
-	},
-};
-
-static struct testcase hugetlb_testcases[] = {
-	{
-		.addr = NULL,
-		.size = HUGETLB_SIZE,
-		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(NULL, MAP_HUGETLB)",
-		.low_addr_required = 1,
-	},
-	{
-		.addr = LOW_ADDR,
-		.size = HUGETLB_SIZE,
-		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(LOW_ADDR, MAP_HUGETLB)",
-		.low_addr_required = 1,
-	},
-	{
-		.addr = HIGH_ADDR,
-		.size = HUGETLB_SIZE,
-		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(HIGH_ADDR, MAP_HUGETLB)",
-		.keep_mapped = 1,
-	},
-	{
-		.addr = HIGH_ADDR,
-		.size = HUGETLB_SIZE,
-		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again",
-		.keep_mapped = 1,
-	},
-	{
-		.addr = HIGH_ADDR,
-		.size = HUGETLB_SIZE,
-		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
-		.msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)",
-	},
-	{
-		.addr = (void *) -1,
-		.size = HUGETLB_SIZE,
-		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(-1, MAP_HUGETLB)",
-		.keep_mapped = 1,
-	},
-	{
-		.addr = (void *) -1,
-		.size = HUGETLB_SIZE,
-		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(-1, MAP_HUGETLB) again",
-	},
-	{
-		.addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
-		.size = 2 * HUGETLB_SIZE,
-		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)",
-		.low_addr_required = 1,
-		.keep_mapped = 1,
-	},
-	{
-		.addr = (void *)(ADDR_SWITCH_HINT),
-		.size = 2 * HUGETLB_SIZE,
-		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
-		.msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)",
-	},
-};
-
-static int run_test(struct testcase *test, int count)
-{
-	void *p;
-	int i, ret = KSFT_PASS;
-
-	for (i = 0; i < count; i++) {
-		struct testcase *t = test + i;
-
-		p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0);
-
-		printf("%s: %p - ", t->msg, p);
-
-		if (p == MAP_FAILED) {
-			printf("FAILED\n");
-			ret = KSFT_FAIL;
-			continue;
-		}
-
-		if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) {
-			printf("FAILED\n");
-			ret = KSFT_FAIL;
-		} else {
-			/*
-			 * Do a dereference of the address returned so that we catch
-			 * bugs in page fault handling
-			 */
-			memset(p, 0, t->size);
-			printf("OK\n");
-		}
-		if (!t->keep_mapped)
-			munmap(p, t->size);
-	}
-
-	return ret;
-}
-
-static int supported_arch(void)
-{
-#if defined(__powerpc64__)
-	return 1;
-#elif defined(__x86_64__)
-	return 1;
-#else
-	return 0;
-#endif
-}
-
-int main(int argc, char **argv)
-{
-	int ret;
-
-	if (!supported_arch())
-		return KSFT_SKIP;
-
-	ret = run_test(testcases, ARRAY_SIZE(testcases));
-	if (argc == 2 && !strcmp(argv[1], "--run-hugetlb"))
-		ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases));
-	return ret;
-}
diff --git a/tools/testing/selftests/vm/va_128TBswitch.sh b/tools/testing/selftests/vm/va_128TBswitch.sh
deleted file mode 100755
index 41580751dc51..000000000000
--- a/tools/testing/selftests/vm/va_128TBswitch.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# Copyright (C) 2022 Adam Sindelar (Meta) <adam@wowsignal.io>
-#
-# This is a test for mmap behavior with 5-level paging. This script wraps the
-# real test to check that the kernel is configured to support at least 5
-# pagetable levels.
-
-# 1 means the test failed
-exitcode=1
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-fail()
-{
-	echo "$1"
-	exit $exitcode
-}
-
-check_supported_x86_64()
-{
-	local config="/proc/config.gz"
-	[[ -f "${config}" ]] || config="/boot/config-$(uname -r)"
-	[[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot"
-
-	# gzip -dcfq automatically handles both compressed and plaintext input.
-	# See man 1 gzip under '-f'.
-	local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2)
-
-	if [[ "${pg_table_levels}" -lt 5 ]]; then
-		echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
-		exit $ksft_skip
-	fi
-}
-
-check_test_requirements()
-{
-	# The test supports x86_64 and powerpc64. We currently have no useful
-	# eligibility check for powerpc64, and the test itself will reject other
-	# architectures.
-	case `uname -m` in
-		"x86_64")
-			check_supported_x86_64
-		;;
-		*)
-			return 0
-		;;
-	esac
-}
-
-check_test_requirements
-./va_128TBswitch
diff --git a/tools/testing/selftests/vm/virtual_address_range.c b/tools/testing/selftests/vm/virtual_address_range.c
deleted file mode 100644
index c0592646ed93..000000000000
--- a/tools/testing/selftests/vm/virtual_address_range.c
+++ /dev/null
@@ -1,139 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2017, Anshuman Khandual, IBM Corp.
- *
- * Works on architectures which support 128TB virtual
- * address range and beyond.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <errno.h>
-#include <sys/mman.h>
-#include <sys/time.h>
-
-/*
- * Maximum address range mapped with a single mmap()
- * call is little bit more than 16GB. Hence 16GB is
- * chosen as the single chunk size for address space
- * mapping.
- */
-#define MAP_CHUNK_SIZE   17179869184UL /* 16GB */
-
-/*
- * Address space till 128TB is mapped without any hint
- * and is enabled by default. Address space beyond 128TB
- * till 512TB is obtained by passing hint address as the
- * first argument into mmap() system call.
- *
- * The process heap address space is divided into two
- * different areas one below 128TB and one above 128TB
- * till it reaches 512TB. One with size 128TB and the
- * other being 384TB.
- *
- * On Arm64 the address space is 256TB and no high mappings
- * are supported so far.
- */
-
-#define NR_CHUNKS_128TB   8192UL /* Number of 16GB chunks for 128TB */
-#define NR_CHUNKS_256TB   (NR_CHUNKS_128TB * 2UL)
-#define NR_CHUNKS_384TB   (NR_CHUNKS_128TB * 3UL)
-
-#define ADDR_MARK_128TB  (1UL << 47) /* First address beyond 128TB */
-#define ADDR_MARK_256TB  (1UL << 48) /* First address beyond 256TB */
-
-#ifdef __aarch64__
-#define HIGH_ADDR_MARK  ADDR_MARK_256TB
-#define HIGH_ADDR_SHIFT 49
-#define NR_CHUNKS_LOW   NR_CHUNKS_256TB
-#define NR_CHUNKS_HIGH  0
-#else
-#define HIGH_ADDR_MARK  ADDR_MARK_128TB
-#define HIGH_ADDR_SHIFT 48
-#define NR_CHUNKS_LOW   NR_CHUNKS_128TB
-#define NR_CHUNKS_HIGH  NR_CHUNKS_384TB
-#endif
-
-static char *hind_addr(void)
-{
-	int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT);
-
-	return (char *) (1UL << bits);
-}
-
-static int validate_addr(char *ptr, int high_addr)
-{
-	unsigned long addr = (unsigned long) ptr;
-
-	if (high_addr) {
-		if (addr < HIGH_ADDR_MARK) {
-			printf("Bad address %lx\n", addr);
-			return 1;
-		}
-		return 0;
-	}
-
-	if (addr > HIGH_ADDR_MARK) {
-		printf("Bad address %lx\n", addr);
-		return 1;
-	}
-	return 0;
-}
-
-static int validate_lower_address_hint(void)
-{
-	char *ptr;
-
-	ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ |
-			PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-
-	if (ptr == MAP_FAILED)
-		return 0;
-
-	return 1;
-}
-
-int main(int argc, char *argv[])
-{
-	char *ptr[NR_CHUNKS_LOW];
-	char *hptr[NR_CHUNKS_HIGH];
-	char *hint;
-	unsigned long i, lchunks, hchunks;
-
-	for (i = 0; i < NR_CHUNKS_LOW; i++) {
-		ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
-					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-
-		if (ptr[i] == MAP_FAILED) {
-			if (validate_lower_address_hint())
-				return 1;
-			break;
-		}
-
-		if (validate_addr(ptr[i], 0))
-			return 1;
-	}
-	lchunks = i;
-
-	for (i = 0; i < NR_CHUNKS_HIGH; i++) {
-		hint = hind_addr();
-		hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
-					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-
-		if (hptr[i] == MAP_FAILED)
-			break;
-
-		if (validate_addr(hptr[i], 1))
-			return 1;
-	}
-	hchunks = i;
-
-	for (i = 0; i < lchunks; i++)
-		munmap(ptr[i], MAP_CHUNK_SIZE);
-
-	for (i = 0; i < hchunks; i++)
-		munmap(hptr[i], MAP_CHUNK_SIZE);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c
deleted file mode 100644
index 40e795624ff3..000000000000
--- a/tools/testing/selftests/vm/vm_util.c
+++ /dev/null
@@ -1,151 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <string.h>
-#include <fcntl.h>
-#include "../kselftest.h"
-#include "vm_util.h"
-
-#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
-#define SMAP_FILE_PATH "/proc/self/smaps"
-#define MAX_LINE_LENGTH 500
-
-uint64_t pagemap_get_entry(int fd, char *start)
-{
-	const unsigned long pfn = (unsigned long)start / getpagesize();
-	uint64_t entry;
-	int ret;
-
-	ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry));
-	if (ret != sizeof(entry))
-		ksft_exit_fail_msg("reading pagemap failed\n");
-	return entry;
-}
-
-bool pagemap_is_softdirty(int fd, char *start)
-{
-	uint64_t entry = pagemap_get_entry(fd, start);
-
-	// Check if dirty bit (55th bit) is set
-	return entry & 0x0080000000000000ull;
-}
-
-bool pagemap_is_swapped(int fd, char *start)
-{
-	uint64_t entry = pagemap_get_entry(fd, start);
-
-	return entry & 0x4000000000000000ull;
-}
-
-bool pagemap_is_populated(int fd, char *start)
-{
-	uint64_t entry = pagemap_get_entry(fd, start);
-
-	/* Present or swapped. */
-	return entry & 0xc000000000000000ull;
-}
-
-unsigned long pagemap_get_pfn(int fd, char *start)
-{
-	uint64_t entry = pagemap_get_entry(fd, start);
-
-	/* If present (63th bit), PFN is at bit 0 -- 54. */
-	if (entry & 0x8000000000000000ull)
-		return entry & 0x007fffffffffffffull;
-	return -1ul;
-}
-
-void clear_softdirty(void)
-{
-	int ret;
-	const char *ctrl = "4";
-	int fd = open("/proc/self/clear_refs", O_WRONLY);
-
-	if (fd < 0)
-		ksft_exit_fail_msg("opening clear_refs failed\n");
-	ret = write(fd, ctrl, strlen(ctrl));
-	close(fd);
-	if (ret != strlen(ctrl))
-		ksft_exit_fail_msg("writing clear_refs failed\n");
-}
-
-bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len)
-{
-	while (fgets(buf, len, fp)) {
-		if (!strncmp(buf, pattern, strlen(pattern)))
-			return true;
-	}
-	return false;
-}
-
-uint64_t read_pmd_pagesize(void)
-{
-	int fd;
-	char buf[20];
-	ssize_t num_read;
-
-	fd = open(PMD_SIZE_FILE_PATH, O_RDONLY);
-	if (fd == -1)
-		ksft_exit_fail_msg("Open hpage_pmd_size failed\n");
-
-	num_read = read(fd, buf, 19);
-	if (num_read < 1) {
-		close(fd);
-		ksft_exit_fail_msg("Read hpage_pmd_size failed\n");
-	}
-	buf[num_read] = '\0';
-	close(fd);
-
-	return strtoul(buf, NULL, 10);
-}
-
-bool __check_huge(void *addr, char *pattern, int nr_hpages,
-		  uint64_t hpage_size)
-{
-	uint64_t thp = -1;
-	int ret;
-	FILE *fp;
-	char buffer[MAX_LINE_LENGTH];
-	char addr_pattern[MAX_LINE_LENGTH];
-
-	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
-		       (unsigned long) addr);
-	if (ret >= MAX_LINE_LENGTH)
-		ksft_exit_fail_msg("%s: Pattern is too long\n", __func__);
-
-	fp = fopen(SMAP_FILE_PATH, "r");
-	if (!fp)
-		ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH);
-
-	if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
-		goto err_out;
-
-	/*
-	 * Fetch the pattern in the same block and check the number of
-	 * hugepages.
-	 */
-	if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer)))
-		goto err_out;
-
-	snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern);
-
-	if (sscanf(buffer, addr_pattern, &thp) != 1)
-		ksft_exit_fail_msg("Reading smap error\n");
-
-err_out:
-	fclose(fp);
-	return thp == (nr_hpages * (hpage_size >> 10));
-}
-
-bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size)
-{
-	return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size);
-}
-
-bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size)
-{
-	return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size);
-}
-
-bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size)
-{
-	return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size);
-}
diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h
deleted file mode 100644
index 1995ee911ef2..000000000000
--- a/tools/testing/selftests/vm/vm_util.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <stdint.h>
-#include <stdbool.h>
-
-uint64_t pagemap_get_entry(int fd, char *start);
-bool pagemap_is_softdirty(int fd, char *start);
-bool pagemap_is_swapped(int fd, char *start);
-bool pagemap_is_populated(int fd, char *start);
-unsigned long pagemap_get_pfn(int fd, char *start);
-void clear_softdirty(void);
-bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
-uint64_t read_pmd_pagesize(void);
-bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size);
-bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size);
-bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);
diff --git a/tools/testing/selftests/vm/write_hugetlb_memory.sh b/tools/testing/selftests/vm/write_hugetlb_memory.sh
deleted file mode 100644
index 70a02301f4c2..000000000000
--- a/tools/testing/selftests/vm/write_hugetlb_memory.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-set -e
-
-size=$1
-populate=$2
-write=$3
-cgroup=$4
-path=$5
-method=$6
-private=$7
-want_sleep=$8
-reserve=$9
-
-echo "Putting task in cgroup '$cgroup'"
-echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs
-
-echo "Method is $method"
-
-set +e
-./write_to_hugetlbfs -p "$path" -s "$size" "$write" "$populate" -m "$method" \
-      "$private" "$want_sleep" "$reserve"
diff --git a/tools/testing/selftests/vm/write_to_hugetlbfs.c b/tools/testing/selftests/vm/write_to_hugetlbfs.c
deleted file mode 100644
index 6a2caba19ee1..000000000000
--- a/tools/testing/selftests/vm/write_to_hugetlbfs.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * This program reserves and uses hugetlb memory, supporting a bunch of
- * scenarios needed by the charged_reserved_hugetlb.sh test.
- */
-
-#include <err.h>
-#include <errno.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/shm.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-
-/* Global definitions. */
-enum method {
-	HUGETLBFS,
-	MMAP_MAP_HUGETLB,
-	SHM,
-	MAX_METHOD
-};
-
-
-/* Global variables. */
-static const char *self;
-static char *shmaddr;
-static int shmid;
-
-/*
- * Show usage and exit.
- */
-static void exit_usage(void)
-{
-	printf("Usage: %s -p <path to hugetlbfs file> -s <size to map> "
-	       "[-m <0=hugetlbfs | 1=mmap(MAP_HUGETLB)>] [-l] [-r] "
-	       "[-o] [-w] [-n]\n",
-	       self);
-	exit(EXIT_FAILURE);
-}
-
-void sig_handler(int signo)
-{
-	printf("Received %d.\n", signo);
-	if (signo == SIGINT) {
-		printf("Deleting the memory\n");
-		if (shmdt((const void *)shmaddr) != 0) {
-			perror("Detach failure");
-			shmctl(shmid, IPC_RMID, NULL);
-			exit(4);
-		}
-
-		shmctl(shmid, IPC_RMID, NULL);
-		printf("Done deleting the memory\n");
-	}
-	exit(2);
-}
-
-int main(int argc, char **argv)
-{
-	int fd = 0;
-	int key = 0;
-	int *ptr = NULL;
-	int c = 0;
-	int size = 0;
-	char path[256] = "";
-	enum method method = MAX_METHOD;
-	int want_sleep = 0, private = 0;
-	int populate = 0;
-	int write = 0;
-	int reserve = 1;
-
-	if (signal(SIGINT, sig_handler) == SIG_ERR)
-		err(1, "\ncan't catch SIGINT\n");
-
-	/* Parse command-line arguments. */
-	setvbuf(stdout, NULL, _IONBF, 0);
-	self = argv[0];
-
-	while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) {
-		switch (c) {
-		case 's':
-			size = atoi(optarg);
-			break;
-		case 'p':
-			strncpy(path, optarg, sizeof(path));
-			break;
-		case 'm':
-			if (atoi(optarg) >= MAX_METHOD) {
-				errno = EINVAL;
-				perror("Invalid -m.");
-				exit_usage();
-			}
-			method = atoi(optarg);
-			break;
-		case 'o':
-			populate = 1;
-			break;
-		case 'w':
-			write = 1;
-			break;
-		case 'l':
-			want_sleep = 1;
-			break;
-		case 'r':
-		    private
-			= 1;
-			break;
-		case 'n':
-			reserve = 0;
-			break;
-		default:
-			errno = EINVAL;
-			perror("Invalid arg");
-			exit_usage();
-		}
-	}
-
-	if (strncmp(path, "", sizeof(path)) != 0) {
-		printf("Writing to this path: %s\n", path);
-	} else {
-		errno = EINVAL;
-		perror("path not found");
-		exit_usage();
-	}
-
-	if (size != 0) {
-		printf("Writing this size: %d\n", size);
-	} else {
-		errno = EINVAL;
-		perror("size not found");
-		exit_usage();
-	}
-
-	if (!populate)
-		printf("Not populating.\n");
-	else
-		printf("Populating.\n");
-
-	if (!write)
-		printf("Not writing to memory.\n");
-
-	if (method == MAX_METHOD) {
-		errno = EINVAL;
-		perror("-m Invalid");
-		exit_usage();
-	} else
-		printf("Using method=%d\n", method);
-
-	if (!private)
-		printf("Shared mapping.\n");
-	else
-		printf("Private mapping.\n");
-
-	if (!reserve)
-		printf("NO_RESERVE mapping.\n");
-	else
-		printf("RESERVE mapping.\n");
-
-	switch (method) {
-	case HUGETLBFS:
-		printf("Allocating using HUGETLBFS.\n");
-		fd = open(path, O_CREAT | O_RDWR, 0777);
-		if (fd == -1)
-			err(1, "Failed to open file.");
-
-		ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
-			   (private ? MAP_PRIVATE : MAP_SHARED) |
-				   (populate ? MAP_POPULATE : 0) |
-				   (reserve ? 0 : MAP_NORESERVE),
-			   fd, 0);
-
-		if (ptr == MAP_FAILED) {
-			close(fd);
-			err(1, "Error mapping the file");
-		}
-		break;
-	case MMAP_MAP_HUGETLB:
-		printf("Allocating using MAP_HUGETLB.\n");
-		ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
-			   (private ? (MAP_PRIVATE | MAP_ANONYMOUS) :
-				      MAP_SHARED) |
-				   MAP_HUGETLB | (populate ? MAP_POPULATE : 0) |
-				   (reserve ? 0 : MAP_NORESERVE),
-			   -1, 0);
-
-		if (ptr == MAP_FAILED)
-			err(1, "mmap");
-
-		printf("Returned address is %p\n", ptr);
-		break;
-	case SHM:
-		printf("Allocating using SHM.\n");
-		shmid = shmget(key, size,
-			       SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
-		if (shmid < 0) {
-			shmid = shmget(++key, size,
-				       SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
-			if (shmid < 0)
-				err(1, "shmget");
-		}
-		printf("shmid: 0x%x, shmget key:%d\n", shmid, key);
-
-		ptr = shmat(shmid, NULL, 0);
-		if (ptr == (int *)-1) {
-			perror("Shared memory attach failure");
-			shmctl(shmid, IPC_RMID, NULL);
-			exit(2);
-		}
-		printf("shmaddr: %p\n", ptr);
-
-		break;
-	default:
-		errno = EINVAL;
-		err(1, "Invalid method.");
-	}
-
-	if (write) {
-		printf("Writing to memory.\n");
-		memset(ptr, 1, size);
-	}
-
-	if (want_sleep) {
-		/* Signal to caller that we're done. */
-		printf("DONE\n");
-
-		/* Hold memory until external kill signal is delivered. */
-		while (1)
-			sleep(100);
-	}
-
-	if (method == HUGETLBFS)
-		close(fd);
-
-	return 0;
-}
-- 
cgit 


From da0618c146ca0e1412173a8a229dd737a73b1a4f Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Mon, 2 Jan 2023 16:11:26 +0000
Subject: selftest/vm: add mremap expand merge offset test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a test to assert that we can mremap() and expand a mapping starting
from an offset within an existing mapping.  We unmap the last page in a 3
page mapping to ensure that the remap should always succeed, before
remapping from the 2nd page.

This is additionally a regression test for the issue solved in "mm,
mremap: fix mremap() expanding vma with addr inside vma" and confirmed to
fail prior to the change and pass after it.

Finally, this patch updates the existing mremap expand merge test to check
error conditions and reduce code duplication between the two tests.

[lstoakes@gmail.com: increment num_expand_tests so test doesn't complain about unexpected tests being run]
  Link: https://lkml.kernel.org/r/8ff3ba3cadc0b6c1b2688ae5c851bf73aa062d57.1673701836.git.lstoakes@gmail.com
Link: https://lkml.kernel.org/r/02b117a8ffd52acc01dc66c2fb39754f08d92c0e.1672675824.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Jakub Matěna <matenajakub@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/mremap_test.c | 119 +++++++++++++++++++++++++------
 1 file changed, 96 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c
index 9496346973d4..5c3773de9f0f 100644
--- a/tools/testing/selftests/mm/mremap_test.c
+++ b/tools/testing/selftests/mm/mremap_test.c
@@ -119,47 +119,109 @@ static unsigned long long get_mmap_min_addr(void)
 }
 
 /*
- * This test validates that merge is called when expanding a mapping.
- * Mapping containing three pages is created, middle page is unmapped
- * and then the mapping containing the first page is expanded so that
- * it fills the created hole. The two parts should merge creating
- * single mapping with three pages.
+ * Using /proc/self/maps, assert that the specified address range is contained
+ * within a single mapping.
  */
-static void mremap_expand_merge(unsigned long page_size)
+static bool is_range_mapped(FILE *maps_fp, void *start, void *end)
 {
-	char *test_name = "mremap expand merge";
-	FILE *fp;
 	char *line = NULL;
 	size_t len = 0;
 	bool success = false;
-	char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
-			   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-
-	munmap(start + page_size, page_size);
-	mremap(start, page_size, 2 * page_size, 0);
 
-	fp = fopen("/proc/self/maps", "r");
-	if (fp == NULL) {
-		ksft_test_result_fail("%s\n", test_name);
-		return;
-	}
+	rewind(maps_fp);
 
-	while (getline(&line, &len, fp) != -1) {
+	while (getline(&line, &len, maps_fp) != -1) {
 		char *first = strtok(line, "- ");
 		void *first_val = (void *)strtol(first, NULL, 16);
 		char *second = strtok(NULL, "- ");
 		void *second_val = (void *) strtol(second, NULL, 16);
 
-		if (first_val == start && second_val == start + 3 * page_size) {
+		if (first_val <= start && second_val >= end) {
 			success = true;
 			break;
 		}
 	}
+
+	return success;
+}
+
+/*
+ * This test validates that merge is called when expanding a mapping.
+ * Mapping containing three pages is created, middle page is unmapped
+ * and then the mapping containing the first page is expanded so that
+ * it fills the created hole. The two parts should merge creating
+ * single mapping with three pages.
+ */
+static void mremap_expand_merge(FILE *maps_fp, unsigned long page_size)
+{
+	char *test_name = "mremap expand merge";
+	bool success = false;
+	char *remap, *start;
+
+	start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+	if (start == MAP_FAILED) {
+		ksft_print_msg("mmap failed: %s\n", strerror(errno));
+		goto out;
+	}
+
+	munmap(start + page_size, page_size);
+	remap = mremap(start, page_size, 2 * page_size, 0);
+	if (remap == MAP_FAILED) {
+		ksft_print_msg("mremap failed: %s\n", strerror(errno));
+		munmap(start, page_size);
+		munmap(start + 2 * page_size, page_size);
+		goto out;
+	}
+
+	success = is_range_mapped(maps_fp, start, start + 3 * page_size);
+	munmap(start, 3 * page_size);
+
+out:
+	if (success)
+		ksft_test_result_pass("%s\n", test_name);
+	else
+		ksft_test_result_fail("%s\n", test_name);
+}
+
+/*
+ * Similar to mremap_expand_merge() except instead of removing the middle page,
+ * we remove the last then attempt to remap offset from the second page. This
+ * should result in the mapping being restored to its former state.
+ */
+static void mremap_expand_merge_offset(FILE *maps_fp, unsigned long page_size)
+{
+
+	char *test_name = "mremap expand merge offset";
+	bool success = false;
+	char *remap, *start;
+
+	start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+	if (start == MAP_FAILED) {
+		ksft_print_msg("mmap failed: %s\n", strerror(errno));
+		goto out;
+	}
+
+	/* Unmap final page to ensure we have space to expand. */
+	munmap(start + 2 * page_size, page_size);
+	remap = mremap(start + page_size, page_size, 2 * page_size, 0);
+	if (remap == MAP_FAILED) {
+		ksft_print_msg("mremap failed: %s\n", strerror(errno));
+		munmap(start, 2 * page_size);
+		goto out;
+	}
+
+	success = is_range_mapped(maps_fp, start, start + 3 * page_size);
+	munmap(start, 3 * page_size);
+
+out:
 	if (success)
 		ksft_test_result_pass("%s\n", test_name);
 	else
 		ksft_test_result_fail("%s\n", test_name);
-	fclose(fp);
 }
 
 /*
@@ -380,11 +442,12 @@ int main(int argc, char **argv)
 	int i, run_perf_tests;
 	unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD;
 	unsigned int pattern_seed;
-	int num_expand_tests = 1;
+	int num_expand_tests = 2;
 	struct test test_cases[MAX_TEST];
 	struct test perf_test_cases[MAX_PERF_TEST];
 	int page_size;
 	time_t t;
+	FILE *maps_fp;
 
 	pattern_seed = (unsigned int) time(&t);
 
@@ -458,7 +521,17 @@ int main(int argc, char **argv)
 		run_mremap_test_case(test_cases[i], &failures, threshold_mb,
 				     pattern_seed);
 
-	mremap_expand_merge(page_size);
+	maps_fp = fopen("/proc/self/maps", "r");
+
+	if (maps_fp == NULL) {
+		ksft_print_msg("Failed to read /proc/self/maps: %s\n", strerror(errno));
+		exit(KSFT_FAIL);
+	}
+
+	mremap_expand_merge(maps_fp, page_size);
+	mremap_expand_merge_offset(maps_fp, page_size);
+
+	fclose(maps_fp);
 
 	if (run_perf_tests) {
 		ksft_print_msg("\n%s\n",
-- 
cgit 


From f4d9139f1394cbe2de158ab8771fea4e587004d4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 9 Jan 2023 18:12:55 +0100
Subject: selftests/mm: define MADV_PAGEOUT to fix compilation issues

If MADV_PAGEOUT is not defined (e.g., on AlmaLinux 8), compilation will
fail.  Let's fix that like khugepaged.c does by conditionally defining
MADV_PAGEOUT.

Link: https://lkml.kernel.org/r/20230109171255.488749-1-david@redhat.com
Fixes: 69c66add5663 ("selftests/vm: anon_cow: test COW handling of anonymous memory")
Signed-off-by: David Hildenbrand <david@redhat.com>
Reported-by: Mirsad Goran Todorovac <mirsad.todorovac@alu.unizg.hr>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/cow.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c
index 16216d893d96..0eb2e8180aa5 100644
--- a/tools/testing/selftests/mm/cow.c
+++ b/tools/testing/selftests/mm/cow.c
@@ -30,6 +30,9 @@
 #include "../kselftest.h"
 #include "vm_util.h"
 
+#ifndef MADV_PAGEOUT
+#define MADV_PAGEOUT 21
+#endif
 #ifndef MADV_COLLAPSE
 #define MADV_COLLAPSE 25
 #endif
-- 
cgit 


From d526643f155c431e8dfef643195f2d636d4e4bb5 Mon Sep 17 00:00:00 2001
From: Alexander Pantyukhin <apantykhin@gmail.com>
Date: Sun, 8 Jan 2023 15:50:23 +0500
Subject: tools:cgroup:memcg_shrinker remove redundant import

Remove redundant import of the sys module.

Also use the sort function instead of sorted.  It sorts the direct array
without create the new one in memory.

Link: https://lkml.kernel.org/r/20230108105023.4289-1-apantykhin@gmail.com
Signed-off-by: Alexander Pantyukhin <apantykhin@gmail.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/cgroup/memcg_shrinker.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/cgroup/memcg_shrinker.py b/tools/cgroup/memcg_shrinker.py
index 706ab27666a4..e81c3017ada9 100644
--- a/tools/cgroup/memcg_shrinker.py
+++ b/tools/cgroup/memcg_shrinker.py
@@ -5,7 +5,6 @@
 
 import os
 import argparse
-import sys
 
 
 def scan_cgroups(cgroup_root):
@@ -44,7 +43,7 @@ def main():
 
     cgroups = scan_cgroups("/sys/fs/cgroup/")
     shrinkers = scan_shrinkers("/sys/kernel/debug/shrinker/")
-    shrinkers = sorted(shrinkers, reverse = True, key = lambda x: x[0])
+    shrinkers.sort(reverse = True, key = lambda x: x[0])
 
     n = 0
     for s in shrinkers:
-- 
cgit 


From 9a3f21fe5cb9f5654ccad7ba712d868f7de66e39 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn@rivosinc.com>
Date: Mon, 9 Jan 2023 12:42:51 +0100
Subject: selftests: vm: enable cross-compilation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Selftests vm builds break when doing cross-compilation.  The Makefile
MACHINE variable incorrectly picks upp the host machine architecture.

If the CROSS_COMPILE variable is set, dig out the target host architecture
from CROSS_COMPILE, instead of calling uname.

Link: https://lkml.kernel.org/r/20230109114251.3349638-1-bjorn@kernel.org
Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 6a4b639b2b2b..0a44d77f8437 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -5,7 +5,11 @@ LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h
 
 include local_config.mk
 
+ifeq ($(CROSS_COMPILE),)
 uname_M := $(shell uname -m 2>/dev/null || echo not)
+else
+uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+')
+endif
 MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/')
 
 # Without this, failed build products remain, with up-to-date timestamps,
-- 
cgit 


From 3c107f36db061603bee7564fbd6388b1f1879fd3 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Wed, 18 Jan 2023 10:09:27 +0800
Subject: selftests/net: mv bpf/nat6to4.c to net folder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are some issues with the bpf/nat6to4.c building.

1. It use TEST_CUSTOM_PROGS, which will add the nat6to4.o to
   kselftest-list file and run by common run_tests.
2. When building the test via `make -C tools/testing/selftests/
   TARGETS="net"`, the nat6to4.o will be build in selftests/net/bpf/
   folder. But in test udpgro_frglist.sh it refers to ../bpf/nat6to4.o.
   The correct path should be ./bpf/nat6to4.o.
3. If building the test via `make -C tools/testing/selftests/ TARGETS="net"
   install`. The nat6to4.o will be installed to kselftest_install/net/
   folder. Then the udpgro_frglist.sh should refer to ./nat6to4.o.

To fix the confusing test path, let's just move the nat6to4.c to net folder
and build it as TEST_GEN_FILES.

Fixes: edae34a3ed92 ("selftests net: add UDP GRO fraglist + bpf self-tests")
Tested-by: Björn Töpel <bjorn@kernel.org>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://lore.kernel.org/r/20230118020927.3971864-1-liuhangbin@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/Makefile          |  50 ++++-
 tools/testing/selftests/net/bpf/Makefile      |  51 -----
 tools/testing/selftests/net/bpf/nat6to4.c     | 285 --------------------------
 tools/testing/selftests/net/nat6to4.c         | 285 ++++++++++++++++++++++++++
 tools/testing/selftests/net/udpgro_frglist.sh |   8 +-
 5 files changed, 337 insertions(+), 342 deletions(-)
 delete mode 100644 tools/testing/selftests/net/bpf/Makefile
 delete mode 100644 tools/testing/selftests/net/bpf/nat6to4.c
 create mode 100644 tools/testing/selftests/net/nat6to4.c

(limited to 'tools')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 3007e98a6d64..47314f0b3006 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -75,14 +75,60 @@ TEST_GEN_PROGS += so_incoming_cpu
 TEST_PROGS += sctp_vrf.sh
 TEST_GEN_FILES += sctp_hello
 TEST_GEN_FILES += csum
+TEST_GEN_FILES += nat6to4.o
 
 TEST_FILES := settings
 
 include ../lib.mk
 
-include bpf/Makefile
-
 $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
 $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread
 $(OUTPUT)/tcp_inq: LDLIBS += -lpthread
 $(OUTPUT)/bind_bhash: LDLIBS += -lpthread
+
+# Rules to generate bpf obj nat6to4.o
+CLANG ?= clang
+SCRATCH_DIR := $(OUTPUT)/tools
+BUILD_DIR := $(SCRATCH_DIR)/build
+BPFDIR := $(abspath ../../../lib/bpf)
+APIDIR := $(abspath ../../../include/uapi)
+
+CCINCLUDE += -I../bpf
+CCINCLUDE += -I../../../../usr/include/
+CCINCLUDE += -I$(SCRATCH_DIR)/include
+
+BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a
+
+MAKE_DIRS := $(BUILD_DIR)/libbpf
+$(MAKE_DIRS):
+	mkdir -p $@
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '-target bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) $(2) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
+endef
+
+ifneq ($(CROSS_COMPILE),)
+CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%))
+endif
+
+CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH))
+
+$(OUTPUT)/nat6to4.o: nat6to4.c $(BPFOBJ) | $(MAKE_DIRS)
+	$(CLANG) -O2 -target bpf -c $< $(CCINCLUDE) $(CLANG_SYS_INCLUDES) -o $@
+
+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
+	   $(APIDIR)/linux/bpf.h					       \
+	   | $(BUILD_DIR)/libbpf
+	$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/     \
+		    EXTRA_CFLAGS='-g -O0'				       \
+		    DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
+
+EXTRA_CLEAN := $(SCRATCH_DIR)
diff --git a/tools/testing/selftests/net/bpf/Makefile b/tools/testing/selftests/net/bpf/Makefile
deleted file mode 100644
index 4abaf16d2077..000000000000
--- a/tools/testing/selftests/net/bpf/Makefile
+++ /dev/null
@@ -1,51 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-CLANG ?= clang
-SCRATCH_DIR := $(OUTPUT)/tools
-BUILD_DIR := $(SCRATCH_DIR)/build
-BPFDIR := $(abspath ../../../lib/bpf)
-APIDIR := $(abspath ../../../include/uapi)
-
-CCINCLUDE += -I../../bpf
-CCINCLUDE += -I../../../../../usr/include/
-CCINCLUDE += -I$(SCRATCH_DIR)/include
-
-BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a
-
-MAKE_DIRS := $(BUILD_DIR)/libbpf $(OUTPUT)/bpf
-$(MAKE_DIRS):
-	mkdir -p $@
-
-TEST_CUSTOM_PROGS = $(OUTPUT)/bpf/nat6to4.o
-all: $(TEST_CUSTOM_PROGS)
-
-# Get Clang's default includes on this system, as opposed to those seen by
-# '-target bpf'. This fixes "missing" files on some architectures/distros,
-# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
-#
-# Use '-idirafter': Don't interfere with include mechanics except where the
-# build would have failed anyways.
-define get_sys_includes
-$(shell $(1) $(2) -v -E - </dev/null 2>&1 \
-	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
-$(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
-endef
-
-ifneq ($(CROSS_COMPILE),)
-CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%))
-endif
-
-CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH))
-
-$(TEST_CUSTOM_PROGS): $(OUTPUT)/%.o: %.c $(BPFOBJ) | $(MAKE_DIRS)
-	$(CLANG) -O2 -target bpf -c $< $(CCINCLUDE) $(CLANG_SYS_INCLUDES) -o $@
-
-$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
-	   $(APIDIR)/linux/bpf.h					       \
-	   | $(BUILD_DIR)/libbpf
-	$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/     \
-		    EXTRA_CFLAGS='-g -O0'				       \
-		    DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
-
-EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR)
-
diff --git a/tools/testing/selftests/net/bpf/nat6to4.c b/tools/testing/selftests/net/bpf/nat6to4.c
deleted file mode 100644
index ac54c36b25fc..000000000000
--- a/tools/testing/selftests/net/bpf/nat6to4.c
+++ /dev/null
@@ -1,285 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * This code is taken from the Android Open Source Project and the author
- * (Maciej Żenczykowski) has gave permission to relicense it under the
- * GPLv2. Therefore this program is free software;
- * You can redistribute it and/or modify it under the terms of the GNU
- * General Public License version 2 as published by the Free Software
- * Foundation
-
- * The original headers, including the original license headers, are
- * included below for completeness.
- *
- * Copyright (C) 2019 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <linux/bpf.h>
-#include <linux/if.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/in.h>
-#include <linux/in6.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/pkt_cls.h>
-#include <linux/swab.h>
-#include <stdbool.h>
-#include <stdint.h>
-
-
-#include <linux/udp.h>
-
-#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_endian.h>
-
-#define IP_DF 0x4000  // Flag: "Don't Fragment"
-
-SEC("schedcls/ingress6/nat_6")
-int sched_cls_ingress6_nat_6_prog(struct __sk_buff *skb)
-{
-	const int l2_header_size =  sizeof(struct ethhdr);
-	void *data = (void *)(long)skb->data;
-	const void *data_end = (void *)(long)skb->data_end;
-	const struct ethhdr * const eth = data;  // used iff is_ethernet
-	const struct ipv6hdr * const ip6 =  (void *)(eth + 1);
-
-	// Require ethernet dst mac address to be our unicast address.
-	if  (skb->pkt_type != PACKET_HOST)
-		return TC_ACT_OK;
-
-	// Must be meta-ethernet IPv6 frame
-	if (skb->protocol != bpf_htons(ETH_P_IPV6))
-		return TC_ACT_OK;
-
-	// Must have (ethernet and) ipv6 header
-	if (data + l2_header_size + sizeof(*ip6) > data_end)
-		return TC_ACT_OK;
-
-	// Ethertype - if present - must be IPv6
-	if (eth->h_proto != bpf_htons(ETH_P_IPV6))
-		return TC_ACT_OK;
-
-	// IP version must be 6
-	if (ip6->version != 6)
-		return TC_ACT_OK;
-	// Maximum IPv6 payload length that can be translated to IPv4
-	if (bpf_ntohs(ip6->payload_len) > 0xFFFF - sizeof(struct iphdr))
-		return TC_ACT_OK;
-	switch (ip6->nexthdr) {
-	case IPPROTO_TCP:  // For TCP & UDP the checksum neutrality of the chosen IPv6
-	case IPPROTO_UDP:  // address means there is no need to update their checksums.
-	case IPPROTO_GRE:  // We do not need to bother looking at GRE/ESP headers,
-	case IPPROTO_ESP:  // since there is never a checksum to update.
-		break;
-	default:  // do not know how to handle anything else
-		return TC_ACT_OK;
-	}
-
-	struct ethhdr eth2;  // used iff is_ethernet
-
-	eth2 = *eth;                     // Copy over the ethernet header (src/dst mac)
-	eth2.h_proto = bpf_htons(ETH_P_IP);  // But replace the ethertype
-
-	struct iphdr ip = {
-		.version = 4,                                                      // u4
-		.ihl = sizeof(struct iphdr) / sizeof(__u32),                       // u4
-		.tos = (ip6->priority << 4) + (ip6->flow_lbl[0] >> 4),             // u8
-		.tot_len = bpf_htons(bpf_ntohs(ip6->payload_len) + sizeof(struct iphdr)),  // u16
-		.id = 0,                                                           // u16
-		.frag_off = bpf_htons(IP_DF),                                          // u16
-		.ttl = ip6->hop_limit,                                             // u8
-		.protocol = ip6->nexthdr,                                          // u8
-		.check = 0,                                                        // u16
-		.saddr = 0x0201a8c0,                            // u32
-		.daddr = 0x0101a8c0,                                         // u32
-	};
-
-	// Calculate the IPv4 one's complement checksum of the IPv4 header.
-	__wsum sum4 = 0;
-
-	for (int i = 0; i < sizeof(ip) / sizeof(__u16); ++i)
-		sum4 += ((__u16 *)&ip)[i];
-
-	// Note that sum4 is guaranteed to be non-zero by virtue of ip.version == 4
-	sum4 = (sum4 & 0xFFFF) + (sum4 >> 16);  // collapse u32 into range 1 .. 0x1FFFE
-	sum4 = (sum4 & 0xFFFF) + (sum4 >> 16);  // collapse any potential carry into u16
-	ip.check = (__u16)~sum4;                // sum4 cannot be zero, so this is never 0xFFFF
-
-	// Calculate the *negative* IPv6 16-bit one's complement checksum of the IPv6 header.
-	__wsum sum6 = 0;
-	// We'll end up with a non-zero sum due to ip6->version == 6 (which has '0' bits)
-	for (int i = 0; i < sizeof(*ip6) / sizeof(__u16); ++i)
-		sum6 += ~((__u16 *)ip6)[i];  // note the bitwise negation
-
-	// Note that there is no L4 checksum update: we are relying on the checksum neutrality
-	// of the ipv6 address chosen by netd's ClatdController.
-
-	// Packet mutations begin - point of no return, but if this first modification fails
-	// the packet is probably still pristine, so let clatd handle it.
-	if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IP), 0))
-		return TC_ACT_OK;
-	bpf_csum_update(skb, sum6);
-
-	data = (void *)(long)skb->data;
-	data_end = (void *)(long)skb->data_end;
-	if (data + l2_header_size + sizeof(struct iphdr) > data_end)
-		return TC_ACT_SHOT;
-
-	struct ethhdr *new_eth = data;
-
-	// Copy over the updated ethernet header
-	*new_eth = eth2;
-
-	// Copy over the new ipv4 header.
-	*(struct iphdr *)(new_eth + 1) = ip;
-	return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
-}
-
-SEC("schedcls/egress4/snat4")
-int sched_cls_egress4_snat4_prog(struct __sk_buff *skb)
-{
-	const int l2_header_size =  sizeof(struct ethhdr);
-	void *data = (void *)(long)skb->data;
-	const void *data_end = (void *)(long)skb->data_end;
-	const struct ethhdr *const eth = data;  // used iff is_ethernet
-	const struct iphdr *const ip4 = (void *)(eth + 1);
-
-	// Must be meta-ethernet IPv4 frame
-	if (skb->protocol != bpf_htons(ETH_P_IP))
-		return TC_ACT_OK;
-
-	// Must have ipv4 header
-	if (data + l2_header_size + sizeof(struct ipv6hdr) > data_end)
-		return TC_ACT_OK;
-
-	// Ethertype - if present - must be IPv4
-	if (eth->h_proto != bpf_htons(ETH_P_IP))
-		return TC_ACT_OK;
-
-	// IP version must be 4
-	if (ip4->version != 4)
-		return TC_ACT_OK;
-
-	// We cannot handle IP options, just standard 20 byte == 5 dword minimal IPv4 header
-	if (ip4->ihl != 5)
-		return TC_ACT_OK;
-
-	// Maximum IPv6 payload length that can be translated to IPv4
-	if (bpf_htons(ip4->tot_len) > 0xFFFF - sizeof(struct ipv6hdr))
-		return TC_ACT_OK;
-
-	// Calculate the IPv4 one's complement checksum of the IPv4 header.
-	__wsum sum4 = 0;
-
-	for (int i = 0; i < sizeof(*ip4) / sizeof(__u16); ++i)
-		sum4 += ((__u16 *)ip4)[i];
-
-	// Note that sum4 is guaranteed to be non-zero by virtue of ip4->version == 4
-	sum4 = (sum4 & 0xFFFF) + (sum4 >> 16);  // collapse u32 into range 1 .. 0x1FFFE
-	sum4 = (sum4 & 0xFFFF) + (sum4 >> 16);  // collapse any potential carry into u16
-	// for a correct checksum we should get *a* zero, but sum4 must be positive, ie 0xFFFF
-	if (sum4 != 0xFFFF)
-		return TC_ACT_OK;
-
-	// Minimum IPv4 total length is the size of the header
-	if (bpf_ntohs(ip4->tot_len) < sizeof(*ip4))
-		return TC_ACT_OK;
-
-	// We are incapable of dealing with IPv4 fragments
-	if (ip4->frag_off & ~bpf_htons(IP_DF))
-		return TC_ACT_OK;
-
-	switch (ip4->protocol) {
-	case IPPROTO_TCP:  // For TCP & UDP the checksum neutrality of the chosen IPv6
-	case IPPROTO_GRE:  // address means there is no need to update their checksums.
-	case IPPROTO_ESP:  // We do not need to bother looking at GRE/ESP headers,
-		break;         // since there is never a checksum to update.
-
-	case IPPROTO_UDP:  // See above comment, but must also have UDP header...
-		if (data + sizeof(*ip4) + sizeof(struct udphdr) > data_end)
-			return TC_ACT_OK;
-		const struct udphdr *uh = (const struct udphdr *)(ip4 + 1);
-		// If IPv4/UDP checksum is 0 then fallback to clatd so it can calculate the
-		// checksum.  Otherwise the network or more likely the NAT64 gateway might
-		// drop the packet because in most cases IPv6/UDP packets with a zero checksum
-		// are invalid. See RFC 6935.  TODO: calculate checksum via bpf_csum_diff()
-		if (!uh->check)
-			return TC_ACT_OK;
-		break;
-
-	default:  // do not know how to handle anything else
-		return TC_ACT_OK;
-	}
-	struct ethhdr eth2;  // used iff is_ethernet
-
-	eth2 = *eth;                     // Copy over the ethernet header (src/dst mac)
-	eth2.h_proto = bpf_htons(ETH_P_IPV6);  // But replace the ethertype
-
-	struct ipv6hdr ip6 = {
-		.version = 6,                                    // __u8:4
-		.priority = ip4->tos >> 4,                       // __u8:4
-		.flow_lbl = {(ip4->tos & 0xF) << 4, 0, 0},       // __u8[3]
-		.payload_len = bpf_htons(bpf_ntohs(ip4->tot_len) - 20),  // __be16
-		.nexthdr = ip4->protocol,                        // __u8
-		.hop_limit = ip4->ttl,                           // __u8
-	};
-	ip6.saddr.in6_u.u6_addr32[0] = bpf_htonl(0x20010db8);
-	ip6.saddr.in6_u.u6_addr32[1] = 0;
-	ip6.saddr.in6_u.u6_addr32[2] = 0;
-	ip6.saddr.in6_u.u6_addr32[3] = bpf_htonl(1);
-	ip6.daddr.in6_u.u6_addr32[0] = bpf_htonl(0x20010db8);
-	ip6.daddr.in6_u.u6_addr32[1] = 0;
-	ip6.daddr.in6_u.u6_addr32[2] = 0;
-	ip6.daddr.in6_u.u6_addr32[3] = bpf_htonl(2);
-
-	// Calculate the IPv6 16-bit one's complement checksum of the IPv6 header.
-	__wsum sum6 = 0;
-	// We'll end up with a non-zero sum due to ip6.version == 6
-	for (int i = 0; i < sizeof(ip6) / sizeof(__u16); ++i)
-		sum6 += ((__u16 *)&ip6)[i];
-
-	// Packet mutations begin - point of no return, but if this first modification fails
-	// the packet is probably still pristine, so let clatd handle it.
-	if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IPV6), 0))
-		return TC_ACT_OK;
-
-	// This takes care of updating the skb->csum field for a CHECKSUM_COMPLETE packet.
-	// In such a case, skb->csum is a 16-bit one's complement sum of the entire payload,
-	// thus we need to subtract out the ipv4 header's sum, and add in the ipv6 header's sum.
-	// However, we've already verified the ipv4 checksum is correct and thus 0.
-	// Thus we only need to add the ipv6 header's sum.
-	//
-	// bpf_csum_update() always succeeds if the skb is CHECKSUM_COMPLETE and returns an error
-	// (-ENOTSUPP) if it isn't.  So we just ignore the return code (see above for more details).
-	bpf_csum_update(skb, sum6);
-
-	// bpf_skb_change_proto() invalidates all pointers - reload them.
-	data = (void *)(long)skb->data;
-	data_end = (void *)(long)skb->data_end;
-
-	// I cannot think of any valid way for this error condition to trigger, however I do
-	// believe the explicit check is required to keep the in kernel ebpf verifier happy.
-	if (data + l2_header_size + sizeof(ip6) > data_end)
-		return TC_ACT_SHOT;
-
-	struct ethhdr *new_eth = data;
-
-	// Copy over the updated ethernet header
-	*new_eth = eth2;
-	// Copy over the new ipv4 header.
-	*(struct ipv6hdr *)(new_eth + 1) = ip6;
-	return TC_ACT_OK;
-}
-
-char _license[] SEC("license") = ("GPL");
diff --git a/tools/testing/selftests/net/nat6to4.c b/tools/testing/selftests/net/nat6to4.c
new file mode 100644
index 000000000000..ac54c36b25fc
--- /dev/null
+++ b/tools/testing/selftests/net/nat6to4.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This code is taken from the Android Open Source Project and the author
+ * (Maciej Żenczykowski) has gave permission to relicense it under the
+ * GPLv2. Therefore this program is free software;
+ * You can redistribute it and/or modify it under the terms of the GNU
+ * General Public License version 2 as published by the Free Software
+ * Foundation
+
+ * The original headers, including the original license headers, are
+ * included below for completeness.
+ *
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <linux/bpf.h>
+#include <linux/if.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/swab.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+
+#include <linux/udp.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define IP_DF 0x4000  // Flag: "Don't Fragment"
+
+SEC("schedcls/ingress6/nat_6")
+int sched_cls_ingress6_nat_6_prog(struct __sk_buff *skb)
+{
+	const int l2_header_size =  sizeof(struct ethhdr);
+	void *data = (void *)(long)skb->data;
+	const void *data_end = (void *)(long)skb->data_end;
+	const struct ethhdr * const eth = data;  // used iff is_ethernet
+	const struct ipv6hdr * const ip6 =  (void *)(eth + 1);
+
+	// Require ethernet dst mac address to be our unicast address.
+	if  (skb->pkt_type != PACKET_HOST)
+		return TC_ACT_OK;
+
+	// Must be meta-ethernet IPv6 frame
+	if (skb->protocol != bpf_htons(ETH_P_IPV6))
+		return TC_ACT_OK;
+
+	// Must have (ethernet and) ipv6 header
+	if (data + l2_header_size + sizeof(*ip6) > data_end)
+		return TC_ACT_OK;
+
+	// Ethertype - if present - must be IPv6
+	if (eth->h_proto != bpf_htons(ETH_P_IPV6))
+		return TC_ACT_OK;
+
+	// IP version must be 6
+	if (ip6->version != 6)
+		return TC_ACT_OK;
+	// Maximum IPv6 payload length that can be translated to IPv4
+	if (bpf_ntohs(ip6->payload_len) > 0xFFFF - sizeof(struct iphdr))
+		return TC_ACT_OK;
+	switch (ip6->nexthdr) {
+	case IPPROTO_TCP:  // For TCP & UDP the checksum neutrality of the chosen IPv6
+	case IPPROTO_UDP:  // address means there is no need to update their checksums.
+	case IPPROTO_GRE:  // We do not need to bother looking at GRE/ESP headers,
+	case IPPROTO_ESP:  // since there is never a checksum to update.
+		break;
+	default:  // do not know how to handle anything else
+		return TC_ACT_OK;
+	}
+
+	struct ethhdr eth2;  // used iff is_ethernet
+
+	eth2 = *eth;                     // Copy over the ethernet header (src/dst mac)
+	eth2.h_proto = bpf_htons(ETH_P_IP);  // But replace the ethertype
+
+	struct iphdr ip = {
+		.version = 4,                                                      // u4
+		.ihl = sizeof(struct iphdr) / sizeof(__u32),                       // u4
+		.tos = (ip6->priority << 4) + (ip6->flow_lbl[0] >> 4),             // u8
+		.tot_len = bpf_htons(bpf_ntohs(ip6->payload_len) + sizeof(struct iphdr)),  // u16
+		.id = 0,                                                           // u16
+		.frag_off = bpf_htons(IP_DF),                                          // u16
+		.ttl = ip6->hop_limit,                                             // u8
+		.protocol = ip6->nexthdr,                                          // u8
+		.check = 0,                                                        // u16
+		.saddr = 0x0201a8c0,                            // u32
+		.daddr = 0x0101a8c0,                                         // u32
+	};
+
+	// Calculate the IPv4 one's complement checksum of the IPv4 header.
+	__wsum sum4 = 0;
+
+	for (int i = 0; i < sizeof(ip) / sizeof(__u16); ++i)
+		sum4 += ((__u16 *)&ip)[i];
+
+	// Note that sum4 is guaranteed to be non-zero by virtue of ip.version == 4
+	sum4 = (sum4 & 0xFFFF) + (sum4 >> 16);  // collapse u32 into range 1 .. 0x1FFFE
+	sum4 = (sum4 & 0xFFFF) + (sum4 >> 16);  // collapse any potential carry into u16
+	ip.check = (__u16)~sum4;                // sum4 cannot be zero, so this is never 0xFFFF
+
+	// Calculate the *negative* IPv6 16-bit one's complement checksum of the IPv6 header.
+	__wsum sum6 = 0;
+	// We'll end up with a non-zero sum due to ip6->version == 6 (which has '0' bits)
+	for (int i = 0; i < sizeof(*ip6) / sizeof(__u16); ++i)
+		sum6 += ~((__u16 *)ip6)[i];  // note the bitwise negation
+
+	// Note that there is no L4 checksum update: we are relying on the checksum neutrality
+	// of the ipv6 address chosen by netd's ClatdController.
+
+	// Packet mutations begin - point of no return, but if this first modification fails
+	// the packet is probably still pristine, so let clatd handle it.
+	if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IP), 0))
+		return TC_ACT_OK;
+	bpf_csum_update(skb, sum6);
+
+	data = (void *)(long)skb->data;
+	data_end = (void *)(long)skb->data_end;
+	if (data + l2_header_size + sizeof(struct iphdr) > data_end)
+		return TC_ACT_SHOT;
+
+	struct ethhdr *new_eth = data;
+
+	// Copy over the updated ethernet header
+	*new_eth = eth2;
+
+	// Copy over the new ipv4 header.
+	*(struct iphdr *)(new_eth + 1) = ip;
+	return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
+}
+
+SEC("schedcls/egress4/snat4")
+int sched_cls_egress4_snat4_prog(struct __sk_buff *skb)
+{
+	const int l2_header_size =  sizeof(struct ethhdr);
+	void *data = (void *)(long)skb->data;
+	const void *data_end = (void *)(long)skb->data_end;
+	const struct ethhdr *const eth = data;  // used iff is_ethernet
+	const struct iphdr *const ip4 = (void *)(eth + 1);
+
+	// Must be meta-ethernet IPv4 frame
+	if (skb->protocol != bpf_htons(ETH_P_IP))
+		return TC_ACT_OK;
+
+	// Must have ipv4 header
+	if (data + l2_header_size + sizeof(struct ipv6hdr) > data_end)
+		return TC_ACT_OK;
+
+	// Ethertype - if present - must be IPv4
+	if (eth->h_proto != bpf_htons(ETH_P_IP))
+		return TC_ACT_OK;
+
+	// IP version must be 4
+	if (ip4->version != 4)
+		return TC_ACT_OK;
+
+	// We cannot handle IP options, just standard 20 byte == 5 dword minimal IPv4 header
+	if (ip4->ihl != 5)
+		return TC_ACT_OK;
+
+	// Maximum IPv6 payload length that can be translated to IPv4
+	if (bpf_htons(ip4->tot_len) > 0xFFFF - sizeof(struct ipv6hdr))
+		return TC_ACT_OK;
+
+	// Calculate the IPv4 one's complement checksum of the IPv4 header.
+	__wsum sum4 = 0;
+
+	for (int i = 0; i < sizeof(*ip4) / sizeof(__u16); ++i)
+		sum4 += ((__u16 *)ip4)[i];
+
+	// Note that sum4 is guaranteed to be non-zero by virtue of ip4->version == 4
+	sum4 = (sum4 & 0xFFFF) + (sum4 >> 16);  // collapse u32 into range 1 .. 0x1FFFE
+	sum4 = (sum4 & 0xFFFF) + (sum4 >> 16);  // collapse any potential carry into u16
+	// for a correct checksum we should get *a* zero, but sum4 must be positive, ie 0xFFFF
+	if (sum4 != 0xFFFF)
+		return TC_ACT_OK;
+
+	// Minimum IPv4 total length is the size of the header
+	if (bpf_ntohs(ip4->tot_len) < sizeof(*ip4))
+		return TC_ACT_OK;
+
+	// We are incapable of dealing with IPv4 fragments
+	if (ip4->frag_off & ~bpf_htons(IP_DF))
+		return TC_ACT_OK;
+
+	switch (ip4->protocol) {
+	case IPPROTO_TCP:  // For TCP & UDP the checksum neutrality of the chosen IPv6
+	case IPPROTO_GRE:  // address means there is no need to update their checksums.
+	case IPPROTO_ESP:  // We do not need to bother looking at GRE/ESP headers,
+		break;         // since there is never a checksum to update.
+
+	case IPPROTO_UDP:  // See above comment, but must also have UDP header...
+		if (data + sizeof(*ip4) + sizeof(struct udphdr) > data_end)
+			return TC_ACT_OK;
+		const struct udphdr *uh = (const struct udphdr *)(ip4 + 1);
+		// If IPv4/UDP checksum is 0 then fallback to clatd so it can calculate the
+		// checksum.  Otherwise the network or more likely the NAT64 gateway might
+		// drop the packet because in most cases IPv6/UDP packets with a zero checksum
+		// are invalid. See RFC 6935.  TODO: calculate checksum via bpf_csum_diff()
+		if (!uh->check)
+			return TC_ACT_OK;
+		break;
+
+	default:  // do not know how to handle anything else
+		return TC_ACT_OK;
+	}
+	struct ethhdr eth2;  // used iff is_ethernet
+
+	eth2 = *eth;                     // Copy over the ethernet header (src/dst mac)
+	eth2.h_proto = bpf_htons(ETH_P_IPV6);  // But replace the ethertype
+
+	struct ipv6hdr ip6 = {
+		.version = 6,                                    // __u8:4
+		.priority = ip4->tos >> 4,                       // __u8:4
+		.flow_lbl = {(ip4->tos & 0xF) << 4, 0, 0},       // __u8[3]
+		.payload_len = bpf_htons(bpf_ntohs(ip4->tot_len) - 20),  // __be16
+		.nexthdr = ip4->protocol,                        // __u8
+		.hop_limit = ip4->ttl,                           // __u8
+	};
+	ip6.saddr.in6_u.u6_addr32[0] = bpf_htonl(0x20010db8);
+	ip6.saddr.in6_u.u6_addr32[1] = 0;
+	ip6.saddr.in6_u.u6_addr32[2] = 0;
+	ip6.saddr.in6_u.u6_addr32[3] = bpf_htonl(1);
+	ip6.daddr.in6_u.u6_addr32[0] = bpf_htonl(0x20010db8);
+	ip6.daddr.in6_u.u6_addr32[1] = 0;
+	ip6.daddr.in6_u.u6_addr32[2] = 0;
+	ip6.daddr.in6_u.u6_addr32[3] = bpf_htonl(2);
+
+	// Calculate the IPv6 16-bit one's complement checksum of the IPv6 header.
+	__wsum sum6 = 0;
+	// We'll end up with a non-zero sum due to ip6.version == 6
+	for (int i = 0; i < sizeof(ip6) / sizeof(__u16); ++i)
+		sum6 += ((__u16 *)&ip6)[i];
+
+	// Packet mutations begin - point of no return, but if this first modification fails
+	// the packet is probably still pristine, so let clatd handle it.
+	if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IPV6), 0))
+		return TC_ACT_OK;
+
+	// This takes care of updating the skb->csum field for a CHECKSUM_COMPLETE packet.
+	// In such a case, skb->csum is a 16-bit one's complement sum of the entire payload,
+	// thus we need to subtract out the ipv4 header's sum, and add in the ipv6 header's sum.
+	// However, we've already verified the ipv4 checksum is correct and thus 0.
+	// Thus we only need to add the ipv6 header's sum.
+	//
+	// bpf_csum_update() always succeeds if the skb is CHECKSUM_COMPLETE and returns an error
+	// (-ENOTSUPP) if it isn't.  So we just ignore the return code (see above for more details).
+	bpf_csum_update(skb, sum6);
+
+	// bpf_skb_change_proto() invalidates all pointers - reload them.
+	data = (void *)(long)skb->data;
+	data_end = (void *)(long)skb->data_end;
+
+	// I cannot think of any valid way for this error condition to trigger, however I do
+	// believe the explicit check is required to keep the in kernel ebpf verifier happy.
+	if (data + l2_header_size + sizeof(ip6) > data_end)
+		return TC_ACT_SHOT;
+
+	struct ethhdr *new_eth = data;
+
+	// Copy over the updated ethernet header
+	*new_eth = eth2;
+	// Copy over the new ipv4 header.
+	*(struct ipv6hdr *)(new_eth + 1) = ip6;
+	return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = ("GPL");
diff --git a/tools/testing/selftests/net/udpgro_frglist.sh b/tools/testing/selftests/net/udpgro_frglist.sh
index c9c4b9d65839..0a6359bed0b9 100755
--- a/tools/testing/selftests/net/udpgro_frglist.sh
+++ b/tools/testing/selftests/net/udpgro_frglist.sh
@@ -40,8 +40,8 @@ run_one() {
 
 	ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp
 	tc -n "${PEER_NS}" qdisc add dev veth1 clsact
-	tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file ../bpf/nat6to4.o section schedcls/ingress6/nat_6  direct-action
-	tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file ../bpf/nat6to4.o section schedcls/egress4/snat4 direct-action
+	tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file nat6to4.o section schedcls/ingress6/nat_6  direct-action
+	tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file nat6to4.o section schedcls/egress4/snat4 direct-action
         echo ${rx_args}
 	ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r &
 
@@ -88,8 +88,8 @@ if [ ! -f ${BPF_FILE} ]; then
 	exit -1
 fi
 
-if [ ! -f bpf/nat6to4.o ]; then
-	echo "Missing nat6to4 helper. Build bpfnat6to4.o selftest first"
+if [ ! -f nat6to4.o ]; then
+	echo "Missing nat6to4 helper. Build bpf nat6to4.o selftest first"
 	exit -1
 fi
 
-- 
cgit 


From edac4b5b185ed3d2717b8267e28e0a1187c0bc08 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 16 Jan 2023 11:10:08 +0100
Subject: selftests/bpf: Add
 serial_test_kprobe_multi_bench_attach_kernel/module tests

Add bench test for module portion of the symbols as well.

  # ./test_progs -v -t kprobe_multi_bench_attach_module
  bpf_testmod.ko is already unloaded.
  Loading bpf_testmod.ko...
  Successfully loaded bpf_testmod.ko.
  test_kprobe_multi_bench_attach:PASS:get_syms 0 nsec
  test_kprobe_multi_bench_attach:PASS:kprobe_multi_empty__open_and_load 0 nsec
  test_kprobe_multi_bench_attach:PASS:bpf_program__attach_kprobe_multi_opts 0 nsec
  test_kprobe_multi_bench_attach: found 26620 functions
  test_kprobe_multi_bench_attach: attached in   0.182s
  test_kprobe_multi_bench_attach: detached in   0.082s
  #96      kprobe_multi_bench_attach_module:OK
  Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED
  Successfully unloaded bpf_testmod.ko.

It's useful for testing kprobe multi link modules resolving.

Acked-by: Song Liu <song@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20230116101009.23694-3-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/kprobe_multi_test.c      | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
index c6f37e825f11..113dba349a57 100644
--- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
@@ -322,7 +322,7 @@ static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused)
 	return strcmp((const char *) key1, (const char *) key2) == 0;
 }
 
-static int get_syms(char ***symsp, size_t *cntp)
+static int get_syms(char ***symsp, size_t *cntp, bool kernel)
 {
 	size_t cap = 0, cnt = 0, i;
 	char *name = NULL, **syms = NULL;
@@ -349,8 +349,9 @@ static int get_syms(char ***symsp, size_t *cntp)
 	}
 
 	while (fgets(buf, sizeof(buf), f)) {
-		/* skip modules */
-		if (strchr(buf, '['))
+		if (kernel && strchr(buf, '['))
+			continue;
+		if (!kernel && !strchr(buf, '['))
 			continue;
 
 		free(name);
@@ -404,7 +405,7 @@ error:
 	return err;
 }
 
-void serial_test_kprobe_multi_bench_attach(void)
+static void test_kprobe_multi_bench_attach(bool kernel)
 {
 	LIBBPF_OPTS(bpf_kprobe_multi_opts, opts);
 	struct kprobe_multi_empty *skel = NULL;
@@ -415,7 +416,7 @@ void serial_test_kprobe_multi_bench_attach(void)
 	char **syms = NULL;
 	size_t cnt = 0, i;
 
-	if (!ASSERT_OK(get_syms(&syms, &cnt), "get_syms"))
+	if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms"))
 		return;
 
 	skel = kprobe_multi_empty__open_and_load();
@@ -453,6 +454,14 @@ cleanup:
 	}
 }
 
+void serial_test_kprobe_multi_bench_attach(void)
+{
+	if (test__start_subtest("kernel"))
+		test_kprobe_multi_bench_attach(true);
+	if (test__start_subtest("modules"))
+		test_kprobe_multi_bench_attach(false);
+}
+
 void test_kprobe_multi_test(void)
 {
 	if (!ASSERT_OK(load_kallsyms(), "load_kallsyms"))
-- 
cgit 


From 1c07425e902cd3137961c3d45b4271bf8a9b8eb9 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 16 Jan 2023 16:04:49 +0000
Subject: kselftest/arm64: Add a stress test program for ZT0

Following the pattern for the other register sets add a stress test program
for ZT0 which continually loads and verifies patterns in the register in
an effort to discover context switching problems.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-arm64-sme2-v4-14-f2fa0aef982f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/.gitignore |   1 +
 tools/testing/selftests/arm64/fp/Makefile   |   3 +
 tools/testing/selftests/arm64/fp/sme-inst.h |  20 ++
 tools/testing/selftests/arm64/fp/zt-test.S  | 317 ++++++++++++++++++++++++++++
 4 files changed, 341 insertions(+)
 create mode 100644 tools/testing/selftests/arm64/fp/zt-test.S

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/.gitignore b/tools/testing/selftests/arm64/fp/.gitignore
index df79d29664a1..41bde4c97d47 100644
--- a/tools/testing/selftests/arm64/fp/.gitignore
+++ b/tools/testing/selftests/arm64/fp/.gitignore
@@ -12,3 +12,4 @@ vlset
 za-fork
 za-ptrace
 za-test
+zt-test
diff --git a/tools/testing/selftests/arm64/fp/Makefile b/tools/testing/selftests/arm64/fp/Makefile
index 36db61358ed5..aff3026d3dff 100644
--- a/tools/testing/selftests/arm64/fp/Makefile
+++ b/tools/testing/selftests/arm64/fp/Makefile
@@ -14,6 +14,7 @@ TEST_GEN_PROGS_EXTENDED := fp-pidbench fpsimd-test \
 	sve-test \
 	ssve-test \
 	za-test \
+	zt-test \
 	vlset
 TEST_PROGS_EXTENDED := fpsimd-stress sve-stress ssve-stress za-stress
 
@@ -41,5 +42,7 @@ $(OUTPUT)/za-fork: za-fork.c $(OUTPUT)/za-fork-asm.o
 $(OUTPUT)/za-ptrace: za-ptrace.c
 $(OUTPUT)/za-test: za-test.S $(OUTPUT)/asm-utils.o
 	$(CC) -nostdlib $^ -o $@
+$(OUTPUT)/zt-test: zt-test.S $(OUTPUT)/asm-utils.o
+	$(CC) -nostdlib $^ -o $@
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/arm64/fp/sme-inst.h b/tools/testing/selftests/arm64/fp/sme-inst.h
index 7191e53ca1c0..9292bba5400b 100644
--- a/tools/testing/selftests/arm64/fp/sme-inst.h
+++ b/tools/testing/selftests/arm64/fp/sme-inst.h
@@ -48,4 +48,24 @@
 		| ((\offset) & 7)
 .endm
 
+/*
+ * LDR (ZT0)
+ *
+ *	LDR ZT0, nx
+ */
+.macro _ldr_zt nx
+	.inst	0xe11f8000			\
+		| (((\nx) & 0x1f) << 5)
+.endm
+
+/*
+ * STR (ZT0)
+ *
+ *	STR ZT0, nx
+ */
+.macro _str_zt nx
+	.inst	0xe13f8000			\
+		| (((\nx) & 0x1f) << 5)
+.endm
+
 #endif
diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S
new file mode 100644
index 000000000000..7ec90976cf5e
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2021-2 ARM Limited.
+// Original author: Mark Brown <broonie@kernel.org>
+//
+// Scalable Matrix Extension ZT context switch test
+// Repeatedly writes unique test patterns into ZT0
+// and reads them back to verify integrity.
+
+#include <asm/unistd.h>
+#include "assembler.h"
+#include "asm-offsets.h"
+#include "sme-inst.h"
+
+.arch_extension sve
+
+#define ZT_SZ	512
+#define ZT_B	(ZT_SZ / 8)
+
+// Declare some storage space to shadow ZT register contents and a
+// scratch buffer.
+.pushsection .text
+.data
+.align 4
+ztref:
+	.space	ZT_B
+scratch:
+	.space	ZT_B
+.popsection
+
+
+// Generate a test pattern for storage in ZT
+// x0: pid
+// x1: generation
+
+// These values are used to construct a 32-bit pattern that is repeated in the
+// scratch buffer as many times as will fit:
+// bits 31:24	generation number (increments once per test_loop)
+// bits 23: 8	pid
+// bits  7: 0	32-bit lane index
+
+function pattern
+	mov	w3, wzr
+	bfi	w3, w0, #8, #16		// PID
+	bfi	w3, w1, #24, #8		// Generation
+
+	ldr	x0, =scratch
+	mov	w1, #ZT_B / 4
+
+0:	str	w3, [x0], #4
+	add	w3, w3, #1		// Lane
+	subs	w1, w1, #1
+	b.ne	0b
+
+	ret
+endfunction
+
+// Set up test pattern in a ZT horizontal vector
+// x0: pid
+// x1: generation
+function setup_zt
+	mov	x4, x30
+
+	bl	pattern			// Get pattern in scratch buffer
+	ldr	x0, =ztref
+	ldr	x1, =scratch
+	mov	x2, #ZT_B
+	bl	memcpy
+
+	ldr	x0, =ztref
+	_ldr_zt 0			// load zt0 from pointer x0
+
+	ret	x4
+endfunction
+
+// Trivial memory compare: compare x2 bytes starting at address x0 with
+// bytes starting at address x1.
+// Returns only if all bytes match; otherwise, the program is aborted.
+// Clobbers x0-x5.
+function memcmp
+	cbz	x2, 2f
+
+	stp	x0, x1, [sp, #-0x20]!
+	str	x2, [sp, #0x10]
+
+	mov	x5, #0
+0:	ldrb	w3, [x0, x5]
+	ldrb	w4, [x1, x5]
+	add	x5, x5, #1
+	cmp	w3, w4
+	b.ne	1f
+	subs	x2, x2, #1
+	b.ne	0b
+
+1:	ldr	x2, [sp, #0x10]
+	ldp	x0, x1, [sp], #0x20
+	b.ne	barf
+
+2:	ret
+endfunction
+
+// Verify that a ZT vector matches its shadow in memory, else abort
+// Clobbers x0-x3
+function check_zt
+	mov	x3, x30
+
+	ldr	x0, =scratch		// Poison scratch
+	mov	x1, #ZT_B
+	bl	memfill_ae
+
+	ldr	x0, =scratch
+	_str_zt 0
+
+	ldr	x0, =ztref
+	ldr	x1, =scratch
+	mov	x2, #ZT_B
+	mov	x30, x3
+	b	memcmp
+endfunction
+
+// Any SME register modified here can cause corruption in the main
+// thread -- but *only* the locations modified here.
+function irritator_handler
+	// Increment the irritation signal count (x23):
+	ldr	x0, [x2, #ucontext_regs + 8 * 23]
+	add	x0, x0, #1
+	str	x0, [x2, #ucontext_regs + 8 * 23]
+
+	// Corrupt some random ZT data
+#if 0
+	adr	x0, .text + (irritator_handler - .text) / 16 * 16
+	movi	v0.8b, #1
+	movi	v9.16b, #2
+	movi	v31.8b, #3
+#endif
+
+	ret
+endfunction
+
+function tickle_handler
+	// Increment the signal count (x23):
+	ldr	x0, [x2, #ucontext_regs + 8 * 23]
+	add	x0, x0, #1
+	str	x0, [x2, #ucontext_regs + 8 * 23]
+
+	ret
+endfunction
+
+function terminate_handler
+	mov	w21, w0
+	mov	x20, x2
+
+	puts	"Terminated by signal "
+	mov	w0, w21
+	bl	putdec
+	puts	", no error, iterations="
+	ldr	x0, [x20, #ucontext_regs + 8 * 22]
+	bl	putdec
+	puts	", signals="
+	ldr	x0, [x20, #ucontext_regs + 8 * 23]
+	bl	putdecn
+
+	mov	x0, #0
+	mov	x8, #__NR_exit
+	svc	#0
+endfunction
+
+// w0: signal number
+// x1: sa_action
+// w2: sa_flags
+// Clobbers x0-x6,x8
+function setsignal
+	str	x30, [sp, #-((sa_sz + 15) / 16 * 16 + 16)]!
+
+	mov	w4, w0
+	mov	x5, x1
+	mov	w6, w2
+
+	add	x0, sp, #16
+	mov	x1, #sa_sz
+	bl	memclr
+
+	mov	w0, w4
+	add	x1, sp, #16
+	str	w6, [x1, #sa_flags]
+	str	x5, [x1, #sa_handler]
+	mov	x2, #0
+	mov	x3, #sa_mask_sz
+	mov	x8, #__NR_rt_sigaction
+	svc	#0
+
+	cbz	w0, 1f
+
+	puts	"sigaction failure\n"
+	b	.Labort
+
+1:	ldr	x30, [sp], #((sa_sz + 15) / 16 * 16 + 16)
+	ret
+endfunction
+
+// Main program entry point
+.globl _start
+function _start
+_start:
+	mov	x23, #0		// signal count
+
+	mov	w0, #SIGINT
+	adr	x1, terminate_handler
+	mov	w2, #SA_SIGINFO
+	bl	setsignal
+
+	mov	w0, #SIGTERM
+	adr	x1, terminate_handler
+	mov	w2, #SA_SIGINFO
+	bl	setsignal
+
+	mov	w0, #SIGUSR1
+	adr	x1, irritator_handler
+	mov	w2, #SA_SIGINFO
+	orr	w2, w2, #SA_NODEFER
+	bl	setsignal
+
+	mov	w0, #SIGUSR2
+	adr	x1, tickle_handler
+	mov	w2, #SA_SIGINFO
+	orr	w2, w2, #SA_NODEFER
+	bl	setsignal
+
+	smstart_za
+
+	// Obtain our PID, to ensure test pattern uniqueness between processes
+	mov	x8, #__NR_getpid
+	svc	#0
+	mov	x20, x0
+
+	puts	"PID:\t"
+	mov	x0, x20
+	bl	putdecn
+
+	mov	x22, #0		// generation number, increments per iteration
+.Ltest_loop:
+	mov	x0, x20
+	mov	x1, x22
+	bl	setup_zt
+
+	mov	x8, #__NR_sched_yield	// Encourage preemption
+	svc	#0
+
+	mrs	x0, S3_3_C4_C2_2	// SVCR should have ZA=1,SM=0
+	and	x1, x0, #3
+	cmp	x1, #2
+	b.ne	svcr_barf
+
+	bl	check_zt
+
+	add	x22, x22, #1	// Everything still working
+	b	.Ltest_loop
+
+.Labort:
+	mov	x0, #0
+	mov	x1, #SIGABRT
+	mov	x8, #__NR_kill
+	svc	#0
+endfunction
+
+function barf
+// fpsimd.c acitivty log dump hack
+//	ldr	w0, =0xdeadc0de
+//	mov	w8, #__NR_exit
+//	svc	#0
+// end hack
+	smstop
+	mov	x10, x0	// expected data
+	mov	x11, x1	// actual data
+	mov	x12, x2	// data size
+
+	puts	"Mismatch: PID="
+	mov	x0, x20
+	bl	putdec
+	puts	", iteration="
+	mov	x0, x22
+	bl	putdec
+	puts	"\tExpected ["
+	mov	x0, x10
+	mov	x1, x12
+	bl	dumphex
+	puts	"]\n\tGot      ["
+	mov	x0, x11
+	mov	x1, x12
+	bl	dumphex
+	puts	"]\n"
+
+	mov	x8, #__NR_getpid
+	svc	#0
+// fpsimd.c acitivty log dump hack
+//	ldr	w0, =0xdeadc0de
+//	mov	w8, #__NR_exit
+//	svc	#0
+// ^ end of hack
+	mov	x1, #SIGABRT
+	mov	x8, #__NR_kill
+	svc	#0
+//	mov	x8, #__NR_exit
+//	mov	x1, #1
+//	svc	#0
+endfunction
+
+function svcr_barf
+	mov	x10, x0
+
+	puts	"Bad SVCR: "
+	mov	x0, x10
+	bl	putdecn
+
+	mov	x8, #__NR_exit
+	mov	x1, #1
+	svc	#0
+endfunction
-- 
cgit 


From f63a9f15b2d4a8c9588a9260d49bc3890118fece Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 16 Jan 2023 16:04:50 +0000
Subject: kselftest/arm64: Cover ZT in the FP stress test

Hook up the newly added zt-test program in the FPSIMD stress tests, start
a copy per CPU when SME2 is supported.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-arm64-sme2-v4-15-f2fa0aef982f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-stress.c | 29 ++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index f8b2f41aac36..520385fcfede 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -370,6 +370,19 @@ static void start_za(struct child_data *child, int vl, int cpu)
 	ksft_print_msg("Started %s\n", child->name);
 }
 
+static void start_zt(struct child_data *child, int cpu)
+{
+	int ret;
+
+	ret = asprintf(&child->name, "ZT-%d", cpu);
+	if (ret == -1)
+		ksft_exit_fail_msg("asprintf() failed\n");
+
+	child_start(child, "./zt-test");
+
+	ksft_print_msg("Started %s\n", child->name);
+}
+
 static void probe_vls(int vls[], int *vl_count, int set_vl)
 {
 	unsigned int vq;
@@ -426,6 +439,7 @@ int main(int argc, char **argv)
 	bool all_children_started = false;
 	int seen_children;
 	int sve_vls[MAX_VLS], sme_vls[MAX_VLS];
+	bool have_sme2;
 	struct sigaction sa;
 
 	while ((c = getopt_long(argc, argv, "t:", options, NULL)) != -1) {
@@ -458,6 +472,13 @@ int main(int argc, char **argv)
 		sme_vl_count = 0;
 	}
 
+	if (getauxval(AT_HWCAP2) & HWCAP2_SME2) {
+		tests += cpus;
+		have_sme2 = true;
+	} else {
+		have_sme2 = false;
+	}
+
 	/* Force context switching if we only have FPSIMD */
 	if (!sve_vl_count && !sme_vl_count)
 		fpsimd_per_cpu = 2;
@@ -468,8 +489,9 @@ int main(int argc, char **argv)
 	ksft_print_header();
 	ksft_set_plan(tests);
 
-	ksft_print_msg("%d CPUs, %d SVE VLs, %d SME VLs\n",
-		       cpus, sve_vl_count, sme_vl_count);
+	ksft_print_msg("%d CPUs, %d SVE VLs, %d SME VLs, SME2 %s\n",
+		       cpus, sve_vl_count, sme_vl_count,
+		       have_sme2 ? "present" : "absent");
 
 	if (timeout > 0)
 		ksft_print_msg("Will run for %ds\n", timeout);
@@ -527,6 +549,9 @@ int main(int argc, char **argv)
 			start_ssve(&children[num_children++], sme_vls[j], i);
 			start_za(&children[num_children++], sme_vls[j], i);
 		}
+
+		if (have_sme2)
+			start_zt(&children[num_children++], i);
 	}
 
 	/*
-- 
cgit 


From 63829373260898bdd2eb95a03e2e1c936998d6ca Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 16 Jan 2023 16:04:51 +0000
Subject: kselftest/arm64: Enumerate SME2 in the signal test utility code

Support test cases for SME2 by adding it to the set of features that we
enumerate so test cases can check for it.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-arm64-sme2-v4-16-f2fa0aef982f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/test_signals.h       | 2 ++
 tools/testing/selftests/arm64/signal/test_signals_utils.c | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/test_signals.h b/tools/testing/selftests/arm64/signal/test_signals.h
index 0c645834ddc3..1e6273d81575 100644
--- a/tools/testing/selftests/arm64/signal/test_signals.h
+++ b/tools/testing/selftests/arm64/signal/test_signals.h
@@ -34,6 +34,7 @@ enum {
 	FSVE_BIT,
 	FSME_BIT,
 	FSME_FA64_BIT,
+	FSME2_BIT,
 	FMAX_END
 };
 
@@ -41,6 +42,7 @@ enum {
 #define FEAT_SVE		(1UL << FSVE_BIT)
 #define FEAT_SME		(1UL << FSME_BIT)
 #define FEAT_SME_FA64		(1UL << FSME_FA64_BIT)
+#define FEAT_SME2		(1UL << FSME2_BIT)
 
 /*
  * A descriptor used to describe and configure a test case.
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c
index 308e229e58ab..07f518f0e58d 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.c
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c
@@ -29,6 +29,7 @@ static char const *const feats_names[FMAX_END] = {
 	" SVE ",
 	" SME ",
 	" FA64 ",
+	" SME2 ",
 };
 
 #define MAX_FEATS_SZ	128
@@ -323,6 +324,8 @@ int test_init(struct tdescr *td)
 			td->feats_supported |= FEAT_SME;
 		if (getauxval(AT_HWCAP2) & HWCAP2_SME_FA64)
 			td->feats_supported |= FEAT_SME_FA64;
+		if (getauxval(AT_HWCAP2) & HWCAP2_SME2)
+			td->feats_supported |= FEAT_SME2;
 		if (feats_ok(td)) {
 			if (td->feats_required & td->feats_supported)
 				fprintf(stderr,
-- 
cgit 


From afe6f182752625cadf4ff97613bd2f362383bcbf Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 16 Jan 2023 16:04:52 +0000
Subject: kselftest/arm64: Teach the generic signal context validation about ZT

Add ZT to the set of signal contexts that the shared code understands and
validates the form of.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-arm64-sme2-v4-17-f2fa0aef982f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../selftests/arm64/signal/testcases/testcases.c   | 36 ++++++++++++++++++++++
 .../selftests/arm64/signal/testcases/testcases.h   |  1 +
 2 files changed, 37 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c
index d2eda7b5de26..27d495fa52f8 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.c
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c
@@ -108,6 +108,26 @@ bool validate_za_context(struct za_context *za, char **err)
 	return true;
 }
 
+bool validate_zt_context(struct zt_context *zt, char **err)
+{
+	if (!zt || !err)
+		return false;
+
+	/* If the context is present there should be at least one register */
+	if (zt->nregs == 0) {
+		*err = "no registers";
+		return false;
+	}
+
+	/* Size should agree with the number of registers */
+	if (zt->head.size != ZT_SIG_CONTEXT_SIZE(zt->nregs)) {
+		*err = "register count does not match size";
+		return false;
+	}
+
+	return true;
+}
+
 bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err)
 {
 	bool terminated = false;
@@ -117,6 +137,7 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err)
 	struct extra_context *extra = NULL;
 	struct sve_context *sve = NULL;
 	struct za_context *za = NULL;
+	struct zt_context *zt = NULL;
 	struct _aarch64_ctx *head =
 		(struct _aarch64_ctx *)uc->uc_mcontext.__reserved;
 	void *extra_data = NULL;
@@ -177,6 +198,13 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err)
 			za = (struct za_context *)head;
 			new_flags |= ZA_CTX;
 			break;
+		case ZT_MAGIC:
+			if (flags & ZT_CTX)
+				*err = "Multiple ZT_MAGIC";
+			/* Size is validated in validate_za_context() */
+			zt = (struct zt_context *)head;
+			new_flags |= ZT_CTX;
+			break;
 		case EXTRA_MAGIC:
 			if (flags & EXTRA_CTX)
 				*err = "Multiple EXTRA_MAGIC";
@@ -234,6 +262,9 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err)
 		if (new_flags & ZA_CTX)
 			if (!validate_za_context(za, err))
 				return false;
+		if (new_flags & ZT_CTX)
+			if (!validate_zt_context(zt, err))
+				return false;
 
 		flags |= new_flags;
 
@@ -245,6 +276,11 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err)
 		return false;
 	}
 
+	if (terminated && (flags & ZT_CTX) && !(flags & ZA_CTX)) {
+		*err = "ZT context but no ZA context";
+		return false;
+	}
+
 	return true;
 }
 
diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h
index 040afded0b76..a08ab0d6207a 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.h
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h
@@ -18,6 +18,7 @@
 #define SVE_CTX		(1 << 1)
 #define ZA_CTX		(1 << 2)
 #define EXTRA_CTX	(1 << 3)
+#define ZT_CTX		(1 << 4)
 
 #define KSFT_BAD_MAGIC	0xdeadbeef
 
-- 
cgit 


From 18f8729ab3d56ec0bfd980d6b3660a50af43b82a Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 16 Jan 2023 16:04:53 +0000
Subject: kselftest/arm64: Add test coverage for ZT register signal frames

We should have a ZT register frame with an expected size when ZA is enabled
and have no ZT frame when ZA is disabled. Since we don't load any data into
ZT we expect the data to all be zeros since the architecture guarantees it
will be set to 0 as ZA is enabled.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-arm64-sme2-v4-18-f2fa0aef982f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/.gitignore    |  1 +
 .../selftests/arm64/signal/testcases/zt_no_regs.c  | 51 +++++++++++++
 .../selftests/arm64/signal/testcases/zt_regs.c     | 85 ++++++++++++++++++++++
 3 files changed, 137 insertions(+)
 create mode 100644 tools/testing/selftests/arm64/signal/testcases/zt_no_regs.c
 create mode 100644 tools/testing/selftests/arm64/signal/testcases/zt_regs.c

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore
index e8d2b57f73ec..b7fbb65183e8 100644
--- a/tools/testing/selftests/arm64/signal/.gitignore
+++ b/tools/testing/selftests/arm64/signal/.gitignore
@@ -5,4 +5,5 @@ sme_*
 ssve_*
 sve_*
 za_*
+zt_*
 !*.[ch]
diff --git a/tools/testing/selftests/arm64/signal/testcases/zt_no_regs.c b/tools/testing/selftests/arm64/signal/testcases/zt_no_regs.c
new file mode 100644
index 000000000000..34f69bcf821e
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/zt_no_regs.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 ARM Limited
+ *
+ * Verify that using an instruction not supported in streaming mode
+ * traps when in streaming mode.
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+#include <sys/prctl.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static union {
+	ucontext_t uc;
+	char buf[1024 * 128];
+} context;
+
+int zt_no_regs_run(struct tdescr *td, siginfo_t *si, ucontext_t *uc)
+{
+	size_t offset;
+	struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context);
+
+	/*
+	 * Get a signal context which should not have a ZT frame and
+	 * registers in it.
+	 */
+	if (!get_current_context(td, &context.uc, sizeof(context)))
+		return 1;
+
+	head = get_header(head, ZT_MAGIC, GET_BUF_RESV_SIZE(context), &offset);
+	if (head) {
+		fprintf(stderr, "Got unexpected ZT context\n");
+		return 1;
+	}
+
+	td->pass = 1;
+
+	return 0;
+}
+
+struct tdescr tde = {
+	.name = "ZT register data not present",
+	.descr = "Validate that ZT is not present when ZA is disabled",
+	.feats_required = FEAT_SME2,
+	.timeout = 3,
+	.sanity_disabled = true,
+	.run = zt_no_regs_run,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/zt_regs.c b/tools/testing/selftests/arm64/signal/testcases/zt_regs.c
new file mode 100644
index 000000000000..e1eb4d5c027a
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/zt_regs.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 ARM Limited
+ *
+ * Verify that using an instruction not supported in streaming mode
+ * traps when in streaming mode.
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+#include <sys/prctl.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static union {
+	ucontext_t uc;
+	char buf[1024 * 128];
+} context;
+
+static void enable_za(void)
+{
+	/* smstart za; real data is TODO */
+	asm volatile(".inst 0xd503457f" : : : );
+}
+
+int zt_regs_run(struct tdescr *td, siginfo_t *si, ucontext_t *uc)
+{
+	size_t offset;
+	struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context);
+	struct zt_context *zt;
+	char *zeros;
+
+	/*
+	 * Get a signal context which should have a ZT frame and registers
+	 * in it.
+	 */
+	enable_za();
+	if (!get_current_context(td, &context.uc, sizeof(context)))
+		return 1;
+
+	head = get_header(head, ZT_MAGIC, GET_BUF_RESV_SIZE(context), &offset);
+	if (!head) {
+		fprintf(stderr, "No ZT context\n");
+		return 1;
+	}
+
+	zt = (struct zt_context *)head;
+	if (zt->nregs == 0) {
+		fprintf(stderr, "Got context with no registers\n");
+		return 1;
+	}
+
+	fprintf(stderr, "Got expected size %u for %d registers\n",
+		head->size, zt->nregs);
+
+	/* We didn't load any data into ZT so it should be all zeros */
+	zeros = malloc(ZT_SIG_REGS_SIZE(zt->nregs));
+	if (!zeros) {
+		fprintf(stderr, "Out of memory, nregs=%u\n", zt->nregs);
+		return 1;
+	}
+	memset(zeros, 0, ZT_SIG_REGS_SIZE(zt->nregs));
+
+	if (memcmp(zeros, (char *)zt + ZT_SIG_REGS_OFFSET,
+		   ZT_SIG_REGS_SIZE(zt->nregs)) != 0) {
+		fprintf(stderr, "ZT data invalid\n");
+		return 1;
+	}
+
+	free(zeros);
+
+	td->pass = 1;
+
+	return 0;
+}
+
+struct tdescr tde = {
+	.name = "ZT register data",
+	.descr = "Validate that ZT is present and has data when ZA is enabled",
+	.feats_required = FEAT_SME2,
+	.timeout = 3,
+	.sanity_disabled = true,
+	.run = zt_regs_run,
+};
-- 
cgit 


From 49886aa9ab33b1825439015a6b2c91ed9b294ba3 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 16 Jan 2023 16:04:54 +0000
Subject: kselftest/arm64: Add SME2 coverage to syscall-abi

Verify that ZT0 is preserved over syscalls when it is present and
PSTATE.ZA is set.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-arm64-sme2-v4-19-f2fa0aef982f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../testing/selftests/arm64/abi/syscall-abi-asm.S  | 43 +++++++++++++++++++++-
 tools/testing/selftests/arm64/abi/syscall-abi.c    | 40 +++++++++++++++++++-
 2 files changed, 80 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/abi/syscall-abi-asm.S b/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
index acd5e9f3bc0b..6ddf392329c9 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
+++ b/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
@@ -23,6 +23,9 @@
 
 .arch_extension sve
 
+#define ID_AA64SMFR0_EL1_SMEver_SHIFT           56
+#define ID_AA64SMFR0_EL1_SMEver_WIDTH           4
+
 /*
  * LDR (vector to ZA array):
  *	LDR ZA[\nw, #\offset], [X\nxbase, #\offset, MUL VL]
@@ -45,6 +48,26 @@
 		| ((\offset) & 7)
 .endm
 
+/*
+ * LDR (ZT0)
+ *
+ *	LDR ZT0, nx
+ */
+.macro _ldr_zt nx
+	.inst	0xe11f8000			\
+		| (((\nx) & 0x1f) << 5)
+.endm
+
+/*
+ * STR (ZT0)
+ *
+ *	STR ZT0, nx
+ */
+.macro _str_zt nx
+	.inst	0xe13f8000			\
+		| (((\nx) & 0x1f) << 5)
+.endm
+
 .globl do_syscall
 do_syscall:
 	// Store callee saved registers x19-x29 (80 bytes) plus x0 and x1
@@ -64,7 +87,7 @@ do_syscall:
 	msr	S3_3_C4_C2_2, x2
 1:
 
-	// Load ZA if it's enabled - uses x12 as scratch due to SME LDR
+	// Load ZA and ZT0 if enabled - uses x12 as scratch due to SME LDR
 	tbz	x2, #SVCR_ZA_SHIFT, 1f
 	mov	w12, #0
 	ldr	x2, =za_in
@@ -73,6 +96,15 @@ do_syscall:
 	add	x12, x12, #1
 	cmp	x1, x12
 	bne	2b
+
+	// ZT0
+	mrs	x2, S3_0_C0_C4_5	// ID_AA64SMFR0_EL1
+	ubfx	x2, x2, #ID_AA64SMFR0_EL1_SMEver_SHIFT, \
+			 #ID_AA64SMFR0_EL1_SMEver_WIDTH
+	cbz	x2, 1f
+	adrp	x2, zt_in
+	add	x2, x2, :lo12:zt_in
+	_ldr_zt 2
 1:
 
 	// Load GPRs x8-x28, and save our SP/FP for later comparison
@@ -235,6 +267,15 @@ do_syscall:
 	add	x12, x12, #1
 	cmp	x1, x12
 	bne	2b
+
+	// ZT0
+	mrs	x2, S3_0_C0_C4_5	// ID_AA64SMFR0_EL1
+	ubfx	x2, x2, #ID_AA64SMFR0_EL1_SMEver_SHIFT, \
+			#ID_AA64SMFR0_EL1_SMEver_WIDTH
+	cbz	x2, 1f
+	adrp	x2, zt_out
+	add	x2, x2, :lo12:zt_out
+	_str_zt 2
 1:
 
 	// Save the SVE state if we have some
diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
index dd7ebe536d05..9800f9dc6b35 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi.c
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -311,6 +311,35 @@ static int check_za(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
 	return errors;
 }
 
+uint8_t zt_in[ZT_SIG_REG_BYTES] __attribute__((aligned(16)));
+uint8_t zt_out[ZT_SIG_REG_BYTES] __attribute__((aligned(16)));
+
+static void setup_zt(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
+		     uint64_t svcr)
+{
+	fill_random(zt_in, sizeof(zt_in));
+	memset(zt_out, 0, sizeof(zt_out));
+}
+
+static int check_zt(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
+		    uint64_t svcr)
+{
+	int errors = 0;
+
+	if (!(getauxval(AT_HWCAP2) & HWCAP2_SME2))
+		return 0;
+
+	if (!(svcr & SVCR_ZA_MASK))
+		return 0;
+
+	if (memcmp(zt_in, zt_out, sizeof(zt_in)) != 0) {
+		ksft_print_msg("SME VL %d ZT does not match\n", sme_vl);
+		errors++;
+	}
+
+	return errors;
+}
+
 typedef void (*setup_fn)(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
 			 uint64_t svcr);
 typedef int (*check_fn)(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
@@ -334,6 +363,7 @@ static struct {
 	{ setup_ffr, check_ffr },
 	{ setup_svcr, check_svcr },
 	{ setup_za, check_za },
+	{ setup_zt, check_zt },
 };
 
 static bool do_test(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
@@ -474,6 +504,7 @@ int main(void)
 {
 	int i;
 	int tests = 1;  /* FPSIMD */
+	int sme_ver;
 
 	srandom(getpid());
 
@@ -482,10 +513,15 @@ int main(void)
 	tests += (sve_count_vls() * sme_count_vls()) * 3;
 	ksft_set_plan(ARRAY_SIZE(syscalls) * tests);
 
+	if (getauxval(AT_HWCAP2) & HWCAP2_SME2)
+		sme_ver = 2;
+	else
+		sme_ver = 1;
+
 	if (getauxval(AT_HWCAP2) & HWCAP2_SME_FA64)
-		ksft_print_msg("SME with FA64\n");
+		ksft_print_msg("SME%d with FA64\n", sme_ver);
 	else if (getauxval(AT_HWCAP2) & HWCAP2_SME)
-		ksft_print_msg("SME without FA64\n");
+		ksft_print_msg("SME%d without FA64\n", sme_ver);
 
 	for (i = 0; i < ARRAY_SIZE(syscalls); i++)
 		test_one_syscall(&syscalls[i]);
-- 
cgit 


From 4e1aa1a18f1becb479c3627b73f1b7ebe2d46924 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 16 Jan 2023 16:04:55 +0000
Subject: kselftest/arm64: Add coverage of the ZT ptrace regset

Add coverage of the ZT ptrace interface.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-arm64-sme2-v4-20-f2fa0aef982f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/.gitignore  |   1 +
 tools/testing/selftests/arm64/fp/Makefile    |   2 +
 tools/testing/selftests/arm64/fp/zt-ptrace.c | 365 +++++++++++++++++++++++++++
 3 files changed, 368 insertions(+)
 create mode 100644 tools/testing/selftests/arm64/fp/zt-ptrace.c

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/.gitignore b/tools/testing/selftests/arm64/fp/.gitignore
index 41bde4c97d47..ebc86757bdd8 100644
--- a/tools/testing/selftests/arm64/fp/.gitignore
+++ b/tools/testing/selftests/arm64/fp/.gitignore
@@ -12,4 +12,5 @@ vlset
 za-fork
 za-ptrace
 za-test
+zt-ptrace
 zt-test
diff --git a/tools/testing/selftests/arm64/fp/Makefile b/tools/testing/selftests/arm64/fp/Makefile
index aff3026d3dff..50a70220ba6c 100644
--- a/tools/testing/selftests/arm64/fp/Makefile
+++ b/tools/testing/selftests/arm64/fp/Makefile
@@ -14,6 +14,7 @@ TEST_GEN_PROGS_EXTENDED := fp-pidbench fpsimd-test \
 	sve-test \
 	ssve-test \
 	za-test \
+	zt-ptrace \
 	zt-test \
 	vlset
 TEST_PROGS_EXTENDED := fpsimd-stress sve-stress ssve-stress za-stress
@@ -42,6 +43,7 @@ $(OUTPUT)/za-fork: za-fork.c $(OUTPUT)/za-fork-asm.o
 $(OUTPUT)/za-ptrace: za-ptrace.c
 $(OUTPUT)/za-test: za-test.S $(OUTPUT)/asm-utils.o
 	$(CC) -nostdlib $^ -o $@
+$(OUTPUT)/zt-ptrace: zt-ptrace.c
 $(OUTPUT)/zt-test: zt-test.S $(OUTPUT)/asm-utils.o
 	$(CC) -nostdlib $^ -o $@
 
diff --git a/tools/testing/selftests/arm64/fp/zt-ptrace.c b/tools/testing/selftests/arm64/fp/zt-ptrace.c
new file mode 100644
index 000000000000..996d9614a131
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/zt-ptrace.c
@@ -0,0 +1,365 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 ARM Limited.
+ */
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/wait.h>
+#include <asm/sigcontext.h>
+#include <asm/ptrace.h>
+
+#include "../../kselftest.h"
+
+/* <linux/elf.h> and <sys/auxv.h> don't like each other, so: */
+#ifndef NT_ARM_ZA
+#define NT_ARM_ZA 0x40c
+#endif
+#ifndef NT_ARM_ZT
+#define NT_ARM_ZT 0x40d
+#endif
+
+#define EXPECTED_TESTS 3
+
+static int sme_vl;
+
+static void fill_buf(char *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < size; i++)
+		buf[i] = random();
+}
+
+static int do_child(void)
+{
+	if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
+		ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno));
+
+	if (raise(SIGSTOP))
+		ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno));
+
+	return EXIT_SUCCESS;
+}
+
+static struct user_za_header *get_za(pid_t pid, void **buf, size_t *size)
+{
+	struct user_za_header *za;
+	void *p;
+	size_t sz = sizeof(*za);
+	struct iovec iov;
+
+	while (1) {
+		if (*size < sz) {
+			p = realloc(*buf, sz);
+			if (!p) {
+				errno = ENOMEM;
+				goto error;
+			}
+
+			*buf = p;
+			*size = sz;
+		}
+
+		iov.iov_base = *buf;
+		iov.iov_len = sz;
+		if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_ZA, &iov))
+			goto error;
+
+		za = *buf;
+		if (za->size <= sz)
+			break;
+
+		sz = za->size;
+	}
+
+	return za;
+
+error:
+	return NULL;
+}
+
+static int set_za(pid_t pid, const struct user_za_header *za)
+{
+	struct iovec iov;
+
+	iov.iov_base = (void *)za;
+	iov.iov_len = za->size;
+	return ptrace(PTRACE_SETREGSET, pid, NT_ARM_ZA, &iov);
+}
+
+static int get_zt(pid_t pid, char zt[ZT_SIG_REG_BYTES])
+{
+	struct iovec iov;
+
+	iov.iov_base = zt;
+	iov.iov_len = ZT_SIG_REG_BYTES;
+	return ptrace(PTRACE_GETREGSET, pid, NT_ARM_ZT, &iov);
+}
+
+
+static int set_zt(pid_t pid, const char zt[ZT_SIG_REG_BYTES])
+{
+	struct iovec iov;
+
+	iov.iov_base = (void *)zt;
+	iov.iov_len = ZT_SIG_REG_BYTES;
+	return ptrace(PTRACE_SETREGSET, pid, NT_ARM_ZT, &iov);
+}
+
+/* Reading with ZA disabled returns all zeros */
+static void ptrace_za_disabled_read_zt(pid_t child)
+{
+	struct user_za_header za;
+	char zt[ZT_SIG_REG_BYTES];
+	int ret, i;
+	bool fail = false;
+
+	/* Disable PSTATE.ZA using the ZA interface */
+	memset(&za, 0, sizeof(za));
+	za.vl = sme_vl;
+	za.size = sizeof(za);
+
+	ret = set_za(child, &za);
+	if (ret != 0) {
+		ksft_print_msg("Failed to disable ZA\n");
+		fail = true;
+	}
+
+	/* Read back ZT */
+	ret = get_zt(child, zt);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read ZT\n");
+		fail = true;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(zt); i++) {
+		if (zt[i]) {
+			ksft_print_msg("zt[%d]: 0x%x != 0\n", i, zt[i]);
+			fail = true;
+		}
+	}
+
+	ksft_test_result(!fail, "ptrace_za_disabled_read_zt\n");
+}
+
+/* Writing then reading ZT should return the data written */
+static void ptrace_set_get_zt(pid_t child)
+{
+	char zt_in[ZT_SIG_REG_BYTES];
+	char zt_out[ZT_SIG_REG_BYTES];
+	int ret, i;
+	bool fail = false;
+
+	fill_buf(zt_in, sizeof(zt_in));
+
+	ret = set_zt(child, zt_in);
+	if (ret != 0) {
+		ksft_print_msg("Failed to set ZT\n");
+		fail = true;
+	}
+
+	ret = get_zt(child, zt_out);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read ZT\n");
+		fail = true;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(zt_in); i++) {
+		if (zt_in[i] != zt_out[i]) {
+			ksft_print_msg("zt[%d]: 0x%x != 0x%x\n", i, 
+				       zt_in[i], zt_out[i]);
+			fail = true;
+		}
+	}
+
+	ksft_test_result(!fail, "ptrace_set_get_zt\n");
+}
+
+/* Writing ZT should set PSTATE.ZA */
+static void ptrace_enable_za_via_zt(pid_t child)
+{
+	struct user_za_header za_in;
+	struct user_za_header *za_out;
+	char zt[ZT_SIG_REG_BYTES];
+	char *za_data;
+	size_t za_out_size;
+	int ret, i, vq;
+	bool fail = false;
+
+	/* Disable PSTATE.ZA using the ZA interface */
+	memset(&za_in, 0, sizeof(za_in));
+	za_in.vl = sme_vl;
+	za_in.size = sizeof(za_in);
+
+	ret = set_za(child, &za_in);
+	if (ret != 0) {
+		ksft_print_msg("Failed to disable ZA\n");
+		fail = true;
+	}
+
+	/* Write ZT */
+	fill_buf(zt, sizeof(zt));
+	ret = set_zt(child, zt);
+	if (ret != 0) {
+		ksft_print_msg("Failed to set ZT\n");
+		fail = true;
+	}
+
+	/* Read back ZA and check for register data */
+	za_out = NULL;
+	za_out_size = 0;
+	if (get_za(child, (void **)&za_out, &za_out_size)) {
+		/* Should have an unchanged VL */
+		if (za_out->vl != sme_vl) {
+			ksft_print_msg("VL changed from %d to %d\n",
+				       sme_vl, za_out->vl);
+			fail = true;
+		}
+		vq = __sve_vq_from_vl(za_out->vl);
+		za_data = (char *)za_out + ZA_PT_ZA_OFFSET;
+
+		/* Should have register data */
+		if (za_out->size < ZA_PT_SIZE(vq)) {
+			ksft_print_msg("ZA data less than expected: %u < %u\n",
+				       za_out->size, ZA_PT_SIZE(vq));
+			fail = true;
+			vq = 0;
+		}
+
+		/* That register data should be non-zero */
+		for (i = 0; i < ZA_PT_ZA_SIZE(vq); i++) {
+			if (za_data[i]) {
+				ksft_print_msg("ZA byte %d is %x\n",
+					       i, za_data[i]);
+				fail = true;
+			}
+		}
+	} else {
+		ksft_print_msg("Failed to read ZA\n");
+		fail = true;
+	}
+
+	ksft_test_result(!fail, "ptrace_enable_za_via_zt\n");
+}
+
+static int do_parent(pid_t child)
+{
+	int ret = EXIT_FAILURE;
+	pid_t pid;
+	int status;
+	siginfo_t si;
+
+	/* Attach to the child */
+	while (1) {
+		int sig;
+
+		pid = wait(&status);
+		if (pid == -1) {
+			perror("wait");
+			goto error;
+		}
+
+		/*
+		 * This should never happen but it's hard to flag in
+		 * the framework.
+		 */
+		if (pid != child)
+			continue;
+
+		if (WIFEXITED(status) || WIFSIGNALED(status))
+			ksft_exit_fail_msg("Child died unexpectedly\n");
+
+		if (!WIFSTOPPED(status))
+			goto error;
+
+		sig = WSTOPSIG(status);
+
+		if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si)) {
+			if (errno == ESRCH)
+				goto disappeared;
+
+			if (errno == EINVAL) {
+				sig = 0; /* bust group-stop */
+				goto cont;
+			}
+
+			ksft_test_result_fail("PTRACE_GETSIGINFO: %s\n",
+					      strerror(errno));
+			goto error;
+		}
+
+		if (sig == SIGSTOP && si.si_code == SI_TKILL &&
+		    si.si_pid == pid)
+			break;
+
+	cont:
+		if (ptrace(PTRACE_CONT, pid, NULL, sig)) {
+			if (errno == ESRCH)
+				goto disappeared;
+
+			ksft_test_result_fail("PTRACE_CONT: %s\n",
+					      strerror(errno));
+			goto error;
+		}
+	}
+
+	ksft_print_msg("Parent is %d, child is %d\n", getpid(), child);
+
+	ptrace_za_disabled_read_zt(child);
+	ptrace_set_get_zt(child);
+	ptrace_enable_za_via_zt(child);
+
+	ret = EXIT_SUCCESS;
+
+error:
+	kill(child, SIGKILL);
+
+disappeared:
+	return ret;
+}
+
+int main(void)
+{
+	int ret = EXIT_SUCCESS;
+	pid_t child;
+
+	srandom(getpid());
+
+	ksft_print_header();
+
+	if (!(getauxval(AT_HWCAP2) & HWCAP2_SME2)) {
+		ksft_set_plan(1);
+		ksft_exit_skip("SME2 not available\n");
+	}
+
+	/* We need a valid SME VL to enable/disable ZA */
+	sme_vl = prctl(PR_SME_GET_VL);
+	if (sme_vl == -1) {
+		ksft_set_plan(1);
+		ksft_exit_skip("Failed to read SME VL: %d (%s)\n",
+			       errno, strerror(errno));
+	}
+
+	ksft_set_plan(EXPECTED_TESTS);
+
+	child = fork();
+	if (!child)
+		return do_child();
+
+	if (do_parent(child))
+		ret = EXIT_FAILURE;
+
+	ksft_print_cnts();
+
+	return ret;
+}
-- 
cgit 


From 3eb1b41fba97a1586e3ecca8c10547071f541567 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 16 Jan 2023 16:04:56 +0000
Subject: kselftest/arm64: Add coverage of SME 2 and 2.1 hwcaps

Add the hwcaps defined by SME 2 and 2.1 to the hwcaps test.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-arm64-sme2-v4-21-f2fa0aef982f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/hwcap.c | 115 ++++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index 9f255bc5f31c..93333a90bf3a 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -50,6 +50,78 @@ static void sme_sigill(void)
 	asm volatile(".inst 0x04bf5800" : : : "x0");
 }
 
+static void sme2_sigill(void)
+{
+	/* SMSTART ZA */
+	asm volatile("msr S0_3_C4_C5_3, xzr" : : : );
+
+	/* ZERO ZT0 */
+	asm volatile(".inst 0xc0480001" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void sme2p1_sigill(void)
+{
+	/* SMSTART SM */
+	asm volatile("msr S0_3_C4_C3_3, xzr" : : : );
+
+	/* BFCLAMP { Z0.H - Z1.H }, Z0.H, Z0.H */
+	asm volatile(".inst 0xc120C000" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smei16i32_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* SMOPA ZA0.S, P0/M, P0/M, Z0.B, Z0.B */
+	asm volatile(".inst 0xa0800000" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smebi32i32_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* BMOPA ZA0.S, P0/M, P0/M, Z0.B, Z0.B */
+	asm volatile(".inst 0x80800008" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smeb16b16_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* BFADD ZA.H[W0, 0], {Z0.H-Z1.H} */
+	asm volatile(".inst 0xC1E41C00" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smef16f16_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* FADD ZA.H[W0, 0], { Z0.H-Z1.H } */
+	asm volatile(".inst 0xc1a41C00" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
 static void sve_sigill(void)
 {
 	/* RDVL x0, #0 */
@@ -158,6 +230,49 @@ static const struct hwcap_data {
 		.sigill_fn = sme_sigill,
 		.sigill_reliable = true,
 	},
+	{
+		.name = "SME2",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME2,
+		.cpuinfo = "sme2",
+		.sigill_fn = sme2_sigill,
+		.sigill_reliable = true,
+	},
+	{
+		.name = "SME 2.1",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME2P1,
+		.cpuinfo = "sme2p1",
+		.sigill_fn = sme2p1_sigill,
+	},
+	{
+		.name = "SME I16I32",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_I16I32,
+		.cpuinfo = "smei16i32",
+		.sigill_fn = smei16i32_sigill,
+	},
+	{
+		.name = "SME BI32I32",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_BI32I32,
+		.cpuinfo = "smebi32i32",
+		.sigill_fn = smebi32i32_sigill,
+	},
+	{
+		.name = "SME B16B16",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_B16B16,
+		.cpuinfo = "smeb16b16",
+		.sigill_fn = smeb16b16_sigill,
+	},
+	{
+		.name = "SME F16F16",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_F16F16,
+		.cpuinfo = "smef16f16",
+		.sigill_fn = smef16f16_sigill,
+	},
 	{
 		.name = "SVE",
 		.at_hwcap = AT_HWCAP,
-- 
cgit 


From bae393dabf353172784987c12c99a55cbe8075bc Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Dec 2022 14:20:42 +0000
Subject: kselftest/arm64: Add TPIDR2 to the set of known signal context
 records

When validating the set of signal context records check that any TPIDR2
record has the correct size, also suppressing warnings due to seeing an
unknown record type.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-arm64-tpidr2-sig-v3-3-c77c6c8775f4@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/testcases/testcases.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c
index d2eda7b5de26..23487c458240 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.c
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c
@@ -163,6 +163,10 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err)
 			if (head->size != sizeof(struct esr_context))
 				*err = "Bad size for esr_context";
 			break;
+		case TPIDR2_MAGIC:
+			if (head->size != sizeof(struct tpidr2_context))
+				*err = "Bad size for tpidr2_context";
+			break;
 		case SVE_MAGIC:
 			if (flags & SVE_CTX)
 				*err = "Multiple SVE_MAGIC";
-- 
cgit 


From 8ced928019353eaecbffee566d7ed6a9a9e60e78 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Dec 2022 14:20:43 +0000
Subject: kselftest/arm64: Add test case for TPIDR2 signal frame records

Ensure that we get signal context for TPIDR2 if and only if SME is present
on the system. Since TPIDR2 is owned by libc we merely validate that the
value is whatever it was set to, this isn't ideal since it's likely to
just be the default of 0 with current systems but it avoids future false
positives.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221208-arm64-tpidr2-sig-v3-4-c77c6c8775f4@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/.gitignore    |  1 +
 .../arm64/signal/testcases/tpidr2_siginfo.c        | 90 ++++++++++++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100644 tools/testing/selftests/arm64/signal/testcases/tpidr2_siginfo.c

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore
index e8d2b57f73ec..e1b6c4d961b5 100644
--- a/tools/testing/selftests/arm64/signal/.gitignore
+++ b/tools/testing/selftests/arm64/signal/.gitignore
@@ -4,5 +4,6 @@ fake_sigreturn_*
 sme_*
 ssve_*
 sve_*
+tpidr2_siginfo
 za_*
 !*.[ch]
diff --git a/tools/testing/selftests/arm64/signal/testcases/tpidr2_siginfo.c b/tools/testing/selftests/arm64/signal/testcases/tpidr2_siginfo.c
new file mode 100644
index 000000000000..6a2c82bf7ead
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/tpidr2_siginfo.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 ARM Limited
+ *
+ * Verify that the TPIDR2 register context in signal frames is set up as
+ * expected.
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+#include <asm/sigcontext.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static union {
+	ucontext_t uc;
+	char buf[1024 * 128];
+} context;
+
+#define SYS_TPIDR2 "S3_3_C13_C0_5"
+
+static uint64_t get_tpidr2(void)
+{
+	uint64_t val;
+
+	asm volatile (
+		"mrs	%0, " SYS_TPIDR2 "\n"
+		: "=r"(val)
+		:
+		: "cc");
+
+	return val;
+}
+
+int tpidr2_present(struct tdescr *td, siginfo_t *si, ucontext_t *uc)
+{
+	struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context);
+	struct tpidr2_context *tpidr2_ctx;
+	size_t offset;
+	bool in_sigframe;
+	bool have_sme;
+	__u64 orig_tpidr2;
+
+	have_sme = getauxval(AT_HWCAP2) & HWCAP2_SME;
+	if (have_sme)
+		orig_tpidr2 = get_tpidr2();
+
+	if (!get_current_context(td, &context.uc, sizeof(context)))
+		return 1;
+
+	tpidr2_ctx = (struct tpidr2_context *)
+		get_header(head, TPIDR2_MAGIC, td->live_sz, &offset);
+
+	in_sigframe = tpidr2_ctx != NULL;
+
+	fprintf(stderr, "TPIDR2 sigframe %s on system %s SME\n",
+		in_sigframe ? "present" : "absent",
+		have_sme ? "with" : "without");
+
+	td->pass = (in_sigframe == have_sme);
+
+	/*
+	 * Check that the value we read back was the one present at
+	 * the time that the signal was triggered.  TPIDR2 is owned by
+	 * libc so we can't safely choose the value and it is possible
+	 * that we may need to revisit this in future if something
+	 * starts deciding to set a new TPIDR2 between us reading and
+	 * the signal.
+	 */
+	if (have_sme && tpidr2_ctx) {
+		if (tpidr2_ctx->tpidr2 != orig_tpidr2) {
+			fprintf(stderr, "TPIDR2 in frame is %llx, was %llx\n",
+				tpidr2_ctx->tpidr2, orig_tpidr2);
+			td->pass = false;
+		}
+	}
+
+	return 0;
+}
+
+struct tdescr tde = {
+	.name = "TPIDR2",
+	.descr = "Validate that TPIDR2 is present as expected",
+	.timeout = 3,
+	.run = tpidr2_present,
+};
-- 
cgit 


From aa58ace3499a678768f5ce9a5973919bfccd8a4e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 12 Jan 2023 19:51:47 +0000
Subject: kselftest/arm64: Fix .pushsection for strings in FP tests

The .pushsection directive used to store the strings used with the .puts
macro in the floating point helpers does not provide a section type but
according to the gas documentation this should be mandatory and with the
clang built in as it actually is. Provide one so that we can build these
tests with LLVM=1.

No functional change.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/r/20230111-arm64-kselftest-clang-v1-1-89c69d377727@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/assembler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/assembler.h b/tools/testing/selftests/arm64/fp/assembler.h
index 90bd433d2665..9b38a0da407d 100644
--- a/tools/testing/selftests/arm64/fp/assembler.h
+++ b/tools/testing/selftests/arm64/fp/assembler.h
@@ -57,7 +57,7 @@ endfunction
 // Utility macro to print a literal string
 // Clobbers x0-x4,x8
 .macro puts string
-	.pushsection .rodata.str1.1, "aMS", 1
+	.pushsection .rodata.str1.1, "aMS", @progbits, 1
 .L__puts_literal\@: .string "\string"
 	.popsection
 
-- 
cgit 


From cd57a6584fe596f4daf128007bff71b1c2ba16c8 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 12 Jan 2023 19:51:48 +0000
Subject: kselftest/arm64: Remove redundant _start labels from FP tests

There are a number of freestanding static executables used in floating
point testing that have no runtime at all. These all define the main entry
point as:

   .globl _start
   function _start
   _start:

but clang's integrated assembler complains that:

  error: symbol '_start' is already defined

due to having both a label and function directive. Remove the label to
allow building with clang.

No functional change.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/r/20230111-arm64-kselftest-clang-v1-2-89c69d377727@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-pidbench.S | 1 -
 tools/testing/selftests/arm64/fp/fpsimd-test.S | 1 -
 tools/testing/selftests/arm64/fp/sve-test.S    | 1 -
 tools/testing/selftests/arm64/fp/za-test.S     | 1 -
 4 files changed, 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/fp-pidbench.S b/tools/testing/selftests/arm64/fp/fp-pidbench.S
index 16a436389bfc..73830f6bc99b 100644
--- a/tools/testing/selftests/arm64/fp/fp-pidbench.S
+++ b/tools/testing/selftests/arm64/fp/fp-pidbench.S
@@ -31,7 +31,6 @@
 // Main program entry point
 .globl _start
 function _start
-_start:
 	puts	"Iterations per test: "
 	mov	x20, #10000
 	lsl	x20, x20, #8
diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S
index 918d04885a33..8b960d01ed2e 100644
--- a/tools/testing/selftests/arm64/fp/fpsimd-test.S
+++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S
@@ -215,7 +215,6 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
-_start:
 	mov	x23, #0		// signal count
 
 	mov	w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S
index 2a18cb4c528c..4328895dfc87 100644
--- a/tools/testing/selftests/arm64/fp/sve-test.S
+++ b/tools/testing/selftests/arm64/fp/sve-test.S
@@ -378,7 +378,6 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
-_start:
 	mov	x23, #0		// Irritation signal count
 
 	mov	w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S
index 53c54af65704..9dcd70911397 100644
--- a/tools/testing/selftests/arm64/fp/za-test.S
+++ b/tools/testing/selftests/arm64/fp/za-test.S
@@ -231,7 +231,6 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
-_start:
 	mov	x23, #0		// signal count
 
 	mov	w0, #SIGINT
-- 
cgit 


From a884f7970e57aef78c6011561e29d238e46b3a9f Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 12 Jan 2023 19:51:49 +0000
Subject: kselftest/arm64: Don't pass headers to the compiler as source

The signal Makefile rules pass all the dependencies for each executable,
including headers, to the compiler which GCC is happy enough with but
clang rejects:

   clang --target=aarch64-none-linux-gnu -fintegrated-as -Wall -O2 -g -I/home/broonie/git/linux/tools/testing/selftests/ -isystem /home/broonie/git/linux/usr/include -D_GNU_SOURCE -std=gnu99 -I.  test_signals.c test_signals_utils.c testcases/testcases.c signals.S testcases/fake_sigreturn_bad_magic.c test_signals.h test_signals_utils.h testcases/testcases.h -o testcases/fake_sigreturn_bad_magic
  clang: error: cannot specify -o when generating multiple output files

This happens because clang gets confused about what to do with the
header files, failing to identify them as source.  This is not amazing
behaviour on clang's part and should ideally be fixed but even if that
happens we'd still need a new clang release so let's instead rework the
Makefile so we use variables for the lists of header and source files,
allowing us to only pass the source files to the compiler and keep clang
happy.

As a bonus the resulting Makefile is a bit easier to read.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/r/20230111-arm64-kselftest-clang-v1-3-89c69d377727@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/Makefile | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/Makefile b/tools/testing/selftests/arm64/signal/Makefile
index be7520a863b0..8f5febaf1a9a 100644
--- a/tools/testing/selftests/arm64/signal/Makefile
+++ b/tools/testing/selftests/arm64/signal/Makefile
@@ -22,6 +22,10 @@ $(TEST_GEN_PROGS): $(PROGS)
 
 # Common test-unit targets to build common-layout test-cases executables
 # Needs secondary expansion to properly include the testcase c-file in pre-reqs
+COMMON_SOURCES := test_signals.c test_signals_utils.c testcases/testcases.c \
+	signals.S
+COMMON_HEADERS := test_signals.h test_signals_utils.h testcases/testcases.h
+
 .SECONDEXPANSION:
-$(PROGS): test_signals.c test_signals_utils.c testcases/testcases.c signals.S $$@.c test_signals.h test_signals_utils.h testcases/testcases.h
-	$(CC) $(CFLAGS) $^ -o $@
+$(PROGS): $$@.c ${COMMON_SOURCES} ${COMMON_HEADERS}
+	$(CC) $(CFLAGS) ${@}.c ${COMMON_SOURCES} -o $@
-- 
cgit 


From 6e4b4f0eca88e47def703f90a403fef5b96730d5 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 12 Jan 2023 19:51:50 +0000
Subject: kselftest/arm64: Initialise current at build time in signal tests

When building with clang the toolchain refuses to link the signals
testcases since the assembly code has a reference to current which has
no initialiser so is placed in the BSS:

  /tmp/signals-af2042.o: in function `fake_sigreturn':
  <unknown>:51:(.text+0x40): relocation truncated to fit: R_AARCH64_LD_PREL_LO19 against symbol `current' defined in .bss section in /tmp/test_signals-ec1160.o

Since the first statement in main() initialises current we may as well
fix this by moving the initialisation to build time so the variable
doesn't end up in the BSS.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/r/20230111-arm64-kselftest-clang-v1-4-89c69d377727@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/test_signals.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/test_signals.c b/tools/testing/selftests/arm64/signal/test_signals.c
index 416b1ff43199..00051b40d71e 100644
--- a/tools/testing/selftests/arm64/signal/test_signals.c
+++ b/tools/testing/selftests/arm64/signal/test_signals.c
@@ -12,12 +12,10 @@
 #include "test_signals.h"
 #include "test_signals_utils.h"
 
-struct tdescr *current;
+struct tdescr *current = &tde;
 
 int main(int argc, char *argv[])
 {
-	current = &tde;
-
 	ksft_print_msg("%s :: %s\n", current->name, current->descr);
 	if (test_setup(current) && test_init(current)) {
 		test_run(current);
-- 
cgit 


From 343d59119e776af3060000f7af70553fc531230e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 12 Jan 2023 19:51:51 +0000
Subject: kselftest/arm64: Support build of MTE tests with clang

The assembly portions of the MTE selftests need to be built with a
toolchain supporting MTE.  Since we support GCC versions that lack MTE
support we have logic to suppress build of these tests when using such a
toolchain but that logic is broken for LLVM=1 builds, it uses CC but CC
is only set for LLVM builds in libs.mk which needs to be included after
we have selected which test programs to build.

Since all supported LLVM versions support MTE we can simply assume MTE
support when LLVM is set.  This is not a thing of beauty but it does the
job.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/r/20230111-arm64-kselftest-clang-v1-5-89c69d377727@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/Makefile | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/mte/Makefile b/tools/testing/selftests/arm64/mte/Makefile
index 037046f5784e..fdb9acdca42b 100644
--- a/tools/testing/selftests/arm64/mte/Makefile
+++ b/tools/testing/selftests/arm64/mte/Makefile
@@ -1,19 +1,29 @@
 # SPDX-License-Identifier: GPL-2.0
 # Copyright (C) 2020 ARM Limited
 
-# preserve CC value from top level Makefile
-ifeq ($(CC),cc)
-CC := $(CROSS_COMPILE)gcc
-endif
-
 CFLAGS += -std=gnu99 -I. -pthread
 LDFLAGS += -pthread
 SRCS := $(filter-out mte_common_util.c,$(wildcard *.c))
 PROGS := $(patsubst %.c,%,$(SRCS))
 
+ifeq ($(LLVM),)
+# For GCC check that the toolchain has MTE support.
+
+# preserve CC value from top level Makefile
+ifeq ($(CC),cc)
+CC := $(CROSS_COMPILE)gcc
+endif
+
 #check if the compiler works well
 mte_cc_support := $(shell if ($(CC) $(CFLAGS) -march=armv8.5-a+memtag -E -x c /dev/null -o /dev/null 2>&1) then echo "1"; fi)
 
+else
+
+# All supported clang versions also support MTE.
+mte_cc_support := 1
+
+endif
+
 ifeq ($(mte_cc_support),1)
 # Generated binaries to be installed by top KSFT script
 TEST_GEN_PROGS := $(PROGS)
-- 
cgit 


From 89d72c035f88c8338bf3ab604a07c7320dc9f800 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 12 Jan 2023 19:51:52 +0000
Subject: kselftest/arm64: Remove spurious comment from MTE test Makefile

There's a stray comment in the MTE test Makefile which documents
something that's since been removed, delete it.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/r/20230111-arm64-kselftest-clang-v1-6-89c69d377727@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/Makefile | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/mte/Makefile b/tools/testing/selftests/arm64/mte/Makefile
index fdb9acdca42b..0d7ac3db8390 100644
--- a/tools/testing/selftests/arm64/mte/Makefile
+++ b/tools/testing/selftests/arm64/mte/Makefile
@@ -28,7 +28,6 @@ ifeq ($(mte_cc_support),1)
 # Generated binaries to be installed by top KSFT script
 TEST_GEN_PROGS := $(PROGS)
 
-# Get Kernel headers installed and use them.
 else
     $(warning compiler "$(CC)" does not support the ARMv8.5 MTE extension.)
     $(warning test program "mte" will not be created.)
-- 
cgit 


From f76cb73a2d7c6e80bed04098b7f100872fd9a475 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 20 Jan 2023 12:04:08 +0000
Subject: kselftest/arm64: Verify that SSVE signal context has SVE_SIG_FLAG_SM
 set

Streaming mode SVE signal context should have SVE_SIG_FLAG_SM set but we
were not actually validating this. Add a check.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230117-arm64-test-ssve-za-v1-1-203c00150154@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/testcases/ssve_regs.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
index d0a178945b1a..cd738265cdcd 100644
--- a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
@@ -92,6 +92,11 @@ static int do_one_sme_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc,
 		return 1;
 	}
 
+	if (!(ssve->flags & SVE_SIG_FLAG_SM)) {
+		fprintf(stderr, "SVE_SIG_FLAG_SM not set in SVE record\n");
+		return 1;
+	}
+
 	/* The actual size validation is done in get_current_context() */
 	fprintf(stderr, "Got expected size %u and VL %d\n",
 		head->size, ssve->vl);
-- 
cgit 


From bc69da5ff087c40d1fc4f30596f1ee1b71924577 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 20 Jan 2023 12:04:09 +0000
Subject: kselftest/arm64: Verify simultaneous SSVE and ZA context generation

Add a test that generates SSVE and ZA context in a single signal frame to
ensure that nothing is going wrong in that case for any reason.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230117-arm64-test-ssve-za-v1-2-203c00150154@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../arm64/signal/testcases/ssve_za_regs.c          | 162 +++++++++++++++++++++
 1 file changed, 162 insertions(+)
 create mode 100644 tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c
new file mode 100644
index 000000000000..954a21f6121a
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 ARM Limited
+ *
+ * Verify that both the streaming SVE and ZA register context in
+ * signal frames is set up as expected when enabled simultaneously.
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+#include <sys/prctl.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static union {
+	ucontext_t uc;
+	char buf[1024 * 128];
+} context;
+static unsigned int vls[SVE_VQ_MAX];
+unsigned int nvls = 0;
+
+static bool sme_get_vls(struct tdescr *td)
+{
+	int vq, vl;
+
+	/*
+	 * Enumerate up to SVE_VQ_MAX vector lengths
+	 */
+	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
+		vl = prctl(PR_SME_SET_VL, vq * 16);
+		if (vl == -1)
+			return false;
+
+		vl &= PR_SME_VL_LEN_MASK;
+
+		/* Skip missing VLs */
+		vq = sve_vq_from_vl(vl);
+
+		vls[nvls++] = vl;
+	}
+
+	/* We need at least one VL */
+	if (nvls < 1) {
+		fprintf(stderr, "Only %d VL supported\n", nvls);
+		return false;
+	}
+
+	return true;
+}
+
+static void setup_regs(void)
+{
+	/* smstart sm; real data is TODO */
+	asm volatile(".inst 0xd503437f" : : : );
+
+	/* smstart za; real data is TODO */
+	asm volatile(".inst 0xd503457f" : : : );
+}
+
+static char zeros[ZA_SIG_REGS_SIZE(SVE_VQ_MAX)];
+
+static int do_one_sme_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc,
+			 unsigned int vl)
+{
+	size_t offset;
+	struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context);
+	struct _aarch64_ctx *regs;
+	struct sve_context *ssve;
+	struct za_context *za;
+	int ret;
+
+	fprintf(stderr, "Testing VL %d\n", vl);
+
+	ret = prctl(PR_SME_SET_VL, vl);
+	if (ret != vl) {
+		fprintf(stderr, "Failed to set VL, got %d\n", ret);
+		return 1;
+	}
+
+	/*
+	 * Get a signal context which should have the SVE and ZA
+	 * frames in it.
+	 */
+	setup_regs();
+	if (!get_current_context(td, &context.uc, sizeof(context)))
+		return 1;
+
+	regs = get_header(head, SVE_MAGIC, GET_BUF_RESV_SIZE(context),
+			  &offset);
+	if (!regs) {
+		fprintf(stderr, "No SVE context\n");
+		return 1;
+	}
+
+	ssve = (struct sve_context *)regs;
+	if (ssve->vl != vl) {
+		fprintf(stderr, "Got SSVE VL %d, expected %d\n", ssve->vl, vl);
+		return 1;
+	}
+
+	if (!(ssve->flags & SVE_SIG_FLAG_SM)) {
+		fprintf(stderr, "SVE_SIG_FLAG_SM not set in SVE record\n");
+		return 1;
+	}
+
+	fprintf(stderr, "Got expected SSVE size %u and VL %d\n",
+		regs->size, ssve->vl);
+
+	regs = get_header(head, ZA_MAGIC, GET_BUF_RESV_SIZE(context),
+			  &offset);
+	if (!regs) {
+		fprintf(stderr, "No ZA context\n");
+		return 1;
+	}
+
+	za = (struct za_context *)regs;
+	if (za->vl != vl) {
+		fprintf(stderr, "Got ZA VL %d, expected %d\n", za->vl, vl);
+		return 1;
+	}
+
+	fprintf(stderr, "Got expected ZA size %u and VL %d\n",
+		regs->size, za->vl);
+
+	/* We didn't load any data into ZA so it should be all zeros */
+	if (memcmp(zeros, (char *)za + ZA_SIG_REGS_OFFSET,
+		   ZA_SIG_REGS_SIZE(sve_vq_from_vl(za->vl))) != 0) {
+		fprintf(stderr, "ZA data invalid\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+static int sme_regs(struct tdescr *td, siginfo_t *si, ucontext_t *uc)
+{
+	int i;
+
+	for (i = 0; i < nvls; i++) {
+		if (do_one_sme_vl(td, si, uc, vls[i]))
+			return 1;
+	}
+
+	td->pass = 1;
+
+	return 0;
+}
+
+struct tdescr tde = {
+	.name = "Streaming SVE registers",
+	.descr = "Check that we get the right Streaming SVE registers reported",
+	/*
+	 * We shouldn't require FA64 but things like memset() used in the
+	 * helpers might use unsupported instructions so for now disable
+	 * the test unless we've got the full instruction set.
+	 */
+	.feats_required = FEAT_SME | FEAT_SME_FA64,
+	.timeout = 3,
+	.init = sme_get_vls,
+	.run = sme_regs,
+};
-- 
cgit 


From 00598857e38f56963116a6d70f7d64a29959bce9 Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Sun, 18 Dec 2022 17:29:41 +0800
Subject: kselftest/arm64: Remove the local NUM_VL definition

It was introduced in commit b77e995e3b96 ("kselftest/arm64: Add a test
program to exercise the syscall ABI") but never actually used. Remove it.

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221218092942.1940-1-yuzenghui@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/syscall-abi.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
index 8afcbf6861fd..01aea12ea252 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi.c
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -20,8 +20,6 @@
 
 #include "syscall-abi.h"
 
-#define NUM_VL ((SVE_VQ_MAX - SVE_VQ_MIN) + 1)
-
 static int default_sme_vl;
 
 static int sve_vl_count;
-- 
cgit 


From daac835347a52d9d141be281e4657cc08a360e97 Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Sun, 18 Dec 2022 17:29:42 +0800
Subject: kselftest/arm64: Correct buffer size for SME ZA storage

It looks like a copy-paste error to describe the ZA buffer size using (the
number of P registers * the maximum size of a Z register). This doesn't
have practical impact though as we're always allocating enough space even
for the architectural maximum ZA storage, with SVL equals to 2048 bits.

Switch to use ZA_SIG_REGS_SIZE(SVE_VQ_MAX). setup_za() will need to
initialize two 64MB arraies with this change and can be optimized later (if
someone complain).

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20221218092942.1940-2-yuzenghui@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/syscall-abi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
index 01aea12ea252..834616fde23e 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi.c
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -300,8 +300,8 @@ static int check_svcr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
 	return errors;
 }
 
-uint8_t za_in[SVE_NUM_PREGS * __SVE_ZREG_SIZE(SVE_VQ_MAX)];
-uint8_t za_out[SVE_NUM_PREGS * __SVE_ZREG_SIZE(SVE_VQ_MAX)];
+uint8_t za_in[ZA_SIG_REGS_SIZE(SVE_VQ_MAX)];
+uint8_t za_out[ZA_SIG_REGS_SIZE(SVE_VQ_MAX)];
 
 static void setup_za(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
 		     uint64_t svcr)
-- 
cgit 


From 79168a669d8125453c8a271115f1ffd4294e61f6 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 21 Jan 2023 05:52:31 +0530
Subject: bpf: Fix missing var_off check for ARG_PTR_TO_DYNPTR

Currently, the dynptr function is not checking the variable offset part
of PTR_TO_STACK that it needs to check. The fixed offset is considered
when computing the stack pointer index, but if the variable offset was
not a constant (such that it could not be accumulated in reg->off), we
will end up a discrepency where runtime pointer does not point to the
actual stack slot we mark as STACK_DYNPTR.

It is impossible to precisely track dynptr state when variable offset is
not constant, hence, just like bpf_timer, kptr, bpf_spin_lock, etc.
simply reject the case where reg->var_off is not constant. Then,
consider both reg->off and reg->var_off.value when computing the stack
pointer index.

A new helper dynptr_get_spi is introduced to hide over these details
since the dynptr needs to be located in multiple places outside the
process_dynptr_func checks, hence once we know it's a PTR_TO_STACK, we
need to enforce these checks in all places.

Note that it is disallowed for unprivileged users to have a non-constant
var_off, so this problem should only be possible to trigger from
programs having CAP_PERFMON. However, its effects can vary.

Without the fix, it is possible to replace the contents of the dynptr
arbitrarily by making verifier mark different stack slots than actual
location and then doing writes to the actual stack address of dynptr at
runtime.

Fixes: 97e03f521050 ("bpf: Add verifier support for dynptrs")
Acked-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                              | 84 +++++++++++++++++-----
 .../selftests/bpf/prog_tests/kfunc_dynptr_param.c  |  2 +-
 tools/testing/selftests/bpf/progs/dynptr_fail.c    |  4 +-
 3 files changed, 69 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 39d8ee38c338..76afdbea425a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -638,11 +638,34 @@ static void print_liveness(struct bpf_verifier_env *env,
 		verbose(env, "D");
 }
 
-static int get_spi(s32 off)
+static int __get_spi(s32 off)
 {
 	return (-off - 1) / BPF_REG_SIZE;
 }
 
+static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	int off, spi;
+
+	if (!tnum_is_const(reg->var_off)) {
+		verbose(env, "dynptr has to be at a constant offset\n");
+		return -EINVAL;
+	}
+
+	off = reg->off + reg->var_off.value;
+	if (off % BPF_REG_SIZE) {
+		verbose(env, "cannot pass in dynptr at an offset=%d\n", off);
+		return -EINVAL;
+	}
+
+	spi = __get_spi(off);
+	if (spi < 1) {
+		verbose(env, "cannot pass in dynptr at an offset=%d\n", off);
+		return -EINVAL;
+	}
+	return spi;
+}
+
 static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
 {
 	int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
@@ -754,7 +777,9 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
 	enum bpf_dynptr_type type;
 	int spi, i, id;
 
-	spi = get_spi(reg->off);
+	spi = dynptr_get_spi(env, reg);
+	if (spi < 0)
+		return spi;
 
 	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
 		return -EINVAL;
@@ -792,7 +817,9 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
 	struct bpf_func_state *state = func(env, reg);
 	int spi, i;
 
-	spi = get_spi(reg->off);
+	spi = dynptr_get_spi(env, reg);
+	if (spi < 0)
+		return spi;
 
 	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
 		return -EINVAL;
@@ -844,7 +871,11 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
 	if (reg->type == CONST_PTR_TO_DYNPTR)
 		return false;
 
-	spi = get_spi(reg->off);
+	spi = dynptr_get_spi(env, reg);
+	if (spi < 0)
+		return false;
+
+	/* We will do check_mem_access to check and update stack bounds later */
 	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
 		return true;
 
@@ -860,14 +891,15 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
 static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
-	int spi;
-	int i;
+	int spi, i;
 
 	/* This already represents first slot of initialized bpf_dynptr */
 	if (reg->type == CONST_PTR_TO_DYNPTR)
 		return true;
 
-	spi = get_spi(reg->off);
+	spi = dynptr_get_spi(env, reg);
+	if (spi < 0)
+		return false;
 	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
 	    !state->stack[spi].spilled_ptr.dynptr.first_slot)
 		return false;
@@ -896,7 +928,9 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
 	if (reg->type == CONST_PTR_TO_DYNPTR) {
 		return reg->dynptr.type == dynptr_type;
 	} else {
-		spi = get_spi(reg->off);
+		spi = dynptr_get_spi(env, reg);
+		if (spi < 0)
+			return false;
 		return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
 	}
 }
@@ -2429,7 +2463,9 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *
 	 */
 	if (reg->type == CONST_PTR_TO_DYNPTR)
 		return 0;
-	spi = get_spi(reg->off);
+	spi = dynptr_get_spi(env, reg);
+	if (spi < 0)
+		return spi;
 	/* Caller ensures dynptr is valid and initialized, which means spi is in
 	 * bounds and spi is the first dynptr slot. Simply mark stack slot as
 	 * read.
@@ -5992,12 +6028,15 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 	}
 	/* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
 	 * check_func_arg_reg_off's logic. We only need to check offset
-	 * alignment for PTR_TO_STACK.
+	 * and its alignment for PTR_TO_STACK.
 	 */
-	if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) {
-		verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off);
-		return -EINVAL;
+	if (reg->type == PTR_TO_STACK) {
+		int err = dynptr_get_spi(env, reg);
+
+		if (err < 0)
+			return err;
 	}
+
 	/*  MEM_UNINIT - Points to memory that is an appropriate candidate for
 	 *		 constructing a mutable bpf_dynptr object.
 	 *
@@ -6405,15 +6444,16 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	}
 }
 
-static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
 	int spi;
 
 	if (reg->type == CONST_PTR_TO_DYNPTR)
 		return reg->ref_obj_id;
-
-	spi = get_spi(reg->off);
+	spi = dynptr_get_spi(env, reg);
+	if (spi < 0)
+		return spi;
 	return state->stack[spi].spilled_ptr.ref_obj_id;
 }
 
@@ -6487,7 +6527,9 @@ skip_type_check:
 			 * PTR_TO_STACK.
 			 */
 			if (reg->type == PTR_TO_STACK) {
-				spi = get_spi(reg->off);
+				spi = dynptr_get_spi(env, reg);
+				if (spi < 0)
+					return spi;
 				if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
 				    !state->stack[spi].spilled_ptr.ref_obj_id) {
 					verbose(env, "arg %d is an unacquired reference\n", regno);
@@ -7977,13 +8019,19 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
 			if (arg_type_is_dynptr(fn->arg_type[i])) {
 				struct bpf_reg_state *reg = &regs[BPF_REG_1 + i];
+				int ref_obj_id;
 
 				if (meta.ref_obj_id) {
 					verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
 					return -EFAULT;
 				}
 
-				meta.ref_obj_id = dynptr_ref_obj_id(env, reg);
+				ref_obj_id = dynptr_ref_obj_id(env, reg);
+				if (ref_obj_id < 0) {
+					verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
+					return ref_obj_id;
+				}
+				meta.ref_obj_id = ref_obj_id;
 				break;
 			}
 		}
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
index a9229260a6ce..72800b1e8395 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
@@ -18,7 +18,7 @@ static struct {
 	const char *expected_verifier_err_msg;
 	int expected_runtime_err;
 } kfunc_dynptr_tests[] = {
-	{"not_valid_dynptr", "Expected an initialized dynptr as arg #1", 0},
+	{"not_valid_dynptr", "cannot pass in dynptr at an offset=-8", 0},
 	{"not_ptr_to_stack", "arg#0 expected pointer to stack or dynptr_ptr", 0},
 	{"dynptr_data_null", NULL, -EBADMSG},
 };
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index 78debc1b3820..02d57b95cf6e 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -382,7 +382,7 @@ int invalid_helper1(void *ctx)
 
 /* A dynptr can't be passed into a helper function at a non-zero offset */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #3")
+__failure __msg("cannot pass in dynptr at an offset=-8")
 int invalid_helper2(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -584,7 +584,7 @@ int invalid_read4(void *ctx)
 
 /* Initializing a dynptr on an offset should fail */
 SEC("?raw_tp")
-__failure __msg("invalid write to stack")
+__failure __msg("cannot pass in dynptr at an offset=0")
 int invalid_offset(void *ctx)
 {
 	struct bpf_dynptr ptr;
-- 
cgit 


From ef8fc7a07c0e161841779d6fe3f6acd5a05c547c Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 21 Jan 2023 05:52:32 +0530
Subject: bpf: Fix partial dynptr stack slot reads/writes

Currently, while reads are disallowed for dynptr stack slots, writes are
not. Reads don't work from both direct access and helpers, while writes
do work in both cases, but have the effect of overwriting the slot_type.

While this is fine, handling for a few edge cases is missing. Firstly,
a user can overwrite the stack slots of dynptr partially.

Consider the following layout:
spi: [d][d][?]
      2  1  0

First slot is at spi 2, second at spi 1.
Now, do a write of 1 to 8 bytes for spi 1.

This will essentially either write STACK_MISC for all slot_types or
STACK_MISC and STACK_ZERO (in case of size < BPF_REG_SIZE partial write
of zeroes). The end result is that slot is scrubbed.

Now, the layout is:
spi: [d][m][?]
      2  1  0

Suppose if user initializes spi = 1 as dynptr.
We get:
spi: [d][d][d]
      2  1  0

But this time, both spi 2 and spi 1 have first_slot = true.

Now, when passing spi 2 to dynptr helper, it will consider it as
initialized as it does not check whether second slot has first_slot ==
false. And spi 1 should already work as normal.

This effectively replaced size + offset of first dynptr, hence allowing
invalid OOB reads and writes.

Make a few changes to protect against this:
When writing to PTR_TO_STACK using BPF insns, when we touch spi of a
STACK_DYNPTR type, mark both first and second slot (regardless of which
slot we touch) as STACK_INVALID. Reads are already prevented.

Second, prevent writing	to stack memory from helpers if the range may
contain any STACK_DYNPTR slots. Reads are already prevented.

For helpers, we cannot allow it to destroy dynptrs from the writes as
depending on arguments, helper may take uninit_mem and dynptr both at
the same time. This would mean that helper may write to uninit_mem
before it reads the dynptr, which would be bad.

PTR_TO_MEM: [?????dd]

Depending on the code inside the helper, it may end up overwriting the
dynptr contents first and then read those as the dynptr argument.

Verifier would only simulate destruction when it does byte by byte
access simulation in check_helper_call for meta.access_size, and
fail to catch this case, as it happens after argument checks.

The same would need to be done for any other non-trivial objects created
on the stack in the future, such as bpf_list_head on stack, or
bpf_rb_root on stack.

A common misunderstanding in the current code is that MEM_UNINIT means
writes, but note that writes may also be performed even without
MEM_UNINIT in case of helpers, in that case the code after handling meta
&& meta->raw_mode will complain when it sees STACK_DYNPTR. So that
invalid read case also covers writes to potential STACK_DYNPTR slots.
The only loophole was in case of meta->raw_mode which simulated writes
through instructions which could overwrite them.

A future series sequenced after this will focus on the clean up of
helper access checks and bugs around that.

Fixes: 97e03f521050 ("bpf: Add verifier support for dynptrs")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                           | 88 +++++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/dynptr_fail.c |  6 +-
 2 files changed, 91 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 76afdbea425a..5c7f29ca94ec 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -769,6 +769,8 @@ static void mark_dynptr_cb_reg(struct bpf_reg_state *reg,
 	__mark_dynptr_reg(reg, type, true);
 }
 
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+				        struct bpf_func_state *state, int spi);
 
 static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 				   enum bpf_arg_type arg_type, int insn_idx)
@@ -863,6 +865,55 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
 	return 0;
 }
 
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+			       struct bpf_reg_state *reg);
+
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+				        struct bpf_func_state *state, int spi)
+{
+	int i;
+
+	/* We always ensure that STACK_DYNPTR is never set partially,
+	 * hence just checking for slot_type[0] is enough. This is
+	 * different for STACK_SPILL, where it may be only set for
+	 * 1 byte, so code has to use is_spilled_reg.
+	 */
+	if (state->stack[spi].slot_type[0] != STACK_DYNPTR)
+		return 0;
+
+	/* Reposition spi to first slot */
+	if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
+		spi = spi + 1;
+
+	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+		verbose(env, "cannot overwrite referenced dynptr\n");
+		return -EINVAL;
+	}
+
+	mark_stack_slot_scratched(env, spi);
+	mark_stack_slot_scratched(env, spi - 1);
+
+	/* Writing partially to one dynptr stack slot destroys both. */
+	for (i = 0; i < BPF_REG_SIZE; i++) {
+		state->stack[spi].slot_type[i] = STACK_INVALID;
+		state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+	}
+
+	/* TODO: Invalidate any slices associated with this dynptr */
+
+	/* Do not release reference state, we are destroying dynptr on stack,
+	 * not using some helper to release it. Just reset register.
+	 */
+	__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+	__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+	/* Same reason as unmark_stack_slots_dynptr above */
+	state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+	state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
+	return 0;
+}
+
 static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
@@ -3391,6 +3442,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 			env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
 	}
 
+	err = destroy_if_dynptr_stack_slot(env, state, spi);
+	if (err)
+		return err;
+
 	mark_stack_slot_scratched(env, spi);
 	if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
 	    !register_is_null(reg) && env->bpf_capable) {
@@ -3504,6 +3559,14 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
 	if (err)
 		return err;
 
+	for (i = min_off; i < max_off; i++) {
+		int spi;
+
+		spi = __get_spi(i);
+		err = destroy_if_dynptr_stack_slot(env, state, spi);
+		if (err)
+			return err;
+	}
 
 	/* Variable offset writes destroy any spilled pointers in range. */
 	for (i = min_off; i < max_off; i++) {
@@ -5531,6 +5594,31 @@ static int check_stack_range_initialized(
 	}
 
 	if (meta && meta->raw_mode) {
+		/* Ensure we won't be overwriting dynptrs when simulating byte
+		 * by byte access in check_helper_call using meta.access_size.
+		 * This would be a problem if we have a helper in the future
+		 * which takes:
+		 *
+		 *	helper(uninit_mem, len, dynptr)
+		 *
+		 * Now, uninint_mem may overlap with dynptr pointer. Hence, it
+		 * may end up writing to dynptr itself when touching memory from
+		 * arg 1. This can be relaxed on a case by case basis for known
+		 * safe cases, but reject due to the possibilitiy of aliasing by
+		 * default.
+		 */
+		for (i = min_off; i < max_off + access_size; i++) {
+			int stack_off = -i - 1;
+
+			spi = __get_spi(i);
+			/* raw_mode may write past allocated_stack */
+			if (state->allocated_stack <= stack_off)
+				continue;
+			if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) {
+				verbose(env, "potential write to dynptr at off=%d disallowed\n", i);
+				return -EACCES;
+			}
+		}
 		meta->access_size = access_size;
 		meta->regno = regno;
 		return 0;
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index 02d57b95cf6e..9dc3f23a8270 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -420,7 +420,7 @@ int invalid_write1(void *ctx)
  * offset
  */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #3")
+__failure __msg("cannot overwrite referenced dynptr")
 int invalid_write2(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -444,7 +444,7 @@ int invalid_write2(void *ctx)
  * non-const offset
  */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #1")
+__failure __msg("cannot overwrite referenced dynptr")
 int invalid_write3(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -476,7 +476,7 @@ static int invalid_write4_callback(__u32 index, void *data)
  * be invalidated as a dynptr
  */
 SEC("?raw_tp")
-__failure __msg("arg 1 is an unacquired reference")
+__failure __msg("cannot overwrite referenced dynptr")
 int invalid_write4(void *ctx)
 {
 	struct bpf_dynptr ptr;
-- 
cgit 


From f8064ab90d6644bc8338d2d7ff6a0d6e7a1b2ef3 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 21 Jan 2023 05:52:33 +0530
Subject: bpf: Invalidate slices on destruction of dynptrs on stack

The previous commit implemented destroy_if_dynptr_stack_slot. It
destroys the dynptr which given spi belongs to, but still doesn't
invalidate the slices that belong to such a dynptr. While for the case
of referenced dynptr, we don't allow their overwrite and return an error
early, we still allow it and destroy the dynptr for unreferenced dynptr.

To be able to enable precise and scoped invalidation of dynptr slices in
this case, we must be able to associate the source dynptr of slices that
have been obtained using bpf_dynptr_data. When doing destruction, only
slices belonging to the dynptr being destructed should be invalidated,
and nothing else. Currently, dynptr slices belonging to different
dynptrs are indistinguishible.

Hence, allocate a unique id to each dynptr (CONST_PTR_TO_DYNPTR and
those on stack). This will be stored as part of reg->id. Whenever using
bpf_dynptr_data, transfer this unique dynptr id to the returned
PTR_TO_MEM_OR_NULL slice pointer, and store it in a new per-PTR_TO_MEM
dynptr_id register state member.

Finally, after establishing such a relationship between dynptrs and
their slices, implement precise invalidation logic that only invalidates
slices belong to the destroyed dynptr in destroy_if_dynptr_stack_slot.

Acked-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-5-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                    |  5 +-
 kernel/bpf/verifier.c                           | 74 +++++++++++++++++++++----
 tools/testing/selftests/bpf/progs/dynptr_fail.c |  4 +-
 3 files changed, 68 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 127058cfec47..aa83de1fe755 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -70,7 +70,10 @@ struct bpf_reg_state {
 			u32 btf_id;
 		};
 
-		u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */
+		struct { /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */
+			u32 mem_size;
+			u32 dynptr_id; /* for dynptr slices */
+		};
 
 		/* For dynptr stack slots */
 		struct {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5c7f29ca94ec..01cb802776fd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -255,6 +255,7 @@ struct bpf_call_arg_meta {
 	int mem_size;
 	u64 msize_max_value;
 	int ref_obj_id;
+	int dynptr_id;
 	int map_uid;
 	int func_id;
 	struct btf *btf;
@@ -750,23 +751,27 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
 
 static void __mark_dynptr_reg(struct bpf_reg_state *reg,
 			      enum bpf_dynptr_type type,
-			      bool first_slot);
+			      bool first_slot, int dynptr_id);
 
 static void __mark_reg_not_init(const struct bpf_verifier_env *env,
 				struct bpf_reg_state *reg);
 
-static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1,
+static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
+				   struct bpf_reg_state *sreg1,
 				   struct bpf_reg_state *sreg2,
 				   enum bpf_dynptr_type type)
 {
-	__mark_dynptr_reg(sreg1, type, true);
-	__mark_dynptr_reg(sreg2, type, false);
+	int id = ++env->id_gen;
+
+	__mark_dynptr_reg(sreg1, type, true, id);
+	__mark_dynptr_reg(sreg2, type, false, id);
 }
 
-static void mark_dynptr_cb_reg(struct bpf_reg_state *reg,
+static void mark_dynptr_cb_reg(struct bpf_verifier_env *env,
+			       struct bpf_reg_state *reg,
 			       enum bpf_dynptr_type type)
 {
-	__mark_dynptr_reg(reg, type, true);
+	__mark_dynptr_reg(reg, type, true, ++env->id_gen);
 }
 
 static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
@@ -795,7 +800,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
 	if (type == BPF_DYNPTR_TYPE_INVALID)
 		return -EINVAL;
 
-	mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr,
+	mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
 			       &state->stack[spi - 1].spilled_ptr, type);
 
 	if (dynptr_type_refcounted(type)) {
@@ -871,7 +876,9 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
 static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
 				        struct bpf_func_state *state, int spi)
 {
-	int i;
+	struct bpf_func_state *fstate;
+	struct bpf_reg_state *dreg;
+	int i, dynptr_id;
 
 	/* We always ensure that STACK_DYNPTR is never set partially,
 	 * hence just checking for slot_type[0] is enough. This is
@@ -899,7 +906,19 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
 		state->stack[spi - 1].slot_type[i] = STACK_INVALID;
 	}
 
-	/* TODO: Invalidate any slices associated with this dynptr */
+	dynptr_id = state->stack[spi].spilled_ptr.id;
+	/* Invalidate any slices associated with this dynptr */
+	bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({
+		/* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
+		if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
+			continue;
+		if (dreg->dynptr_id == dynptr_id) {
+			if (!env->allow_ptr_leaks)
+				__mark_reg_not_init(env, dreg);
+			else
+				__mark_reg_unknown(env, dreg);
+		}
+	}));
 
 	/* Do not release reference state, we are destroying dynptr on stack,
 	 * not using some helper to release it. Just reset register.
@@ -1562,7 +1581,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
 }
 
 static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
-			      bool first_slot)
+			      bool first_slot, int dynptr_id)
 {
 	/* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
 	 * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
@@ -1570,6 +1589,8 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type ty
 	 */
 	__mark_reg_known_zero(reg);
 	reg->type = CONST_PTR_TO_DYNPTR;
+	/* Give each dynptr a unique id to uniquely associate slices to it. */
+	reg->id = dynptr_id;
 	reg->dynptr.type = type;
 	reg->dynptr.first_slot = first_slot;
 }
@@ -6532,6 +6553,19 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	}
 }
 
+static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi;
+
+	if (reg->type == CONST_PTR_TO_DYNPTR)
+		return reg->id;
+	spi = dynptr_get_spi(env, reg);
+	if (spi < 0)
+		return spi;
+	return state->stack[spi].spilled_ptr.id;
+}
+
 static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
@@ -7601,7 +7635,7 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
 	 * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
 	 */
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
-	mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
+	mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
 	callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
 
 	/* unused */
@@ -8107,18 +8141,31 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
 			if (arg_type_is_dynptr(fn->arg_type[i])) {
 				struct bpf_reg_state *reg = &regs[BPF_REG_1 + i];
-				int ref_obj_id;
+				int id, ref_obj_id;
+
+				if (meta.dynptr_id) {
+					verbose(env, "verifier internal error: meta.dynptr_id already set\n");
+					return -EFAULT;
+				}
 
 				if (meta.ref_obj_id) {
 					verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
 					return -EFAULT;
 				}
 
+				id = dynptr_id(env, reg);
+				if (id < 0) {
+					verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+					return id;
+				}
+
 				ref_obj_id = dynptr_ref_obj_id(env, reg);
 				if (ref_obj_id < 0) {
 					verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
 					return ref_obj_id;
 				}
+
+				meta.dynptr_id = id;
 				meta.ref_obj_id = ref_obj_id;
 				break;
 			}
@@ -8275,6 +8322,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		return -EFAULT;
 	}
 
+	if (is_dynptr_ref_function(func_id))
+		regs[BPF_REG_0].dynptr_id = meta.dynptr_id;
+
 	if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
 		/* For release_reference() */
 		regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index 9dc3f23a8270..e43000c63c66 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -67,7 +67,7 @@ static int get_map_val_dynptr(struct bpf_dynptr *ptr)
  * bpf_ringbuf_submit/discard_dynptr call
  */
 SEC("?raw_tp")
-__failure __msg("Unreleased reference id=1")
+__failure __msg("Unreleased reference id=2")
 int ringbuf_missing_release1(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -80,7 +80,7 @@ int ringbuf_missing_release1(void *ctx)
 }
 
 SEC("?raw_tp")
-__failure __msg("Unreleased reference id=2")
+__failure __msg("Unreleased reference id=4")
 int ringbuf_missing_release2(void *ctx)
 {
 	struct bpf_dynptr ptr1, ptr2;
-- 
cgit 


From 91b875a5e43b3a8dec4fbdca067c8860004b5f0e Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Sat, 21 Jan 2023 05:52:37 +0530
Subject: selftests/bpf: convenience macro for use with 'asm volatile' blocks

A set of macros useful for writing naked BPF functions using inline
assembly. E.g. as follows:

struct map_struct {
	...
} map SEC(".maps");

SEC(...)
__naked int foo_test(void)
{
	asm volatile(
		"r0 = 0;"
		"*(u64*)(r10 - 8) = r0;"
		"r1 = %[map] ll;"
		"r2 = r10;"
		"r2 += -8;"
		"call %[bpf_map_lookup_elem];"
		"r0 = 0;"
		"exit;"
		:
		: __imm(bpf_map_lookup_elem),
		  __imm_addr(map)
		: __clobber_all);
}

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
[ Kartikeya: Add acks, include __clobber_common from Andrii ]
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-9-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/bpf_misc.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index 4a01ea9113bf..2d7b89b447b2 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -7,6 +7,13 @@
 #define __success		__attribute__((btf_decl_tag("comment:test_expect_success")))
 #define __log_level(lvl)	__attribute__((btf_decl_tag("comment:test_log_level="#lvl)))
 
+/* Convenience macro for use with 'asm volatile' blocks */
+#define __naked __attribute__((naked))
+#define __clobber_all "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "memory"
+#define __clobber_common "r0", "r1", "r2", "r3", "r4", "r5", "memory"
+#define __imm(name) [name]"i"(name)
+#define __imm_addr(name) [name]"i"(&name)
+
 #if defined(__TARGET_ARCH_x86)
 #define SYSCALL_WRAPPER 1
 #define SYS_PREFIX "__x64_"
-- 
cgit 


From f4d24edf1b9249e43282ac2572d43d9ad10faf43 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 21 Jan 2023 05:52:38 +0530
Subject: selftests/bpf: Add dynptr pruning tests

Add verifier tests that verify the new pruning behavior for STACK_DYNPTR
slots, and ensure that state equivalence takes into account changes to
the old and current verifier state correctly. Also ensure that the
stacksafe changes are actually enabling pruning in case states are
equivalent from pruning PoV.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-10-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/dynptr_fail.c | 141 ++++++++++++++++++++++++
 1 file changed, 141 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index e43000c63c66..f1e047877279 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -35,6 +35,13 @@ struct {
 	__type(value, __u32);
 } array_map3 SEC(".maps");
 
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, __u64);
+} array_map4 SEC(".maps");
+
 struct sample {
 	int pid;
 	long value;
@@ -653,3 +660,137 @@ int dynptr_from_mem_invalid_api(void *ctx)
 
 	return 0;
 }
+
+SEC("?tc")
+__failure __msg("cannot overwrite referenced dynptr") __log_level(2)
+int dynptr_pruning_overwrite(struct __sk_buff *ctx)
+{
+	asm volatile (
+		"r9 = 0xeB9F;				\
+		 r6 = %[ringbuf] ll;			\
+		 r1 = r6;				\
+		 r2 = 8;				\
+		 r3 = 0;				\
+		 r4 = r10;				\
+		 r4 += -16;				\
+		 call %[bpf_ringbuf_reserve_dynptr];	\
+		 if r0 == 0 goto pjmp1;			\
+		 goto pjmp2;				\
+	pjmp1:						\
+		 *(u64 *)(r10 - 16) = r9;		\
+	pjmp2:						\
+		 r1 = r10;				\
+		 r1 += -16;				\
+		 r2 = 0;				\
+		 call %[bpf_ringbuf_discard_dynptr];	"
+		:
+		: __imm(bpf_ringbuf_reserve_dynptr),
+		  __imm(bpf_ringbuf_discard_dynptr),
+		  __imm_addr(ringbuf)
+		: __clobber_all
+	);
+	return 0;
+}
+
+SEC("?tc")
+__success __msg("12: safe") __log_level(2)
+int dynptr_pruning_stacksafe(struct __sk_buff *ctx)
+{
+	asm volatile (
+		"r9 = 0xeB9F;				\
+		 r6 = %[ringbuf] ll;			\
+		 r1 = r6;				\
+		 r2 = 8;				\
+		 r3 = 0;				\
+		 r4 = r10;				\
+		 r4 += -16;				\
+		 call %[bpf_ringbuf_reserve_dynptr];	\
+		 if r0 == 0 goto stjmp1;		\
+		 goto stjmp2;				\
+	stjmp1:						\
+		 r9 = r9;				\
+	stjmp2:						\
+		 r1 = r10;				\
+		 r1 += -16;				\
+		 r2 = 0;				\
+		 call %[bpf_ringbuf_discard_dynptr];	"
+		:
+		: __imm(bpf_ringbuf_reserve_dynptr),
+		  __imm(bpf_ringbuf_discard_dynptr),
+		  __imm_addr(ringbuf)
+		: __clobber_all
+	);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("cannot overwrite referenced dynptr") __log_level(2)
+int dynptr_pruning_type_confusion(struct __sk_buff *ctx)
+{
+	asm volatile (
+		"r6 = %[array_map4] ll;			\
+		 r7 = %[ringbuf] ll;			\
+		 r1 = r6;				\
+		 r2 = r10;				\
+		 r2 += -8;				\
+		 r9 = 0;				\
+		 *(u64 *)(r2 + 0) = r9;			\
+		 r3 = r10;				\
+		 r3 += -24;				\
+		 r9 = 0xeB9FeB9F;			\
+		 *(u64 *)(r10 - 16) = r9;		\
+		 *(u64 *)(r10 - 24) = r9;		\
+		 r9 = 0;				\
+		 r4 = 0;				\
+		 r8 = r2;				\
+		 call %[bpf_map_update_elem];		\
+		 r1 = r6;				\
+		 r2 = r8;				\
+		 call %[bpf_map_lookup_elem];		\
+		 if r0 != 0 goto tjmp1;			\
+		 exit;					\
+	tjmp1:						\
+		 r8 = r0;				\
+		 r1 = r7;				\
+		 r2 = 8;				\
+		 r3 = 0;				\
+		 r4 = r10;				\
+		 r4 += -16;				\
+		 r0 = *(u64 *)(r0 + 0);			\
+		 call %[bpf_ringbuf_reserve_dynptr];	\
+		 if r0 == 0 goto tjmp2;			\
+		 r8 = r8;				\
+		 r8 = r8;				\
+		 r8 = r8;				\
+		 r8 = r8;				\
+		 r8 = r8;				\
+		 r8 = r8;				\
+		 r8 = r8;				\
+		 goto tjmp3;				\
+	tjmp2:						\
+		 *(u64 *)(r10 - 8) = r9;		\
+		 *(u64 *)(r10 - 16) = r9;		\
+		 r1 = r8;				\
+		 r1 += 8;				\
+		 r2 = 0;				\
+		 r3 = 0;				\
+		 r4 = r10;				\
+		 r4 += -16;				\
+		 call %[bpf_dynptr_from_mem];		\
+	tjmp3:						\
+		 r1 = r10;				\
+		 r1 += -16;				\
+		 r2 = 0;				\
+		 call %[bpf_ringbuf_discard_dynptr];	"
+		:
+		: __imm(bpf_map_update_elem),
+		  __imm(bpf_map_lookup_elem),
+		  __imm(bpf_ringbuf_reserve_dynptr),
+		  __imm(bpf_dynptr_from_mem),
+		  __imm(bpf_ringbuf_discard_dynptr),
+		  __imm_addr(array_map4),
+		  __imm_addr(ringbuf)
+		: __clobber_all
+	);
+	return 0;
+}
-- 
cgit 


From ef4810135396735c1a6b1c343c3cc4fe4be96a43 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 21 Jan 2023 05:52:39 +0530
Subject: selftests/bpf: Add dynptr var_off tests

Ensure that variable offset is handled correctly, and verifier takes
both fixed and variable part into account. Also ensures that only
constant var_off is allowed.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-11-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/dynptr_fail.c | 40 +++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index f1e047877279..2d899f2bebb0 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -794,3 +794,43 @@ int dynptr_pruning_type_confusion(struct __sk_buff *ctx)
 	);
 	return 0;
 }
+
+SEC("?tc")
+__failure __msg("dynptr has to be at a constant offset") __log_level(2)
+int dynptr_var_off_overwrite(struct __sk_buff *ctx)
+{
+	asm volatile (
+		"r9 = 16;				\
+		 *(u32 *)(r10 - 4) = r9;		\
+		 r8 = *(u32 *)(r10 - 4);		\
+		 if r8 >= 0 goto vjmp1;			\
+		 r0 = 1;				\
+		 exit;					\
+	vjmp1:						\
+		 if r8 <= 16 goto vjmp2;		\
+		 r0 = 1;				\
+		 exit;					\
+	vjmp2:						\
+		 r8 &= 16;				\
+		 r1 = %[ringbuf] ll;			\
+		 r2 = 8;				\
+		 r3 = 0;				\
+		 r4 = r10;				\
+		 r4 += -32;				\
+		 r4 += r8;				\
+		 call %[bpf_ringbuf_reserve_dynptr];	\
+		 r9 = 0xeB9F;				\
+		 *(u64 *)(r10 - 16) = r9;		\
+		 r1 = r10;				\
+		 r1 += -32;				\
+		 r1 += r8;				\
+		 r2 = 0;				\
+		 call %[bpf_ringbuf_discard_dynptr];	"
+		:
+		: __imm(bpf_ringbuf_reserve_dynptr),
+		  __imm(bpf_ringbuf_discard_dynptr),
+		  __imm_addr(ringbuf)
+		: __clobber_all
+	);
+	return 0;
+}
-- 
cgit 


From 011edc8e49b8551dfb6cfcc8601d05e029cf5994 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 21 Jan 2023 05:52:40 +0530
Subject: selftests/bpf: Add dynptr partial slot overwrite tests

Try creating a dynptr, then overwriting second slot with first slot of
another dynptr. Then, the first slot of first dynptr should also be
invalidated, but without our fix that does not happen. As a consequence,
the unfixed case allows passing first dynptr (as the kernel check only
checks for slot_type and then first_slot == true).

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-12-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/dynptr_fail.c | 66 +++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index 2d899f2bebb0..1cbec5468879 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -834,3 +834,69 @@ int dynptr_var_off_overwrite(struct __sk_buff *ctx)
 	);
 	return 0;
 }
+
+SEC("?tc")
+__failure __msg("cannot overwrite referenced dynptr") __log_level(2)
+int dynptr_partial_slot_invalidate(struct __sk_buff *ctx)
+{
+	asm volatile (
+		"r6 = %[ringbuf] ll;			\
+		 r7 = %[array_map4] ll;			\
+		 r1 = r7;				\
+		 r2 = r10;				\
+		 r2 += -8;				\
+		 r9 = 0;				\
+		 *(u64 *)(r2 + 0) = r9;			\
+		 r3 = r2;				\
+		 r4 = 0;				\
+		 r8 = r2;				\
+		 call %[bpf_map_update_elem];		\
+		 r1 = r7;				\
+		 r2 = r8;				\
+		 call %[bpf_map_lookup_elem];		\
+		 if r0 != 0 goto sjmp1;			\
+		 exit;					\
+	sjmp1:						\
+		 r7 = r0;				\
+		 r1 = r6;				\
+		 r2 = 8;				\
+		 r3 = 0;				\
+		 r4 = r10;				\
+		 r4 += -24;				\
+		 call %[bpf_ringbuf_reserve_dynptr];	\
+		 *(u64 *)(r10 - 16) = r9;		\
+		 r1 = r7;				\
+		 r2 = 8;				\
+		 r3 = 0;				\
+		 r4 = r10;				\
+		 r4 += -16;				\
+		 call %[bpf_dynptr_from_mem];		\
+		 r1 = r10;				\
+		 r1 += -512;				\
+		 r2 = 488;				\
+		 r3 = r10;				\
+		 r3 += -24;				\
+		 r4 = 0;				\
+		 r5 = 0;				\
+		 call %[bpf_dynptr_read];		\
+		 r8 = 1;				\
+		 if r0 != 0 goto sjmp2;			\
+		 r8 = 0;				\
+	sjmp2:						\
+		 r1 = r10;				\
+		 r1 += -24;				\
+		 r2 = 0;				\
+		 call %[bpf_ringbuf_discard_dynptr];	"
+		:
+		: __imm(bpf_map_update_elem),
+		  __imm(bpf_map_lookup_elem),
+		  __imm(bpf_ringbuf_reserve_dynptr),
+		  __imm(bpf_ringbuf_discard_dynptr),
+		  __imm(bpf_dynptr_from_mem),
+		  __imm(bpf_dynptr_read),
+		  __imm_addr(ringbuf),
+		  __imm_addr(array_map4)
+		: __clobber_all
+	);
+	return 0;
+}
-- 
cgit 


From ae8e354c497af625eaecd3d86e04f9087762d42b Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 21 Jan 2023 05:52:41 +0530
Subject: selftests/bpf: Add dynptr helper tests

First test that we allow overwriting dynptr slots and reinitializing
them in unreferenced case, and disallow overwriting for referenced case.
Include tests to ensure slices obtained from destroyed dynptrs are being
invalidated on their destruction. The destruction needs to be scoped, as
in slices of dynptr A should not be invalidated when dynptr B is
destroyed. Next, test that MEM_UNINIT doesn't allow writing dynptr stack
slots.

Acked-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230121002241.2113993-13-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/dynptr_fail.c | 192 ++++++++++++++++++++++++
 1 file changed, 192 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index 1cbec5468879..5950ad6ec2e6 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -900,3 +900,195 @@ int dynptr_partial_slot_invalidate(struct __sk_buff *ctx)
 	);
 	return 0;
 }
+
+/* Test that it is allowed to overwrite unreferenced dynptr. */
+SEC("?raw_tp")
+__success
+int dynptr_overwrite_unref(void *ctx)
+{
+	struct bpf_dynptr ptr;
+
+	if (get_map_val_dynptr(&ptr))
+		return 0;
+	if (get_map_val_dynptr(&ptr))
+		return 0;
+	if (get_map_val_dynptr(&ptr))
+		return 0;
+
+	return 0;
+}
+
+/* Test that slices are invalidated on reinitializing a dynptr. */
+SEC("?raw_tp")
+__failure __msg("invalid mem access 'scalar'")
+int dynptr_invalidate_slice_reinit(void *ctx)
+{
+	struct bpf_dynptr ptr;
+	__u8 *p;
+
+	if (get_map_val_dynptr(&ptr))
+		return 0;
+	p = bpf_dynptr_data(&ptr, 0, 1);
+	if (!p)
+		return 0;
+	if (get_map_val_dynptr(&ptr))
+		return 0;
+	/* this should fail */
+	return *p;
+}
+
+/* Invalidation of dynptr slices on destruction of dynptr should not miss
+ * mem_or_null pointers.
+ */
+SEC("?raw_tp")
+__failure __msg("R1 type=scalar expected=percpu_ptr_")
+int dynptr_invalidate_slice_or_null(void *ctx)
+{
+	struct bpf_dynptr ptr;
+	__u8 *p;
+
+	if (get_map_val_dynptr(&ptr))
+		return 0;
+
+	p = bpf_dynptr_data(&ptr, 0, 1);
+	*(__u8 *)&ptr = 0;
+	/* this should fail */
+	bpf_this_cpu_ptr(p);
+	return 0;
+}
+
+/* Destruction of dynptr should also any slices obtained from it */
+SEC("?raw_tp")
+__failure __msg("R7 invalid mem access 'scalar'")
+int dynptr_invalidate_slice_failure(void *ctx)
+{
+	struct bpf_dynptr ptr1;
+	struct bpf_dynptr ptr2;
+	__u8 *p1, *p2;
+
+	if (get_map_val_dynptr(&ptr1))
+		return 0;
+	if (get_map_val_dynptr(&ptr2))
+		return 0;
+
+	p1 = bpf_dynptr_data(&ptr1, 0, 1);
+	if (!p1)
+		return 0;
+	p2 = bpf_dynptr_data(&ptr2, 0, 1);
+	if (!p2)
+		return 0;
+
+	*(__u8 *)&ptr1 = 0;
+	/* this should fail */
+	return *p1;
+}
+
+/* Invalidation of slices should be scoped and should not prevent dereferencing
+ * slices of another dynptr after destroying unrelated dynptr
+ */
+SEC("?raw_tp")
+__success
+int dynptr_invalidate_slice_success(void *ctx)
+{
+	struct bpf_dynptr ptr1;
+	struct bpf_dynptr ptr2;
+	__u8 *p1, *p2;
+
+	if (get_map_val_dynptr(&ptr1))
+		return 1;
+	if (get_map_val_dynptr(&ptr2))
+		return 1;
+
+	p1 = bpf_dynptr_data(&ptr1, 0, 1);
+	if (!p1)
+		return 1;
+	p2 = bpf_dynptr_data(&ptr2, 0, 1);
+	if (!p2)
+		return 1;
+
+	*(__u8 *)&ptr1 = 0;
+	return *p2;
+}
+
+/* Overwriting referenced dynptr should be rejected */
+SEC("?raw_tp")
+__failure __msg("cannot overwrite referenced dynptr")
+int dynptr_overwrite_ref(void *ctx)
+{
+	struct bpf_dynptr ptr;
+
+	bpf_ringbuf_reserve_dynptr(&ringbuf, 64, 0, &ptr);
+	/* this should fail */
+	if (get_map_val_dynptr(&ptr))
+		bpf_ringbuf_discard_dynptr(&ptr, 0);
+	return 0;
+}
+
+/* Reject writes to dynptr slot from bpf_dynptr_read */
+SEC("?raw_tp")
+__failure __msg("potential write to dynptr at off=-16")
+int dynptr_read_into_slot(void *ctx)
+{
+	union {
+		struct {
+			char _pad[48];
+			struct bpf_dynptr ptr;
+		};
+		char buf[64];
+	} data;
+
+	bpf_ringbuf_reserve_dynptr(&ringbuf, 64, 0, &data.ptr);
+	/* this should fail */
+	bpf_dynptr_read(data.buf, sizeof(data.buf), &data.ptr, 0, 0);
+
+	return 0;
+}
+
+/* Reject writes to dynptr slot for uninit arg */
+SEC("?raw_tp")
+__failure __msg("potential write to dynptr at off=-16")
+int uninit_write_into_slot(void *ctx)
+{
+	struct {
+		char buf[64];
+		struct bpf_dynptr ptr;
+	} data;
+
+	bpf_ringbuf_reserve_dynptr(&ringbuf, 80, 0, &data.ptr);
+	/* this should fail */
+	bpf_get_current_comm(data.buf, 80);
+
+	return 0;
+}
+
+static int callback(__u32 index, void *data)
+{
+        *(__u32 *)data = 123;
+
+        return 0;
+}
+
+/* If the dynptr is written into in a callback function, its data
+ * slices should be invalidated as well.
+ */
+SEC("?raw_tp")
+__failure __msg("invalid mem access 'scalar'")
+int invalid_data_slices(void *ctx)
+{
+	struct bpf_dynptr ptr;
+	__u32 *slice;
+
+	if (get_map_val_dynptr(&ptr))
+		return 0;
+
+	slice = bpf_dynptr_data(&ptr, 0, sizeof(__u32));
+	if (!slice)
+		return 0;
+
+	bpf_loop(10, callback, &ptr, 0);
+
+	/* this should fail */
+	*slice = 1;
+
+	return 0;
+}
-- 
cgit 


From 03d7a1053cf72372be22b43faada5bca12ff183d Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Wed, 18 Jan 2023 11:52:15 +0100
Subject: objtool: Check that module init/exit function is an indirect call
 target

Some out-of-tree modules still do not use module_init() / module_exit()
macros and simply create functions with magic names init_module() and
cleanup_module() instead. As a result, these functions are not recognized
as indirect call targets by objtool and such module fails to load into an
IBT enabled kernel.

This old way is not even documented any more but it is cleaner to issue
a warning than to let the module fail on load without obvious reason.

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230118105215.B9DA960514@lion.mk-sys.cz
---
 tools/objtool/Documentation/objtool.txt | 8 ++++++++
 tools/objtool/check.c                   | 7 +++++++
 2 files changed, 15 insertions(+)

(limited to 'tools')

diff --git a/tools/objtool/Documentation/objtool.txt b/tools/objtool/Documentation/objtool.txt
index 8a671902a187..8e53fc6735ef 100644
--- a/tools/objtool/Documentation/objtool.txt
+++ b/tools/objtool/Documentation/objtool.txt
@@ -410,6 +410,14 @@ the objtool maintainers.
    can remove this warning by putting the ANNOTATE_INTRA_FUNCTION_CALL
    directive right before the call.
 
+12. file.o: warning: func(): not an indirect call target
+
+   This means that objtool is running with --ibt and a function expected
+   to be an indirect call target is not. In particular, this happens for
+   init_module() or cleanup_module() if a module relies on these special
+   names and does not use module_init() / module_exit() macros to create
+   them.
+
 
 If the error doesn't seem to make sense, it could be a bug in objtool.
 Feel free to ask the objtool maintainer for help.
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index cab1a162781c..7c40bd51c75a 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -847,8 +847,15 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file)
 	list_for_each_entry(insn, &file->endbr_list, call_node) {
 
 		int *site = (int *)sec->data->d_buf + idx;
+		struct symbol *sym = insn->sym;
 		*site = 0;
 
+		if (opts.module && sym && sym->type == STT_FUNC &&
+		    insn->offset == sym->offset &&
+		    (!strcmp(sym->name, "init_module") ||
+		     !strcmp(sym->name, "cleanup_module")))
+			WARN("%s(): not an indirect call target", sym->name);
+
 		if (elf_add_reloc_to_insn(file->elf, sec,
 					  idx * sizeof(int),
 					  R_X86_64_PC32,
-- 
cgit 


From f2edf0c819a4823cd6c288801ce737e8d4fcde06 Mon Sep 17 00:00:00 2001
From: Yulong Zhang <yulong.zhang@metoak.net>
Date: Tue, 17 Jan 2023 10:51:47 +0800
Subject: tools/iio/iio_utils:fix memory leak

1. fopen sysfs without fclose.
2. asprintf filename without free.
3. if asprintf return error,do not need to free the buffer.

Signed-off-by: Yulong Zhang <yulong.zhang@metoak.net>
Link: https://lore.kernel.org/r/20230117025147.69890-1-yulong.zhang@metoak.net
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 tools/iio/iio_utils.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/iio/iio_utils.c b/tools/iio/iio_utils.c
index 8d35893b2fa8..6a00a6eecaef 100644
--- a/tools/iio/iio_utils.c
+++ b/tools/iio/iio_utils.c
@@ -264,6 +264,7 @@ int iioutils_get_param_float(float *output, const char *param_name,
 			if (fscanf(sysfsfp, "%f", output) != 1)
 				ret = errno ? -errno : -ENODATA;
 
+			fclose(sysfsfp);
 			break;
 		}
 error_free_filename:
@@ -345,9 +346,9 @@ int build_channel_array(const char *device_dir, int buffer_idx,
 			}
 
 			sysfsfp = fopen(filename, "r");
+			free(filename);
 			if (!sysfsfp) {
 				ret = -errno;
-				free(filename);
 				goto error_close_dir;
 			}
 
@@ -357,7 +358,6 @@ int build_channel_array(const char *device_dir, int buffer_idx,
 				if (fclose(sysfsfp))
 					perror("build_channel_array(): Failed to close file");
 
-				free(filename);
 				goto error_close_dir;
 			}
 			if (ret == 1)
@@ -365,11 +365,9 @@ int build_channel_array(const char *device_dir, int buffer_idx,
 
 			if (fclose(sysfsfp)) {
 				ret = -errno;
-				free(filename);
 				goto error_close_dir;
 			}
 
-			free(filename);
 		}
 
 	*ci_array = malloc(sizeof(**ci_array) * (*counter));
@@ -395,9 +393,9 @@ int build_channel_array(const char *device_dir, int buffer_idx,
 			}
 
 			sysfsfp = fopen(filename, "r");
+			free(filename);
 			if (!sysfsfp) {
 				ret = -errno;
-				free(filename);
 				count--;
 				goto error_cleanup_array;
 			}
@@ -405,20 +403,17 @@ int build_channel_array(const char *device_dir, int buffer_idx,
 			errno = 0;
 			if (fscanf(sysfsfp, "%i", &current_enabled) != 1) {
 				ret = errno ? -errno : -ENODATA;
-				free(filename);
 				count--;
 				goto error_cleanup_array;
 			}
 
 			if (fclose(sysfsfp)) {
 				ret = -errno;
-				free(filename);
 				count--;
 				goto error_cleanup_array;
 			}
 
 			if (!current_enabled) {
-				free(filename);
 				count--;
 				continue;
 			}
@@ -429,7 +424,6 @@ int build_channel_array(const char *device_dir, int buffer_idx,
 						strlen(ent->d_name) -
 						strlen("_en"));
 			if (!current->name) {
-				free(filename);
 				ret = -ENOMEM;
 				count--;
 				goto error_cleanup_array;
@@ -439,7 +433,6 @@ int build_channel_array(const char *device_dir, int buffer_idx,
 			ret = iioutils_break_up_name(current->name,
 						     &current->generic_name);
 			if (ret) {
-				free(filename);
 				free(current->name);
 				count--;
 				goto error_cleanup_array;
@@ -450,17 +443,16 @@ int build_channel_array(const char *device_dir, int buffer_idx,
 				       scan_el_dir,
 				       current->name);
 			if (ret < 0) {
-				free(filename);
 				ret = -ENOMEM;
 				goto error_cleanup_array;
 			}
 
 			sysfsfp = fopen(filename, "r");
+			free(filename);
 			if (!sysfsfp) {
 				ret = -errno;
-				fprintf(stderr, "failed to open %s\n",
-					filename);
-				free(filename);
+				fprintf(stderr, "failed to open %s/%s_index\n",
+					scan_el_dir, current->name);
 				goto error_cleanup_array;
 			}
 
@@ -470,17 +462,14 @@ int build_channel_array(const char *device_dir, int buffer_idx,
 				if (fclose(sysfsfp))
 					perror("build_channel_array(): Failed to close file");
 
-				free(filename);
 				goto error_cleanup_array;
 			}
 
 			if (fclose(sysfsfp)) {
 				ret = -errno;
-				free(filename);
 				goto error_cleanup_array;
 			}
 
-			free(filename);
 			/* Find the scale */
 			ret = iioutils_get_param_float(&current->scale,
 						       "scale",
-- 
cgit 


From 2b3486bc2d237ec345b3942b7be5deabf8c8fed1 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 19 Jan 2023 14:15:24 -0800
Subject: bpf: Introduce device-bound XDP programs

New flag BPF_F_XDP_DEV_BOUND_ONLY plus all the infra to have a way
to associate a netdev with a BPF program at load time.

netdevsim checks are dropped in favor of generic check in dev_xdp_attach.

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Anatoly Burakov <anatoly.burakov@intel.com>
Cc: Alexander Lobakin <alexandr.lobakin@intel.com>
Cc: Magnus Karlsson <magnus.karlsson@gmail.com>
Cc: Maryam Tahhan <mtahhan@redhat.com>
Cc: xdp-hints@xdp-project.net
Cc: netdev@vger.kernel.org
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230119221536.3349901-6-sdf@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 drivers/net/netdevsim/bpf.c    |  4 --
 include/linux/bpf.h            | 24 +++++++++--
 include/uapi/linux/bpf.h       |  5 +++
 kernel/bpf/core.c              |  4 +-
 kernel/bpf/offload.c           | 95 +++++++++++++++++++++++++++++++-----------
 kernel/bpf/syscall.c           |  9 ++--
 net/core/dev.c                 |  5 +++
 tools/include/uapi/linux/bpf.h |  5 +++
 8 files changed, 113 insertions(+), 38 deletions(-)

(limited to 'tools')

diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 50854265864d..f60eb97e3a62 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -315,10 +315,6 @@ nsim_setup_prog_hw_checks(struct netdevsim *ns, struct netdev_bpf *bpf)
 		NSIM_EA(bpf->extack, "xdpoffload of non-bound program");
 		return -EINVAL;
 	}
-	if (!bpf_offload_dev_match(bpf->prog, ns->netdev)) {
-		NSIM_EA(bpf->extack, "program bound to different dev");
-		return -EINVAL;
-	}
 
 	state = bpf->prog->aux->offload->dev_priv;
 	if (WARN_ON(strcmp(state->state, "xlated"))) {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1bb525c0130e..b97a05bb47be 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1261,7 +1261,8 @@ struct bpf_prog_aux {
 	enum bpf_prog_type saved_dst_prog_type;
 	enum bpf_attach_type saved_dst_attach_type;
 	bool verifier_zext; /* Zero extensions has been inserted by verifier. */
-	bool offload_requested;
+	bool dev_bound; /* Program is bound to the netdev. */
+	bool offload_requested; /* Program is bound and offloaded to the netdev. */
 	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
 	bool func_proto_unreliable;
 	bool sleepable;
@@ -2451,7 +2452,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
 bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool);
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
-void bpf_prog_offload_destroy(struct bpf_prog *prog);
+void bpf_prog_dev_bound_destroy(struct bpf_prog *prog);
 int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
 			       struct bpf_prog *prog);
 
@@ -2479,7 +2480,13 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev);
 void unpriv_ebpf_notify(int new_state);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
-int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
+int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr);
+void bpf_dev_bound_netdev_unregister(struct net_device *dev);
+
+static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux)
+{
+	return aux->dev_bound;
+}
 
 static inline bool bpf_prog_is_offloaded(const struct bpf_prog_aux *aux)
 {
@@ -2507,12 +2514,21 @@ void sock_map_unhash(struct sock *sk);
 void sock_map_destroy(struct sock *sk);
 void sock_map_close(struct sock *sk, long timeout);
 #else
-static inline int bpf_prog_offload_init(struct bpf_prog *prog,
+static inline int bpf_prog_dev_bound_init(struct bpf_prog *prog,
 					union bpf_attr *attr)
 {
 	return -EOPNOTSUPP;
 }
 
+static inline void bpf_dev_bound_netdev_unregister(struct net_device *dev)
+{
+}
+
+static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux)
+{
+	return false;
+}
+
 static inline bool bpf_prog_is_offloaded(struct bpf_prog_aux *aux)
 {
 	return false;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index adae5b168f9d..ba0f0cfb5e42 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1156,6 +1156,11 @@ enum bpf_link_type {
  */
 #define BPF_F_XDP_HAS_FRAGS	(1U << 5)
 
+/* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded
+ * program becomes device-bound but can access XDP metadata.
+ */
+#define BPF_F_XDP_DEV_BOUND_ONLY	(1U << 6)
+
 /* link_create.kprobe_multi.flags used in LINK_CREATE command for
  * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
  */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 515f4f08591c..1cf19da3c128 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2553,8 +2553,8 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 #endif
 	bpf_free_used_maps(aux);
 	bpf_free_used_btfs(aux);
-	if (bpf_prog_is_offloaded(aux))
-		bpf_prog_offload_destroy(aux->prog);
+	if (bpf_prog_is_dev_bound(aux))
+		bpf_prog_dev_bound_destroy(aux->prog);
 #ifdef CONFIG_PERF_EVENTS
 	if (aux->prog->has_callchain_buf)
 		put_callchain_buffers();
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index deb06498da0b..f767455ed732 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -41,7 +41,7 @@ struct bpf_offload_dev {
 struct bpf_offload_netdev {
 	struct rhash_head l;
 	struct net_device *netdev;
-	struct bpf_offload_dev *offdev;
+	struct bpf_offload_dev *offdev; /* NULL when bound-only */
 	struct list_head progs;
 	struct list_head maps;
 	struct list_head offdev_netdevs;
@@ -89,19 +89,17 @@ static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
 	INIT_LIST_HEAD(&ondev->progs);
 	INIT_LIST_HEAD(&ondev->maps);
 
-	down_write(&bpf_devs_lock);
 	err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
 	if (err) {
 		netdev_warn(netdev, "failed to register for BPF offload\n");
-		goto err_unlock_free;
+		goto err_free;
 	}
 
-	list_add(&ondev->offdev_netdevs, &offdev->netdevs);
-	up_write(&bpf_devs_lock);
+	if (offdev)
+		list_add(&ondev->offdev_netdevs, &offdev->netdevs);
 	return 0;
 
-err_unlock_free:
-	up_write(&bpf_devs_lock);
+err_free:
 	kfree(ondev);
 	return err;
 }
@@ -149,24 +147,26 @@ static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
 static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
 						struct net_device *netdev)
 {
-	struct bpf_offload_netdev *ondev, *altdev;
+	struct bpf_offload_netdev *ondev, *altdev = NULL;
 	struct bpf_offloaded_map *offmap, *mtmp;
 	struct bpf_prog_offload *offload, *ptmp;
 
 	ASSERT_RTNL();
 
-	down_write(&bpf_devs_lock);
 	ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
 	if (WARN_ON(!ondev))
-		goto unlock;
+		return;
 
 	WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
-	list_del(&ondev->offdev_netdevs);
 
 	/* Try to move the objects to another netdev of the device */
-	altdev = list_first_entry_or_null(&offdev->netdevs,
-					  struct bpf_offload_netdev,
-					  offdev_netdevs);
+	if (offdev) {
+		list_del(&ondev->offdev_netdevs);
+		altdev = list_first_entry_or_null(&offdev->netdevs,
+						  struct bpf_offload_netdev,
+						  offdev_netdevs);
+	}
+
 	if (altdev) {
 		list_for_each_entry(offload, &ondev->progs, offloads)
 			offload->netdev = altdev->netdev;
@@ -185,11 +185,9 @@ static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
 	WARN_ON(!list_empty(&ondev->progs));
 	WARN_ON(!list_empty(&ondev->maps));
 	kfree(ondev);
-unlock:
-	up_write(&bpf_devs_lock);
 }
 
-int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
 {
 	struct bpf_offload_netdev *ondev;
 	struct bpf_prog_offload *offload;
@@ -199,7 +197,11 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 	    attr->prog_type != BPF_PROG_TYPE_XDP)
 		return -EINVAL;
 
-	if (attr->prog_flags)
+	if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY)
+		return -EINVAL;
+
+	if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS &&
+	    attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)
 		return -EINVAL;
 
 	offload = kzalloc(sizeof(*offload), GFP_USER);
@@ -214,11 +216,23 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 	if (err)
 		goto err_maybe_put;
 
+	prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY);
+
 	down_write(&bpf_devs_lock);
 	ondev = bpf_offload_find_netdev(offload->netdev);
 	if (!ondev) {
-		err = -EINVAL;
-		goto err_unlock;
+		if (bpf_prog_is_offloaded(prog->aux)) {
+			err = -EINVAL;
+			goto err_unlock;
+		}
+
+		/* When only binding to the device, explicitly
+		 * create an entry in the hashtable.
+		 */
+		err = __bpf_offload_dev_netdev_register(NULL, offload->netdev);
+		if (err)
+			goto err_unlock;
+		ondev = bpf_offload_find_netdev(offload->netdev);
 	}
 	offload->offdev = ondev->offdev;
 	prog->aux->offload = offload;
@@ -321,12 +335,25 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
 	up_read(&bpf_devs_lock);
 }
 
-void bpf_prog_offload_destroy(struct bpf_prog *prog)
+void bpf_prog_dev_bound_destroy(struct bpf_prog *prog)
 {
+	struct bpf_offload_netdev *ondev;
+	struct net_device *netdev;
+
+	rtnl_lock();
 	down_write(&bpf_devs_lock);
-	if (prog->aux->offload)
+	if (prog->aux->offload) {
+		list_del_init(&prog->aux->offload->offloads);
+
+		netdev = prog->aux->offload->netdev;
 		__bpf_prog_offload_destroy(prog);
+
+		ondev = bpf_offload_find_netdev(netdev);
+		if (!ondev->offdev && list_empty(&ondev->progs))
+			__bpf_offload_dev_netdev_unregister(NULL, netdev);
+	}
 	up_write(&bpf_devs_lock);
+	rtnl_unlock();
 }
 
 static int bpf_prog_offload_translate(struct bpf_prog *prog)
@@ -621,7 +648,7 @@ static bool __bpf_offload_dev_match(struct bpf_prog *prog,
 	struct bpf_offload_netdev *ondev1, *ondev2;
 	struct bpf_prog_offload *offload;
 
-	if (!bpf_prog_is_offloaded(prog->aux))
+	if (!bpf_prog_is_dev_bound(prog->aux))
 		return false;
 
 	offload = prog->aux->offload;
@@ -667,14 +694,21 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
 int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
 				    struct net_device *netdev)
 {
-	return __bpf_offload_dev_netdev_register(offdev, netdev);
+	int err;
+
+	down_write(&bpf_devs_lock);
+	err = __bpf_offload_dev_netdev_register(offdev, netdev);
+	up_write(&bpf_devs_lock);
+	return err;
 }
 EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
 
 void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
 				       struct net_device *netdev)
 {
+	down_write(&bpf_devs_lock);
 	__bpf_offload_dev_netdev_unregister(offdev, netdev);
+	up_write(&bpf_devs_lock);
 }
 EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
 
@@ -708,6 +742,19 @@ void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)
 }
 EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
 
+void bpf_dev_bound_netdev_unregister(struct net_device *dev)
+{
+	struct bpf_offload_netdev *ondev;
+
+	ASSERT_RTNL();
+
+	down_write(&bpf_devs_lock);
+	ondev = bpf_offload_find_netdev(dev);
+	if (ondev && !ondev->offdev)
+		__bpf_offload_dev_netdev_unregister(NULL, ondev->netdev);
+	up_write(&bpf_devs_lock);
+}
+
 static int __init bpf_offload_init(void)
 {
 	return rhashtable_init(&offdevs, &offdevs_params);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5e90b697f908..fdf4ff3d5a7f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2491,7 +2491,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
 				 BPF_F_TEST_STATE_FREQ |
 				 BPF_F_SLEEPABLE |
 				 BPF_F_TEST_RND_HI32 |
-				 BPF_F_XDP_HAS_FRAGS))
+				 BPF_F_XDP_HAS_FRAGS |
+				 BPF_F_XDP_DEV_BOUND_ONLY))
 		return -EINVAL;
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2575,7 +2576,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
 	prog->aux->attach_btf = attach_btf;
 	prog->aux->attach_btf_id = attr->attach_btf_id;
 	prog->aux->dst_prog = dst_prog;
-	prog->aux->offload_requested = !!attr->prog_ifindex;
+	prog->aux->dev_bound = !!attr->prog_ifindex;
 	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
 	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
 
@@ -2598,8 +2599,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
 	atomic64_set(&prog->aux->refcnt, 1);
 	prog->gpl_compatible = is_gpl ? 1 : 0;
 
-	if (bpf_prog_is_offloaded(prog->aux)) {
-		err = bpf_prog_offload_init(prog, attr);
+	if (bpf_prog_is_dev_bound(prog->aux)) {
+		err = bpf_prog_dev_bound_init(prog, attr);
 		if (err)
 			goto free_prog_sec;
 	}
diff --git a/net/core/dev.c b/net/core/dev.c
index a37829de6529..e66da626df84 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9228,6 +9228,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
 			NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
 			return -EINVAL;
 		}
+		if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
+			NL_SET_ERR_MSG(extack, "Program bound to different device");
+			return -EINVAL;
+		}
 		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
 			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
 			return -EINVAL;
@@ -10830,6 +10834,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
 		dev_shutdown(dev);
 
 		dev_xdp_uninstall(dev);
+		bpf_dev_bound_netdev_unregister(dev);
 
 		netdev_offload_xstats_disable_all(dev);
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 142b81bcbb2e..7f024ac22edd 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1156,6 +1156,11 @@ enum bpf_link_type {
  */
 #define BPF_F_XDP_HAS_FRAGS	(1U << 5)
 
+/* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded
+ * program becomes device-bound but can access XDP metadata.
+ */
+#define BPF_F_XDP_DEV_BOUND_ONLY	(1U << 6)
+
 /* link_create.kprobe_multi.flags used in LINK_CREATE command for
  * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
  */
-- 
cgit 


From 40535704624e980a560fd68635d98c2984ded572 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 19 Jan 2023 14:15:25 -0800
Subject: selftests/bpf: Update expected test_offload.py messages

Generic check has a different error message, update the selftest.

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Anatoly Burakov <anatoly.burakov@intel.com>
Cc: Alexander Lobakin <alexandr.lobakin@intel.com>
Cc: Magnus Karlsson <magnus.karlsson@gmail.com>
Cc: Maryam Tahhan <mtahhan@redhat.com>
Cc: xdp-hints@xdp-project.net
Cc: netdev@vger.kernel.org
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230119221536.3349901-7-sdf@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/test_offload.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py
index 7cb1bc05e5cf..40cba8d368d9 100755
--- a/tools/testing/selftests/bpf/test_offload.py
+++ b/tools/testing/selftests/bpf/test_offload.py
@@ -1039,7 +1039,7 @@ try:
     offload = bpf_pinned("/sys/fs/bpf/offload")
     ret, _, err = sim.set_xdp(offload, "drv", fail=False, include_stderr=True)
     fail(ret == 0, "attached offloaded XDP program to drv")
-    check_extack(err, "Using device-bound program without HW_MODE flag is not supported.", args)
+    check_extack(err, "Using offloaded program without HW_MODE flag is not supported.", args)
     rm("/sys/fs/bpf/offload")
     sim.wait_for_flush()
 
@@ -1088,12 +1088,12 @@ try:
     ret, _, err = sim.set_xdp(pinned, "offload",
                               fail=False, include_stderr=True)
     fail(ret == 0, "Pinned program loaded for a different device accepted")
-    check_extack_nsim(err, "program bound to different dev.", args)
+    check_extack(err, "Program bound to different device.", args)
     simdev2.remove()
     ret, _, err = sim.set_xdp(pinned, "offload",
                               fail=False, include_stderr=True)
     fail(ret == 0, "Pinned program loaded for a removed device accepted")
-    check_extack_nsim(err, "xdpoffload of non-bound program.", args)
+    check_extack(err, "Program bound to different device.", args)
     rm(pin_file)
     bpftool_prog_list_wait(expected=0)
 
@@ -1334,12 +1334,12 @@ try:
     ret, _, err = simA.set_xdp(progB, "offload", force=True, JSON=False,
                                fail=False, include_stderr=True)
     fail(ret == 0, "cross-ASIC program allowed")
-    check_extack_nsim(err, "program bound to different dev.", args)
+    check_extack(err, "Program bound to different device.", args)
     for d in simdevB.nsims:
         ret, _, err = d.set_xdp(progA, "offload", force=True, JSON=False,
                                 fail=False, include_stderr=True)
         fail(ret == 0, "cross-ASIC program allowed")
-        check_extack_nsim(err, "program bound to different dev.", args)
+        check_extack(err, "Program bound to different device.", args)
 
     start_test("Test multi-dev ASIC cross-dev map reuse...")
 
-- 
cgit 


From e2a46d54d7a100c79c9d645184db9ce594c8f5f6 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 19 Jan 2023 14:15:30 -0800
Subject: selftests/bpf: Verify xdp_metadata xdp->af_xdp path

- create new netns
- create veth pair (veTX+veRX)
- setup AF_XDP socket for both interfaces
- attach bpf to veRX
- send packet via veTX
- verify the packet has expected metadata at veRX

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Anatoly Burakov <anatoly.burakov@intel.com>
Cc: Alexander Lobakin <alexandr.lobakin@intel.com>
Cc: Magnus Karlsson <magnus.karlsson@gmail.com>
Cc: Maryam Tahhan <mtahhan@redhat.com>
Cc: xdp-hints@xdp-project.net
Cc: netdev@vger.kernel.org
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230119221536.3349901-12-sdf@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/DENYLIST.s390x         |   1 +
 tools/testing/selftests/bpf/Makefile               |   2 +-
 .../selftests/bpf/prog_tests/xdp_metadata.c        | 410 +++++++++++++++++++++
 tools/testing/selftests/bpf/progs/xdp_metadata.c   |  64 ++++
 tools/testing/selftests/bpf/progs/xdp_metadata2.c  |  23 ++
 tools/testing/selftests/bpf/xdp_metadata.h         |  15 +
 6 files changed, 514 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
 create mode 100644 tools/testing/selftests/bpf/progs/xdp_metadata.c
 create mode 100644 tools/testing/selftests/bpf/progs/xdp_metadata2.c
 create mode 100644 tools/testing/selftests/bpf/xdp_metadata.h

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index 96e8371f5c2a..b9ef9cc61d36 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -86,5 +86,6 @@ xdp_adjust_tail                          # case-128 err 0 errno 28 retval 1 size
 xdp_bonding                              # failed to auto-attach program 'trace_on_entry': -524                        (trampoline)
 xdp_bpf2bpf                              # failed to auto-attach program 'trace_on_entry': -524                        (trampoline)
 xdp_do_redirect                          # prog_run_max_size unexpected error: -22 (errno 22)
+xdp_metadata                             # JIT does not support calling kernel function                                (kfunc)
 xdp_synproxy                             # JIT does not support calling kernel function                                (kfunc)
 xfrm_info                                # JIT does not support calling kernel function                                (kfunc)
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 22533a18705e..e09bef2b7502 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -527,7 +527,7 @@ TRUNNER_BPF_PROGS_DIR := progs
 TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c	\
 			 network_helpers.c testing_helpers.c		\
 			 btf_helpers.c flow_dissector_load.h		\
-			 cap_helpers.c test_loader.c
+			 cap_helpers.c test_loader.c xsk.c
 TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko	\
 		       $(OUTPUT)/liburandom_read.so			\
 		       $(OUTPUT)/xdp_synproxy				\
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
new file mode 100644
index 000000000000..e033d48288c0
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "xdp_metadata.skel.h"
+#include "xdp_metadata2.skel.h"
+#include "xdp_metadata.h"
+#include "xsk.h"
+
+#include <bpf/btf.h>
+#include <linux/errqueue.h>
+#include <linux/if_link.h>
+#include <linux/net_tstamp.h>
+#include <linux/udp.h>
+#include <sys/mman.h>
+#include <net/if.h>
+#include <poll.h>
+
+#define TX_NAME "veTX"
+#define RX_NAME "veRX"
+
+#define UDP_PAYLOAD_BYTES 4
+
+#define AF_XDP_SOURCE_PORT 1234
+#define AF_XDP_CONSUMER_PORT 8080
+
+#define UMEM_NUM 16
+#define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE
+#define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM)
+#define XDP_FLAGS XDP_FLAGS_DRV_MODE
+#define QUEUE_ID 0
+
+#define TX_ADDR "10.0.0.1"
+#define RX_ADDR "10.0.0.2"
+#define PREFIX_LEN "8"
+#define FAMILY AF_INET
+
+#define SYS(cmd) ({ \
+	if (!ASSERT_OK(system(cmd), (cmd))) \
+		goto out; \
+})
+
+struct xsk {
+	void *umem_area;
+	struct xsk_umem *umem;
+	struct xsk_ring_prod fill;
+	struct xsk_ring_cons comp;
+	struct xsk_ring_prod tx;
+	struct xsk_ring_cons rx;
+	struct xsk_socket *socket;
+};
+
+static int open_xsk(int ifindex, struct xsk *xsk)
+{
+	int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
+	const struct xsk_socket_config socket_config = {
+		.rx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+		.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+		.bind_flags = XDP_COPY,
+	};
+	const struct xsk_umem_config umem_config = {
+		.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+		.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+		.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
+		.flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG,
+	};
+	__u32 idx;
+	u64 addr;
+	int ret;
+	int i;
+
+	xsk->umem_area = mmap(NULL, UMEM_SIZE, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
+	if (!ASSERT_NEQ(xsk->umem_area, MAP_FAILED, "mmap"))
+		return -1;
+
+	ret = xsk_umem__create(&xsk->umem,
+			       xsk->umem_area, UMEM_SIZE,
+			       &xsk->fill,
+			       &xsk->comp,
+			       &umem_config);
+	if (!ASSERT_OK(ret, "xsk_umem__create"))
+		return ret;
+
+	ret = xsk_socket__create(&xsk->socket, ifindex, QUEUE_ID,
+				 xsk->umem,
+				 &xsk->rx,
+				 &xsk->tx,
+				 &socket_config);
+	if (!ASSERT_OK(ret, "xsk_socket__create"))
+		return ret;
+
+	/* First half of umem is for TX. This way address matches 1-to-1
+	 * to the completion queue index.
+	 */
+
+	for (i = 0; i < UMEM_NUM / 2; i++) {
+		addr = i * UMEM_FRAME_SIZE;
+		printf("%p: tx_desc[%d] -> %lx\n", xsk, i, addr);
+	}
+
+	/* Second half of umem is for RX. */
+
+	ret = xsk_ring_prod__reserve(&xsk->fill, UMEM_NUM / 2, &idx);
+	if (!ASSERT_EQ(UMEM_NUM / 2, ret, "xsk_ring_prod__reserve"))
+		return ret;
+	if (!ASSERT_EQ(idx, 0, "fill idx != 0"))
+		return -1;
+
+	for (i = 0; i < UMEM_NUM / 2; i++) {
+		addr = (UMEM_NUM / 2 + i) * UMEM_FRAME_SIZE;
+		printf("%p: rx_desc[%d] -> %lx\n", xsk, i, addr);
+		*xsk_ring_prod__fill_addr(&xsk->fill, i) = addr;
+	}
+	xsk_ring_prod__submit(&xsk->fill, ret);
+
+	return 0;
+}
+
+static void close_xsk(struct xsk *xsk)
+{
+	if (xsk->umem)
+		xsk_umem__delete(xsk->umem);
+	if (xsk->socket)
+		xsk_socket__delete(xsk->socket);
+	munmap(xsk->umem, UMEM_SIZE);
+}
+
+static void ip_csum(struct iphdr *iph)
+{
+	__u32 sum = 0;
+	__u16 *p;
+	int i;
+
+	iph->check = 0;
+	p = (void *)iph;
+	for (i = 0; i < sizeof(*iph) / sizeof(*p); i++)
+		sum += p[i];
+
+	while (sum >> 16)
+		sum = (sum & 0xffff) + (sum >> 16);
+
+	iph->check = ~sum;
+}
+
+static int generate_packet(struct xsk *xsk, __u16 dst_port)
+{
+	struct xdp_desc *tx_desc;
+	struct udphdr *udph;
+	struct ethhdr *eth;
+	struct iphdr *iph;
+	void *data;
+	__u32 idx;
+	int ret;
+
+	ret = xsk_ring_prod__reserve(&xsk->tx, 1, &idx);
+	if (!ASSERT_EQ(ret, 1, "xsk_ring_prod__reserve"))
+		return -1;
+
+	tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx);
+	tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE;
+	printf("%p: tx_desc[%u]->addr=%llx\n", xsk, idx, tx_desc->addr);
+	data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr);
+
+	eth = data;
+	iph = (void *)(eth + 1);
+	udph = (void *)(iph + 1);
+
+	memcpy(eth->h_dest, "\x00\x00\x00\x00\x00\x02", ETH_ALEN);
+	memcpy(eth->h_source, "\x00\x00\x00\x00\x00\x01", ETH_ALEN);
+	eth->h_proto = htons(ETH_P_IP);
+
+	iph->version = 0x4;
+	iph->ihl = 0x5;
+	iph->tos = 0x9;
+	iph->tot_len = htons(sizeof(*iph) + sizeof(*udph) + UDP_PAYLOAD_BYTES);
+	iph->id = 0;
+	iph->frag_off = 0;
+	iph->ttl = 0;
+	iph->protocol = IPPROTO_UDP;
+	ASSERT_EQ(inet_pton(FAMILY, TX_ADDR, &iph->saddr), 1, "inet_pton(TX_ADDR)");
+	ASSERT_EQ(inet_pton(FAMILY, RX_ADDR, &iph->daddr), 1, "inet_pton(RX_ADDR)");
+	ip_csum(iph);
+
+	udph->source = htons(AF_XDP_SOURCE_PORT);
+	udph->dest = htons(dst_port);
+	udph->len = htons(sizeof(*udph) + UDP_PAYLOAD_BYTES);
+	udph->check = 0;
+
+	memset(udph + 1, 0xAA, UDP_PAYLOAD_BYTES);
+
+	tx_desc->len = sizeof(*eth) + sizeof(*iph) + sizeof(*udph) + UDP_PAYLOAD_BYTES;
+	xsk_ring_prod__submit(&xsk->tx, 1);
+
+	ret = sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0);
+	if (!ASSERT_GE(ret, 0, "sendto"))
+		return ret;
+
+	return 0;
+}
+
+static void complete_tx(struct xsk *xsk)
+{
+	__u32 idx;
+	__u64 addr;
+
+	if (ASSERT_EQ(xsk_ring_cons__peek(&xsk->comp, 1, &idx), 1, "xsk_ring_cons__peek")) {
+		addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx);
+
+		printf("%p: refill idx=%u addr=%llx\n", xsk, idx, addr);
+		*xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr;
+		xsk_ring_prod__submit(&xsk->fill, 1);
+	}
+}
+
+static void refill_rx(struct xsk *xsk, __u64 addr)
+{
+	__u32 idx;
+
+	if (ASSERT_EQ(xsk_ring_prod__reserve(&xsk->fill, 1, &idx), 1, "xsk_ring_prod__reserve")) {
+		printf("%p: complete idx=%u addr=%llx\n", xsk, idx, addr);
+		*xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr;
+		xsk_ring_prod__submit(&xsk->fill, 1);
+	}
+}
+
+static int verify_xsk_metadata(struct xsk *xsk)
+{
+	const struct xdp_desc *rx_desc;
+	struct pollfd fds = {};
+	struct xdp_meta *meta;
+	struct ethhdr *eth;
+	struct iphdr *iph;
+	__u64 comp_addr;
+	void *data;
+	__u64 addr;
+	__u32 idx;
+	int ret;
+
+	ret = recvfrom(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, NULL);
+	if (!ASSERT_EQ(ret, 0, "recvfrom"))
+		return -1;
+
+	fds.fd = xsk_socket__fd(xsk->socket);
+	fds.events = POLLIN;
+
+	ret = poll(&fds, 1, 1000);
+	if (!ASSERT_GT(ret, 0, "poll"))
+		return -1;
+
+	ret = xsk_ring_cons__peek(&xsk->rx, 1, &idx);
+	if (!ASSERT_EQ(ret, 1, "xsk_ring_cons__peek"))
+		return -2;
+
+	rx_desc = xsk_ring_cons__rx_desc(&xsk->rx, idx);
+	comp_addr = xsk_umem__extract_addr(rx_desc->addr);
+	addr = xsk_umem__add_offset_to_addr(rx_desc->addr);
+	printf("%p: rx_desc[%u]->addr=%llx addr=%llx comp_addr=%llx\n",
+	       xsk, idx, rx_desc->addr, addr, comp_addr);
+	data = xsk_umem__get_data(xsk->umem_area, addr);
+
+	/* Make sure we got the packet offset correctly. */
+
+	eth = data;
+	ASSERT_EQ(eth->h_proto, htons(ETH_P_IP), "eth->h_proto");
+	iph = (void *)(eth + 1);
+	ASSERT_EQ((int)iph->version, 4, "iph->version");
+
+	/* custom metadata */
+
+	meta = data - sizeof(struct xdp_meta);
+
+	if (!ASSERT_NEQ(meta->rx_timestamp, 0, "rx_timestamp"))
+		return -1;
+
+	if (!ASSERT_NEQ(meta->rx_hash, 0, "rx_hash"))
+		return -1;
+
+	xsk_ring_cons__release(&xsk->rx, 1);
+	refill_rx(xsk, comp_addr);
+
+	return 0;
+}
+
+void test_xdp_metadata(void)
+{
+	struct xdp_metadata2 *bpf_obj2 = NULL;
+	struct xdp_metadata *bpf_obj = NULL;
+	struct bpf_program *new_prog, *prog;
+	struct nstoken *tok = NULL;
+	__u32 queue_id = QUEUE_ID;
+	struct bpf_map *prog_arr;
+	struct xsk tx_xsk = {};
+	struct xsk rx_xsk = {};
+	__u32 val, key = 0;
+	int retries = 10;
+	int rx_ifindex;
+	int tx_ifindex;
+	int sock_fd;
+	int ret;
+
+	/* Setup new networking namespace, with a veth pair. */
+
+	SYS("ip netns add xdp_metadata");
+	tok = open_netns("xdp_metadata");
+	SYS("ip link add numtxqueues 1 numrxqueues 1 " TX_NAME
+	    " type veth peer " RX_NAME " numtxqueues 1 numrxqueues 1");
+	SYS("ip link set dev " TX_NAME " address 00:00:00:00:00:01");
+	SYS("ip link set dev " RX_NAME " address 00:00:00:00:00:02");
+	SYS("ip link set dev " TX_NAME " up");
+	SYS("ip link set dev " RX_NAME " up");
+	SYS("ip addr add " TX_ADDR "/" PREFIX_LEN " dev " TX_NAME);
+	SYS("ip addr add " RX_ADDR "/" PREFIX_LEN " dev " RX_NAME);
+
+	rx_ifindex = if_nametoindex(RX_NAME);
+	tx_ifindex = if_nametoindex(TX_NAME);
+
+	/* Setup separate AF_XDP for TX and RX interfaces. */
+
+	ret = open_xsk(tx_ifindex, &tx_xsk);
+	if (!ASSERT_OK(ret, "open_xsk(TX_NAME)"))
+		goto out;
+
+	ret = open_xsk(rx_ifindex, &rx_xsk);
+	if (!ASSERT_OK(ret, "open_xsk(RX_NAME)"))
+		goto out;
+
+	bpf_obj = xdp_metadata__open();
+	if (!ASSERT_OK_PTR(bpf_obj, "open skeleton"))
+		goto out;
+
+	prog = bpf_object__find_program_by_name(bpf_obj->obj, "rx");
+	bpf_program__set_ifindex(prog, rx_ifindex);
+	bpf_program__set_flags(prog, BPF_F_XDP_DEV_BOUND_ONLY);
+
+	if (!ASSERT_OK(xdp_metadata__load(bpf_obj), "load skeleton"))
+		goto out;
+
+	/* Make sure we can't add dev-bound programs to prog maps. */
+	prog_arr = bpf_object__find_map_by_name(bpf_obj->obj, "prog_arr");
+	if (!ASSERT_OK_PTR(prog_arr, "no prog_arr map"))
+		goto out;
+
+	val = bpf_program__fd(prog);
+	if (!ASSERT_ERR(bpf_map__update_elem(prog_arr, &key, sizeof(key),
+					     &val, sizeof(val), BPF_ANY),
+			"update prog_arr"))
+		goto out;
+
+	/* Attach BPF program to RX interface. */
+
+	ret = bpf_xdp_attach(rx_ifindex,
+			     bpf_program__fd(bpf_obj->progs.rx),
+			     XDP_FLAGS, NULL);
+	if (!ASSERT_GE(ret, 0, "bpf_xdp_attach"))
+		goto out;
+
+	sock_fd = xsk_socket__fd(rx_xsk.socket);
+	ret = bpf_map_update_elem(bpf_map__fd(bpf_obj->maps.xsk), &queue_id, &sock_fd, 0);
+	if (!ASSERT_GE(ret, 0, "bpf_map_update_elem"))
+		goto out;
+
+	/* Send packet destined to RX AF_XDP socket. */
+	if (!ASSERT_GE(generate_packet(&tx_xsk, AF_XDP_CONSUMER_PORT), 0,
+		       "generate AF_XDP_CONSUMER_PORT"))
+		goto out;
+
+	/* Verify AF_XDP RX packet has proper metadata. */
+	if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk), 0,
+		       "verify_xsk_metadata"))
+		goto out;
+
+	complete_tx(&tx_xsk);
+
+	/* Make sure freplace correctly picks up original bound device
+	 * and doesn't crash.
+	 */
+
+	bpf_obj2 = xdp_metadata2__open();
+	if (!ASSERT_OK_PTR(bpf_obj2, "open skeleton"))
+		goto out;
+
+	new_prog = bpf_object__find_program_by_name(bpf_obj2->obj, "freplace_rx");
+	bpf_program__set_attach_target(new_prog, bpf_program__fd(prog), "rx");
+
+	if (!ASSERT_OK(xdp_metadata2__load(bpf_obj2), "load freplace skeleton"))
+		goto out;
+
+	if (!ASSERT_OK(xdp_metadata2__attach(bpf_obj2), "attach freplace"))
+		goto out;
+
+	/* Send packet to trigger . */
+	if (!ASSERT_GE(generate_packet(&tx_xsk, AF_XDP_CONSUMER_PORT), 0,
+		       "generate freplace packet"))
+		goto out;
+
+	while (!retries--) {
+		if (bpf_obj2->bss->called)
+			break;
+		usleep(10);
+	}
+	ASSERT_GT(bpf_obj2->bss->called, 0, "not called");
+
+out:
+	close_xsk(&rx_xsk);
+	close_xsk(&tx_xsk);
+	xdp_metadata2__destroy(bpf_obj2);
+	xdp_metadata__destroy(bpf_obj);
+	if (tok)
+		close_netns(tok);
+	system("ip netns del xdp_metadata");
+}
diff --git a/tools/testing/selftests/bpf/progs/xdp_metadata.c b/tools/testing/selftests/bpf/progs/xdp_metadata.c
new file mode 100644
index 000000000000..77678b034389
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_metadata.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include "xdp_metadata.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_XSKMAP);
+	__uint(max_entries, 4);
+	__type(key, __u32);
+	__type(value, __u32);
+} xsk SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, __u32);
+} prog_arr SEC(".maps");
+
+extern int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx,
+					 __u64 *timestamp) __ksym;
+extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx,
+				    __u32 *hash) __ksym;
+
+SEC("xdp")
+int rx(struct xdp_md *ctx)
+{
+	void *data, *data_meta;
+	struct xdp_meta *meta;
+	u64 timestamp = -1;
+	int ret;
+
+	/* Reserve enough for all custom metadata. */
+
+	ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(struct xdp_meta));
+	if (ret != 0)
+		return XDP_DROP;
+
+	data = (void *)(long)ctx->data;
+	data_meta = (void *)(long)ctx->data_meta;
+
+	if (data_meta + sizeof(struct xdp_meta) > data)
+		return XDP_DROP;
+
+	meta = data_meta;
+
+	/* Export metadata. */
+
+	/* We expect veth bpf_xdp_metadata_rx_timestamp to return 0 HW
+	 * timestamp, so put some non-zero value into AF_XDP frame for
+	 * the userspace.
+	 */
+	bpf_xdp_metadata_rx_timestamp(ctx, &timestamp);
+	if (timestamp == 0)
+		meta->rx_timestamp = 1;
+
+	bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash);
+
+	return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/xdp_metadata2.c b/tools/testing/selftests/bpf/progs/xdp_metadata2.c
new file mode 100644
index 000000000000..cf69d05451c3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_metadata2.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include "xdp_metadata.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx,
+				    __u32 *hash) __ksym;
+
+int called;
+
+SEC("freplace/rx")
+int freplace_rx(struct xdp_md *ctx)
+{
+	u32 hash = 0;
+	/* Call _any_ metadata function to make sure we don't crash. */
+	bpf_xdp_metadata_rx_hash(ctx, &hash);
+	called++;
+	return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/xdp_metadata.h b/tools/testing/selftests/bpf/xdp_metadata.h
new file mode 100644
index 000000000000..f6780fbb0a21
--- /dev/null
+++ b/tools/testing/selftests/bpf/xdp_metadata.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#pragma once
+
+#ifndef ETH_P_IP
+#define ETH_P_IP 0x0800
+#endif
+
+#ifndef ETH_P_IPV6
+#define ETH_P_IPV6 0x86DD
+#endif
+
+struct xdp_meta {
+	__u64 rx_timestamp;
+	__u32 rx_hash;
+};
-- 
cgit 


From 297a3f1241550f6969f65a5efeee9162241daae5 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 19 Jan 2023 14:15:36 -0800
Subject: selftests/bpf: Simple program to dump XDP RX metadata

To be used for verification of driver implementations. Note that
the skb path is gone from the series, but I'm still keeping the
implementation for any possible future work.

$ xdp_hw_metadata <ifname>

On the other machine:

$ echo -n xdp | nc -u -q1 <target> 9091 # for AF_XDP
$ echo -n skb | nc -u -q1 <target> 9092 # for skb

Sample output:

  # xdp
  xsk_ring_cons__peek: 1
  0x19f9090: rx_desc[0]->addr=100000000008000 addr=8100 comp_addr=8000
  rx_timestamp_supported: 1
  rx_timestamp: 1667850075063948829
  0x19f9090: complete idx=8 addr=8000

  # skb
  found skb hwtstamp = 1668314052.854274681

Decoding:
  # xdp
  rx_timestamp=1667850075.063948829

  $ date -d @1667850075
  Mon Nov  7 11:41:15 AM PST 2022
  $ date
  Mon Nov  7 11:42:05 AM PST 2022

  # skb
  $ date -d @1668314052
  Sat Nov 12 08:34:12 PM PST 2022
  $ date
  Sat Nov 12 08:37:06 PM PST 2022

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Anatoly Burakov <anatoly.burakov@intel.com>
Cc: Alexander Lobakin <alexandr.lobakin@intel.com>
Cc: Magnus Karlsson <magnus.karlsson@gmail.com>
Cc: Maryam Tahhan <mtahhan@redhat.com>
Cc: xdp-hints@xdp-project.net
Cc: netdev@vger.kernel.org
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230119221536.3349901-18-sdf@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/.gitignore             |   1 +
 tools/testing/selftests/bpf/Makefile               |   7 +-
 .../testing/selftests/bpf/progs/xdp_hw_metadata.c  |  81 +++++
 tools/testing/selftests/bpf/xdp_hw_metadata.c      | 403 +++++++++++++++++++++
 4 files changed, 491 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/xdp_hw_metadata.c
 create mode 100644 tools/testing/selftests/bpf/xdp_hw_metadata.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 401a75844cc0..4aa5bba956ff 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -47,3 +47,4 @@ test_cpp
 xskxceiver
 xdp_redirect_multi
 xdp_synproxy
+xdp_hw_metadata
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index e09bef2b7502..a554b41fe872 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -83,7 +83,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
 TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
 	flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
 	test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
-	xskxceiver xdp_redirect_multi xdp_synproxy veristat
+	xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata
 
 TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/sign-file
 TEST_GEN_FILES += liburandom_read.so
@@ -383,6 +383,7 @@ test_subskeleton.skel.h-deps := test_subskeleton_lib2.bpf.o test_subskeleton_lib
 test_subskeleton_lib.skel.h-deps := test_subskeleton_lib2.bpf.o test_subskeleton_lib.bpf.o
 test_usdt.skel.h-deps := test_usdt.bpf.o test_usdt_multispec.bpf.o
 xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o
+xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o
 
 LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps)))
 
@@ -580,6 +581,10 @@ $(OUTPUT)/xskxceiver: xskxceiver.c $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
 
+$(OUTPUT)/xdp_hw_metadata: xdp_hw_metadata.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xsk.o $(OUTPUT)/xdp_hw_metadata.skel.h | $(OUTPUT)
+	$(call msg,BINARY,,$@)
+	$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
+
 # Make sure we are able to include and link libbpf against c++.
 $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ)
 	$(call msg,CXX,,$@)
diff --git a/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c
new file mode 100644
index 000000000000..25b8178735ee
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include "xdp_metadata.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_XSKMAP);
+	__uint(max_entries, 256);
+	__type(key, __u32);
+	__type(value, __u32);
+} xsk SEC(".maps");
+
+extern int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx,
+					 __u64 *timestamp) __ksym;
+extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx,
+				    __u32 *hash) __ksym;
+
+SEC("xdp")
+int rx(struct xdp_md *ctx)
+{
+	void *data, *data_meta, *data_end;
+	struct ipv6hdr *ip6h = NULL;
+	struct ethhdr *eth = NULL;
+	struct udphdr *udp = NULL;
+	struct iphdr *iph = NULL;
+	struct xdp_meta *meta;
+	int ret;
+
+	data = (void *)(long)ctx->data;
+	data_end = (void *)(long)ctx->data_end;
+	eth = data;
+	if (eth + 1 < data_end) {
+		if (eth->h_proto == bpf_htons(ETH_P_IP)) {
+			iph = (void *)(eth + 1);
+			if (iph + 1 < data_end && iph->protocol == IPPROTO_UDP)
+				udp = (void *)(iph + 1);
+		}
+		if (eth->h_proto == bpf_htons(ETH_P_IPV6)) {
+			ip6h = (void *)(eth + 1);
+			if (ip6h + 1 < data_end && ip6h->nexthdr == IPPROTO_UDP)
+				udp = (void *)(ip6h + 1);
+		}
+		if (udp && udp + 1 > data_end)
+			udp = NULL;
+	}
+
+	if (!udp)
+		return XDP_PASS;
+
+	if (udp->dest != bpf_htons(9091))
+		return XDP_PASS;
+
+	bpf_printk("forwarding UDP:9091 to AF_XDP");
+
+	ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(struct xdp_meta));
+	if (ret != 0) {
+		bpf_printk("bpf_xdp_adjust_meta returned %d", ret);
+		return XDP_PASS;
+	}
+
+	data = (void *)(long)ctx->data;
+	data_meta = (void *)(long)ctx->data_meta;
+	meta = data_meta;
+
+	if (meta + 1 > data) {
+		bpf_printk("bpf_xdp_adjust_meta doesn't appear to work");
+		return XDP_PASS;
+	}
+
+	if (!bpf_xdp_metadata_rx_timestamp(ctx, &meta->rx_timestamp))
+		bpf_printk("populated rx_timestamp with %u", meta->rx_timestamp);
+
+	if (!bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash))
+		bpf_printk("populated rx_hash with %u", meta->rx_hash);
+
+	return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
new file mode 100644
index 000000000000..0008f0f239e8
--- /dev/null
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -0,0 +1,403 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Reference program for verifying XDP metadata on real HW. Functional test
+ * only, doesn't test the performance.
+ *
+ * RX:
+ * - UDP 9091 packets are diverted into AF_XDP
+ * - Metadata verified:
+ *   - rx_timestamp
+ *   - rx_hash
+ *
+ * TX:
+ * - TBD
+ */
+
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "xdp_hw_metadata.skel.h"
+#include "xsk.h"
+
+#include <error.h>
+#include <linux/errqueue.h>
+#include <linux/if_link.h>
+#include <linux/net_tstamp.h>
+#include <linux/udp.h>
+#include <linux/sockios.h>
+#include <sys/mman.h>
+#include <net/if.h>
+#include <poll.h>
+
+#include "xdp_metadata.h"
+
+#define UMEM_NUM 16
+#define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE
+#define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM)
+#define XDP_FLAGS (XDP_FLAGS_DRV_MODE | XDP_FLAGS_REPLACE)
+
+struct xsk {
+	void *umem_area;
+	struct xsk_umem *umem;
+	struct xsk_ring_prod fill;
+	struct xsk_ring_cons comp;
+	struct xsk_ring_prod tx;
+	struct xsk_ring_cons rx;
+	struct xsk_socket *socket;
+};
+
+struct xdp_hw_metadata *bpf_obj;
+struct xsk *rx_xsk;
+const char *ifname;
+int ifindex;
+int rxq;
+
+void test__fail(void) { /* for network_helpers.c */ }
+
+static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
+{
+	int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
+	const struct xsk_socket_config socket_config = {
+		.rx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+		.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+		.bind_flags = XDP_COPY,
+	};
+	const struct xsk_umem_config umem_config = {
+		.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+		.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+		.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
+		.flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG,
+	};
+	__u32 idx;
+	u64 addr;
+	int ret;
+	int i;
+
+	xsk->umem_area = mmap(NULL, UMEM_SIZE, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
+	if (xsk->umem_area == MAP_FAILED)
+		return -ENOMEM;
+
+	ret = xsk_umem__create(&xsk->umem,
+			       xsk->umem_area, UMEM_SIZE,
+			       &xsk->fill,
+			       &xsk->comp,
+			       &umem_config);
+	if (ret)
+		return ret;
+
+	ret = xsk_socket__create(&xsk->socket, ifindex, queue_id,
+				 xsk->umem,
+				 &xsk->rx,
+				 &xsk->tx,
+				 &socket_config);
+	if (ret)
+		return ret;
+
+	/* First half of umem is for TX. This way address matches 1-to-1
+	 * to the completion queue index.
+	 */
+
+	for (i = 0; i < UMEM_NUM / 2; i++) {
+		addr = i * UMEM_FRAME_SIZE;
+		printf("%p: tx_desc[%d] -> %lx\n", xsk, i, addr);
+	}
+
+	/* Second half of umem is for RX. */
+
+	ret = xsk_ring_prod__reserve(&xsk->fill, UMEM_NUM / 2, &idx);
+	for (i = 0; i < UMEM_NUM / 2; i++) {
+		addr = (UMEM_NUM / 2 + i) * UMEM_FRAME_SIZE;
+		printf("%p: rx_desc[%d] -> %lx\n", xsk, i, addr);
+		*xsk_ring_prod__fill_addr(&xsk->fill, i) = addr;
+	}
+	xsk_ring_prod__submit(&xsk->fill, ret);
+
+	return 0;
+}
+
+static void close_xsk(struct xsk *xsk)
+{
+	if (xsk->umem)
+		xsk_umem__delete(xsk->umem);
+	if (xsk->socket)
+		xsk_socket__delete(xsk->socket);
+	munmap(xsk->umem, UMEM_SIZE);
+}
+
+static void refill_rx(struct xsk *xsk, __u64 addr)
+{
+	__u32 idx;
+
+	if (xsk_ring_prod__reserve(&xsk->fill, 1, &idx) == 1) {
+		printf("%p: complete idx=%u addr=%llx\n", xsk, idx, addr);
+		*xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr;
+		xsk_ring_prod__submit(&xsk->fill, 1);
+	}
+}
+
+static void verify_xdp_metadata(void *data)
+{
+	struct xdp_meta *meta;
+
+	meta = data - sizeof(*meta);
+
+	printf("rx_timestamp: %llu\n", meta->rx_timestamp);
+	printf("rx_hash: %u\n", meta->rx_hash);
+}
+
+static void verify_skb_metadata(int fd)
+{
+	char cmsg_buf[1024];
+	char packet_buf[128];
+
+	struct scm_timestamping *ts;
+	struct iovec packet_iov;
+	struct cmsghdr *cmsg;
+	struct msghdr hdr;
+
+	memset(&hdr, 0, sizeof(hdr));
+	hdr.msg_iov = &packet_iov;
+	hdr.msg_iovlen = 1;
+	packet_iov.iov_base = packet_buf;
+	packet_iov.iov_len = sizeof(packet_buf);
+
+	hdr.msg_control = cmsg_buf;
+	hdr.msg_controllen = sizeof(cmsg_buf);
+
+	if (recvmsg(fd, &hdr, 0) < 0)
+		error(-1, errno, "recvmsg");
+
+	for (cmsg = CMSG_FIRSTHDR(&hdr); cmsg != NULL;
+	     cmsg = CMSG_NXTHDR(&hdr, cmsg)) {
+
+		if (cmsg->cmsg_level != SOL_SOCKET)
+			continue;
+
+		switch (cmsg->cmsg_type) {
+		case SCM_TIMESTAMPING:
+			ts = (struct scm_timestamping *)CMSG_DATA(cmsg);
+			if (ts->ts[2].tv_sec || ts->ts[2].tv_nsec) {
+				printf("found skb hwtstamp = %lu.%lu\n",
+				       ts->ts[2].tv_sec, ts->ts[2].tv_nsec);
+				return;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	printf("skb hwtstamp is not found!\n");
+}
+
+static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd)
+{
+	const struct xdp_desc *rx_desc;
+	struct pollfd fds[rxq + 1];
+	__u64 comp_addr;
+	__u64 addr;
+	__u32 idx;
+	int ret;
+	int i;
+
+	for (i = 0; i < rxq; i++) {
+		fds[i].fd = xsk_socket__fd(rx_xsk[i].socket);
+		fds[i].events = POLLIN;
+		fds[i].revents = 0;
+	}
+
+	fds[rxq].fd = server_fd;
+	fds[rxq].events = POLLIN;
+	fds[rxq].revents = 0;
+
+	while (true) {
+		errno = 0;
+		ret = poll(fds, rxq + 1, 1000);
+		printf("poll: %d (%d)\n", ret, errno);
+		if (ret < 0)
+			break;
+		if (ret == 0)
+			continue;
+
+		if (fds[rxq].revents)
+			verify_skb_metadata(server_fd);
+
+		for (i = 0; i < rxq; i++) {
+			if (fds[i].revents == 0)
+				continue;
+
+			struct xsk *xsk = &rx_xsk[i];
+
+			ret = xsk_ring_cons__peek(&xsk->rx, 1, &idx);
+			printf("xsk_ring_cons__peek: %d\n", ret);
+			if (ret != 1)
+				continue;
+
+			rx_desc = xsk_ring_cons__rx_desc(&xsk->rx, idx);
+			comp_addr = xsk_umem__extract_addr(rx_desc->addr);
+			addr = xsk_umem__add_offset_to_addr(rx_desc->addr);
+			printf("%p: rx_desc[%u]->addr=%llx addr=%llx comp_addr=%llx\n",
+			       xsk, idx, rx_desc->addr, addr, comp_addr);
+			verify_xdp_metadata(xsk_umem__get_data(xsk->umem_area, addr));
+			xsk_ring_cons__release(&xsk->rx, 1);
+			refill_rx(xsk, comp_addr);
+		}
+	}
+
+	return 0;
+}
+
+struct ethtool_channels {
+	__u32	cmd;
+	__u32	max_rx;
+	__u32	max_tx;
+	__u32	max_other;
+	__u32	max_combined;
+	__u32	rx_count;
+	__u32	tx_count;
+	__u32	other_count;
+	__u32	combined_count;
+};
+
+#define ETHTOOL_GCHANNELS	0x0000003c /* Get no of channels */
+
+static int rxq_num(const char *ifname)
+{
+	struct ethtool_channels ch = {
+		.cmd = ETHTOOL_GCHANNELS,
+	};
+
+	struct ifreq ifr = {
+		.ifr_data = (void *)&ch,
+	};
+	strcpy(ifr.ifr_name, ifname);
+	int fd, ret;
+
+	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+	if (fd < 0)
+		error(-1, errno, "socket");
+
+	ret = ioctl(fd, SIOCETHTOOL, &ifr);
+	if (ret < 0)
+		error(-1, errno, "socket");
+
+	close(fd);
+
+	return ch.rx_count + ch.combined_count;
+}
+
+static void cleanup(void)
+{
+	LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
+	int ret;
+	int i;
+
+	if (bpf_obj) {
+		opts.old_prog_fd = bpf_program__fd(bpf_obj->progs.rx);
+		if (opts.old_prog_fd >= 0) {
+			printf("detaching bpf program....\n");
+			ret = bpf_xdp_detach(ifindex, XDP_FLAGS, &opts);
+			if (ret)
+				printf("failed to detach XDP program: %d\n", ret);
+		}
+	}
+
+	for (i = 0; i < rxq; i++)
+		close_xsk(&rx_xsk[i]);
+
+	if (bpf_obj)
+		xdp_hw_metadata__destroy(bpf_obj);
+}
+
+static void handle_signal(int sig)
+{
+	/* interrupting poll() is all we need */
+}
+
+static void timestamping_enable(int fd, int val)
+{
+	int ret;
+
+	ret = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val));
+	if (ret < 0)
+		error(-1, errno, "setsockopt(SO_TIMESTAMPING)");
+}
+
+int main(int argc, char *argv[])
+{
+	int server_fd = -1;
+	int ret;
+	int i;
+
+	struct bpf_program *prog;
+
+	if (argc != 2) {
+		fprintf(stderr, "pass device name\n");
+		return -1;
+	}
+
+	ifname = argv[1];
+	ifindex = if_nametoindex(ifname);
+	rxq = rxq_num(ifname);
+
+	printf("rxq: %d\n", rxq);
+
+	rx_xsk = malloc(sizeof(struct xsk) * rxq);
+	if (!rx_xsk)
+		error(-1, ENOMEM, "malloc");
+
+	for (i = 0; i < rxq; i++) {
+		printf("open_xsk(%s, %p, %d)\n", ifname, &rx_xsk[i], i);
+		ret = open_xsk(ifindex, &rx_xsk[i], i);
+		if (ret)
+			error(-1, -ret, "open_xsk");
+
+		printf("xsk_socket__fd() -> %d\n", xsk_socket__fd(rx_xsk[i].socket));
+	}
+
+	printf("open bpf program...\n");
+	bpf_obj = xdp_hw_metadata__open();
+	if (libbpf_get_error(bpf_obj))
+		error(-1, libbpf_get_error(bpf_obj), "xdp_hw_metadata__open");
+
+	prog = bpf_object__find_program_by_name(bpf_obj->obj, "rx");
+	bpf_program__set_ifindex(prog, ifindex);
+	bpf_program__set_flags(prog, BPF_F_XDP_DEV_BOUND_ONLY);
+
+	printf("load bpf program...\n");
+	ret = xdp_hw_metadata__load(bpf_obj);
+	if (ret)
+		error(-1, -ret, "xdp_hw_metadata__load");
+
+	printf("prepare skb endpoint...\n");
+	server_fd = start_server(AF_INET6, SOCK_DGRAM, NULL, 9092, 1000);
+	if (server_fd < 0)
+		error(-1, errno, "start_server");
+	timestamping_enable(server_fd,
+			    SOF_TIMESTAMPING_SOFTWARE |
+			    SOF_TIMESTAMPING_RAW_HARDWARE);
+
+	printf("prepare xsk map...\n");
+	for (i = 0; i < rxq; i++) {
+		int sock_fd = xsk_socket__fd(rx_xsk[i].socket);
+		__u32 queue_id = i;
+
+		printf("map[%d] = %d\n", queue_id, sock_fd);
+		ret = bpf_map_update_elem(bpf_map__fd(bpf_obj->maps.xsk), &queue_id, &sock_fd, 0);
+		if (ret)
+			error(-1, -ret, "bpf_map_update_elem");
+	}
+
+	printf("attach bpf program...\n");
+	ret = bpf_xdp_attach(ifindex,
+			     bpf_program__fd(bpf_obj->progs.rx),
+			     XDP_FLAGS, NULL);
+	if (ret)
+		error(-1, -ret, "bpf_xdp_attach");
+
+	signal(SIGINT, handle_signal);
+	ret = verify_metadata(rx_xsk, rxq, server_fd);
+	close(server_fd);
+	cleanup();
+	if (ret)
+		error(-1, -ret, "verify_metadata");
+}
-- 
cgit 


From 9fabbdf338b701f2d763d9edbc3e82ce1e7fa1b4 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 19 Jan 2023 23:42:04 -0800
Subject: selftests: iommu: Fix test_cmd_destroy_access() call in user_copy

The test_cmd_destroy_access() should end with a semicolon, so add one.
There is a test_ioctl_destroy(ioas_id) following already, so drop one.

Fixes: 57f0988706fe ("iommufd: Add a selftest")
Link: https://lore.kernel.org/r/20230120074204.1368-1-nicolinc@nvidia.com
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 8aa8a346cf22..fa08209268c4 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -1259,7 +1259,7 @@ TEST_F(iommufd_mock_domain, user_copy)
 
 	test_cmd_destroy_access_pages(
 		access_cmd.id, access_cmd.access_pages.out_access_pages_id);
-	test_cmd_destroy_access(access_cmd.id) test_ioctl_destroy(ioas_id);
+	test_cmd_destroy_access(access_cmd.id);
 
 	test_ioctl_destroy(ioas_id);
 }
-- 
cgit 


From 7525daeefc8c20902bac63f89603096c76808fe0 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 19 Jan 2023 20:18:44 -0600
Subject: selftests/bpf: Use __failure macro in task kfunc testsuite

In commit 537c3f66eac1 ("selftests/bpf: add generic BPF program tester-loader"),
a new mechanism was added to the BPF selftest framework to allow testsuites to
use macros to define expected failing testcases.

This allows any testsuite which tests verification failure to remove a good
amount of boilerplate code. This patch updates the task_kfunc selftest suite
to use these new macros.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20230120021844.3048244-1-void@manifault.com
---
 .../testing/selftests/bpf/prog_tests/task_kfunc.c  | 71 +---------------------
 .../selftests/bpf/progs/task_kfunc_failure.c       | 18 ++++++
 2 files changed, 19 insertions(+), 70 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
index 18848c31e36f..f79fa5bc9a8d 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
@@ -9,9 +9,6 @@
 #include "task_kfunc_failure.skel.h"
 #include "task_kfunc_success.skel.h"
 
-static size_t log_buf_sz = 1 << 20; /* 1 MB */
-static char obj_log_buf[1048576];
-
 static struct task_kfunc_success *open_load_task_kfunc_skel(void)
 {
 	struct task_kfunc_success *skel;
@@ -83,67 +80,6 @@ static const char * const success_tests[] = {
 	"test_task_from_pid_invalid",
 };
 
-static struct {
-	const char *prog_name;
-	const char *expected_err_msg;
-} failure_tests[] = {
-	{"task_kfunc_acquire_untrusted", "R1 must be referenced or trusted"},
-	{"task_kfunc_acquire_fp", "arg#0 pointer type STRUCT task_struct must point"},
-	{"task_kfunc_acquire_unsafe_kretprobe", "reg type unsupported for arg#0 function"},
-	{"task_kfunc_acquire_trusted_walked", "R1 must be referenced or trusted"},
-	{"task_kfunc_acquire_null", "arg#0 pointer type STRUCT task_struct must point"},
-	{"task_kfunc_acquire_unreleased", "Unreleased reference"},
-	{"task_kfunc_get_non_kptr_param", "arg#0 expected pointer to map value"},
-	{"task_kfunc_get_non_kptr_acquired", "arg#0 expected pointer to map value"},
-	{"task_kfunc_get_null", "arg#0 expected pointer to map value"},
-	{"task_kfunc_xchg_unreleased", "Unreleased reference"},
-	{"task_kfunc_get_unreleased", "Unreleased reference"},
-	{"task_kfunc_release_untrusted", "arg#0 is untrusted_ptr_or_null_ expected ptr_ or socket"},
-	{"task_kfunc_release_fp", "arg#0 pointer type STRUCT task_struct must point"},
-	{"task_kfunc_release_null", "arg#0 is ptr_or_null_ expected ptr_ or socket"},
-	{"task_kfunc_release_unacquired", "release kernel function bpf_task_release expects"},
-	{"task_kfunc_from_pid_no_null_check", "arg#0 is ptr_or_null_ expected ptr_ or socket"},
-	{"task_kfunc_from_lsm_task_free", "reg type unsupported for arg#0 function"},
-};
-
-static void verify_fail(const char *prog_name, const char *expected_err_msg)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
-	struct task_kfunc_failure *skel;
-	int err, i;
-
-	opts.kernel_log_buf = obj_log_buf;
-	opts.kernel_log_size = log_buf_sz;
-	opts.kernel_log_level = 1;
-
-	skel = task_kfunc_failure__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "task_kfunc_failure__open_opts"))
-		goto cleanup;
-
-	for (i = 0; i < ARRAY_SIZE(failure_tests); i++) {
-		struct bpf_program *prog;
-		const char *curr_name = failure_tests[i].prog_name;
-
-		prog = bpf_object__find_program_by_name(skel->obj, curr_name);
-		if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
-			goto cleanup;
-
-		bpf_program__set_autoload(prog, !strcmp(curr_name, prog_name));
-	}
-
-	err = task_kfunc_failure__load(skel);
-	if (!ASSERT_ERR(err, "unexpected load success"))
-		goto cleanup;
-
-	if (!ASSERT_OK_PTR(strstr(obj_log_buf, expected_err_msg), "expected_err_msg")) {
-		fprintf(stderr, "Expected err_msg: %s\n", expected_err_msg);
-		fprintf(stderr, "Verifier output: %s\n", obj_log_buf);
-	}
-
-cleanup:
-	task_kfunc_failure__destroy(skel);
-}
-
 void test_task_kfunc(void)
 {
 	int i;
@@ -155,10 +91,5 @@ void test_task_kfunc(void)
 		run_success_test(success_tests[i]);
 	}
 
-	for (i = 0; i < ARRAY_SIZE(failure_tests); i++) {
-		if (!test__start_subtest(failure_tests[i].prog_name))
-			continue;
-
-		verify_fail(failure_tests[i].prog_name, failure_tests[i].expected_err_msg);
-	}
+	RUN_TESTS(task_kfunc_failure);
 }
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
index 1b47b94dbca0..e6950d6a9cf0 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
@@ -5,6 +5,7 @@
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_helpers.h>
 
+#include "bpf_misc.h"
 #include "task_kfunc_common.h"
 
 char _license[] SEC("license") = "GPL";
@@ -27,6 +28,7 @@ static struct __tasks_kfunc_map_value *insert_lookup_task(struct task_struct *ta
 }
 
 SEC("tp_btf/task_newtask")
+__failure __msg("R1 must be referenced or trusted")
 int BPF_PROG(task_kfunc_acquire_untrusted, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
@@ -44,6 +46,7 @@ int BPF_PROG(task_kfunc_acquire_untrusted, struct task_struct *task, u64 clone_f
 }
 
 SEC("tp_btf/task_newtask")
+__failure __msg("arg#0 pointer type STRUCT task_struct must point")
 int BPF_PROG(task_kfunc_acquire_fp, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired, *stack_task = (struct task_struct *)&clone_flags;
@@ -56,6 +59,7 @@ int BPF_PROG(task_kfunc_acquire_fp, struct task_struct *task, u64 clone_flags)
 }
 
 SEC("kretprobe/free_task")
+__failure __msg("reg type unsupported for arg#0 function")
 int BPF_PROG(task_kfunc_acquire_unsafe_kretprobe, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
@@ -68,6 +72,7 @@ int BPF_PROG(task_kfunc_acquire_unsafe_kretprobe, struct task_struct *task, u64
 }
 
 SEC("tp_btf/task_newtask")
+__failure __msg("R1 must be referenced or trusted")
 int BPF_PROG(task_kfunc_acquire_trusted_walked, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
@@ -81,6 +86,7 @@ int BPF_PROG(task_kfunc_acquire_trusted_walked, struct task_struct *task, u64 cl
 
 
 SEC("tp_btf/task_newtask")
+__failure __msg("arg#0 pointer type STRUCT task_struct must point")
 int BPF_PROG(task_kfunc_acquire_null, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
@@ -95,6 +101,7 @@ int BPF_PROG(task_kfunc_acquire_null, struct task_struct *task, u64 clone_flags)
 }
 
 SEC("tp_btf/task_newtask")
+__failure __msg("Unreleased reference")
 int BPF_PROG(task_kfunc_acquire_unreleased, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
@@ -107,6 +114,7 @@ int BPF_PROG(task_kfunc_acquire_unreleased, struct task_struct *task, u64 clone_
 }
 
 SEC("tp_btf/task_newtask")
+__failure __msg("arg#0 expected pointer to map value")
 int BPF_PROG(task_kfunc_get_non_kptr_param, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *kptr;
@@ -122,6 +130,7 @@ int BPF_PROG(task_kfunc_get_non_kptr_param, struct task_struct *task, u64 clone_
 }
 
 SEC("tp_btf/task_newtask")
+__failure __msg("arg#0 expected pointer to map value")
 int BPF_PROG(task_kfunc_get_non_kptr_acquired, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *kptr, *acquired;
@@ -140,6 +149,7 @@ int BPF_PROG(task_kfunc_get_non_kptr_acquired, struct task_struct *task, u64 clo
 }
 
 SEC("tp_btf/task_newtask")
+__failure __msg("arg#0 expected pointer to map value")
 int BPF_PROG(task_kfunc_get_null, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *kptr;
@@ -155,6 +165,7 @@ int BPF_PROG(task_kfunc_get_null, struct task_struct *task, u64 clone_flags)
 }
 
 SEC("tp_btf/task_newtask")
+__failure __msg("Unreleased reference")
 int BPF_PROG(task_kfunc_xchg_unreleased, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *kptr;
@@ -174,6 +185,7 @@ int BPF_PROG(task_kfunc_xchg_unreleased, struct task_struct *task, u64 clone_fla
 }
 
 SEC("tp_btf/task_newtask")
+__failure __msg("Unreleased reference")
 int BPF_PROG(task_kfunc_get_unreleased, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *kptr;
@@ -193,6 +205,7 @@ int BPF_PROG(task_kfunc_get_unreleased, struct task_struct *task, u64 clone_flag
 }
 
 SEC("tp_btf/task_newtask")
+__failure __msg("arg#0 is untrusted_ptr_or_null_ expected ptr_ or socket")
 int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_flags)
 {
 	struct __tasks_kfunc_map_value *v;
@@ -208,6 +221,7 @@ int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_f
 }
 
 SEC("tp_btf/task_newtask")
+__failure __msg("arg#0 pointer type STRUCT task_struct must point")
 int BPF_PROG(task_kfunc_release_fp, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired = (struct task_struct *)&clone_flags;
@@ -219,6 +233,7 @@ int BPF_PROG(task_kfunc_release_fp, struct task_struct *task, u64 clone_flags)
 }
 
 SEC("tp_btf/task_newtask")
+__failure __msg("arg#0 is ptr_or_null_ expected ptr_ or socket")
 int BPF_PROG(task_kfunc_release_null, struct task_struct *task, u64 clone_flags)
 {
 	struct __tasks_kfunc_map_value local, *v;
@@ -251,6 +266,7 @@ int BPF_PROG(task_kfunc_release_null, struct task_struct *task, u64 clone_flags)
 }
 
 SEC("tp_btf/task_newtask")
+__failure __msg("release kernel function bpf_task_release expects")
 int BPF_PROG(task_kfunc_release_unacquired, struct task_struct *task, u64 clone_flags)
 {
 	/* Cannot release trusted task pointer which was not acquired. */
@@ -260,6 +276,7 @@ int BPF_PROG(task_kfunc_release_unacquired, struct task_struct *task, u64 clone_
 }
 
 SEC("tp_btf/task_newtask")
+__failure __msg("arg#0 is ptr_or_null_ expected ptr_ or socket")
 int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
@@ -273,6 +290,7 @@ int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 cl
 }
 
 SEC("lsm/task_free")
+__failure __msg("reg type unsupported for arg#0 function")
 int BPF_PROG(task_kfunc_from_lsm_task_free, struct task_struct *task)
 {
 	struct task_struct *acquired;
-- 
cgit 


From 3c59623d82947c8bfcc2fbf34c5eab02493b3b28 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:08:50 -0800
Subject: libbpf: Add support for fetching up to 8 arguments in kprobes

Add BPF_KPROBE() and PT_REGS_PARMx() support for up to 8 arguments, if
target architecture supports this. Currently all architectures are
limited to only 5 register-placed arguments, which is limiting even on
x86-64.

This patch adds generic macro machinery to support up to 8 arguments
both when explicitly fetching it from pt_regs through PT_REGS_PARMx()
macros, as well as more ergonomic access in BPF_KPROBE().

Also, for i386 architecture we now don't have to define fake PARM4 and
PARM5 definitions, they will be generically substituted, just like for
PARM6 through PARM8.

Subsequent patches will fill out architecture-specific definitions,
where appropriate.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Alan Maguire <alan.maguire@oracle.com> # arm64
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com> # s390x
Link: https://lore.kernel.org/bpf/20230120200914.3008030-2-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 41 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index bdb0f6b5be84..ef17d8fdd9d6 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -98,12 +98,10 @@
 
 #ifdef __i386__
 
+/* i386 kernel is built with -mregparm=3 */
 #define __PT_PARM1_REG eax
 #define __PT_PARM2_REG edx
 #define __PT_PARM3_REG ecx
-/* i386 kernel is built with -mregparm=3 */
-#define __PT_PARM4_REG __unsupported__
-#define __PT_PARM5_REG __unsupported__
 #define __PT_RET_REG esp
 #define __PT_FP_REG ebp
 #define __PT_RC_REG eax
@@ -287,16 +285,39 @@ struct pt_regs___arm64 {
 
 struct pt_regs;
 
-/* allow some architecutres to override `struct pt_regs` */
+/* allow some architectures to override `struct pt_regs` */
 #ifndef __PT_REGS_CAST
 #define __PT_REGS_CAST(x) (x)
 #endif
 
+/*
+ * Different architectures support different number of arguments passed
+ * through registers. i386 supports just 3, some arches support up to 8.
+ */
+#ifndef __PT_PARM4_REG
+#define __PT_PARM4_REG __unsupported__
+#endif
+#ifndef __PT_PARM5_REG
+#define __PT_PARM5_REG __unsupported__
+#endif
+#ifndef __PT_PARM6_REG
+#define __PT_PARM6_REG __unsupported__
+#endif
+#ifndef __PT_PARM7_REG
+#define __PT_PARM7_REG __unsupported__
+#endif
+#ifndef __PT_PARM8_REG
+#define __PT_PARM8_REG __unsupported__
+#endif
+
 #define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG)
 #define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG)
 #define PT_REGS_PARM3(x) (__PT_REGS_CAST(x)->__PT_PARM3_REG)
 #define PT_REGS_PARM4(x) (__PT_REGS_CAST(x)->__PT_PARM4_REG)
 #define PT_REGS_PARM5(x) (__PT_REGS_CAST(x)->__PT_PARM5_REG)
+#define PT_REGS_PARM6(x) (__PT_REGS_CAST(x)->__PT_PARM6_REG)
+#define PT_REGS_PARM7(x) (__PT_REGS_CAST(x)->__PT_PARM7_REG)
+#define PT_REGS_PARM8(x) (__PT_REGS_CAST(x)->__PT_PARM8_REG)
 #define PT_REGS_RET(x) (__PT_REGS_CAST(x)->__PT_RET_REG)
 #define PT_REGS_FP(x) (__PT_REGS_CAST(x)->__PT_FP_REG)
 #define PT_REGS_RC(x) (__PT_REGS_CAST(x)->__PT_RC_REG)
@@ -308,6 +329,9 @@ struct pt_regs;
 #define PT_REGS_PARM3_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM3_REG)
 #define PT_REGS_PARM4_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM4_REG)
 #define PT_REGS_PARM5_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM5_REG)
+#define PT_REGS_PARM6_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM6_REG)
+#define PT_REGS_PARM7_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM7_REG)
+#define PT_REGS_PARM8_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM8_REG)
 #define PT_REGS_RET_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RET_REG)
 #define PT_REGS_FP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_FP_REG)
 #define PT_REGS_RC_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RC_REG)
@@ -360,6 +384,9 @@ struct pt_regs;
 #define PT_REGS_PARM3(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_PARM4(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_PARM5(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM6(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM7(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM8(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_RET(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_FP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_RC(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
@@ -371,6 +398,9 @@ struct pt_regs;
 #define PT_REGS_PARM3_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_PARM4_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_PARM5_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM6_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM7_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM8_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_RET_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_FP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_RC_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
@@ -576,6 +606,9 @@ struct pt_regs;
 #define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx)
 #define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx)
 #define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx)
+#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (void *)PT_REGS_PARM6(ctx)
+#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (void *)PT_REGS_PARM7(ctx)
+#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (void *)PT_REGS_PARM8(ctx)
 #define ___bpf_kprobe_args(args...)     ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args)
 
 /*
-- 
cgit 


From 013290329a0707c9900f86a743dce70d6c345330 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:08:51 -0800
Subject: libbpf: Add 6th argument support for x86-64 in bpf_tracing.h

Add r9 as register containing 6th argument on x86-64 architecture, as
per its ABI. Add also a link to a page describing ABI for easier future
reference.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-3-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index ef17d8fdd9d6..a47504c2f3cb 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -78,6 +78,10 @@
 
 #if defined(bpf_target_x86)
 
+/*
+ * https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI
+ */
+
 #if defined(__KERNEL__) || defined(__VMLINUX_H__)
 
 #define __PT_PARM1_REG di
@@ -85,6 +89,7 @@
 #define __PT_PARM3_REG dx
 #define __PT_PARM4_REG cx
 #define __PT_PARM5_REG r8
+#define __PT_PARM6_REG r9
 #define __PT_RET_REG sp
 #define __PT_FP_REG bp
 #define __PT_RC_REG ax
@@ -115,6 +120,7 @@
 #define __PT_PARM3_REG rdx
 #define __PT_PARM4_REG rcx
 #define __PT_PARM5_REG r8
+#define __PT_PARM6_REG r9
 #define __PT_RET_REG rsp
 #define __PT_FP_REG rbp
 #define __PT_RC_REG rax
-- 
cgit 


From 1dac40ac8742122a5f12b08f8073d03d6915f9ef Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:08:52 -0800
Subject: libbpf: Fix arm and arm64 specs in bpf_tracing.h

Remove invalid support for PARM5 on 32-bit arm, as per ABI. Add three
more argument registers for arm64. Also leave links to ABI specs for
future reference.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Alan Maguire <alan.maguire@oracle.com> # arm64
Link: https://lore.kernel.org/bpf/20230120200914.3008030-4-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index a47504c2f3cb..a263f600e309 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -157,11 +157,14 @@ struct pt_regs___s390 {
 
 #elif defined(bpf_target_arm)
 
+/*
+ * https://github.com/ARM-software/abi-aa/blob/main/aapcs32/aapcs32.rst#machine-registers
+ */
+
 #define __PT_PARM1_REG uregs[0]
 #define __PT_PARM2_REG uregs[1]
 #define __PT_PARM3_REG uregs[2]
 #define __PT_PARM4_REG uregs[3]
-#define __PT_PARM5_REG uregs[4]
 #define __PT_RET_REG uregs[14]
 #define __PT_FP_REG uregs[11]	/* Works only with CONFIG_FRAME_POINTER */
 #define __PT_RC_REG uregs[0]
@@ -170,6 +173,10 @@ struct pt_regs___s390 {
 
 #elif defined(bpf_target_arm64)
 
+/*
+ * https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#machine-registers
+ */
+
 struct pt_regs___arm64 {
 	unsigned long orig_x0;
 };
@@ -181,6 +188,9 @@ struct pt_regs___arm64 {
 #define __PT_PARM3_REG regs[2]
 #define __PT_PARM4_REG regs[3]
 #define __PT_PARM5_REG regs[4]
+#define __PT_PARM6_REG regs[5]
+#define __PT_PARM7_REG regs[6]
+#define __PT_PARM8_REG regs[7]
 #define __PT_RET_REG regs[30]
 #define __PT_FP_REG regs[29]	/* Works only with CONFIG_FRAME_POINTER */
 #define __PT_RC_REG regs[0]
-- 
cgit 


From 1222445a5bf622b37a9bb16c7ad0148e2a21eb0d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:08:53 -0800
Subject: libbpf: Complete mips spec in bpf_tracing.h

Add registers for PARM6 through PARM8. Add a link to an ABI. We don't
distinguish between O32, N32, and N64, so document that we assume N64
right now.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-5-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index a263f600e309..f356955b059a 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -201,11 +201,19 @@ struct pt_regs___arm64 {
 
 #elif defined(bpf_target_mips)
 
+/*
+ * N64 ABI is assumed right now.
+ * https://en.wikipedia.org/wiki/MIPS_architecture#Calling_conventions
+ */
+
 #define __PT_PARM1_REG regs[4]
 #define __PT_PARM2_REG regs[5]
 #define __PT_PARM3_REG regs[6]
 #define __PT_PARM4_REG regs[7]
 #define __PT_PARM5_REG regs[8]
+#define __PT_PARM6_REG regs[9]
+#define __PT_PARM7_REG regs[10]
+#define __PT_PARM8_REG regs[11]
 #define __PT_RET_REG regs[31]
 #define __PT_FP_REG regs[30]	/* Works only with CONFIG_FRAME_POINTER */
 #define __PT_RC_REG regs[2]
-- 
cgit 


From 2eb2be30b8dac137d9a21c6125a68993b9dc8a00 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:08:54 -0800
Subject: libbpf: Complete powerpc spec in bpf_tracing.h

Add definitions of PARM6 through PARM8 for powerpc architecture. Add
also a link to a functiona call sequence documentation for future reference.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-6-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index f356955b059a..1b9447f847b0 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -222,11 +222,19 @@ struct pt_regs___arm64 {
 
 #elif defined(bpf_target_powerpc)
 
+/*
+ * http://refspecs.linux-foundation.org/elf/elfspec_ppc.pdf (page 3-14,
+ * section "Function Calling Sequence")
+ */
+
 #define __PT_PARM1_REG gpr[3]
 #define __PT_PARM2_REG gpr[4]
 #define __PT_PARM3_REG gpr[5]
 #define __PT_PARM4_REG gpr[6]
 #define __PT_PARM5_REG gpr[7]
+#define __PT_PARM6_REG gpr[8]
+#define __PT_PARM7_REG gpr[9]
+#define __PT_PARM8_REG gpr[10]
 #define __PT_RET_REG regs[31]
 #define __PT_FP_REG __unsupported__
 #define __PT_RC_REG gpr[3]
-- 
cgit 


From 7f60f5d85e29441b297bf64f0236c70e624a9108 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:08:55 -0800
Subject: libbpf: Complete sparc spec in bpf_tracing.h

Add PARM6 definition for sparc architecture. Leave a link to calling
convention documentation.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-7-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 1b9447f847b0..0f9640ddbe1c 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -245,11 +245,16 @@ struct pt_regs___arm64 {
 
 #elif defined(bpf_target_sparc)
 
+/*
+ * https://en.wikipedia.org/wiki/Calling_convention#SPARC
+ */
+
 #define __PT_PARM1_REG u_regs[UREG_I0]
 #define __PT_PARM2_REG u_regs[UREG_I1]
 #define __PT_PARM3_REG u_regs[UREG_I2]
 #define __PT_PARM4_REG u_regs[UREG_I3]
 #define __PT_PARM5_REG u_regs[UREG_I4]
+#define __PT_PARM6_REG u_regs[UREG_I5]
 #define __PT_RET_REG u_regs[UREG_I7]
 #define __PT_FP_REG __unsupported__
 #define __PT_RC_REG u_regs[UREG_I0]
-- 
cgit 


From b13ed8ca7fba3d3e61621be021fdccd5c44b2634 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:08:56 -0800
Subject: libbpf: Complete riscv arch spec in bpf_tracing.h

Add PARM6 through PARM8 definitions for RISC V (riscv) arch. Leave the
link for ABI doc for future reference.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Pu Lehui <pulehui@huawei.com> # RISC-V
Link: https://lore.kernel.org/bpf/20230120200914.3008030-8-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 0f9640ddbe1c..579e2f830532 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -268,12 +268,19 @@ struct pt_regs___arm64 {
 
 #elif defined(bpf_target_riscv)
 
+/*
+ * https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc#risc-v-calling-conventions
+ */
+
 #define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x))
 #define __PT_PARM1_REG a0
 #define __PT_PARM2_REG a1
 #define __PT_PARM3_REG a2
 #define __PT_PARM4_REG a3
 #define __PT_PARM5_REG a4
+#define __PT_PARM6_REG a5
+#define __PT_PARM7_REG a6
+#define __PT_PARM8_REG a7
 #define __PT_RET_REG ra
 #define __PT_FP_REG s0
 #define __PT_RC_REG a0
-- 
cgit 


From 0ac086567916e189c7db71da7339e20b5747f8ac Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:08:57 -0800
Subject: libbpf: Fix and complete ARC spec in bpf_tracing.h

Add PARM6 through PARM8 definitions. Also fix frame pointer (FP)
register definition. Also leave a link to where to find ABI spec.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-9-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 579e2f830532..815c50ccedf4 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -291,6 +291,11 @@ struct pt_regs___arm64 {
 
 #elif defined(bpf_target_arc)
 
+/*
+ * Section "Function Calling Sequence" (page 24):
+ * https://raw.githubusercontent.com/wiki/foss-for-synopsys-dwc-arc-processors/toolchain/files/ARCv2_ABI.pdf
+ */
+
 /* arc provides struct user_pt_regs instead of struct pt_regs to userspace */
 #define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x))
 #define __PT_PARM1_REG scratch.r0
@@ -298,8 +303,11 @@ struct pt_regs___arm64 {
 #define __PT_PARM3_REG scratch.r2
 #define __PT_PARM4_REG scratch.r3
 #define __PT_PARM5_REG scratch.r4
+#define __PT_PARM6_REG scratch.r5
+#define __PT_PARM7_REG scratch.r6
+#define __PT_PARM8_REG scratch.r7
 #define __PT_RET_REG scratch.blink
-#define __PT_FP_REG __unsupported__
+#define __PT_FP_REG scratch.fp
 #define __PT_RC_REG scratch.r0
 #define __PT_SP_REG scratch.sp
 #define __PT_IP_REG scratch.ret
-- 
cgit 


From 55ff00d5393b6d6f6cb90530aac546f5aa52456d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:08:58 -0800
Subject: libbpf: Complete LoongArch (loongarch) spec in bpf_tracing.h

Add PARM6 through PARM8 definitions. Add kernel docs link describing ABI
for LoongArch.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-10-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 815c50ccedf4..510df96ff19d 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -316,13 +316,19 @@ struct pt_regs___arm64 {
 
 #elif defined(bpf_target_loongarch)
 
-/* https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html */
+/*
+ * https://docs.kernel.org/loongarch/introduction.html
+ * https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html
+ */
 
 #define __PT_PARM1_REG regs[4]
 #define __PT_PARM2_REG regs[5]
 #define __PT_PARM3_REG regs[6]
 #define __PT_PARM4_REG regs[7]
 #define __PT_PARM5_REG regs[8]
+#define __PT_PARM6_REG regs[9]
+#define __PT_PARM7_REG regs[10]
+#define __PT_PARM8_REG regs[11]
 #define __PT_RET_REG regs[1]
 #define __PT_FP_REG regs[22]
 #define __PT_RC_REG regs[4]
-- 
cgit 


From ac4afd6e6fa4d4f9a0ab6ca76f4953545e4da7e6 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:08:59 -0800
Subject: libbpf: Add BPF_UPROBE and BPF_URETPROBE macro aliases

Add BPF_UPROBE and BPF_URETPROBE macros, aliased to BPF_KPROBE and
BPF_KRETPROBE, respectively. This makes uprobe-based BPF program code
much less confusing, especially to people new to tracing, at no cost in
terms of maintainability. We'll use this macro in selftests in
subsequent patch.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-11-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 510df96ff19d..9c4246fe8799 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -781,4 +781,11 @@ ____##name(struct pt_regs *ctx, ##args)
 
 #define BPF_KPROBE_SYSCALL BPF_KSYSCALL
 
+/* BPF_UPROBE and BPF_URETPROBE are identical to BPF_KPROBE and BPF_KRETPROBE,
+ * but are named way less confusingly for SEC("uprobe") and SEC("uretprobe")
+ * use cases.
+ */
+#define BPF_UPROBE(name, args...)  BPF_KPROBE(name, ##args)
+#define BPF_URETPROBE(name, args...)  BPF_KRETPROBE(name, ##args)
+
 #endif
-- 
cgit 


From bc72742bebec8e0766af1e07e13169a873f9119c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:00 -0800
Subject: selftests/bpf: Validate arch-specific argument registers limits

Update uprobe_autoattach selftest to validate architecture-specific
argument passing through registers. Use new BPF_UPROBE and
BPF_URETPROBE, and construct both BPF-side and user-space side in such
a way that for different architectures we are fetching and checking
different number of arguments, matching architecture-specific limit of
how many registers are available for argument passing.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Alan Maguire <alan.maguire@oracle.com> # arm64
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com> # s390x
Link: https://lore.kernel.org/bpf/20230120200914.3008030-12-andrii@kernel.org
---
 .../selftests/bpf/prog_tests/uprobe_autoattach.c   | 33 ++++++++++++---
 tools/testing/selftests/bpf/progs/bpf_misc.h       | 25 +++++++++++
 .../selftests/bpf/progs/test_uprobe_autoattach.c   | 48 +++++++++++++++++++++-
 3 files changed, 99 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c b/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c
index 35b87c7ba5be..82807def0d24 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c
@@ -3,18 +3,21 @@
 
 #include <test_progs.h>
 #include "test_uprobe_autoattach.skel.h"
+#include "progs/bpf_misc.h"
 
 /* uprobe attach point */
-static noinline int autoattach_trigger_func(int arg)
+static noinline int autoattach_trigger_func(int arg1, int arg2, int arg3,
+					    int arg4, int arg5, int arg6,
+					    int arg7, int arg8)
 {
 	asm volatile ("");
-	return arg + 1;
+	return arg1 + arg2 + arg3 + arg4 + arg5 + arg6 + arg7 + arg8 + 1;
 }
 
 void test_uprobe_autoattach(void)
 {
 	struct test_uprobe_autoattach *skel;
-	int trigger_val = 100, trigger_ret;
+	int trigger_ret;
 	size_t malloc_sz = 1;
 	char *mem;
 
@@ -28,22 +31,42 @@ void test_uprobe_autoattach(void)
 	skel->bss->test_pid = getpid();
 
 	/* trigger & validate uprobe & uretprobe */
-	trigger_ret = autoattach_trigger_func(trigger_val);
+	trigger_ret = autoattach_trigger_func(1, 2, 3, 4, 5, 6, 7, 8);
 
 	skel->bss->test_pid = getpid();
 
 	/* trigger & validate shared library u[ret]probes attached by name */
 	mem = malloc(malloc_sz);
 
-	ASSERT_EQ(skel->bss->uprobe_byname_parm1, trigger_val, "check_uprobe_byname_parm1");
+	ASSERT_EQ(skel->bss->uprobe_byname_parm1, 1, "check_uprobe_byname_parm1");
 	ASSERT_EQ(skel->bss->uprobe_byname_ran, 1, "check_uprobe_byname_ran");
 	ASSERT_EQ(skel->bss->uretprobe_byname_rc, trigger_ret, "check_uretprobe_byname_rc");
+	ASSERT_EQ(skel->bss->uretprobe_byname_ret, trigger_ret, "check_uretprobe_byname_ret");
 	ASSERT_EQ(skel->bss->uretprobe_byname_ran, 2, "check_uretprobe_byname_ran");
 	ASSERT_EQ(skel->bss->uprobe_byname2_parm1, malloc_sz, "check_uprobe_byname2_parm1");
 	ASSERT_EQ(skel->bss->uprobe_byname2_ran, 3, "check_uprobe_byname2_ran");
 	ASSERT_EQ(skel->bss->uretprobe_byname2_rc, mem, "check_uretprobe_byname2_rc");
 	ASSERT_EQ(skel->bss->uretprobe_byname2_ran, 4, "check_uretprobe_byname2_ran");
 
+	ASSERT_EQ(skel->bss->a[0], 1, "arg1");
+	ASSERT_EQ(skel->bss->a[1], 2, "arg2");
+	ASSERT_EQ(skel->bss->a[2], 3, "arg3");
+#if FUNC_REG_ARG_CNT > 3
+	ASSERT_EQ(skel->bss->a[3], 4, "arg4");
+#endif
+#if FUNC_REG_ARG_CNT > 4
+	ASSERT_EQ(skel->bss->a[4], 5, "arg5");
+#endif
+#if FUNC_REG_ARG_CNT > 5
+	ASSERT_EQ(skel->bss->a[5], 6, "arg6");
+#endif
+#if FUNC_REG_ARG_CNT > 6
+	ASSERT_EQ(skel->bss->a[6], 7, "arg7");
+#endif
+#if FUNC_REG_ARG_CNT > 7
+	ASSERT_EQ(skel->bss->a[7], 8, "arg8");
+#endif
+
 	free(mem);
 cleanup:
 	test_uprobe_autoattach__destroy(skel);
diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index 2d7b89b447b2..14e28f991451 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -28,4 +28,29 @@
 #define SYS_PREFIX "__se_"
 #endif
 
+/* How many arguments are passed to function in register */
+#if defined(__TARGET_ARCH_x86) || defined(__x86_64__)
+#define FUNC_REG_ARG_CNT 6
+#elif defined(__i386__)
+#define FUNC_REG_ARG_CNT 3
+#elif defined(__TARGET_ARCH_s390) || defined(__s390x__)
+#define FUNC_REG_ARG_CNT 5
+#elif defined(__TARGET_ARCH_arm) || defined(__arm__)
+#define FUNC_REG_ARG_CNT 4
+#elif defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
+#define FUNC_REG_ARG_CNT 8
+#elif defined(__TARGET_ARCH_mips) || defined(__mips__)
+#define FUNC_REG_ARG_CNT 8
+#elif defined(__TARGET_ARCH_powerpc) || defined(__powerpc__) || defined(__powerpc64__)
+#define FUNC_REG_ARG_CNT 8
+#elif defined(__TARGET_ARCH_sparc) || defined(__sparc__)
+#define FUNC_REG_ARG_CNT 6
+#elif defined(__TARGET_ARCH_riscv) || defined(__riscv__)
+#define FUNC_REG_ARG_CNT 8
+#else
+/* default to 5 for others */
+#define FUNC_REG_ARG_CNT 5
+#endif
+
+
 #endif
diff --git a/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c b/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c
index ab75522e2eeb..774ddeb45898 100644
--- a/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c
+++ b/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c
@@ -6,10 +6,12 @@
 #include <bpf/bpf_core_read.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
 
 int uprobe_byname_parm1 = 0;
 int uprobe_byname_ran = 0;
 int uretprobe_byname_rc = 0;
+int uretprobe_byname_ret = 0;
 int uretprobe_byname_ran = 0;
 size_t uprobe_byname2_parm1 = 0;
 int uprobe_byname2_ran = 0;
@@ -18,6 +20,8 @@ int uretprobe_byname2_ran = 0;
 
 int test_pid;
 
+int a[8];
+
 /* This program cannot auto-attach, but that should not stop other
  * programs from attaching.
  */
@@ -28,18 +32,58 @@ int handle_uprobe_noautoattach(struct pt_regs *ctx)
 }
 
 SEC("uprobe//proc/self/exe:autoattach_trigger_func")
-int handle_uprobe_byname(struct pt_regs *ctx)
+int BPF_UPROBE(handle_uprobe_byname
+	       , int arg1
+	       , int arg2
+	       , int arg3
+#if FUNC_REG_ARG_CNT > 3
+	       , int arg4
+#endif
+#if FUNC_REG_ARG_CNT > 4
+	       , int arg5
+#endif
+#if FUNC_REG_ARG_CNT > 5
+	       , int arg6
+#endif
+#if FUNC_REG_ARG_CNT > 6
+	       , int arg7
+#endif
+#if FUNC_REG_ARG_CNT > 7
+	       , int arg8
+#endif
+)
 {
 	uprobe_byname_parm1 = PT_REGS_PARM1_CORE(ctx);
 	uprobe_byname_ran = 1;
+
+	a[0] = arg1;
+	a[1] = arg2;
+	a[2] = arg3;
+#if FUNC_REG_ARG_CNT > 3
+	a[3] = arg4;
+#endif
+#if FUNC_REG_ARG_CNT > 4
+	a[4] = arg5;
+#endif
+#if FUNC_REG_ARG_CNT > 5
+	a[5] = arg6;
+#endif
+#if FUNC_REG_ARG_CNT > 6
+	a[6] = arg7;
+#endif
+#if FUNC_REG_ARG_CNT > 7
+	a[7] = arg8;
+#endif
 	return 0;
 }
 
 SEC("uretprobe//proc/self/exe:autoattach_trigger_func")
-int handle_uretprobe_byname(struct pt_regs *ctx)
+int BPF_URETPROBE(handle_uretprobe_byname, int ret)
 {
 	uretprobe_byname_rc = PT_REGS_RC_CORE(ctx);
+	uretprobe_byname_ret = ret;
 	uretprobe_byname_ran = 2;
+
 	return 0;
 }
 
-- 
cgit 


From 8ccabeef913311eb54599abc52a6bcfc3e7bf1d8 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:01 -0800
Subject: libbpf: Improve syscall tracing support in bpf_tracing.h

Set up generic support in bpf_tracing.h for up to 7 syscall arguments
tracing with BPF_KSYSCALL, which seems to be the limit according to
syscall(2) manpage. Also change the way that syscall convention is
specified to be more explicit. Subsequent patches will adjust and define
proper per-architecture syscall conventions.

__PT_PARM1_SYSCALL_REG through __PT_PARM6_SYSCALL_REG is added
temporarily to keep everything working before each architecture has
syscall reg tables defined. They will be removed afterwards.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Alan Maguire <alan.maguire@oracle.com> # arm64
Link: https://lore.kernel.org/bpf/20230120200914.3008030-13-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 71 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 58 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 9c4246fe8799..e3bf510e1e55 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -367,6 +367,34 @@ struct pt_regs;
 #ifndef __PT_PARM8_REG
 #define __PT_PARM8_REG __unsupported__
 #endif
+/*
+ * Similarly, syscall-specific conventions might differ between function call
+ * conventions within each architecutre. All supported architectures pass
+ * either 6 or 7 syscall arguments in registers.
+ *
+ * See syscall(2) manpage for succinct table with information on each arch.
+ */
+#ifndef __PT_PARM1_SYSCALL_REG
+#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG
+#endif
+#ifndef __PT_PARM2_SYSCALL_REG
+#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
+#endif
+#ifndef __PT_PARM3_SYSCALL_REG
+#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
+#endif
+#ifndef __PT_PARM4_SYSCALL_REG
+#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
+#endif
+#ifndef __PT_PARM5_SYSCALL_REG
+#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
+#endif
+#ifndef __PT_PARM6_SYSCALL_REG
+#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG
+#endif
+#ifndef __PT_PARM7_SYSCALL_REG
+#define __PT_PARM7_SYSCALL_REG __unsupported__
+#endif
 
 #define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG)
 #define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG)
@@ -416,24 +444,33 @@ struct pt_regs;
 #endif
 
 #ifndef PT_REGS_PARM1_SYSCALL
-#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1(x)
+#define PT_REGS_PARM1_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM1_SYSCALL_REG)
+#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM1_SYSCALL_REG)
+#endif
+#ifndef PT_REGS_PARM2_SYSCALL
+#define PT_REGS_PARM2_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM2_SYSCALL_REG)
+#define PT_REGS_PARM2_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM2_SYSCALL_REG)
+#endif
+#ifndef PT_REGS_PARM3_SYSCALL
+#define PT_REGS_PARM3_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM3_SYSCALL_REG)
+#define PT_REGS_PARM3_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM3_SYSCALL_REG)
 #endif
-#define PT_REGS_PARM2_SYSCALL(x) PT_REGS_PARM2(x)
-#define PT_REGS_PARM3_SYSCALL(x) PT_REGS_PARM3(x)
 #ifndef PT_REGS_PARM4_SYSCALL
-#define PT_REGS_PARM4_SYSCALL(x) PT_REGS_PARM4(x)
+#define PT_REGS_PARM4_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM4_SYSCALL_REG)
+#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM4_SYSCALL_REG)
 #endif
-#define PT_REGS_PARM5_SYSCALL(x) PT_REGS_PARM5(x)
-
-#ifndef PT_REGS_PARM1_CORE_SYSCALL
-#define PT_REGS_PARM1_CORE_SYSCALL(x) PT_REGS_PARM1_CORE(x)
+#ifndef PT_REGS_PARM5_SYSCALL
+#define PT_REGS_PARM5_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM5_SYSCALL_REG)
+#define PT_REGS_PARM5_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM5_SYSCALL_REG)
+#endif
+#ifndef PT_REGS_PARM6_SYSCALL
+#define PT_REGS_PARM6_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM6_SYSCALL_REG)
+#define PT_REGS_PARM6_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM6_SYSCALL_REG)
 #endif
-#define PT_REGS_PARM2_CORE_SYSCALL(x) PT_REGS_PARM2_CORE(x)
-#define PT_REGS_PARM3_CORE_SYSCALL(x) PT_REGS_PARM3_CORE(x)
-#ifndef PT_REGS_PARM4_CORE_SYSCALL
-#define PT_REGS_PARM4_CORE_SYSCALL(x) PT_REGS_PARM4_CORE(x)
+#ifndef PT_REGS_PARM7_SYSCALL
+#define PT_REGS_PARM7_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM7_SYSCALL_REG)
+#define PT_REGS_PARM7_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM7_SYSCALL_REG)
 #endif
-#define PT_REGS_PARM5_CORE_SYSCALL(x) PT_REGS_PARM5_CORE(x)
 
 #else /* defined(bpf_target_defined) */
 
@@ -473,12 +510,16 @@ struct pt_regs;
 #define PT_REGS_PARM3_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_PARM4_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_PARM5_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM6_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM7_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 
 #define PT_REGS_PARM1_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_PARM2_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_PARM3_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_PARM4_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 #define PT_REGS_PARM5_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM6_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
+#define PT_REGS_PARM7_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; })
 
 #endif /* defined(bpf_target_defined) */
 
@@ -723,6 +764,8 @@ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)
 #define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_SYSCALL(regs)
 #define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_SYSCALL(regs)
 #define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_SYSCALL(regs)
+#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (void *)PT_REGS_PARM6_SYSCALL(regs)
+#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (void *)PT_REGS_PARM7_SYSCALL(regs)
 #define ___bpf_syscall_args(args...)     ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args)
 
 /* If kernel doesn't have CONFIG_ARCH_HAS_SYSCALL_WRAPPER, we have to BPF_CORE_READ from pt_regs */
@@ -732,6 +775,8 @@ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)
 #define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs)
 #define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs)
 #define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs)
+#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (void *)PT_REGS_PARM6_CORE_SYSCALL(regs)
+#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (void *)PT_REGS_PARM7_CORE_SYSCALL(regs)
 #define ___bpf_syswrap_args(args...)     ___bpf_apply(___bpf_syswrap_args, ___bpf_narg(args))(args)
 
 /*
-- 
cgit 


From d21fbceedd90ccc11775ca583889d21d3b66d88d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:02 -0800
Subject: libbpf: Define x86-64 syscall regs spec in bpf_tracing.h

Define explicit table of registers used for syscall argument passing.
Remove now unnecessary overrides of PT_REGS_PARM5_[CORE_]SYSCALL macros.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-14-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index e3bf510e1e55..5fd498821c29 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -90,14 +90,22 @@
 #define __PT_PARM4_REG cx
 #define __PT_PARM5_REG r8
 #define __PT_PARM6_REG r9
+/*
+ * Syscall uses r10 for PARM4. See arch/x86/entry/entry_64.S:entry_SYSCALL_64
+ * comments in Linux sources. And refer to syscall(2) manpage.
+ */
+#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG
+#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
+#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
+#define __PT_PARM4_SYSCALL_REG r10
+#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
+#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG
+
 #define __PT_RET_REG sp
 #define __PT_FP_REG bp
 #define __PT_RC_REG ax
 #define __PT_SP_REG sp
 #define __PT_IP_REG ip
-/* syscall uses r10 for PARM4 */
-#define PT_REGS_PARM4_SYSCALL(x) ((x)->r10)
-#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(x, r10)
 
 #else
 
@@ -121,14 +129,19 @@
 #define __PT_PARM4_REG rcx
 #define __PT_PARM5_REG r8
 #define __PT_PARM6_REG r9
+
+#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG
+#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
+#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
+#define __PT_PARM4_SYSCALL_REG r10
+#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
+#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG
+
 #define __PT_RET_REG rsp
 #define __PT_FP_REG rbp
 #define __PT_RC_REG rax
 #define __PT_SP_REG rsp
 #define __PT_IP_REG rip
-/* syscall uses r10 for PARM4 */
-#define PT_REGS_PARM4_SYSCALL(x) ((x)->r10)
-#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(x, r10)
 
 #endif /* __i386__ */
 
-- 
cgit 


From ff00f9cbd2ddee60cd1542e58631638a5996b05f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:03 -0800
Subject: libbpf: Define i386 syscall regs spec in bpf_tracing.h

Define explicit table of registers used for syscall argument passing.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-15-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 5fd498821c29..4d04c63fda27 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -115,6 +115,14 @@
 #define __PT_PARM1_REG eax
 #define __PT_PARM2_REG edx
 #define __PT_PARM3_REG ecx
+/* i386 syscall ABI is very different, refer to syscall(2) manpage */
+#define __PT_PARM1_SYSCALL_REG ebx
+#define __PT_PARM2_SYSCALL_REG ecx
+#define __PT_PARM3_SYSCALL_REG edx
+#define __PT_PARM4_SYSCALL_REG esi
+#define __PT_PARM5_SYSCALL_REG edi
+#define __PT_PARM6_SYSCALL_REG ebp
+
 #define __PT_RET_REG esp
 #define __PT_FP_REG ebp
 #define __PT_RC_REG eax
-- 
cgit 


From e82b96a3a99f9a04566305490e88efef80a80247 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:04 -0800
Subject: libbpf: Define s390x syscall regs spec in bpf_tracing.h

Define explicit table of registers used for syscall argument passing.
Note that we need custom overrides for PT_REGS_PARM1_[CORE_]SYSCALL
macros due to the need to use BPF CO-RE and custom local pt_regs
definitions to fetch orig_gpr2, storing 1st argument.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com> # s390x
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-16-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 4d04c63fda27..b63d538eb5f5 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -157,6 +157,10 @@
 
 #elif defined(bpf_target_s390)
 
+/*
+ * https://github.com/IBM/s390x-abi/releases/download/v1.6/lzsabi_s390x.pdf
+ */
+
 struct pt_regs___s390 {
 	unsigned long orig_gpr2;
 };
@@ -168,13 +172,22 @@ struct pt_regs___s390 {
 #define __PT_PARM3_REG gprs[4]
 #define __PT_PARM4_REG gprs[5]
 #define __PT_PARM5_REG gprs[6]
+
+#define __PT_PARM1_SYSCALL_REG orig_gpr2
+#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
+#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
+#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
+#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
+#define __PT_PARM6_SYSCALL_REG gprs[7]
+#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x)
+#define PT_REGS_PARM1_CORE_SYSCALL(x) \
+	BPF_CORE_READ((const struct pt_regs___s390 *)(x), __PT_PARM1_SYSCALL_REG)
+
 #define __PT_RET_REG gprs[14]
 #define __PT_FP_REG gprs[11]	/* Works only with CONFIG_FRAME_POINTER */
 #define __PT_RC_REG gprs[2]
 #define __PT_SP_REG gprs[15]
 #define __PT_IP_REG psw.addr
-#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x)
-#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ((const struct pt_regs___s390 *)(x), orig_gpr2)
 
 #elif defined(bpf_target_arm)
 
-- 
cgit 


From 3a95c42d65d5c8d836bc7f881b3ffd5229d1b73a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:05 -0800
Subject: libbpf: Define arm syscall regs spec in bpf_tracing.h

Define explicit table of registers used for syscall argument passing.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-17-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index b63d538eb5f5..edd85a0ab612 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -199,6 +199,14 @@ struct pt_regs___s390 {
 #define __PT_PARM2_REG uregs[1]
 #define __PT_PARM3_REG uregs[2]
 #define __PT_PARM4_REG uregs[3]
+
+#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG
+#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
+#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
+#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
+#define __PT_PARM6_SYSCALL_REG uregs[5]
+#define __PT_PARM7_SYSCALL_REG uregs[6]
+
 #define __PT_RET_REG uregs[14]
 #define __PT_FP_REG uregs[11]	/* Works only with CONFIG_FRAME_POINTER */
 #define __PT_RC_REG uregs[0]
-- 
cgit 


From 3488ea0584bb3888e2fcbe2d2ff738ff173d91ab Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:06 -0800
Subject: libbpf: Define arm64 syscall regs spec in bpf_tracing.h

Define explicit table of registers used for syscall argument passing.
We need PT_REGS_PARM1_[CORE_]SYSCALL macros overrides, similarly to
s390x, due to orig_x0 not being present in UAPI's pt_regs, so we need to
utilize BPF CO-RE and custom pt_regs___arm64 definition.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Alan Maguire <alan.maguire@oracle.com> # arm64
Link: https://lore.kernel.org/bpf/20230120200914.3008030-18-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index edd85a0ab612..f753346e9be6 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -233,13 +233,22 @@ struct pt_regs___arm64 {
 #define __PT_PARM6_REG regs[5]
 #define __PT_PARM7_REG regs[6]
 #define __PT_PARM8_REG regs[7]
+
+#define __PT_PARM1_SYSCALL_REG orig_x0
+#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
+#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
+#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
+#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
+#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG
+#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x)
+#define PT_REGS_PARM1_CORE_SYSCALL(x) \
+	BPF_CORE_READ((const struct pt_regs___arm64 *)(x), __PT_PARM1_SYSCALL_REG)
+
 #define __PT_RET_REG regs[30]
 #define __PT_FP_REG regs[29]	/* Works only with CONFIG_FRAME_POINTER */
 #define __PT_RC_REG regs[0]
 #define __PT_SP_REG sp
 #define __PT_IP_REG pc
-#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x)
-#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ((const struct pt_regs___arm64 *)(x), orig_x0)
 
 #elif defined(bpf_target_mips)
 
-- 
cgit 


From cfd0bbe91536095f5491e9ac71fb11304dd87dd2 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:07 -0800
Subject: libbpf: Define mips syscall regs spec in bpf_tracing.h

Define explicit table of registers used for syscall argument passing.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-19-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index f753346e9be6..a3da63b74b0b 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -265,6 +265,14 @@ struct pt_regs___arm64 {
 #define __PT_PARM6_REG regs[9]
 #define __PT_PARM7_REG regs[10]
 #define __PT_PARM8_REG regs[11]
+
+#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG
+#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
+#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
+#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
+#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG /* only N32/N64 */
+#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG /* only N32/N64 */
+
 #define __PT_RET_REG regs[31]
 #define __PT_FP_REG regs[30]	/* Works only with CONFIG_FRAME_POINTER */
 #define __PT_RC_REG regs[2]
-- 
cgit 


From c1cc01a2d1d1d94079fa87ba44c34bd7bdf9175d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:08 -0800
Subject: libbpf: Define powerpc syscall regs spec in bpf_tracing.h

Define explicit table of registers used for syscall argument passing.
Note that 7th arg is supported on 32-bit powerpc architecture, by not on
powerpc64.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-20-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index a3da63b74b0b..b3647c4a6c0c 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -294,13 +294,24 @@ struct pt_regs___arm64 {
 #define __PT_PARM6_REG gpr[8]
 #define __PT_PARM7_REG gpr[9]
 #define __PT_PARM8_REG gpr[10]
+
+/* powerpc does not select ARCH_HAS_SYSCALL_WRAPPER. */
+#define PT_REGS_SYSCALL_REGS(ctx) ctx
+#define __PT_PARM1_SYSCALL_REG orig_gpr3
+#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
+#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
+#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
+#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
+#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG
+#if !defined(__arch64__)
+#define __PT_PARM7_SYSCALL_REG __PT_PARM7_REG /* only powerpc (not powerpc64) */
+#endif
+
 #define __PT_RET_REG regs[31]
 #define __PT_FP_REG __unsupported__
 #define __PT_RC_REG gpr[3]
 #define __PT_SP_REG sp
 #define __PT_IP_REG nip
-/* powerpc does not select ARCH_HAS_SYSCALL_WRAPPER. */
-#define PT_REGS_SYSCALL_REGS(ctx) ctx
 
 #elif defined(bpf_target_sparc)
 
-- 
cgit 


From 377c15b1a2cd6e38db86be7a721c8311d8a73836 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:09 -0800
Subject: libbpf: Define sparc syscall regs spec in bpf_tracing.h

Define explicit table of registers used for syscall argument passing.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-21-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index b3647c4a6c0c..a2db267a6bce 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -325,6 +325,14 @@ struct pt_regs___arm64 {
 #define __PT_PARM4_REG u_regs[UREG_I3]
 #define __PT_PARM5_REG u_regs[UREG_I4]
 #define __PT_PARM6_REG u_regs[UREG_I5]
+
+#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG
+#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
+#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
+#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
+#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
+#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG
+
 #define __PT_RET_REG u_regs[UREG_I7]
 #define __PT_FP_REG __unsupported__
 #define __PT_RC_REG u_regs[UREG_I0]
-- 
cgit 


From a0426216a32004bbb1c96dbffaeca7ac261960cf Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:10 -0800
Subject: libbpf: Define riscv syscall regs spec in bpf_tracing.h

Define explicit table of registers used for syscall argument passing.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Pu Lehui <pulehui@huawei.com> # RISC-V
Link: https://lore.kernel.org/bpf/20230120200914.3008030-22-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index a2db267a6bce..047dc84c7cc9 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -359,13 +359,21 @@ struct pt_regs___arm64 {
 #define __PT_PARM6_REG a5
 #define __PT_PARM7_REG a6
 #define __PT_PARM8_REG a7
+
+/* riscv does not select ARCH_HAS_SYSCALL_WRAPPER. */
+#define PT_REGS_SYSCALL_REGS(ctx) ctx
+#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG
+#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
+#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
+#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
+#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
+#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG
+
 #define __PT_RET_REG ra
 #define __PT_FP_REG s0
 #define __PT_RC_REG a0
 #define __PT_SP_REG sp
 #define __PT_IP_REG pc
-/* riscv does not select ARCH_HAS_SYSCALL_WRAPPER. */
-#define PT_REGS_SYSCALL_REGS(ctx) ctx
 
 #elif defined(bpf_target_arc)
 
-- 
cgit 


From 2cf802737fb9cbc258ec65b15c06bd226881ae88 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:11 -0800
Subject: libbpf: Define arc syscall regs spec in bpf_tracing.h

Define explicit table of registers used for syscall argument passing.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-23-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 047dc84c7cc9..87fd2c774e73 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -392,13 +392,21 @@ struct pt_regs___arm64 {
 #define __PT_PARM6_REG scratch.r5
 #define __PT_PARM7_REG scratch.r6
 #define __PT_PARM8_REG scratch.r7
+
+/* arc does not select ARCH_HAS_SYSCALL_WRAPPER. */
+#define PT_REGS_SYSCALL_REGS(ctx) ctx
+#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG
+#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
+#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
+#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
+#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
+#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG
+
 #define __PT_RET_REG scratch.blink
 #define __PT_FP_REG scratch.fp
 #define __PT_RC_REG scratch.r0
 #define __PT_SP_REG scratch.sp
 #define __PT_IP_REG scratch.ret
-/* arc does not select ARCH_HAS_SYSCALL_WRAPPER. */
-#define PT_REGS_SYSCALL_REGS(ctx) ctx
 
 #elif defined(bpf_target_loongarch)
 
-- 
cgit 


From 12a299f0b5c746c244bd06fe6a2b3672e9036e83 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:12 -0800
Subject: libbpf: Define loongarch syscall regs spec in bpf_tracing.h

Define explicit table of registers used for syscall argument passing.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-24-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 87fd2c774e73..b37d3c78b19a 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -423,13 +423,21 @@ struct pt_regs___arm64 {
 #define __PT_PARM6_REG regs[9]
 #define __PT_PARM7_REG regs[10]
 #define __PT_PARM8_REG regs[11]
+
+/* loongarch does not select ARCH_HAS_SYSCALL_WRAPPER. */
+#define PT_REGS_SYSCALL_REGS(ctx) ctx
+#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG
+#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
+#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
+#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
+#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
+#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG
+
 #define __PT_RET_REG regs[1]
 #define __PT_FP_REG regs[22]
 #define __PT_RC_REG regs[4]
 #define __PT_SP_REG regs[3]
 #define __PT_IP_REG csr_era
-/* loongarch does not select ARCH_HAS_SYSCALL_WRAPPER. */
-#define PT_REGS_SYSCALL_REGS(ctx) ctx
 
 #endif
 
-- 
cgit 


From 92dc5cdfc1131befa061cab8ed772c7bf8bfb2db Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:13 -0800
Subject: selftests/bpf: Add 6-argument syscall tracing test

Turns out splice() is one of the syscalls that's using current maximum
number of arguments (six). This is perfect for testing, so extend
bpf_syscall_macro selftest to also trace splice() syscall, using
BPF_KSYSCALL() macro. This makes sure all the syscall argument register
definitions are correct.

Suggested-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Alan Maguire <alan.maguire@oracle.com> # arm64
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com> # s390x
Link: https://lore.kernel.org/bpf/20230120200914.3008030-25-andrii@kernel.org
---
 .../bpf/prog_tests/test_bpf_syscall_macro.c        | 17 ++++++++++++++
 .../selftests/bpf/progs/bpf_syscall_macro.c        | 26 ++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c b/tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c
index c381faaae741..2900c5e9a016 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright 2022 Sony Group Corporation */
+#define _GNU_SOURCE
+#include <fcntl.h>
 #include <sys/prctl.h>
 #include <test_progs.h>
 #include "bpf_syscall_macro.skel.h"
@@ -13,6 +15,8 @@ void test_bpf_syscall_macro(void)
 	unsigned long exp_arg3 = 13;
 	unsigned long exp_arg4 = 14;
 	unsigned long exp_arg5 = 15;
+	loff_t off_in, off_out;
+	ssize_t r;
 
 	/* check whether it can open program */
 	skel = bpf_syscall_macro__open();
@@ -33,6 +37,7 @@ void test_bpf_syscall_macro(void)
 
 	/* check whether args of syscall are copied correctly */
 	prctl(exp_arg1, exp_arg2, exp_arg3, exp_arg4, exp_arg5);
+
 #if defined(__aarch64__) || defined(__s390__)
 	ASSERT_NEQ(skel->bss->arg1, exp_arg1, "syscall_arg1");
 #else
@@ -68,6 +73,18 @@ void test_bpf_syscall_macro(void)
 	ASSERT_EQ(skel->bss->arg4_syscall, exp_arg4, "BPF_KPROBE_SYSCALL_arg4");
 	ASSERT_EQ(skel->bss->arg5_syscall, exp_arg5, "BPF_KPROBE_SYSCALL_arg5");
 
+	r = splice(-42, &off_in, 42, &off_out, 0x12340000, SPLICE_F_NONBLOCK);
+	err = -errno;
+	ASSERT_EQ(r, -1, "splice_res");
+	ASSERT_EQ(err, -EBADF, "splice_err");
+
+	ASSERT_EQ(skel->bss->splice_fd_in, -42, "splice_arg1");
+	ASSERT_EQ(skel->bss->splice_off_in, (__u64)&off_in, "splice_arg2");
+	ASSERT_EQ(skel->bss->splice_fd_out, 42, "splice_arg3");
+	ASSERT_EQ(skel->bss->splice_off_out, (__u64)&off_out, "splice_arg4");
+	ASSERT_EQ(skel->bss->splice_len, 0x12340000, "splice_arg5");
+	ASSERT_EQ(skel->bss->splice_flags, SPLICE_F_NONBLOCK, "splice_arg6");
+
 cleanup:
 	bpf_syscall_macro__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c b/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c
index e1e11897e99b..1a476d8ed354 100644
--- a/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c
+++ b/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c
@@ -81,4 +81,30 @@ int BPF_KSYSCALL(prctl_enter, int option, unsigned long arg2,
 	return 0;
 }
 
+__u64 splice_fd_in;
+__u64 splice_off_in;
+__u64 splice_fd_out;
+__u64 splice_off_out;
+__u64 splice_len;
+__u64 splice_flags;
+
+SEC("ksyscall/splice")
+int BPF_KSYSCALL(splice_enter, int fd_in, loff_t *off_in, int fd_out,
+		 loff_t *off_out, size_t len, unsigned int flags)
+{
+	pid_t pid = bpf_get_current_pid_tgid() >> 32;
+
+	if (pid != filter_pid)
+		return 0;
+
+	splice_fd_in = fd_in;
+	splice_off_in = (__u64)off_in;
+	splice_fd_out = fd_out;
+	splice_off_out = (__u64)off_out;
+	splice_len = len;
+	splice_flags = flags;
+
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit 


From a4d325ae461c5d8e3c5e222082e0b5755b7b9b7f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 20 Jan 2023 12:09:14 -0800
Subject: libbpf: Clean up now not needed __PT_PARM{1-6}_SYSCALL_REG defaults

Each architecture supports at least 6 syscall argument registers, so now
that specs for each architecture is defined in bpf_tracing.h, remove
unnecessary macro overrides, which previously were required to keep
existing BPF_KSYSCALL() uses compiling and working.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230120200914.3008030-26-andrii@kernel.org
---
 tools/lib/bpf/bpf_tracing.h | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index b37d3c78b19a..6db88f41fa0d 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -476,24 +476,6 @@ struct pt_regs;
  *
  * See syscall(2) manpage for succinct table with information on each arch.
  */
-#ifndef __PT_PARM1_SYSCALL_REG
-#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG
-#endif
-#ifndef __PT_PARM2_SYSCALL_REG
-#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG
-#endif
-#ifndef __PT_PARM3_SYSCALL_REG
-#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG
-#endif
-#ifndef __PT_PARM4_SYSCALL_REG
-#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG
-#endif
-#ifndef __PT_PARM5_SYSCALL_REG
-#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG
-#endif
-#ifndef __PT_PARM6_SYSCALL_REG
-#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG
-#endif
 #ifndef __PT_PARM7_SYSCALL_REG
 #define __PT_PARM7_SYSCALL_REG __unsupported__
 #endif
-- 
cgit 


From 057fb03160a88925877548979ab4ab7f9223e118 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Jan 2023 18:11:36 +0000
Subject: selftests: net: tcp_mmap: populate pages in send path

In commit 72653ae5303c ("selftests: net: tcp_mmap:
Use huge pages in send path") I made a change to use hugepages
for the buffer used by the client (tx path)

Today, I understood that the cause for poor zerocopy
performance was that after a mmap() for a 512KB memory
zone, kernel uses a single zeropage, mapped 128 times.

This was really the reason for poor tx path performance
in zero copy mode, because this zero page refcount is
under high pressure, especially when TCP ACK packets
are processed on another cpu.

We need either to force a COW on all the memory range,
or use MAP_POPULATE so that a zero page is not abused.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20230120181136.3764521-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tcp_mmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tcp_mmap.c b/tools/testing/selftests/net/tcp_mmap.c
index 00f837c9bc6c..46a02bbd31d0 100644
--- a/tools/testing/selftests/net/tcp_mmap.c
+++ b/tools/testing/selftests/net/tcp_mmap.c
@@ -137,7 +137,8 @@ static void *mmap_large_buffer(size_t need, size_t *allocated)
 	if (buffer == (void *)-1) {
 		sz = need;
 		buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE,
-			      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+			      MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE,
+			      -1, 0);
 		if (buffer != (void *)-1)
 			fprintf(stderr, "MAP_HUGETLB attempt failed, look at /sys/kernel/mm/hugepages for optimal performance\n");
 	}
-- 
cgit 


From ca22da2fbd693b54dc8e3b7b54ccc9f7e9ba3640 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Fri, 20 Jan 2023 18:01:40 +0100
Subject: act_mirred: use the backlog for nested calls to mirred ingress

William reports kernel soft-lockups on some OVS topologies when TC mirred
egress->ingress action is hit by local TCP traffic [1].
The same can also be reproduced with SCTP (thanks Xin for verifying), when
client and server reach themselves through mirred egress to ingress, and
one of the two peers sends a "heartbeat" packet (from within a timer).

Enqueueing to backlog proved to fix this soft lockup; however, as Cong
noticed [2], we should preserve - when possible - the current mirred
behavior that counts as "overlimits" any eventual packet drop subsequent to
the mirred forwarding action [3]. A compromise solution might use the
backlog only when tcf_mirred_act() has a nest level greater than one:
change tcf_mirred_forward() accordingly.

Also, add a kselftest that can reproduce the lockup and verifies TC mirred
ability to account for further packet drops after TC mirred egress->ingress
(when the nest level is 1).

 [1] https://lore.kernel.org/netdev/33dc43f587ec1388ba456b4915c75f02a8aae226.1663945716.git.dcaratti@redhat.com/
 [2] https://lore.kernel.org/netdev/Y0w%2FWWY60gqrtGLp@pop-os.localdomain/
 [3] such behavior is not guaranteed: for example, if RPS or skb RX
     timestamping is enabled on the mirred target device, the kernel
     can defer receiving the skb and return NET_RX_SUCCESS inside
     tcf_mirred_forward().

Reported-by: William Zhao <wizhao@redhat.com>
CC: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/sched/act_mirred.c                             |  7 ++++
 .../testing/selftests/net/forwarding/tc_actions.sh | 49 +++++++++++++++++++++-
 2 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index c8abb5136491..8037ec9b1d31 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -206,12 +206,19 @@ release_idr:
 	return err;
 }
 
+static bool is_mirred_nested(void)
+{
+	return unlikely(__this_cpu_read(mirred_nest_level) > 1);
+}
+
 static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb)
 {
 	int err;
 
 	if (!want_ingress)
 		err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
+	else if (is_mirred_nested())
+		err = netif_rx(skb);
 	else
 		err = netif_receive_skb(skb);
 
diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh
index 1e0a62f638fe..919c0dd9fe4b 100755
--- a/tools/testing/selftests/net/forwarding/tc_actions.sh
+++ b/tools/testing/selftests/net/forwarding/tc_actions.sh
@@ -3,7 +3,8 @@
 
 ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \
 	mirred_egress_mirror_test matchall_mirred_egress_mirror_test \
-	gact_trap_test mirred_egress_to_ingress_test"
+	gact_trap_test mirred_egress_to_ingress_test \
+	mirred_egress_to_ingress_tcp_test"
 NUM_NETIFS=4
 source tc_common.sh
 source lib.sh
@@ -198,6 +199,52 @@ mirred_egress_to_ingress_test()
 	log_test "mirred_egress_to_ingress ($tcflags)"
 }
 
+mirred_egress_to_ingress_tcp_test()
+{
+	local tmpfile=$(mktemp) tmpfile1=$(mktemp)
+
+	RET=0
+	dd conv=sparse status=none if=/dev/zero bs=1M count=2 of=$tmpfile
+	tc filter add dev $h1 protocol ip pref 100 handle 100 egress flower \
+		$tcflags ip_proto tcp src_ip 192.0.2.1 dst_ip 192.0.2.2 \
+			action ct commit nat src addr 192.0.2.2 pipe \
+			action ct clear pipe \
+			action ct commit nat dst addr 192.0.2.1 pipe \
+			action ct clear pipe \
+			action skbedit ptype host pipe \
+			action mirred ingress redirect dev $h1
+	tc filter add dev $h1 protocol ip pref 101 handle 101 egress flower \
+		$tcflags ip_proto icmp \
+			action mirred ingress redirect dev $h1
+	tc filter add dev $h1 protocol ip pref 102 handle 102 ingress flower \
+		ip_proto icmp \
+			action drop
+
+	ip vrf exec v$h1 nc --recv-only -w10 -l -p 12345 -o $tmpfile1  &
+	local rpid=$!
+	ip vrf exec v$h1 nc -w1 --send-only 192.0.2.2 12345 <$tmpfile
+	wait -n $rpid
+	cmp -s $tmpfile $tmpfile1
+	check_err $? "server output check failed"
+
+	$MZ $h1 -c 10 -p 64 -a $h1mac -b $h1mac -A 192.0.2.1 -B 192.0.2.1 \
+		-t icmp "ping,id=42,seq=5" -q
+	tc_check_packets "dev $h1 egress" 101 10
+	check_err $? "didn't mirred redirect ICMP"
+	tc_check_packets "dev $h1 ingress" 102 10
+	check_err $? "didn't drop mirred ICMP"
+	local overlimits=$(tc_rule_stats_get ${h1} 101 egress .overlimits)
+	test ${overlimits} = 10
+	check_err $? "wrong overlimits, expected 10 got ${overlimits}"
+
+	tc filter del dev $h1 egress protocol ip pref 100 handle 100 flower
+	tc filter del dev $h1 egress protocol ip pref 101 handle 101 flower
+	tc filter del dev $h1 ingress protocol ip pref 102 handle 102 flower
+
+	rm -f $tmpfile $tmpfile1
+	log_test "mirred_egress_to_ingress_tcp ($tcflags)"
+}
+
 setup_prepare()
 {
 	h1=${NETIFS[p1]}
-- 
cgit 


From be5bea1cc0bf663aa0a45a2c0021dfcfb6cca976 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 20 Jan 2023 09:50:36 -0800
Subject: net: add basic C code generators for Netlink

Code generators to turn Netlink specs into C code.
I'm definitely not proud of it.

The main generator is in Python, there's a bash script
to regen all code-gen'ed files in tree after making
spec changes.

Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 MAINTAINERS                |    1 +
 tools/net/ynl/ynl-gen-c.py | 2373 ++++++++++++++++++++++++++++++++++++++++++++
 tools/net/ynl/ynl-regen.sh |   30 +
 3 files changed, 2404 insertions(+)
 create mode 100755 tools/net/ynl/ynl-gen-c.py
 create mode 100755 tools/net/ynl/ynl-regen.sh

(limited to 'tools')

diff --git a/MAINTAINERS b/MAINTAINERS
index dfb447c6b094..6e85524a7443 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14577,6 +14577,7 @@ F:	include/uapi/linux/netdevice.h
 F:	lib/net_utils.c
 F:	lib/random32.c
 F:	net/
+F:	tools/net/
 F:	tools/testing/selftests/net/
 
 NETWORKING [IPSEC]
diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
new file mode 100755
index 000000000000..0c0f18540b7f
--- /dev/null
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -0,0 +1,2373 @@
+#!/usr/bin/env python
+
+import argparse
+import jsonschema
+import os
+import yaml
+
+
+def c_upper(name):
+    return name.upper().replace('-', '_')
+
+
+def c_lower(name):
+    return name.lower().replace('-', '_')
+
+
+class BaseNlLib:
+    def get_family_id(self):
+        return 'ys->family_id'
+
+    def parse_cb_run(self, cb, data, is_dump=False, indent=1):
+        ind = '\n\t\t' + '\t' * indent + ' '
+        if is_dump:
+            return f"mnl_cb_run2(ys->rx_buf, len, 0, 0, {cb}, {data},{ind}ynl_cb_array, NLMSG_MIN_TYPE)"
+        else:
+            return f"mnl_cb_run2(ys->rx_buf, len, ys->seq, ys->portid,{ind}{cb}, {data},{ind}" + \
+                   "ynl_cb_array, NLMSG_MIN_TYPE)"
+
+
+class Type:
+    def __init__(self, family, attr_set, attr):
+        self.family = family
+        self.attr = attr
+        self.value = attr['value']
+        self.name = c_lower(attr['name'])
+        self.type = attr['type']
+        self.checks = attr.get('checks', {})
+
+        if 'len' in attr:
+            self.len = attr['len']
+        if 'nested-attributes' in attr:
+            self.nested_attrs = attr['nested-attributes']
+            if self.nested_attrs == family.name:
+                self.nested_render_name = f"{family.name}"
+            else:
+                self.nested_render_name = f"{family.name}_{c_lower(self.nested_attrs)}"
+
+        self.enum_name = f"{attr_set.name_prefix}{self.name}"
+        self.enum_name = c_upper(self.enum_name)
+        self.c_name = c_lower(self.name)
+        if self.c_name in _C_KW:
+            self.c_name += '_'
+
+    def __getitem__(self, key):
+        return self.attr[key]
+
+    def __contains__(self, key):
+        return key in self.attr
+
+    def is_multi_val(self):
+        return None
+
+    def is_scalar(self):
+        return self.type in {'u8', 'u16', 'u32', 'u64', 's32', 's64'}
+
+    def presence_type(self):
+        return 'bit'
+
+    def presence_member(self, space, type_filter):
+        if self.presence_type() != type_filter:
+            return
+
+        if self.presence_type() == 'bit':
+            pfx = '__' if space == 'user' else ''
+            return f"{pfx}u32 {self.c_name}:1;"
+
+        if self.presence_type() == 'len':
+            pfx = '__' if space == 'user' else ''
+            return f"{pfx}u32 {self.c_name}_len;"
+
+    def _complex_member_type(self, ri):
+        return None
+
+    def free_needs_iter(self):
+        return False
+
+    def free(self, ri, var, ref):
+        if self.is_multi_val() or self.presence_type() == 'len':
+            ri.cw.p(f'free({var}->{ref}{self.c_name});')
+
+    def arg_member(self, ri):
+        member = self._complex_member_type(ri)
+        if member:
+            return [member + ' *' + self.c_name]
+        raise Exception(f"Struct member not implemented for class type {self.type}")
+
+    def struct_member(self, ri):
+        if self.is_multi_val():
+            ri.cw.p(f"unsigned int n_{self.c_name};")
+        member = self._complex_member_type(ri)
+        if member:
+            ptr = '*' if self.is_multi_val() else ''
+            ri.cw.p(f"{member} {ptr}{self.c_name};")
+            return
+        members = self.arg_member(ri)
+        for one in members:
+            ri.cw.p(one + ';')
+
+    def _attr_policy(self, policy):
+        return '{ .type = ' + policy + ', }'
+
+    def attr_policy(self, cw):
+        policy = c_upper('nla-' + self.attr['type'])
+
+        spec = self._attr_policy(policy)
+        cw.p(f"\t[{self.enum_name}] = {spec},")
+
+    def _attr_typol(self):
+        raise Exception(f"Type policy not implemented for class type {self.type}")
+
+    def attr_typol(self, cw):
+        typol = self._attr_typol()
+        cw.p(f'[{self.enum_name}] = {"{"} .name = "{self.name}", {typol}{"}"},')
+
+    def _attr_put_line(self, ri, var, line):
+        if self.presence_type() == 'bit':
+            ri.cw.p(f"if ({var}->_present.{self.c_name})")
+        elif self.presence_type() == 'len':
+            ri.cw.p(f"if ({var}->_present.{self.c_name}_len)")
+        ri.cw.p(f"{line};")
+
+    def _attr_put_simple(self, ri, var, put_type):
+        line = f"mnl_attr_put_{put_type}(nlh, {self.enum_name}, {var}->{self.c_name})"
+        self._attr_put_line(ri, var, line)
+
+    def attr_put(self, ri, var):
+        raise Exception(f"Put not implemented for class type {self.type}")
+
+    def _attr_get(self, ri, var):
+        raise Exception(f"Attr get not implemented for class type {self.type}")
+
+    def attr_get(self, ri, var, first):
+        lines, init_lines, local_vars = self._attr_get(ri, var)
+        if type(lines) is str:
+            lines = [lines]
+        if type(init_lines) is str:
+            init_lines = [init_lines]
+
+        kw = 'if' if first else 'else if'
+        ri.cw.block_start(line=f"{kw} (mnl_attr_get_type(attr) == {self.enum_name})")
+        if local_vars:
+            for local in local_vars:
+                ri.cw.p(local)
+            ri.cw.nl()
+
+        if not self.is_multi_val():
+            ri.cw.p("if (ynl_attr_validate(yarg, attr))")
+            ri.cw.p("return MNL_CB_ERROR;")
+            if self.presence_type() == 'bit':
+                ri.cw.p(f"{var}->_present.{self.c_name} = 1;")
+
+        if init_lines:
+            ri.cw.nl()
+            for line in init_lines:
+                ri.cw.p(line)
+
+        for line in lines:
+            ri.cw.p(line)
+        ri.cw.block_end()
+
+    def _setter_lines(self, ri, member, presence):
+        raise Exception(f"Setter not implemented for class type {self.type}")
+
+    def setter(self, ri, space, direction, deref=False, ref=None):
+        ref = (ref if ref else []) + [self.c_name]
+        var = "req"
+        member = f"{var}->{'.'.join(ref)}"
+
+        code = []
+        presence = ''
+        for i in range(0, len(ref)):
+            presence = f"{var}->{'.'.join(ref[:i] + [''])}_present.{ref[i]}"
+            if self.presence_type() == 'bit':
+                code.append(presence + ' = 1;')
+        code += self._setter_lines(ri, member, presence)
+
+        ri.cw.write_func('static inline void',
+                         f"{op_prefix(ri, direction, deref=deref)}_set_{'_'.join(ref)}",
+                         body=code,
+                         args=[f'{type_name(ri, direction, deref=deref)} *{var}'] + self.arg_member(ri))
+
+
+class TypeUnused(Type):
+    def presence_type(self):
+        return ''
+
+    def _attr_typol(self):
+        return '.type = YNL_PT_REJECT, '
+
+    def attr_policy(self, cw):
+        pass
+
+
+class TypePad(Type):
+    def presence_type(self):
+        return ''
+
+    def _attr_typol(self):
+        return '.type = YNL_PT_REJECT, '
+
+    def attr_policy(self, cw):
+        pass
+
+
+class TypeScalar(Type):
+    def __init__(self, family, attr_set, attr):
+        super().__init__(family, attr_set, attr)
+
+        self.is_bitfield = False
+        if 'enum' in self.attr:
+            self.is_bitfield = family.consts[self.attr['enum']]['type'] == 'flags'
+        if 'enum-as-flags' in self.attr and self.attr['enum-as-flags']:
+            self.is_bitfield = True
+
+        if 'enum' in self.attr and not self.is_bitfield:
+            self.type_name = f"enum {family.name}_{c_lower(self.attr['enum'])}"
+        else:
+            self.type_name = '__' + self.type
+
+        self.byte_order_comment = ''
+        if 'byte-order' in attr:
+            self.byte_order_comment = f" /* {attr['byte-order']} */"
+
+    def _mnl_type(self):
+        t = self.type
+        # mnl does not have a helper for signed types
+        if t[0] == 's':
+            t = 'u' + t[1:]
+        return t
+
+    def _attr_policy(self, policy):
+        if 'flags-mask' in self.checks or self.is_bitfield:
+            if self.is_bitfield:
+                mask = self.family.consts[self.attr['enum']].get_mask()
+            else:
+                flags = self.family.consts[self.checks['flags-mask']]
+                flag_cnt = len(flags['entries'])
+                mask = (1 << flag_cnt) - 1
+            return f"NLA_POLICY_MASK({policy}, 0x{mask:x})"
+        elif 'min' in self.checks:
+            return f"NLA_POLICY_MIN({policy}, {self.checks['min']})"
+        elif 'enum' in self.attr:
+            enum = self.family.consts[self.attr['enum']]
+            cnt = len(enum['entries'])
+            return f"NLA_POLICY_MAX({policy}, {cnt - 1})"
+        return super()._attr_policy(policy)
+
+    def _attr_typol(self):
+        return f'.type = YNL_PT_U{self.type[1:]}, '
+
+    def arg_member(self, ri):
+        return [f'{self.type_name} {self.c_name}{self.byte_order_comment}']
+
+    def attr_put(self, ri, var):
+        self._attr_put_simple(ri, var, self._mnl_type())
+
+    def _attr_get(self, ri, var):
+        return f"{var}->{self.c_name} = mnl_attr_get_{self._mnl_type()}(attr);", None, None
+
+    def _setter_lines(self, ri, member, presence):
+        return [f"{member} = {self.c_name};"]
+
+
+class TypeFlag(Type):
+    def arg_member(self, ri):
+        return []
+
+    def _attr_typol(self):
+        return '.type = YNL_PT_FLAG, '
+
+    def attr_put(self, ri, var):
+        self._attr_put_line(ri, var, f"mnl_attr_put(nlh, {self.enum_name}, 0, NULL)")
+
+    def _attr_get(self, ri, var):
+        return [], None, None
+
+    def _setter_lines(self, ri, member, presence):
+        return []
+
+
+class TypeString(Type):
+    def arg_member(self, ri):
+        return [f"const char *{self.c_name}"]
+
+    def presence_type(self):
+        return 'len'
+
+    def struct_member(self, ri):
+        ri.cw.p(f"char *{self.c_name};")
+
+    def _attr_typol(self):
+        return f'.type = YNL_PT_NUL_STR, '
+
+    def _attr_policy(self, policy):
+        mem = '{ .type = ' + policy
+        if 'max-len' in self.checks:
+            mem += ', .len = ' + str(self.checks['max-len'])
+        mem += ', }'
+        return mem
+
+    def attr_policy(self, cw):
+        if self.checks.get('unterminated-ok', False):
+            policy = 'NLA_STRING'
+        else:
+            policy = 'NLA_NUL_STRING'
+
+        spec = self._attr_policy(policy)
+        cw.p(f"\t[{self.enum_name}] = {spec},")
+
+    def attr_put(self, ri, var):
+        self._attr_put_simple(ri, var, 'strz')
+
+    def _attr_get(self, ri, var):
+        len_mem = var + '->_present.' + self.c_name + '_len'
+        return [f"{len_mem} = len;",
+                f"{var}->{self.c_name} = malloc(len + 1);",
+                f"memcpy({var}->{self.c_name}, mnl_attr_get_str(attr), len);",
+                f"{var}->{self.c_name}[len] = 0;"], \
+               ['len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr));'], \
+               ['unsigned int len;']
+
+    def _setter_lines(self, ri, member, presence):
+        return [f"free({member});",
+                f"{presence}_len = strlen({self.c_name});",
+                f"{member} = malloc({presence}_len + 1);",
+                f'memcpy({member}, {self.c_name}, {presence}_len);',
+                f'{member}[{presence}_len] = 0;']
+
+
+class TypeBinary(Type):
+    def arg_member(self, ri):
+        return [f"const void *{self.c_name}", 'size_t len']
+
+    def presence_type(self):
+        return 'len'
+
+    def struct_member(self, ri):
+        ri.cw.p(f"void *{self.c_name};")
+
+    def _attr_typol(self):
+        return f'.type = YNL_PT_BINARY,'
+
+    def _attr_policy(self, policy):
+        mem = '{ '
+        if len(self.checks) == 1 and 'min-len' in self.checks:
+            mem += '.len = ' + str(self.checks['min-len'])
+        elif len(self.checks) == 0:
+            mem += '.type = NLA_BINARY'
+        else:
+            raise Exception('One or more of binary type checks not implemented, yet')
+        mem += ', }'
+        return mem
+
+    def attr_put(self, ri, var):
+        self._attr_put_line(ri, var, f"mnl_attr_put(nlh, {self.enum_name}, " +
+                            f"{var}->_present.{self.c_name}_len, {var}->{self.c_name})")
+
+    def _attr_get(self, ri, var):
+        len_mem = var + '->_present.' + self.c_name + '_len'
+        return [f"{len_mem} = len;",
+                f"{var}->{self.c_name} = malloc(len);",
+                f"memcpy({var}->{self.c_name}, mnl_attr_get_payload(attr), len);"], \
+               ['len = mnl_attr_get_payload_len(attr);'], \
+               ['unsigned int len;']
+
+    def _setter_lines(self, ri, member, presence):
+        return [f"free({member});",
+                f"{member} = malloc({presence}_len);",
+                f'memcpy({member}, {self.c_name}, {presence}_len);']
+
+
+class TypeNest(Type):
+    def _complex_member_type(self, ri):
+        return f"struct {self.nested_render_name}"
+
+    def free(self, ri, var, ref):
+        ri.cw.p(f'{self.nested_render_name}_free(&{var}->{ref}{self.c_name});')
+
+    def _attr_typol(self):
+        return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, '
+
+    def _attr_policy(self, policy):
+        return 'NLA_POLICY_NESTED(' + self.nested_render_name + '_nl_policy)'
+
+    def attr_put(self, ri, var):
+        self._attr_put_line(ri, var, f"{self.nested_render_name}_put(nlh, " +
+                            f"{self.enum_name}, &{var}->{self.c_name})")
+
+    def _attr_get(self, ri, var):
+        get_lines = [f"{self.nested_render_name}_parse(&parg, attr);"]
+        init_lines = [f"parg.rsp_policy = &{self.nested_render_name}_nest;",
+                      f"parg.data = &{var}->{self.c_name};"]
+        return get_lines, init_lines, None
+
+    def setter(self, ri, space, direction, deref=False, ref=None):
+        ref = (ref if ref else []) + [self.c_name]
+
+        for _, attr in ri.family.pure_nested_structs[self.nested_attrs].member_list():
+            attr.setter(ri, self.nested_attrs, direction, deref=deref, ref=ref)
+
+
+class TypeMultiAttr(Type):
+    def is_multi_val(self):
+        return True
+
+    def presence_type(self):
+        return 'count'
+
+    def _complex_member_type(self, ri):
+        if 'type' not in self.attr or self.attr['type'] == 'nest':
+            return f"struct {self.nested_render_name}"
+        elif self.attr['type'] in scalars:
+            scalar_pfx = '__' if ri.ku_space == 'user' else ''
+            return scalar_pfx + self.attr['type']
+        else:
+            raise Exception(f"Sub-type {self.attr['type']} not supported yet")
+
+    def free_needs_iter(self):
+        return 'type' not in self.attr or self.attr['type'] == 'nest'
+
+    def free(self, ri, var, ref):
+        if 'type' not in self.attr or self.attr['type'] == 'nest':
+            ri.cw.p(f"for (i = 0; i < {var}->{ref}n_{self.c_name}; i++)")
+            ri.cw.p(f'{self.nested_render_name}_free(&{var}->{ref}{self.c_name}[i]);')
+
+    def _attr_typol(self):
+        if 'type' not in self.attr or self.attr['type'] == 'nest':
+            return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, '
+        elif self.attr['type'] in scalars:
+            return f".type = YNL_PT_U{self.attr['type'][1:]}, "
+        else:
+            raise Exception(f"Sub-type {self.attr['type']} not supported yet")
+
+    def _attr_get(self, ri, var):
+        return f'{var}->n_{self.c_name}++;', None, None
+
+
+class TypeArrayNest(Type):
+    def is_multi_val(self):
+        return True
+
+    def presence_type(self):
+        return 'count'
+
+    def _complex_member_type(self, ri):
+        if 'sub-type' not in self.attr or self.attr['sub-type'] == 'nest':
+            return f"struct {self.nested_render_name}"
+        elif self.attr['sub-type'] in scalars:
+            scalar_pfx = '__' if ri.ku_space == 'user' else ''
+            return scalar_pfx + self.attr['sub-type']
+        else:
+            raise Exception(f"Sub-type {self.attr['sub-type']} not supported yet")
+
+    def _attr_typol(self):
+        return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, '
+
+    def _attr_get(self, ri, var):
+        local_vars = ['const struct nlattr *attr2;']
+        get_lines = [f'attr_{self.c_name} = attr;',
+                     'mnl_attr_for_each_nested(attr2, attr)',
+                     f'\t{var}->n_{self.c_name}++;']
+        return get_lines, None, local_vars
+
+
+class TypeNestTypeValue(Type):
+    def _complex_member_type(self, ri):
+        return f"struct {self.nested_render_name}"
+
+    def _attr_typol(self):
+        return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, '
+
+    def _attr_get(self, ri, var):
+        prev = 'attr'
+        tv_args = ''
+        get_lines = []
+        local_vars = []
+        init_lines = [f"parg.rsp_policy = &{self.nested_render_name}_nest;",
+                      f"parg.data = &{var}->{self.c_name};"]
+        if 'type-value' in self.attr:
+            tv_names = [c_lower(x) for x in self.attr["type-value"]]
+            local_vars += [f'const struct nlattr *attr_{", *attr_".join(tv_names)};']
+            local_vars += [f'__u32 {", ".join(tv_names)};']
+            for level in self.attr["type-value"]:
+                level = c_lower(level)
+                get_lines += [f'attr_{level} = mnl_attr_get_payload({prev});']
+                get_lines += [f'{level} = mnl_attr_get_type(attr_{level});']
+                prev = 'attr_' + level
+
+            tv_args = f", {', '.join(tv_names)}"
+
+        get_lines += [f"{self.nested_render_name}_parse(&parg, {prev}{tv_args});"]
+        return get_lines, init_lines, local_vars
+
+
+class Struct:
+    def __init__(self, family, space_name, type_list=None, inherited=None):
+        self.family = family
+        self.space_name = space_name
+        self.attr_set = family.attr_sets[space_name]
+        # Use list to catch comparisons with empty sets
+        self._inherited = inherited if inherited is not None else []
+        self.inherited = []
+
+        self.nested = type_list is None
+        if family.name == c_lower(space_name):
+            self.render_name = f"{family.name}"
+        else:
+            self.render_name = f"{family.name}_{c_lower(space_name)}"
+        self.struct_name = 'struct ' + self.render_name
+        self.ptr_name = self.struct_name + ' *'
+
+        self.request = False
+        self.reply = False
+
+        self.attr_list = []
+        self.attrs = dict()
+        if type_list:
+            for t in type_list:
+                self.attr_list.append((t, self.attr_set[t]),)
+        else:
+            for t in self.attr_set:
+                self.attr_list.append((t, self.attr_set[t]),)
+
+        max_val = 0
+        self.attr_max_val = None
+        for name, attr in self.attr_list:
+            if attr.value > max_val:
+                max_val = attr.value
+                self.attr_max_val = attr
+            self.attrs[name] = attr
+
+    def __iter__(self):
+        yield from self.attrs
+
+    def __getitem__(self, key):
+        return self.attrs[key]
+
+    def member_list(self):
+        return self.attr_list
+
+    def set_inherited(self, new_inherited):
+        if self._inherited != new_inherited:
+            raise Exception("Inheriting different members not supported")
+        self.inherited = [c_lower(x) for x in sorted(self._inherited)]
+
+
+class EnumEntry:
+    def __init__(self, enum_set, yaml, prev, value_start):
+        if isinstance(yaml, str):
+            self.name = yaml
+            yaml = {}
+            self.doc = ''
+        else:
+            self.name = yaml['name']
+            self.doc = yaml.get('doc', '')
+
+        self.yaml = yaml
+        self.c_name = c_upper(enum_set.value_pfx + self.name)
+
+        if 'value' in yaml:
+            self.value = yaml['value']
+            if prev:
+                self.value_change = (self.value != prev.value + 1)
+        elif prev:
+            self.value = prev.value + 1
+        else:
+            self.value = value_start
+            self.value_change = (self.value != 0)
+
+    def __getitem__(self, key):
+        return self.yaml[key]
+
+    def __contains__(self, key):
+        return key in self.yaml
+
+    def has_doc(self):
+        return bool(self.doc)
+
+
+class EnumSet:
+    def __init__(self, family, yaml):
+        self.yaml = yaml
+        self.family = family
+
+        self.render_name = c_lower(family.name + '-' + yaml['name'])
+        self.enum_name = 'enum ' + self.render_name
+
+        self.value_pfx = yaml.get('name-prefix', f"{family.name}-{yaml['name']}-")
+
+        self.type = yaml['type']
+
+        prev_entry = None
+        value_start = self.yaml.get('value-start', 0)
+        self.entries = {}
+        self.entry_list = []
+        for entry in self.yaml['entries']:
+            e = EnumEntry(self, entry, prev_entry, value_start)
+            self.entries[e.name] = e
+            self.entry_list.append(e)
+            prev_entry = e
+
+    def __getitem__(self, key):
+        return self.yaml[key]
+
+    def __contains__(self, key):
+        return key in self.yaml
+
+    def has_doc(self):
+        if 'doc' in self.yaml:
+            return True
+        for entry in self.entry_list:
+            if entry.has_doc():
+                return True
+        return False
+
+    def get_mask(self):
+        mask = 0
+        idx = self.yaml.get('value-start', 0)
+        for _ in self.entry_list:
+            mask |= 1 << idx
+            idx += 1
+        return mask
+
+
+class AttrSet:
+    def __init__(self, family, yaml):
+        self.yaml = yaml
+
+        self.attrs = dict()
+        self.name = self.yaml['name']
+        if 'subset-of' not in yaml:
+            self.subset_of = None
+            if 'name-prefix' in yaml:
+                pfx = yaml['name-prefix']
+            elif self.name == family.name:
+                pfx = family.name + '-a-'
+            else:
+                pfx = f"{family.name}-a-{self.name}-"
+            self.name_prefix = c_upper(pfx)
+            self.max_name = c_upper(self.yaml.get('attr-max-name', f"{self.name_prefix}max"))
+        else:
+            self.subset_of = self.yaml['subset-of']
+            self.name_prefix = family.attr_sets[self.subset_of].name_prefix
+            self.max_name = family.attr_sets[self.subset_of].max_name
+
+        self.c_name = c_lower(self.name)
+        if self.c_name in _C_KW:
+            self.c_name += '_'
+        if self.c_name == family.c_name:
+            self.c_name = ''
+
+        val = 0
+        for elem in self.yaml['attributes']:
+            if 'value' in elem:
+                val = elem['value']
+            else:
+                elem['value'] = val
+            val += 1
+
+            if 'multi-attr' in elem and elem['multi-attr']:
+                attr = TypeMultiAttr(family, self, elem)
+            elif elem['type'] in scalars:
+                attr = TypeScalar(family, self, elem)
+            elif elem['type'] == 'unused':
+                attr = TypeUnused(family, self, elem)
+            elif elem['type'] == 'pad':
+                attr = TypePad(family, self, elem)
+            elif elem['type'] == 'flag':
+                attr = TypeFlag(family, self, elem)
+            elif elem['type'] == 'string':
+                attr = TypeString(family, self, elem)
+            elif elem['type'] == 'binary':
+                attr = TypeBinary(family, self, elem)
+            elif elem['type'] == 'nest':
+                attr = TypeNest(family, self, elem)
+            elif elem['type'] == 'array-nest':
+                attr = TypeArrayNest(family, self, elem)
+            elif elem['type'] == 'nest-type-value':
+                attr = TypeNestTypeValue(family, self, elem)
+            else:
+                raise Exception(f"No typed class for type {elem['type']}")
+
+            self.attrs[elem['name']] = attr
+
+    def __getitem__(self, key):
+        return self.attrs[key]
+
+    def __contains__(self, key):
+        return key in self.yaml
+
+    def __iter__(self):
+        yield from self.attrs
+
+    def items(self):
+        return self.attrs.items()
+
+
+class Operation:
+    def __init__(self, family, yaml, value):
+        self.yaml = yaml
+        self.value = value
+
+        self.name = self.yaml['name']
+        self.render_name = family.name + '_' + c_lower(self.name)
+        self.is_async = 'notify' in yaml or 'event' in yaml
+        if not self.is_async:
+            self.enum_name = family.op_prefix + c_upper(self.name)
+        else:
+            self.enum_name = family.async_op_prefix + c_upper(self.name)
+
+        self.dual_policy = ('do' in yaml and 'request' in yaml['do']) and \
+                         ('dump' in yaml and 'request' in yaml['dump'])
+
+    def __getitem__(self, key):
+        return self.yaml[key]
+
+    def __contains__(self, key):
+        return key in self.yaml
+
+    def add_notification(self, op):
+        if 'notify' not in self.yaml:
+            self.yaml['notify'] = dict()
+            self.yaml['notify']['reply'] = self.yaml['do']['reply']
+            self.yaml['notify']['cmds'] = []
+        self.yaml['notify']['cmds'].append(op)
+
+
+class Family:
+    def __init__(self, file_name):
+        with open(file_name, "r") as stream:
+            self.yaml = yaml.safe_load(stream)
+
+        self.proto = self.yaml.get('protocol', 'genetlink')
+
+        with open(os.path.dirname(os.path.dirname(file_name)) +
+                  f'/{self.proto}.yaml', "r") as stream:
+            schema = yaml.safe_load(stream)
+
+        jsonschema.validate(self.yaml, schema)
+
+        if self.yaml.get('protocol', 'genetlink') not in {'genetlink', 'genetlink-c', 'genetlink-legacy'}:
+            raise Exception("Codegen only supported for genetlink")
+
+        self.fam_key = c_upper(self.yaml.get('c-family-name', self.yaml["name"] + '_FAMILY_NAME'))
+        self.ver_key = c_upper(self.yaml.get('c-version-name', self.yaml["name"] + '_FAMILY_VERSION'))
+
+        if 'definitions' not in self.yaml:
+            self.yaml['definitions'] = []
+
+        self.name = self.yaml['name']
+        self.c_name = c_lower(self.name)
+        if 'uapi-header' in self.yaml:
+            self.uapi_header = self.yaml['uapi-header']
+        else:
+            self.uapi_header = f"linux/{self.name}.h"
+        if 'name-prefix' in self.yaml['operations']:
+            self.op_prefix = c_upper(self.yaml['operations']['name-prefix'])
+        else:
+            self.op_prefix = c_upper(self.yaml['name'] + '-cmd-')
+        if 'async-prefix' in self.yaml['operations']:
+            self.async_op_prefix = c_upper(self.yaml['operations']['async-prefix'])
+        else:
+            self.async_op_prefix = self.op_prefix
+
+        self.mcgrps = self.yaml.get('mcast-groups', {'list': []})
+
+        self.consts = dict()
+        self.ops = dict()
+        self.ops_list = []
+        self.attr_sets = dict()
+        self.attr_sets_list = []
+
+        self.hooks = dict()
+        for when in ['pre', 'post']:
+            self.hooks[when] = dict()
+            for op_mode in ['do', 'dump']:
+                self.hooks[when][op_mode] = dict()
+                self.hooks[when][op_mode]['set'] = set()
+                self.hooks[when][op_mode]['list'] = []
+
+        # dict space-name -> 'request': set(attrs), 'reply': set(attrs)
+        self.root_sets = dict()
+        # dict space-name -> set('request', 'reply')
+        self.pure_nested_structs = dict()
+        self.all_notify = dict()
+
+        self._mock_up_events()
+
+        self._dictify()
+        self._load_root_sets()
+        self._load_nested_sets()
+        self._load_all_notify()
+        self._load_hooks()
+
+        self.kernel_policy = self.yaml.get('kernel-policy', 'split')
+        if self.kernel_policy == 'global':
+            self._load_global_policy()
+
+    def __getitem__(self, key):
+        return self.yaml[key]
+
+    def get(self, key, default=None):
+        return self.yaml.get(key, default)
+
+    # Fake a 'do' equivalent of all events, so that we can render their response parsing
+    def _mock_up_events(self):
+        for op in self.yaml['operations']['list']:
+            if 'event' in op:
+                op['do'] = {
+                    'reply': {
+                        'attributes': op['event']['attributes']
+                    }
+                }
+
+    def _dictify(self):
+        for elem in self.yaml['definitions']:
+            if elem['type'] == 'enum':
+                self.consts[elem['name']] = EnumSet(self, elem)
+            else:
+                self.consts[elem['name']] = elem
+
+        for elem in self.yaml['attribute-sets']:
+            attr_set = AttrSet(self, elem)
+            self.attr_sets[elem['name']] = attr_set
+            self.attr_sets_list.append((elem['name'], attr_set), )
+
+        ntf = []
+        val = 0
+        for elem in self.yaml['operations']['list']:
+            if 'value' in elem:
+                val = elem['value']
+
+            op = Operation(self, elem, val)
+            val += 1
+
+            self.ops_list.append((elem['name'], op),)
+            if 'notify' in elem:
+                ntf.append(op)
+                continue
+            if 'attribute-set' not in elem:
+                continue
+            self.ops[elem['name']] = op
+        for n in ntf:
+            self.ops[n['notify']].add_notification(n)
+
+    def _load_root_sets(self):
+        for op_name, op in self.ops.items():
+            if 'attribute-set' not in op:
+                continue
+
+            req_attrs = set()
+            rsp_attrs = set()
+            for op_mode in ['do', 'dump']:
+                if op_mode in op and 'request' in op[op_mode]:
+                    req_attrs.update(set(op[op_mode]['request']['attributes']))
+                if op_mode in op and 'reply' in op[op_mode]:
+                    rsp_attrs.update(set(op[op_mode]['reply']['attributes']))
+
+            if op['attribute-set'] not in self.root_sets:
+                self.root_sets[op['attribute-set']] = {'request': req_attrs, 'reply': rsp_attrs}
+            else:
+                self.root_sets[op['attribute-set']]['request'].update(req_attrs)
+                self.root_sets[op['attribute-set']]['reply'].update(rsp_attrs)
+
+    def _load_nested_sets(self):
+        for root_set, rs_members in self.root_sets.items():
+            for attr, spec in self.attr_sets[root_set].items():
+                if 'nested-attributes' in spec:
+                    inherit = set()
+                    nested = spec['nested-attributes']
+                    if nested not in self.root_sets:
+                        self.pure_nested_structs[nested] = Struct(self, nested, inherited=inherit)
+                    if attr in rs_members['request']:
+                        self.pure_nested_structs[nested].request = True
+                    if attr in rs_members['reply']:
+                        self.pure_nested_structs[nested].reply = True
+
+                    if 'type-value' in spec:
+                        if nested in self.root_sets:
+                            raise Exception("Inheriting members to a space used as root not supported")
+                        inherit.update(set(spec['type-value']))
+                    elif spec['type'] == 'array-nest':
+                        inherit.add('idx')
+                    self.pure_nested_structs[nested].set_inherited(inherit)
+
+    def _load_all_notify(self):
+        for op_name, op in self.ops.items():
+            if not op:
+                continue
+
+            if 'notify' in op:
+                self.all_notify[op_name] = op['notify']['cmds']
+
+    def _load_global_policy(self):
+        global_set = set()
+        attr_set_name = None
+        for op_name, op in self.ops.items():
+            if not op:
+                continue
+            if 'attribute-set' not in op:
+                continue
+
+            if attr_set_name is None:
+                attr_set_name = op['attribute-set']
+            if attr_set_name != op['attribute-set']:
+                raise Exception('For a global policy all ops must use the same set')
+
+            for op_mode in {'do', 'dump'}:
+                if op_mode in op:
+                    global_set.update(op[op_mode].get('request', []))
+
+        self.global_policy = []
+        self.global_policy_set = attr_set_name
+        for attr in self.attr_sets[attr_set_name]:
+            if attr in global_set:
+                self.global_policy.append(attr)
+
+    def _load_hooks(self):
+        for op in self.ops.values():
+            for op_mode in ['do', 'dump']:
+                if op_mode not in op:
+                    continue
+                for when in ['pre', 'post']:
+                    if when not in op[op_mode]:
+                        continue
+                    name = op[op_mode][when]
+                    if name in self.hooks[when][op_mode]['set']:
+                        continue
+                    self.hooks[when][op_mode]['set'].add(name)
+                    self.hooks[when][op_mode]['list'].append(name)
+
+
+class RenderInfo:
+    def __init__(self, cw, family, ku_space, op, op_name, op_mode, attr_set=None):
+        self.family = family
+        self.nl = cw.nlib
+        self.ku_space = ku_space
+        self.op = op
+        self.op_name = op_name
+        self.op_mode = op_mode
+
+        # 'do' and 'dump' response parsing is identical
+        if op_mode != 'do' and 'dump' in op and 'do' in op and 'reply' in op['do'] and \
+           op["do"]["reply"] == op["dump"]["reply"]:
+            self.type_consistent = True
+        else:
+            self.type_consistent = op_mode == 'event'
+
+        self.attr_set = attr_set
+        if not self.attr_set:
+            self.attr_set = op['attribute-set']
+
+        if op:
+            self.type_name = c_lower(op_name)
+        else:
+            self.type_name = c_lower(attr_set)
+
+        self.cw = cw
+
+        self.struct = dict()
+        for op_dir in ['request', 'reply']:
+            if op and op_dir in op[op_mode]:
+                self.struct[op_dir] = Struct(family, self.attr_set,
+                                             type_list=op[op_mode][op_dir]['attributes'])
+        if op_mode == 'event':
+            self.struct['reply'] = Struct(family, self.attr_set, type_list=op['event']['attributes'])
+
+
+class CodeWriter:
+    def __init__(self, nlib, out_file):
+        self.nlib = nlib
+
+        self._nl = False
+        self._silent_block = False
+        self._ind = 0
+        self._out = out_file
+
+    @classmethod
+    def _is_cond(cls, line):
+        return line.startswith('if') or line.startswith('while') or line.startswith('for')
+
+    def p(self, line, add_ind=0):
+        if self._nl:
+            self._out.write('\n')
+            self._nl = False
+        ind = self._ind
+        if line[-1] == ':':
+            ind -= 1
+        if self._silent_block:
+            ind += 1
+        self._silent_block = line.endswith(')') and CodeWriter._is_cond(line)
+        if add_ind:
+            ind += add_ind
+        self._out.write('\t' * ind + line + '\n')
+
+    def nl(self):
+        self._nl = True
+
+    def block_start(self, line=''):
+        if line:
+            line = line + ' '
+        self.p(line + '{')
+        self._ind += 1
+
+    def block_end(self, line=''):
+        if line and line[0] not in {';', ','}:
+            line = ' ' + line
+        self._ind -= 1
+        self.p('}' + line)
+
+    def write_doc_line(self, doc, indent=True):
+        words = doc.split()
+        line = ' *'
+        for word in words:
+            if len(line) + len(word) >= 79:
+                self.p(line)
+                line = ' *'
+                if indent:
+                    line += '  '
+            line += ' ' + word
+        self.p(line)
+
+    def write_func_prot(self, qual_ret, name, args=None, doc=None, suffix=''):
+        if not args:
+            args = ['void']
+
+        if doc:
+            self.p('/*')
+            self.p(' * ' + doc)
+            self.p(' */')
+
+        oneline = qual_ret
+        if qual_ret[-1] != '*':
+            oneline += ' '
+        oneline += f"{name}({', '.join(args)}){suffix}"
+
+        if len(oneline) < 80:
+            self.p(oneline)
+            return
+
+        v = qual_ret
+        if len(v) > 3:
+            self.p(v)
+            v = ''
+        elif qual_ret[-1] != '*':
+            v += ' '
+        v += name + '('
+        ind = '\t' * (len(v) // 8) + ' ' * (len(v) % 8)
+        delta_ind = len(v) - len(ind)
+        v += args[0]
+        i = 1
+        while i < len(args):
+            next_len = len(v) + len(args[i])
+            if v[0] == '\t':
+                next_len += delta_ind
+            if next_len > 76:
+                self.p(v + ',')
+                v = ind
+            else:
+                v += ', '
+            v += args[i]
+            i += 1
+        self.p(v + ')' + suffix)
+
+    def write_func_lvar(self, local_vars):
+        if not local_vars:
+            return
+
+        if type(local_vars) is str:
+            local_vars = [local_vars]
+
+        local_vars.sort(key=len, reverse=True)
+        for var in local_vars:
+            self.p(var)
+        self.nl()
+
+    def write_func(self, qual_ret, name, body, args=None, local_vars=None):
+        self.write_func_prot(qual_ret=qual_ret, name=name, args=args)
+        self.write_func_lvar(local_vars=local_vars)
+
+        self.block_start()
+        for line in body:
+            self.p(line)
+        self.block_end()
+
+    def writes_defines(self, defines):
+        longest = 0
+        for define in defines:
+            if len(define[0]) > longest:
+                longest = len(define[0])
+        longest = ((longest + 8) // 8) * 8
+        for define in defines:
+            line = '#define ' + define[0]
+            line += '\t' * ((longest - len(define[0]) + 7) // 8)
+            if type(define[1]) is int:
+                line += str(define[1])
+            elif type(define[1]) is str:
+                line += '"' + define[1] + '"'
+            self.p(line)
+
+    def write_struct_init(self, members):
+        longest = max([len(x[0]) for x in members])
+        longest += 1  # because we prepend a .
+        longest = ((longest + 8) // 8) * 8
+        for one in members:
+            line = '.' + one[0]
+            line += '\t' * ((longest - len(one[0]) - 1 + 7) // 8)
+            line += '= ' + one[1] + ','
+            self.p(line)
+
+
+scalars = {'u8', 'u16', 'u32', 'u64', 's32', 's64'}
+
+direction_to_suffix = {
+    'reply': '_rsp',
+    'request': '_req',
+    '': ''
+}
+
+op_mode_to_wrapper = {
+    'do': '',
+    'dump': '_list',
+    'notify': '_ntf',
+    'event': '',
+}
+
+_C_KW = {
+    'do'
+}
+
+
+def rdir(direction):
+    if direction == 'reply':
+        return 'request'
+    if direction == 'request':
+        return 'reply'
+    return direction
+
+
+def op_prefix(ri, direction, deref=False):
+    suffix = f"_{ri.type_name}"
+
+    if not ri.op_mode or ri.op_mode == 'do':
+        suffix += f"{direction_to_suffix[direction]}"
+    else:
+        if direction == 'request':
+            suffix += '_req_dump'
+        else:
+            if ri.type_consistent:
+                if deref:
+                    suffix += f"{direction_to_suffix[direction]}"
+                else:
+                    suffix += op_mode_to_wrapper[ri.op_mode]
+            else:
+                suffix += '_rsp'
+                suffix += '_dump' if deref else '_list'
+
+    return f"{ri.family['name']}{suffix}"
+
+
+def type_name(ri, direction, deref=False):
+    return f"struct {op_prefix(ri, direction, deref=deref)}"
+
+
+def print_prototype(ri, direction, terminate=True, doc=None):
+    suffix = ';' if terminate else ''
+
+    fname = ri.op.render_name
+    if ri.op_mode == 'dump':
+        fname += '_dump'
+
+    args = ['struct ynl_sock *ys']
+    if 'request' in ri.op[ri.op_mode]:
+        args.append(f"{type_name(ri, direction)} *" + f"{direction_to_suffix[direction][1:]}")
+
+    ret = 'int'
+    if 'reply' in ri.op[ri.op_mode]:
+        ret = f"{type_name(ri, rdir(direction))} *"
+
+    ri.cw.write_func_prot(ret, fname, args, doc=doc, suffix=suffix)
+
+
+def print_req_prototype(ri):
+    print_prototype(ri, "request", doc=ri.op['doc'])
+
+
+def print_dump_prototype(ri):
+    print_prototype(ri, "request")
+
+
+def put_typol_fwd(cw, struct):
+    cw.p(f'extern struct ynl_policy_nest {struct.render_name}_nest;')
+
+
+def put_typol(cw, struct):
+    type_max = struct.attr_set.max_name
+    cw.block_start(line=f'struct ynl_policy_attr {struct.render_name}_policy[{type_max} + 1] =')
+
+    for _, arg in struct.member_list():
+        arg.attr_typol(cw)
+
+    cw.block_end(line=';')
+    cw.nl()
+
+    cw.block_start(line=f'struct ynl_policy_nest {struct.render_name}_nest =')
+    cw.p(f'.max_attr = {type_max},')
+    cw.p(f'.table = {struct.render_name}_policy,')
+    cw.block_end(line=';')
+    cw.nl()
+
+
+def put_req_nested(ri, struct):
+    func_args = ['struct nlmsghdr *nlh',
+                 'unsigned int attr_type',
+                 f'{struct.ptr_name}obj']
+
+    ri.cw.write_func_prot('int', f'{struct.render_name}_put', func_args)
+    ri.cw.block_start()
+    ri.cw.write_func_lvar('struct nlattr *nest;')
+
+    ri.cw.p("nest = mnl_attr_nest_start(nlh, attr_type);")
+
+    for _, arg in struct.member_list():
+        arg.attr_put(ri, "obj")
+
+    ri.cw.p("mnl_attr_nest_end(nlh, nest);")
+
+    ri.cw.nl()
+    ri.cw.p('return 0;')
+    ri.cw.block_end()
+    ri.cw.nl()
+
+
+def _multi_parse(ri, struct, init_lines, local_vars):
+    if struct.nested:
+        iter_line = "mnl_attr_for_each_nested(attr, nested)"
+    else:
+        iter_line = "mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr))"
+
+    array_nests = set()
+    multi_attrs = set()
+    needs_parg = False
+    for arg, aspec in struct.member_list():
+        if aspec['type'] == 'array-nest':
+            local_vars.append(f'const struct nlattr *attr_{aspec.c_name};')
+            array_nests.add(arg)
+        if 'multi-attr' in aspec:
+            multi_attrs.add(arg)
+        needs_parg |= 'nested-attributes' in aspec
+    if array_nests or multi_attrs:
+        local_vars.append('int i;')
+    if needs_parg:
+        local_vars.append('struct ynl_parse_arg parg;')
+        init_lines.append('parg.ys = yarg->ys;')
+
+    ri.cw.block_start()
+    ri.cw.write_func_lvar(local_vars)
+
+    for line in init_lines:
+        ri.cw.p(line)
+    ri.cw.nl()
+
+    for arg in struct.inherited:
+        ri.cw.p(f'dst->{arg} = {arg};')
+
+    ri.cw.nl()
+    ri.cw.block_start(line=iter_line)
+
+    first = True
+    for _, arg in struct.member_list():
+        arg.attr_get(ri, 'dst', first=first)
+        first = False
+
+    ri.cw.block_end()
+    ri.cw.nl()
+
+    for anest in sorted(array_nests):
+        aspec = struct[anest]
+
+        ri.cw.block_start(line=f"if (dst->n_{aspec.c_name})")
+        ri.cw.p(f"dst->{aspec.c_name} = calloc(dst->n_{aspec.c_name}, sizeof(*dst->{aspec.c_name}));")
+        ri.cw.p('i = 0;')
+        ri.cw.p(f"parg.rsp_policy = &{aspec.nested_render_name}_nest;")
+        ri.cw.block_start(line=f"mnl_attr_for_each_nested(attr, attr_{aspec.c_name})")
+        ri.cw.p(f"parg.data = &dst->{aspec.c_name}[i];")
+        ri.cw.p(f"if ({aspec.nested_render_name}_parse(&parg, attr, mnl_attr_get_type(attr)))")
+        ri.cw.p('return MNL_CB_ERROR;')
+        ri.cw.p('i++;')
+        ri.cw.block_end()
+        ri.cw.block_end()
+    ri.cw.nl()
+
+    for anest in sorted(multi_attrs):
+        aspec = struct[anest]
+        ri.cw.block_start(line=f"if (dst->n_{aspec.c_name})")
+        ri.cw.p(f"dst->{aspec.c_name} = calloc(dst->n_{aspec.c_name}, sizeof(*dst->{aspec.c_name}));")
+        ri.cw.p('i = 0;')
+        if 'nested-attributes' in aspec:
+            ri.cw.p(f"parg.rsp_policy = &{aspec.nested_render_name}_nest;")
+        ri.cw.block_start(line=iter_line)
+        ri.cw.block_start(line=f"if (mnl_attr_get_type(attr) == {aspec.enum_name})")
+        if 'nested-attributes' in aspec:
+            ri.cw.p(f"parg.data = &dst->{aspec.c_name}[i];")
+            ri.cw.p(f"if ({aspec.nested_render_name}_parse(&parg, attr))")
+            ri.cw.p('return MNL_CB_ERROR;')
+        elif aspec['type'] in scalars:
+            t = aspec['type']
+            if t[0] == 's':
+                t = 'u' + t[1:]
+            ri.cw.p(f"dst->{aspec.c_name}[i] = mnl_attr_get_{t}(attr);")
+        else:
+            raise Exception('Nest parsing type not supported yet')
+        ri.cw.p('i++;')
+        ri.cw.block_end()
+        ri.cw.block_end()
+        ri.cw.block_end()
+    ri.cw.nl()
+
+    if struct.nested:
+        ri.cw.p('return 0;')
+    else:
+        ri.cw.p('return MNL_CB_OK;')
+    ri.cw.block_end()
+    ri.cw.nl()
+
+
+def parse_rsp_nested(ri, struct):
+    func_args = ['struct ynl_parse_arg *yarg',
+                 'const struct nlattr *nested']
+    for arg in struct.inherited:
+        func_args.append('__u32 ' + arg)
+
+    local_vars = ['const struct nlattr *attr;',
+                  f'{struct.ptr_name}dst = yarg->data;']
+    init_lines = []
+
+    ri.cw.write_func_prot('int', f'{struct.render_name}_parse', func_args)
+
+    _multi_parse(ri, struct, init_lines, local_vars)
+
+
+def parse_rsp_msg(ri, deref=False):
+    if 'reply' not in ri.op[ri.op_mode] and ri.op_mode != 'event':
+        return
+
+    func_args = ['const struct nlmsghdr *nlh',
+                 'void *data']
+
+    local_vars = [f'{type_name(ri, "reply", deref=deref)} *dst;',
+                  'struct ynl_parse_arg *yarg = data;',
+                  'const struct nlattr *attr;']
+    init_lines = ['dst = yarg->data;']
+
+    ri.cw.write_func_prot('int', f'{op_prefix(ri, "reply", deref=deref)}_parse', func_args)
+
+    _multi_parse(ri, ri.struct["reply"], init_lines, local_vars)
+
+
+def print_req(ri):
+    ret_ok = '0'
+    ret_err = '-1'
+    direction = "request"
+    local_vars = ['struct nlmsghdr *nlh;',
+                  'int len, err;']
+
+    if 'reply' in ri.op[ri.op_mode]:
+        ret_ok = 'rsp'
+        ret_err = 'NULL'
+        local_vars += [f'{type_name(ri, rdir(direction))} *rsp;',
+                       'struct ynl_parse_arg yarg = { .ys = ys, };']
+
+    print_prototype(ri, direction, terminate=False)
+    ri.cw.block_start()
+    ri.cw.write_func_lvar(local_vars)
+
+    ri.cw.p(f"nlh = ynl_gemsg_start_req(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);")
+
+    ri.cw.p(f"ys->req_policy = &{ri.struct['request'].render_name}_nest;")
+    if 'reply' in ri.op[ri.op_mode]:
+        ri.cw.p(f"yarg.rsp_policy = &{ri.struct['reply'].render_name}_nest;")
+    ri.cw.nl()
+    for _, attr in ri.struct["request"].member_list():
+        attr.attr_put(ri, "req")
+    ri.cw.nl()
+
+    ri.cw.p('err = mnl_socket_sendto(ys->sock, nlh, nlh->nlmsg_len);')
+    ri.cw.p('if (err < 0)')
+    ri.cw.p(f"return {ret_err};")
+    ri.cw.nl()
+    ri.cw.p('len = mnl_socket_recvfrom(ys->sock, ys->rx_buf, MNL_SOCKET_BUFFER_SIZE);')
+    ri.cw.p('if (len < 0)')
+    ri.cw.p(f"return {ret_err};")
+    ri.cw.nl()
+
+    if 'reply' in ri.op[ri.op_mode]:
+        ri.cw.p('rsp = calloc(1, sizeof(*rsp));')
+        ri.cw.p('yarg.data = rsp;')
+        ri.cw.nl()
+        ri.cw.p(f"err = {ri.nl.parse_cb_run(op_prefix(ri, 'reply') + '_parse', '&yarg', False)};")
+        ri.cw.p('if (err < 0)')
+        ri.cw.p('goto err_free;')
+        ri.cw.nl()
+
+    ri.cw.p('err = ynl_recv_ack(ys, err);')
+    ri.cw.p('if (err)')
+    ri.cw.p('goto err_free;')
+    ri.cw.nl()
+    ri.cw.p(f"return {ret_ok};")
+    ri.cw.nl()
+    ri.cw.p('err_free:')
+
+    if 'reply' in ri.op[ri.op_mode]:
+        ri.cw.p(f"{call_free(ri, rdir(direction), 'rsp')}")
+    ri.cw.p(f"return {ret_err};")
+    ri.cw.block_end()
+
+
+def print_dump(ri):
+    direction = "request"
+    print_prototype(ri, direction, terminate=False)
+    ri.cw.block_start()
+    local_vars = ['struct ynl_dump_state yds = {};',
+                  'struct nlmsghdr *nlh;',
+                  'int len, err;']
+
+    for var in local_vars:
+        ri.cw.p(f'{var}')
+    ri.cw.nl()
+
+    ri.cw.p('yds.ys = ys;')
+    ri.cw.p(f"yds.alloc_sz = sizeof({type_name(ri, rdir(direction))});")
+    ri.cw.p(f"yds.cb = {op_prefix(ri, 'reply', deref=True)}_parse;")
+    ri.cw.p(f"yds.rsp_policy = &{ri.struct['reply'].render_name}_nest;")
+    ri.cw.nl()
+    ri.cw.p(f"nlh = ynl_gemsg_start_dump(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);")
+
+    if "request" in ri.op[ri.op_mode]:
+        ri.cw.p(f"ys->req_policy = &{ri.struct['request'].render_name}_nest;")
+        ri.cw.nl()
+        for _, attr in ri.struct["request"].member_list():
+            attr.attr_put(ri, "req")
+    ri.cw.nl()
+
+    ri.cw.p('err = mnl_socket_sendto(ys->sock, nlh, nlh->nlmsg_len);')
+    ri.cw.p('if (err < 0)')
+    ri.cw.p('return NULL;')
+    ri.cw.nl()
+
+    ri.cw.block_start(line='do')
+    ri.cw.p('len = mnl_socket_recvfrom(ys->sock, ys->rx_buf, MNL_SOCKET_BUFFER_SIZE);')
+    ri.cw.p('if (len < 0)')
+    ri.cw.p('goto free_list;')
+    ri.cw.nl()
+    ri.cw.p(f"err = {ri.nl.parse_cb_run('ynl_dump_trampoline', '&yds', False, indent=2)};")
+    ri.cw.p('if (err < 0)')
+    ri.cw.p('goto free_list;')
+    ri.cw.block_end(line='while (err > 0);')
+    ri.cw.nl()
+
+    ri.cw.p('return yds.first;')
+    ri.cw.nl()
+    ri.cw.p('free_list:')
+    ri.cw.p(call_free(ri, rdir(direction), 'yds.first'))
+    ri.cw.p('return NULL;')
+    ri.cw.block_end()
+
+
+def call_free(ri, direction, var):
+    return f"{op_prefix(ri, direction)}_free({var});"
+
+
+def free_arg_name(direction):
+    if direction:
+        return direction_to_suffix[direction][1:]
+    return 'obj'
+
+
+def print_free_prototype(ri, direction, suffix=';'):
+    name = op_prefix(ri, direction)
+    arg = free_arg_name(direction)
+    ri.cw.write_func_prot('void', f"{name}_free", [f"struct {name} *{arg}"], suffix=suffix)
+
+
+def _print_type(ri, direction, struct):
+    suffix = f'_{ri.type_name}{direction_to_suffix[direction]}'
+
+    if ri.op_mode == 'dump':
+        suffix += '_dump'
+
+    ri.cw.block_start(line=f"struct {ri.family['name']}{suffix}")
+
+    meta_started = False
+    for _, attr in struct.member_list():
+        for type_filter in ['len', 'bit']:
+            line = attr.presence_member(ri.ku_space, type_filter)
+            if line:
+                if not meta_started:
+                    ri.cw.block_start(line=f"struct")
+                    meta_started = True
+                ri.cw.p(line)
+    if meta_started:
+        ri.cw.block_end(line='_present;')
+        ri.cw.nl()
+
+    for arg in struct.inherited:
+        ri.cw.p(f"__u32 {arg};")
+
+    for _, attr in struct.member_list():
+        attr.struct_member(ri)
+
+    ri.cw.block_end(line=';')
+    ri.cw.nl()
+
+
+def print_type(ri, direction):
+    _print_type(ri, direction, ri.struct[direction])
+
+
+def print_type_full(ri, struct):
+    _print_type(ri, "", struct)
+
+
+def print_type_helpers(ri, direction, deref=False):
+    print_free_prototype(ri, direction)
+
+    if ri.ku_space == 'user' and direction == 'request':
+        for _, attr in ri.struct[direction].member_list():
+            attr.setter(ri, ri.attr_set, direction, deref=deref)
+    ri.cw.nl()
+
+
+def print_req_type_helpers(ri):
+    print_type_helpers(ri, "request")
+
+
+def print_rsp_type_helpers(ri):
+    if 'reply' not in ri.op[ri.op_mode]:
+        return
+    print_type_helpers(ri, "reply")
+
+
+def print_parse_prototype(ri, direction, terminate=True):
+    suffix = "_rsp" if direction == "reply" else "_req"
+    term = ';' if terminate else ''
+
+    ri.cw.write_func_prot('void', f"{ri.op.render_name}{suffix}_parse",
+                          ['const struct nlattr **tb',
+                           f"struct {ri.op.render_name}{suffix} *req"],
+                          suffix=term)
+
+
+def print_req_type(ri):
+    print_type(ri, "request")
+
+
+def print_rsp_type(ri):
+    if (ri.op_mode == 'do' or ri.op_mode == 'dump') and 'reply' in ri.op[ri.op_mode]:
+        direction = 'reply'
+    elif ri.op_mode == 'event':
+        direction = 'reply'
+    else:
+        return
+    print_type(ri, direction)
+
+
+def print_wrapped_type(ri):
+    ri.cw.block_start(line=f"{type_name(ri, 'reply')}")
+    if ri.op_mode == 'dump':
+        ri.cw.p(f"{type_name(ri, 'reply')} *next;")
+    elif ri.op_mode == 'notify' or ri.op_mode == 'event':
+        ri.cw.p('__u16 family;')
+        ri.cw.p('__u8 cmd;')
+        ri.cw.p(f"void (*free)({type_name(ri, 'reply')} *ntf);")
+    ri.cw.p(f"{type_name(ri, 'reply', deref=True)} obj __attribute__ ((aligned (8)));")
+    ri.cw.block_end(line=';')
+    ri.cw.nl()
+    print_free_prototype(ri, 'reply')
+    ri.cw.nl()
+
+
+def _free_type_members_iter(ri, struct):
+    for _, attr in struct.member_list():
+        if attr.free_needs_iter():
+            ri.cw.p('unsigned int i;')
+            ri.cw.nl()
+            break
+
+
+def _free_type_members(ri, var, struct, ref=''):
+    for _, attr in struct.member_list():
+        attr.free(ri, var, ref)
+
+
+def _free_type(ri, direction, struct):
+    var = free_arg_name(direction)
+
+    print_free_prototype(ri, direction, suffix='')
+    ri.cw.block_start()
+    _free_type_members_iter(ri, struct)
+    _free_type_members(ri, var, struct)
+    if direction:
+        ri.cw.p(f'free({var});')
+    ri.cw.block_end()
+    ri.cw.nl()
+
+
+def free_rsp_nested(ri, struct):
+    _free_type(ri, "", struct)
+
+
+def print_rsp_free(ri):
+    if 'reply' not in ri.op[ri.op_mode]:
+        return
+    _free_type(ri, 'reply', ri.struct['reply'])
+
+
+def print_dump_type_free(ri):
+    sub_type = type_name(ri, 'reply')
+
+    print_free_prototype(ri, 'reply', suffix='')
+    ri.cw.block_start()
+    ri.cw.p(f"{sub_type} *next = rsp;")
+    ri.cw.nl()
+    ri.cw.block_start(line='while (next)')
+    _free_type_members_iter(ri, ri.struct['reply'])
+    ri.cw.p('rsp = next;')
+    ri.cw.p('next = rsp->next;')
+    ri.cw.nl()
+
+    _free_type_members(ri, 'rsp', ri.struct['reply'], ref='obj.')
+    ri.cw.p(f'free(rsp);')
+    ri.cw.block_end()
+    ri.cw.block_end()
+    ri.cw.nl()
+
+
+def print_ntf_type_free(ri):
+    print_free_prototype(ri, 'reply', suffix='')
+    ri.cw.block_start()
+    _free_type_members_iter(ri, ri.struct['reply'])
+    _free_type_members(ri, 'rsp', ri.struct['reply'], ref='obj.')
+    ri.cw.p(f'free(rsp);')
+    ri.cw.block_end()
+    ri.cw.nl()
+
+
+def print_ntf_parse_prototype(family, cw, suffix=';'):
+    cw.write_func_prot('struct ynl_ntf_base_type *', f"{family['name']}_ntf_parse",
+                       ['struct ynl_sock *ys'], suffix=suffix)
+
+
+def print_ntf_type_parse(family, cw, ku_mode):
+    print_ntf_parse_prototype(family, cw, suffix='')
+    cw.block_start()
+    cw.write_func_lvar(['struct genlmsghdr *genlh;',
+                        'struct nlmsghdr *nlh;',
+                        'struct ynl_parse_arg yarg = { .ys = ys, };',
+                        'struct ynl_ntf_base_type *rsp;',
+                        'int len, err;',
+                        'mnl_cb_t parse;'])
+    cw.p('len = mnl_socket_recvfrom(ys->sock, ys->rx_buf, MNL_SOCKET_BUFFER_SIZE);')
+    cw.p('if (len < (ssize_t)(sizeof(*nlh) + sizeof(*genlh)))')
+    cw.p('return NULL;')
+    cw.nl()
+    cw.p('nlh = (struct nlmsghdr *)ys->rx_buf;')
+    cw.p('genlh = mnl_nlmsg_get_payload(nlh);')
+    cw.nl()
+    cw.block_start(line='switch (genlh->cmd)')
+    for ntf_op in sorted(family.all_notify.keys()):
+        op = family.ops[ntf_op]
+        ri = RenderInfo(cw, family, ku_mode, op, ntf_op, "notify")
+        for ntf in op['notify']['cmds']:
+            cw.p(f"case {ntf.enum_name}:")
+        cw.p(f"rsp = calloc(1, sizeof({type_name(ri, 'notify')}));")
+        cw.p(f"parse = {op_prefix(ri, 'reply', deref=True)}_parse;")
+        cw.p(f"yarg.rsp_policy = &{ri.struct['reply'].render_name}_nest;")
+        cw.p(f"rsp->free = (void *){op_prefix(ri, 'notify')}_free;")
+        cw.p('break;')
+    for op_name, op in family.ops.items():
+        if 'event' not in op:
+            continue
+        ri = RenderInfo(cw, family, ku_mode, op, op_name, "event")
+        cw.p(f"case {op.enum_name}:")
+        cw.p(f"rsp = calloc(1, sizeof({type_name(ri, 'event')}));")
+        cw.p(f"parse = {op_prefix(ri, 'reply', deref=True)}_parse;")
+        cw.p(f"yarg.rsp_policy = &{ri.struct['reply'].render_name}_nest;")
+        cw.p(f"rsp->free = (void *){op_prefix(ri, 'notify')}_free;")
+        cw.p('break;')
+    cw.p('default:')
+    cw.p('ynl_error_unknown_notification(ys, genlh->cmd);')
+    cw.p('return NULL;')
+    cw.block_end()
+    cw.nl()
+    cw.p('yarg.data = rsp->data;')
+    cw.nl()
+    cw.p(f"err = {cw.nlib.parse_cb_run('parse', '&yarg', True)};")
+    cw.p('if (err < 0)')
+    cw.p('goto err_free;')
+    cw.nl()
+    cw.p('rsp->family = nlh->nlmsg_type;')
+    cw.p('rsp->cmd = genlh->cmd;')
+    cw.p('return rsp;')
+    cw.nl()
+    cw.p('err_free:')
+    cw.p('free(rsp);')
+    cw.p('return NULL;')
+    cw.block_end()
+    cw.nl()
+
+
+def print_req_policy_fwd(cw, struct, ri=None, terminate=True):
+    if terminate and ri and kernel_can_gen_family_struct(struct.family):
+        return
+
+    if terminate:
+        prefix = 'extern '
+    else:
+        if kernel_can_gen_family_struct(struct.family) and ri:
+            prefix = 'static '
+        else:
+            prefix = ''
+
+    suffix = ';' if terminate else ' = {'
+
+    max_attr = struct.attr_max_val
+    if ri:
+        name = ri.op.render_name
+        if ri.op.dual_policy:
+            name += '_' + ri.op_mode
+    else:
+        name = struct.render_name
+    cw.p(f"{prefix}const struct nla_policy {name}_nl_policy[{max_attr.enum_name} + 1]{suffix}")
+
+
+def print_req_policy(cw, struct, ri=None):
+    print_req_policy_fwd(cw, struct, ri=ri, terminate=False)
+    for _, arg in struct.member_list():
+        arg.attr_policy(cw)
+    cw.p("};")
+
+
+def kernel_can_gen_family_struct(family):
+    return family.proto == 'genetlink'
+
+
+def print_kernel_op_table_fwd(family, cw, terminate):
+    exported = not kernel_can_gen_family_struct(family)
+
+    if not terminate or exported:
+        cw.p(f"/* Ops table for {family.name} */")
+
+        pol_to_struct = {'global': 'genl_small_ops',
+                         'per-op': 'genl_ops',
+                         'split': 'genl_split_ops'}
+        struct_type = pol_to_struct[family.kernel_policy]
+
+        if family.kernel_policy == 'split':
+            cnt = 0
+            for op in family.ops.values():
+                if 'do' in op:
+                    cnt += 1
+                if 'dump' in op:
+                    cnt += 1
+        else:
+            cnt = len(family.ops)
+
+        qual = 'static const' if not exported else 'const'
+        line = f"{qual} struct {struct_type} {family.name}_nl_ops[{cnt}]"
+        if terminate:
+            cw.p(f"extern {line};")
+        else:
+            cw.block_start(line=line + ' =')
+
+    if not terminate:
+        return
+
+    cw.nl()
+    for name in family.hooks['pre']['do']['list']:
+        cw.write_func_prot('int', c_lower(name),
+                           ['const struct genl_split_ops *ops',
+                            'struct sk_buff *skb', 'struct genl_info *info'], suffix=';')
+    for name in family.hooks['post']['do']['list']:
+        cw.write_func_prot('void', c_lower(name),
+                           ['const struct genl_split_ops *ops',
+                            'struct sk_buff *skb', 'struct genl_info *info'], suffix=';')
+    for name in family.hooks['pre']['dump']['list']:
+        cw.write_func_prot('int', c_lower(name),
+                           ['struct netlink_callback *cb'], suffix=';')
+    for name in family.hooks['post']['dump']['list']:
+        cw.write_func_prot('int', c_lower(name),
+                           ['struct netlink_callback *cb'], suffix=';')
+
+    cw.nl()
+
+    for op_name, op in family.ops.items():
+        if op.is_async:
+            continue
+
+        if 'do' in op:
+            name = c_lower(f"{family.name}-nl-{op_name}-doit")
+            cw.write_func_prot('int', name,
+                               ['struct sk_buff *skb', 'struct genl_info *info'], suffix=';')
+
+        if 'dump' in op:
+            name = c_lower(f"{family.name}-nl-{op_name}-dumpit")
+            cw.write_func_prot('int', name,
+                               ['struct sk_buff *skb', 'struct netlink_callback *cb'], suffix=';')
+    cw.nl()
+
+
+def print_kernel_op_table_hdr(family, cw):
+    print_kernel_op_table_fwd(family, cw, terminate=True)
+
+
+def print_kernel_op_table(family, cw):
+    print_kernel_op_table_fwd(family, cw, terminate=False)
+    if family.kernel_policy == 'global' or family.kernel_policy == 'per-op':
+        for op_name, op in family.ops.items():
+            if op.is_async:
+                continue
+
+            cw.block_start()
+            members = [('cmd', op.enum_name)]
+            if 'dont-validate' in op:
+                members.append(('validate',
+                                ' | '.join([c_upper('genl-dont-validate-' + x)
+                                            for x in op['dont-validate']])), )
+            for op_mode in ['do', 'dump']:
+                if op_mode in op:
+                    name = c_lower(f"{family.name}-nl-{op_name}-{op_mode}it")
+                    members.append((op_mode + 'it', name))
+            if family.kernel_policy == 'per-op':
+                struct = Struct(family, op['attribute-set'],
+                                type_list=op['do']['request']['attributes'])
+
+                name = c_lower(f"{family.name}-{op_name}-nl-policy")
+                members.append(('policy', name))
+                members.append(('maxattr', struct.attr_max_val.enum_name))
+            if 'flags' in op:
+                members.append(('flags', ' | '.join([c_upper('genl-' + x) for x in op['flags']])))
+            cw.write_struct_init(members)
+            cw.block_end(line=',')
+    elif family.kernel_policy == 'split':
+        cb_names = {'do':   {'pre': 'pre_doit', 'post': 'post_doit'},
+                    'dump': {'pre': 'start', 'post': 'done'}}
+
+        for op_name, op in family.ops.items():
+            for op_mode in ['do', 'dump']:
+                if op.is_async or op_mode not in op:
+                    continue
+
+                cw.block_start()
+                members = [('cmd', op.enum_name)]
+                if 'dont-validate' in op:
+                    members.append(('validate',
+                                    ' | '.join([c_upper('genl-dont-validate-' + x)
+                                                for x in op['dont-validate']])), )
+                name = c_lower(f"{family.name}-nl-{op_name}-{op_mode}it")
+                if 'pre' in op[op_mode]:
+                    members.append((cb_names[op_mode]['pre'], c_lower(op[op_mode]['pre'])))
+                members.append((op_mode + 'it', name))
+                if 'post' in op[op_mode]:
+                    members.append((cb_names[op_mode]['post'], c_lower(op[op_mode]['post'])))
+                if 'request' in op[op_mode]:
+                    struct = Struct(family, op['attribute-set'],
+                                    type_list=op[op_mode]['request']['attributes'])
+
+                    if op.dual_policy:
+                        name = c_lower(f"{family.name}-{op_name}-{op_mode}-nl-policy")
+                    else:
+                        name = c_lower(f"{family.name}-{op_name}-nl-policy")
+                    members.append(('policy', name))
+                    members.append(('maxattr', struct.attr_max_val.enum_name))
+                flags = (op['flags'] if 'flags' in op else []) + ['cmd-cap-' + op_mode]
+                members.append(('flags', ' | '.join([c_upper('genl-' + x) for x in flags])))
+                cw.write_struct_init(members)
+                cw.block_end(line=',')
+
+    cw.block_end(line=';')
+    cw.nl()
+
+
+def print_kernel_mcgrp_hdr(family, cw):
+    if not family.mcgrps['list']:
+        return
+
+    cw.block_start('enum')
+    for grp in family.mcgrps['list']:
+        grp_id = c_upper(f"{family.name}-nlgrp-{grp['name']},")
+        cw.p(grp_id)
+    cw.block_end(';')
+    cw.nl()
+
+
+def print_kernel_mcgrp_src(family, cw):
+    if not family.mcgrps['list']:
+        return
+
+    cw.block_start('static const struct genl_multicast_group ' + family.name + '_nl_mcgrps[] =')
+    for grp in family.mcgrps['list']:
+        name = grp['name']
+        grp_id = c_upper(f"{family.name}-nlgrp-{name}")
+        cw.p('[' + grp_id + '] = { "' + name + '", },')
+    cw.block_end(';')
+    cw.nl()
+
+
+def print_kernel_family_struct_hdr(family, cw):
+    if not kernel_can_gen_family_struct(family):
+        return
+
+    cw.p(f"extern struct genl_family {family.name}_nl_family;")
+    cw.nl()
+
+
+def print_kernel_family_struct_src(family, cw):
+    if not kernel_can_gen_family_struct(family):
+        return
+
+    cw.block_start(f"struct genl_family {family.name}_nl_family __ro_after_init =")
+    cw.p('.name\t\t= ' + family.fam_key + ',')
+    cw.p('.version\t= ' + family.ver_key + ',')
+    cw.p('.netnsok\t= true,')
+    cw.p('.parallel_ops\t= true,')
+    cw.p('.module\t\t= THIS_MODULE,')
+    if family.kernel_policy == 'per-op':
+        cw.p(f'.ops\t\t= {family.name}_nl_ops,')
+        cw.p(f'.n_ops\t\t= ARRAY_SIZE({family.name}_nl_ops),')
+    elif family.kernel_policy == 'split':
+        cw.p(f'.split_ops\t= {family.name}_nl_ops,')
+        cw.p(f'.n_split_ops\t= ARRAY_SIZE({family.name}_nl_ops),')
+    if family.mcgrps['list']:
+        cw.p(f'.mcgrps\t\t= {family.name}_nl_mcgrps,')
+        cw.p(f'.n_mcgrps\t= ARRAY_SIZE({family.name}_nl_mcgrps),')
+    cw.block_end(';')
+
+
+def uapi_enum_start(family, cw, obj, ckey='', enum_name='enum-name'):
+    start_line = 'enum'
+    if enum_name in obj:
+        if obj[enum_name]:
+            start_line = 'enum ' + c_lower(obj[enum_name])
+    elif ckey and ckey in obj:
+        start_line = 'enum ' + family.name + '_' + c_lower(obj[ckey])
+    cw.block_start(line=start_line)
+
+
+def render_uapi(family, cw):
+    hdr_prot = f"_UAPI_LINUX_{family.name.upper()}_H"
+    cw.p('#ifndef ' + hdr_prot)
+    cw.p('#define ' + hdr_prot)
+    cw.nl()
+
+    defines = [(family.fam_key, family["name"]),
+               (family.ver_key, family.get('version', 1))]
+    cw.writes_defines(defines)
+    cw.nl()
+
+    defines = []
+    for const in family['definitions']:
+        if const['type'] != 'const':
+            cw.writes_defines(defines)
+            defines = []
+            cw.nl()
+
+        if const['type'] == 'enum':
+            enum = family.consts[const['name']]
+
+            if enum.has_doc():
+                cw.p('/**')
+                doc = ''
+                if 'doc' in enum:
+                    doc = ' - ' + enum['doc']
+                cw.write_doc_line(enum.enum_name + doc)
+                for entry in enum.entry_list:
+                    if entry.has_doc():
+                        doc = '@' + entry.c_name + ': ' + entry['doc']
+                        cw.write_doc_line(doc)
+                cw.p(' */')
+
+            uapi_enum_start(family, cw, const, 'name')
+            first = True
+            name_pfx = const.get('name-prefix', f"{family.name}-{const['name']}-")
+            for entry in enum.entry_list:
+                suffix = ','
+                if first and 'value-start' in const:
+                    suffix = f" = {const['value-start']}" + suffix
+                first = False
+                cw.p(entry.c_name + suffix)
+
+            if const.get('render-max', False):
+                cw.nl()
+                max_name = c_upper(name_pfx + 'max')
+                cw.p('__' + max_name + ',')
+                cw.p(max_name + ' = (__' + max_name + ' - 1)')
+            cw.block_end(line=';')
+            cw.nl()
+        elif const['type'] == 'flags':
+            uapi_enum_start(family, cw, const, 'name')
+            i = const.get('value-start', 0)
+            for item in const['entries']:
+                item_name = item
+                if 'name-prefix' in const:
+                    item_name = c_upper(const['name-prefix'] + item)
+                cw.p(f'{item_name} = {1 << i},')
+                i += 1
+            cw.block_end(line=';')
+            cw.nl()
+        elif const['type'] == 'const':
+            defines.append([c_upper(family.get('c-define-name',
+                                               f"{family.name}-{const['name']}")),
+                            const['value']])
+
+    if defines:
+        cw.writes_defines(defines)
+        cw.nl()
+
+    max_by_define = family.get('max-by-define', False)
+
+    for _, attr_set in family.attr_sets_list:
+        if attr_set.subset_of:
+            continue
+
+        cnt_name = c_upper(family.get('attr-cnt-name', f"__{attr_set.name_prefix}MAX"))
+        max_value = f"({cnt_name} - 1)"
+
+        val = 0
+        uapi_enum_start(family, cw, attr_set.yaml, 'enum-name')
+        for _, attr in attr_set.items():
+            suffix = ','
+            if attr['value'] != val:
+                suffix = f" = {attr['value']},"
+                val = attr['value']
+            val += 1
+            cw.p(attr.enum_name + suffix)
+        cw.nl()
+        cw.p(cnt_name + ('' if max_by_define else ','))
+        if not max_by_define:
+            cw.p(f"{attr_set.max_name} = {max_value}")
+        cw.block_end(line=';')
+        if max_by_define:
+            cw.p(f"#define {attr_set.max_name} {max_value}")
+        cw.nl()
+
+    # Commands
+    separate_ntf = 'async-prefix' in family['operations']
+
+    max_name = c_upper(family.get('cmd-max-name', f"{family.op_prefix}MAX"))
+    cnt_name = c_upper(family.get('cmd-cnt-name', f"__{family.op_prefix}MAX"))
+    max_value = f"({cnt_name} - 1)"
+
+    uapi_enum_start(family, cw, family['operations'], 'enum-name')
+    for _, op in family.ops_list:
+        if separate_ntf and ('notify' in op or 'event' in op):
+            continue
+
+        suffix = ','
+        if 'value' in op:
+            suffix = f" = {op['value']},"
+        cw.p(op.enum_name + suffix)
+    cw.nl()
+    cw.p(cnt_name + ('' if max_by_define else ','))
+    if not max_by_define:
+        cw.p(f"{max_name} = {max_value}")
+    cw.block_end(line=';')
+    if max_by_define:
+        cw.p(f"#define {max_name} {max_value}")
+    cw.nl()
+
+    if separate_ntf:
+        uapi_enum_start(family, cw, family['operations'], enum_name='async-enum')
+        for _, op in family.ops_list:
+            if separate_ntf and not ('notify' in op or 'event' in op):
+                continue
+
+            suffix = ','
+            if 'value' in op:
+                suffix = f" = {op['value']},"
+            cw.p(op.enum_name + suffix)
+        cw.block_end(line=';')
+        cw.nl()
+
+    # Multicast
+    defines = []
+    for grp in family.mcgrps['list']:
+        name = grp['name']
+        defines.append([c_upper(grp.get('c-define-name', f"{family.name}-mcgrp-{name}")),
+                        f'{name}'])
+    cw.nl()
+    if defines:
+        cw.writes_defines(defines)
+        cw.nl()
+
+    cw.p(f'#endif /* {hdr_prot} */')
+
+
+def find_kernel_root(full_path):
+    sub_path = ''
+    while True:
+        sub_path = os.path.join(os.path.basename(full_path), sub_path)
+        full_path = os.path.dirname(full_path)
+        maintainers = os.path.join(full_path, "MAINTAINERS")
+        if os.path.exists(maintainers):
+            return full_path, sub_path[:-1]
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Netlink simple parsing generator')
+    parser.add_argument('--mode', dest='mode', type=str, required=True)
+    parser.add_argument('--spec', dest='spec', type=str, required=True)
+    parser.add_argument('--header', dest='header', action='store_true', default=None)
+    parser.add_argument('--source', dest='header', action='store_false')
+    parser.add_argument('--user-header', nargs='+', default=[])
+    parser.add_argument('-o', dest='out_file', type=str)
+    args = parser.parse_args()
+
+    out_file = open(args.out_file, 'w+') if args.out_file else os.sys.stdout
+
+    if args.header is None:
+        parser.error("--header or --source is required")
+
+    try:
+        parsed = Family(args.spec)
+    except yaml.YAMLError as exc:
+        print(exc)
+        os.sys.exit(1)
+        return
+
+    cw = CodeWriter(BaseNlLib(), out_file)
+
+    _, spec_kernel = find_kernel_root(args.spec)
+    if args.mode == 'uapi':
+        cw.p('/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */')
+    else:
+        if args.header:
+            cw.p('/* SPDX-License-Identifier: BSD-3-Clause */')
+        else:
+            cw.p('// SPDX-License-Identifier: BSD-3-Clause')
+    cw.p("/* Do not edit directly, auto-generated from: */")
+    cw.p(f"/*\t{spec_kernel} */")
+    cw.p(f"/* YNL-GEN {args.mode} {'header' if args.header else 'source'} */")
+    cw.nl()
+
+    if args.mode == 'uapi':
+        render_uapi(parsed, cw)
+        return
+
+    hdr_prot = f"_LINUX_{parsed.name.upper()}_GEN_H"
+    if args.header:
+        cw.p('#ifndef ' + hdr_prot)
+        cw.p('#define ' + hdr_prot)
+        cw.nl()
+
+    if args.mode == 'kernel':
+        cw.p('#include <net/netlink.h>')
+        cw.p('#include <net/genetlink.h>')
+        cw.nl()
+        if not args.header:
+            if args.out_file:
+                cw.p(f'#include "{os.path.basename(args.out_file[:-2])}.h"')
+            cw.nl()
+    headers = [parsed.uapi_header]
+    for definition in parsed['definitions']:
+        if 'header' in definition:
+            headers.append(definition['header'])
+    for one in headers:
+        cw.p(f"#include <{one}>")
+    cw.nl()
+
+    if args.mode == "user":
+        if not args.header:
+            cw.p("#include <stdlib.h>")
+            cw.p("#include <stdio.h>")
+            cw.p("#include <string.h>")
+            cw.p("#include <libmnl/libmnl.h>")
+            cw.p("#include <linux/genetlink.h>")
+            cw.nl()
+            for one in args.user_header:
+                cw.p(f'#include "{one}"')
+        else:
+            cw.p('struct ynl_sock;')
+        cw.nl()
+
+    if args.mode == "kernel":
+        if args.header:
+            for _, struct in sorted(parsed.pure_nested_structs.items()):
+                if struct.request:
+                    cw.p('/* Common nested types */')
+                    break
+            for attr_set, struct in sorted(parsed.pure_nested_structs.items()):
+                if struct.request:
+                    print_req_policy_fwd(cw, struct)
+            cw.nl()
+
+            if parsed.kernel_policy == 'global':
+                cw.p(f"/* Global operation policy for {parsed.name} */")
+
+                struct = Struct(parsed, parsed.global_policy_set, type_list=parsed.global_policy)
+                print_req_policy_fwd(cw, struct)
+                cw.nl()
+
+            if parsed.kernel_policy in {'per-op', 'split'}:
+                for op_name, op in parsed.ops.items():
+                    if 'do' in op and 'event' not in op:
+                        ri = RenderInfo(cw, parsed, args.mode, op, op_name, "do")
+                        print_req_policy_fwd(cw, ri.struct['request'], ri=ri)
+                        cw.nl()
+
+            print_kernel_op_table_hdr(parsed, cw)
+            print_kernel_mcgrp_hdr(parsed, cw)
+            print_kernel_family_struct_hdr(parsed, cw)
+        else:
+            for _, struct in sorted(parsed.pure_nested_structs.items()):
+                if struct.request:
+                    cw.p('/* Common nested types */')
+                    break
+            for attr_set, struct in sorted(parsed.pure_nested_structs.items()):
+                if struct.request:
+                    print_req_policy(cw, struct)
+            cw.nl()
+
+            if parsed.kernel_policy == 'global':
+                cw.p(f"/* Global operation policy for {parsed.name} */")
+
+                struct = Struct(parsed, parsed.global_policy_set, type_list=parsed.global_policy)
+                print_req_policy(cw, struct)
+                cw.nl()
+
+            for op_name, op in parsed.ops.items():
+                if parsed.kernel_policy in {'per-op', 'split'}:
+                    for op_mode in {'do', 'dump'}:
+                        if op_mode in op and 'request' in op[op_mode]:
+                            cw.p(f"/* {op.enum_name} - {op_mode} */")
+                            ri = RenderInfo(cw, parsed, args.mode, op, op_name, op_mode)
+                            print_req_policy(cw, ri.struct['request'], ri=ri)
+                            cw.nl()
+
+            print_kernel_op_table(parsed, cw)
+            print_kernel_mcgrp_src(parsed, cw)
+            print_kernel_family_struct_src(parsed, cw)
+
+    if args.mode == "user":
+        has_ntf = False
+        if args.header:
+            cw.p('/* Common nested types */')
+            for attr_set, struct in sorted(parsed.pure_nested_structs.items()):
+                ri = RenderInfo(cw, parsed, args.mode, "", "", "", attr_set)
+                print_type_full(ri, struct)
+
+            for op_name, op in parsed.ops.items():
+                cw.p(f"/* ============== {op.enum_name} ============== */")
+
+                if 'do' in op and 'event' not in op:
+                    cw.p(f"/* {op.enum_name} - do */")
+                    ri = RenderInfo(cw, parsed, args.mode, op, op_name, "do")
+                    print_req_type(ri)
+                    print_req_type_helpers(ri)
+                    cw.nl()
+                    print_rsp_type(ri)
+                    print_rsp_type_helpers(ri)
+                    cw.nl()
+                    print_req_prototype(ri)
+                    cw.nl()
+
+                if 'dump' in op:
+                    cw.p(f"/* {op.enum_name} - dump */")
+                    ri = RenderInfo(cw, parsed, args.mode, op, op_name, 'dump')
+                    if 'request' in op['dump']:
+                        print_req_type(ri)
+                        print_req_type_helpers(ri)
+                    if not ri.type_consistent:
+                        print_rsp_type(ri)
+                    print_wrapped_type(ri)
+                    print_dump_prototype(ri)
+                    cw.nl()
+
+                if 'notify' in op:
+                    cw.p(f"/* {op.enum_name} - notify */")
+                    ri = RenderInfo(cw, parsed, args.mode, op, op_name, 'notify')
+                    has_ntf = True
+                    if not ri.type_consistent:
+                        raise Exception('Only notifications with consistent types supported')
+                    print_wrapped_type(ri)
+
+                if 'event' in op:
+                    ri = RenderInfo(cw, parsed, args.mode, op, op_name, 'event')
+                    cw.p(f"/* {op.enum_name} - event */")
+                    print_rsp_type(ri)
+                    cw.nl()
+                    print_wrapped_type(ri)
+
+            if has_ntf:
+                cw.p('/* --------------- Common notification parsing --------------- */')
+                print_ntf_parse_prototype(parsed, cw)
+            cw.nl()
+        else:
+            cw.p('/* Policies */')
+            for name, _ in parsed.attr_sets.items():
+                struct = Struct(parsed, name)
+                put_typol_fwd(cw, struct)
+            cw.nl()
+
+            for name, _ in parsed.attr_sets.items():
+                struct = Struct(parsed, name)
+                put_typol(cw, struct)
+
+            cw.p('/* Common nested types */')
+            for attr_set, struct in sorted(parsed.pure_nested_structs.items()):
+                ri = RenderInfo(cw, parsed, args.mode, "", "", "", attr_set)
+
+                free_rsp_nested(ri, struct)
+                if struct.request:
+                    put_req_nested(ri, struct)
+                if struct.reply:
+                    parse_rsp_nested(ri, struct)
+
+            for op_name, op in parsed.ops.items():
+                cw.p(f"/* ============== {op.enum_name} ============== */")
+                if 'do' in op and 'event' not in op:
+                    cw.p(f"/* {op.enum_name} - do */")
+                    ri = RenderInfo(cw, parsed, args.mode, op, op_name, "do")
+                    print_rsp_free(ri)
+                    parse_rsp_msg(ri)
+                    print_req(ri)
+                    cw.nl()
+
+                if 'dump' in op:
+                    cw.p(f"/* {op.enum_name} - dump */")
+                    ri = RenderInfo(cw, parsed, args.mode, op, op_name, "dump")
+                    if not ri.type_consistent:
+                        parse_rsp_msg(ri, deref=True)
+                    print_dump_type_free(ri)
+                    print_dump(ri)
+                    cw.nl()
+
+                if 'notify' in op:
+                    cw.p(f"/* {op.enum_name} - notify */")
+                    ri = RenderInfo(cw, parsed, args.mode, op, op_name, 'notify')
+                    has_ntf = True
+                    if not ri.type_consistent:
+                        raise Exception('Only notifications with consistent types supported')
+                    print_ntf_type_free(ri)
+
+                if 'event' in op:
+                    cw.p(f"/* {op.enum_name} - event */")
+                    has_ntf = True
+
+                    ri = RenderInfo(cw, parsed, args.mode, op, op_name, "do")
+                    parse_rsp_msg(ri)
+
+                    ri = RenderInfo(cw, parsed, args.mode, op, op_name, "event")
+                    print_ntf_type_free(ri)
+
+            if has_ntf:
+                cw.p('/* --------------- Common notification parsing --------------- */')
+                print_ntf_type_parse(parsed, cw, args.mode)
+
+    if args.header:
+        cw.p(f'#endif /* {hdr_prot} */')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/net/ynl/ynl-regen.sh b/tools/net/ynl/ynl-regen.sh
new file mode 100755
index 000000000000..43989ae48ed0
--- /dev/null
+++ b/tools/net/ynl/ynl-regen.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# SPDX-License-Identifier: BSD-3-Clause
+
+TOOL=$(dirname $(realpath $0))/ynl-gen-c.py
+
+force=
+
+while [ ! -z "$1" ]; do
+  case "$1" in
+    -f ) force=yes; shift ;;
+    * )  echo "Unrecognized option '$1'"; exit 1 ;;
+  esac
+done
+
+KDIR=$(dirname $(dirname $(dirname $(dirname $(realpath $0)))))
+
+files=$(git grep --files-with-matches '^/\* YNL-GEN \(kernel\|uapi\)')
+for f in $files; do
+    # params:     0       1      2     3
+    #         $YAML YNL-GEN kernel $mode
+    params=( $(git grep -B1 -h '/\* YNL-GEN' $f | sed 's@/\*\(.*\)\*/@\1@') )
+
+    if [ $f -nt ${params[0]} -a -z "$force" ]; then
+	echo -e "\tSKIP $f"
+	continue
+    fi
+
+    echo -e "\tGEN ${params[2]}\t$f"
+    $TOOL --mode ${params[2]} --${params[3]} --spec $KDIR/${params[0]} -o $f
+done
-- 
cgit 


From e4b48ed460d369070ce1d491800acf50ab9701d2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 20 Jan 2023 09:50:41 -0800
Subject: tools: ynl: add a completely generic client

Add a CLI sample which can take in arbitrary request
in JSON format, convert it to Netlink and do the inverse
for output.

It's meant as a development tool primarily and perhaps
for selftests which need to tickle netlink in a special way.

Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/net/ynl/samples/cli.py |  47 ++++
 tools/net/ynl/samples/ynl.py | 534 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 581 insertions(+)
 create mode 100755 tools/net/ynl/samples/cli.py
 create mode 100644 tools/net/ynl/samples/ynl.py

(limited to 'tools')

diff --git a/tools/net/ynl/samples/cli.py b/tools/net/ynl/samples/cli.py
new file mode 100755
index 000000000000..b27159c70710
--- /dev/null
+++ b/tools/net/ynl/samples/cli.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+# SPDX-License-Identifier: BSD-3-Clause
+
+import argparse
+import json
+import pprint
+import time
+
+from ynl import YnlFamily
+
+
+def main():
+    parser = argparse.ArgumentParser(description='YNL CLI sample')
+    parser.add_argument('--spec', dest='spec', type=str, required=True)
+    parser.add_argument('--schema', dest='schema', type=str)
+    parser.add_argument('--json', dest='json_text', type=str)
+    parser.add_argument('--do', dest='do', type=str)
+    parser.add_argument('--dump', dest='dump', type=str)
+    parser.add_argument('--sleep', dest='sleep', type=int)
+    parser.add_argument('--subscribe', dest='ntf', type=str)
+    args = parser.parse_args()
+
+    attrs = {}
+    if args.json_text:
+        attrs = json.loads(args.json_text)
+
+    ynl = YnlFamily(args.spec, args.schema)
+
+    if args.ntf:
+        ynl.ntf_subscribe(args.ntf)
+
+    if args.sleep:
+        time.sleep(args.sleep)
+
+    if args.do or args.dump:
+        method = getattr(ynl, args.do if args.do else args.dump)
+
+        reply = method(attrs, dump=bool(args.dump))
+        pprint.PrettyPrinter().pprint(reply)
+
+    if args.ntf:
+        ynl.check_ntf()
+        pprint.PrettyPrinter().pprint(ynl.async_msg_queue)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/net/ynl/samples/ynl.py b/tools/net/ynl/samples/ynl.py
new file mode 100644
index 000000000000..b71523d71d46
--- /dev/null
+++ b/tools/net/ynl/samples/ynl.py
@@ -0,0 +1,534 @@
+# SPDX-License-Identifier: BSD-3-Clause
+
+import functools
+import jsonschema
+import os
+import random
+import socket
+import struct
+import yaml
+
+#
+# Generic Netlink code which should really be in some library, but I can't quickly find one.
+#
+
+
+class Netlink:
+    # Netlink socket
+    SOL_NETLINK = 270
+
+    NETLINK_ADD_MEMBERSHIP = 1
+    NETLINK_CAP_ACK = 10
+    NETLINK_EXT_ACK = 11
+
+    # Netlink message
+    NLMSG_ERROR = 2
+    NLMSG_DONE = 3
+
+    NLM_F_REQUEST = 1
+    NLM_F_ACK = 4
+    NLM_F_ROOT = 0x100
+    NLM_F_MATCH = 0x200
+    NLM_F_APPEND = 0x800
+
+    NLM_F_CAPPED = 0x100
+    NLM_F_ACK_TLVS = 0x200
+
+    NLM_F_DUMP = NLM_F_ROOT | NLM_F_MATCH
+
+    NLA_F_NESTED = 0x8000
+    NLA_F_NET_BYTEORDER = 0x4000
+
+    NLA_TYPE_MASK = NLA_F_NESTED | NLA_F_NET_BYTEORDER
+
+    # Genetlink defines
+    NETLINK_GENERIC = 16
+
+    GENL_ID_CTRL = 0x10
+
+    # nlctrl
+    CTRL_CMD_GETFAMILY = 3
+
+    CTRL_ATTR_FAMILY_ID = 1
+    CTRL_ATTR_FAMILY_NAME = 2
+    CTRL_ATTR_MAXATTR = 5
+    CTRL_ATTR_MCAST_GROUPS = 7
+
+    CTRL_ATTR_MCAST_GRP_NAME = 1
+    CTRL_ATTR_MCAST_GRP_ID = 2
+
+    # Extack types
+    NLMSGERR_ATTR_MSG = 1
+    NLMSGERR_ATTR_OFFS = 2
+    NLMSGERR_ATTR_COOKIE = 3
+    NLMSGERR_ATTR_POLICY = 4
+    NLMSGERR_ATTR_MISS_TYPE = 5
+    NLMSGERR_ATTR_MISS_NEST = 6
+
+
+class NlAttr:
+    def __init__(self, raw, offset):
+        self._len, self._type = struct.unpack("HH", raw[offset:offset + 4])
+        self.type = self._type & ~Netlink.NLA_TYPE_MASK
+        self.payload_len = self._len
+        self.full_len = (self.payload_len + 3) & ~3
+        self.raw = raw[offset + 4:offset + self.payload_len]
+
+    def as_u16(self):
+        return struct.unpack("H", self.raw)[0]
+
+    def as_u32(self):
+        return struct.unpack("I", self.raw)[0]
+
+    def as_u64(self):
+        return struct.unpack("Q", self.raw)[0]
+
+    def as_strz(self):
+        return self.raw.decode('ascii')[:-1]
+
+    def as_bin(self):
+        return self.raw
+
+    def __repr__(self):
+        return f"[type:{self.type} len:{self._len}] {self.raw}"
+
+
+class NlAttrs:
+    def __init__(self, msg):
+        self.attrs = []
+
+        offset = 0
+        while offset < len(msg):
+            attr = NlAttr(msg, offset)
+            offset += attr.full_len
+            self.attrs.append(attr)
+
+    def __iter__(self):
+        yield from self.attrs
+
+    def __repr__(self):
+        msg = ''
+        for a in self.attrs:
+            if msg:
+                msg += '\n'
+            msg += repr(a)
+        return msg
+
+
+class NlMsg:
+    def __init__(self, msg, offset, attr_space=None):
+        self.hdr = msg[offset:offset + 16]
+
+        self.nl_len, self.nl_type, self.nl_flags, self.nl_seq, self.nl_portid = \
+            struct.unpack("IHHII", self.hdr)
+
+        self.raw = msg[offset + 16:offset + self.nl_len]
+
+        self.error = 0
+        self.done = 0
+
+        extack_off = None
+        if self.nl_type == Netlink.NLMSG_ERROR:
+            self.error = struct.unpack("i", self.raw[0:4])[0]
+            self.done = 1
+            extack_off = 20
+        elif self.nl_type == Netlink.NLMSG_DONE:
+            self.done = 1
+            extack_off = 4
+
+        self.extack = None
+        if self.nl_flags & Netlink.NLM_F_ACK_TLVS and extack_off:
+            self.extack = dict()
+            extack_attrs = NlAttrs(self.raw[extack_off:])
+            for extack in extack_attrs:
+                if extack.type == Netlink.NLMSGERR_ATTR_MSG:
+                    self.extack['msg'] = extack.as_strz()
+                elif extack.type == Netlink.NLMSGERR_ATTR_MISS_TYPE:
+                    self.extack['miss-type'] = extack.as_u32()
+                elif extack.type == Netlink.NLMSGERR_ATTR_MISS_NEST:
+                    self.extack['miss-nest'] = extack.as_u32()
+                elif extack.type == Netlink.NLMSGERR_ATTR_OFFS:
+                    self.extack['bad-attr-offs'] = extack.as_u32()
+                else:
+                    if 'unknown' not in self.extack:
+                        self.extack['unknown'] = []
+                    self.extack['unknown'].append(extack)
+
+            if attr_space:
+                # We don't have the ability to parse nests yet, so only do global
+                if 'miss-type' in self.extack and 'miss-nest' not in self.extack:
+                    miss_type = self.extack['miss-type']
+                    if len(attr_space.attr_list) > miss_type:
+                        spec = attr_space.attr_list[miss_type]
+                        desc = spec['name']
+                        if 'doc' in spec:
+                            desc += f" ({spec['doc']})"
+                        self.extack['miss-type'] = desc
+
+    def __repr__(self):
+        msg = f"nl_len = {self.nl_len} ({len(self.raw)}) nl_flags = 0x{self.nl_flags:x} nl_type = {self.nl_type}\n"
+        if self.error:
+            msg += '\terror: ' + str(self.error)
+        if self.extack:
+            msg += '\textack: ' + repr(self.extack)
+        return msg
+
+
+class NlMsgs:
+    def __init__(self, data, attr_space=None):
+        self.msgs = []
+
+        offset = 0
+        while offset < len(data):
+            msg = NlMsg(data, offset, attr_space=attr_space)
+            offset += msg.nl_len
+            self.msgs.append(msg)
+
+    def __iter__(self):
+        yield from self.msgs
+
+
+genl_family_name_to_id = None
+
+
+def _genl_msg(nl_type, nl_flags, genl_cmd, genl_version, seq=None):
+    # we prepend length in _genl_msg_finalize()
+    if seq is None:
+        seq = random.randint(1, 1024)
+    nlmsg = struct.pack("HHII", nl_type, nl_flags, seq, 0)
+    genlmsg = struct.pack("bbH", genl_cmd, genl_version, 0)
+    return nlmsg + genlmsg
+
+
+def _genl_msg_finalize(msg):
+    return struct.pack("I", len(msg) + 4) + msg
+
+
+def _genl_load_families():
+    with socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, Netlink.NETLINK_GENERIC) as sock:
+        sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1)
+
+        msg = _genl_msg(Netlink.GENL_ID_CTRL,
+                        Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK | Netlink.NLM_F_DUMP,
+                        Netlink.CTRL_CMD_GETFAMILY, 1)
+        msg = _genl_msg_finalize(msg)
+
+        sock.send(msg, 0)
+
+        global genl_family_name_to_id
+        genl_family_name_to_id = dict()
+
+        while True:
+            reply = sock.recv(128 * 1024)
+            nms = NlMsgs(reply)
+            for nl_msg in nms:
+                if nl_msg.error:
+                    print("Netlink error:", nl_msg.error)
+                    return
+                if nl_msg.done:
+                    return
+
+                gm = GenlMsg(nl_msg)
+                fam = dict()
+                for attr in gm.raw_attrs:
+                    if attr.type == Netlink.CTRL_ATTR_FAMILY_ID:
+                        fam['id'] = attr.as_u16()
+                    elif attr.type == Netlink.CTRL_ATTR_FAMILY_NAME:
+                        fam['name'] = attr.as_strz()
+                    elif attr.type == Netlink.CTRL_ATTR_MAXATTR:
+                        fam['maxattr'] = attr.as_u32()
+                    elif attr.type == Netlink.CTRL_ATTR_MCAST_GROUPS:
+                        fam['mcast'] = dict()
+                        for entry in NlAttrs(attr.raw):
+                            mcast_name = None
+                            mcast_id = None
+                            for entry_attr in NlAttrs(entry.raw):
+                                if entry_attr.type == Netlink.CTRL_ATTR_MCAST_GRP_NAME:
+                                    mcast_name = entry_attr.as_strz()
+                                elif entry_attr.type == Netlink.CTRL_ATTR_MCAST_GRP_ID:
+                                    mcast_id = entry_attr.as_u32()
+                            if mcast_name and mcast_id is not None:
+                                fam['mcast'][mcast_name] = mcast_id
+                if 'name' in fam and 'id' in fam:
+                    genl_family_name_to_id[fam['name']] = fam
+
+
+class GenlMsg:
+    def __init__(self, nl_msg):
+        self.nl = nl_msg
+
+        self.hdr = nl_msg.raw[0:4]
+        self.raw = nl_msg.raw[4:]
+
+        self.genl_cmd, self.genl_version, _ = struct.unpack("bbH", self.hdr)
+
+        self.raw_attrs = NlAttrs(self.raw)
+
+    def __repr__(self):
+        msg = repr(self.nl)
+        msg += f"\tgenl_cmd = {self.genl_cmd} genl_ver = {self.genl_version}\n"
+        for a in self.raw_attrs:
+            msg += '\t\t' + repr(a) + '\n'
+        return msg
+
+
+class GenlFamily:
+    def __init__(self, family_name):
+        self.family_name = family_name
+
+        global genl_family_name_to_id
+        if genl_family_name_to_id is None:
+            _genl_load_families()
+
+        self.genl_family = genl_family_name_to_id[family_name]
+        self.family_id = genl_family_name_to_id[family_name]['id']
+
+
+#
+# YNL implementation details.
+#
+
+
+class YnlAttrSpace:
+    def __init__(self, family, yaml):
+        self.yaml = yaml
+
+        self.attrs = dict()
+        self.name = self.yaml['name']
+        self.subspace_of = self.yaml['subset-of'] if 'subspace-of' in self.yaml else None
+
+        val = 0
+        max_val = 0
+        for elem in self.yaml['attributes']:
+            if 'value' in elem:
+                val = elem['value']
+            else:
+                elem['value'] = val
+            if val > max_val:
+                max_val = val
+            val += 1
+
+            self.attrs[elem['name']] = elem
+
+        self.attr_list = [None] * (max_val + 1)
+        for elem in self.yaml['attributes']:
+            self.attr_list[elem['value']] = elem
+
+    def __getitem__(self, key):
+        return self.attrs[key]
+
+    def __contains__(self, key):
+        return key in self.yaml
+
+    def __iter__(self):
+        yield from self.attrs
+
+    def items(self):
+        return self.attrs.items()
+
+
+class YnlFamily:
+    def __init__(self, def_path, schema=None):
+        self.include_raw = False
+
+        with open(def_path, "r") as stream:
+            self.yaml = yaml.safe_load(stream)
+
+        if schema:
+            with open(schema, "r") as stream:
+                schema = yaml.safe_load(stream)
+
+            jsonschema.validate(self.yaml, schema)
+
+        self.sock = socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, Netlink.NETLINK_GENERIC)
+        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1)
+        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_EXT_ACK, 1)
+
+        self._ops = dict()
+        self._spaces = dict()
+        self._types = dict()
+
+        for elem in self.yaml['attribute-sets']:
+            self._spaces[elem['name']] = YnlAttrSpace(self, elem)
+
+        for elem in self.yaml['definitions']:
+            self._types[elem['name']] = elem
+
+        async_separation = 'async-prefix' in self.yaml['operations']
+        self.async_msg_ids = set()
+        self.async_msg_queue = []
+        val = 0
+        max_val = 0
+        for elem in self.yaml['operations']['list']:
+            if not (async_separation and ('notify' in elem or 'event' in elem)):
+                if 'value' in elem:
+                    val = elem['value']
+                else:
+                    elem['value'] = val
+                val += 1
+                max_val = max(val, max_val)
+
+            if 'notify' in elem or 'event' in elem:
+                self.async_msg_ids.add(elem['value'])
+
+            self._ops[elem['name']] = elem
+
+            op_name = elem['name'].replace('-', '_')
+
+            bound_f = functools.partial(self._op, elem['name'])
+            setattr(self, op_name, bound_f)
+
+        self._op_array = [None] * max_val
+        for _, op in self._ops.items():
+            self._op_array[op['value']] = op
+            if 'notify' in op:
+                op['attribute-set'] = self._ops[op['notify']]['attribute-set']
+
+        self.family = GenlFamily(self.yaml['name'])
+
+    def ntf_subscribe(self, mcast_name):
+        if mcast_name not in self.family.genl_family['mcast']:
+            raise Exception(f'Multicast group "{mcast_name}" not present in the family')
+
+        self.sock.bind((0, 0))
+        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_ADD_MEMBERSHIP,
+                             self.family.genl_family['mcast'][mcast_name])
+
+    def _add_attr(self, space, name, value):
+        attr = self._spaces[space][name]
+        nl_type = attr['value']
+        if attr["type"] == 'nest':
+            nl_type |= Netlink.NLA_F_NESTED
+            attr_payload = b''
+            for subname, subvalue in value.items():
+                attr_payload += self._add_attr(attr['nested-attributes'], subname, subvalue)
+        elif attr["type"] == 'u32':
+            attr_payload = struct.pack("I", int(value))
+        elif attr["type"] == 'string':
+            attr_payload = str(value).encode('ascii') + b'\x00'
+        elif attr["type"] == 'binary':
+            attr_payload = value
+        else:
+            raise Exception(f'Unknown type at {space} {name} {value} {attr["type"]}')
+
+        pad = b'\x00' * ((4 - len(attr_payload) % 4) % 4)
+        return struct.pack('HH', len(attr_payload) + 4, nl_type) + attr_payload + pad
+
+    def _decode_enum(self, rsp, attr_spec):
+        raw = rsp[attr_spec['name']]
+        enum = self._types[attr_spec['enum']]
+        i = attr_spec.get('value-start', 0)
+        if 'enum-as-flags' in attr_spec and attr_spec['enum-as-flags']:
+            value = set()
+            while raw:
+                if raw & 1:
+                    value.add(enum['entries'][i])
+                raw >>= 1
+                i += 1
+        else:
+            value = enum['entries'][raw - i]
+        rsp[attr_spec['name']] = value
+
+    def _decode(self, attrs, space):
+        attr_space = self._spaces[space]
+        rsp = dict()
+        for attr in attrs:
+            attr_spec = attr_space.attr_list[attr.type]
+            if attr_spec["type"] == 'nest':
+                subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'])
+                rsp[attr_spec['name']] = subdict
+            elif attr_spec['type'] == 'u32':
+                rsp[attr_spec['name']] = attr.as_u32()
+            elif attr_spec['type'] == 'u64':
+                rsp[attr_spec['name']] = attr.as_u64()
+            elif attr_spec["type"] == 'string':
+                rsp[attr_spec['name']] = attr.as_strz()
+            elif attr_spec["type"] == 'binary':
+                rsp[attr_spec['name']] = attr.as_bin()
+            else:
+                raise Exception(f'Unknown {attr.type} {attr_spec["name"]} {attr_spec["type"]}')
+
+            if 'enum' in attr_spec:
+                self._decode_enum(rsp, attr_spec)
+        return rsp
+
+    def handle_ntf(self, nl_msg, genl_msg):
+        msg = dict()
+        if self.include_raw:
+            msg['nlmsg'] = nl_msg
+            msg['genlmsg'] = genl_msg
+        op = self._op_array[genl_msg.genl_cmd]
+        msg['name'] = op['name']
+        msg['msg'] = self._decode(genl_msg.raw_attrs, op['attribute-set'])
+        self.async_msg_queue.append(msg)
+
+    def check_ntf(self):
+        while True:
+            try:
+                reply = self.sock.recv(128 * 1024, socket.MSG_DONTWAIT)
+            except BlockingIOError:
+                return
+
+            nms = NlMsgs(reply)
+            for nl_msg in nms:
+                if nl_msg.error:
+                    print("Netlink error in ntf!?", os.strerror(-nl_msg.error))
+                    print(nl_msg)
+                    continue
+                if nl_msg.done:
+                    print("Netlink done while checking for ntf!?")
+                    continue
+
+                gm = GenlMsg(nl_msg)
+                if gm.genl_cmd not in self.async_msg_ids:
+                    print("Unexpected msg id done while checking for ntf", gm)
+                    continue
+
+                self.handle_ntf(nl_msg, gm)
+
+    def _op(self, method, vals, dump=False):
+        op = self._ops[method]
+
+        nl_flags = Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK
+        if dump:
+            nl_flags |= Netlink.NLM_F_DUMP
+
+        req_seq = random.randint(1024, 65535)
+        msg = _genl_msg(self.family.family_id, nl_flags, op['value'], 1, req_seq)
+        for name, value in vals.items():
+            msg += self._add_attr(op['attribute-set'], name, value)
+        msg = _genl_msg_finalize(msg)
+
+        self.sock.send(msg, 0)
+
+        done = False
+        rsp = []
+        while not done:
+            reply = self.sock.recv(128 * 1024)
+            nms = NlMsgs(reply, attr_space=self._spaces[op['attribute-set']])
+            for nl_msg in nms:
+                if nl_msg.error:
+                    print("Netlink error:", os.strerror(-nl_msg.error))
+                    print(nl_msg)
+                    return
+                if nl_msg.done:
+                    done = True
+                    break
+
+                gm = GenlMsg(nl_msg)
+                # Check if this is a reply to our request
+                if nl_msg.nl_seq != req_seq or gm.genl_cmd != op['value']:
+                    if gm.genl_cmd in self.async_msg_ids:
+                        self.handle_ntf(nl_msg, gm)
+                        continue
+                    else:
+                        print('Unexpected message: ' + repr(gm))
+                        continue
+
+                rsp.append(self._decode(gm.raw_attrs, op['attribute-set']))
+
+        if not rsp:
+            return None
+        if not dump and len(rsp) == 1:
+            return rsp[0]
+        return rsp
-- 
cgit 


From f1e06fa10e37341a4a642d93fbd0a9babe129655 Mon Sep 17 00:00:00 2001
From: Aaron Lewis <aaronlewis@google.com>
Date: Tue, 20 Dec 2022 16:12:34 +0000
Subject: KVM: selftests: Add flags when creating a pmu event filter

Now that the flags field can be non-zero, pass it in when creating a
pmu event filter.

This is needed in preparation for testing masked events.

No functional change intended.

Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Link: https://lore.kernel.org/r/20221220161236.555143-6-aaronlewis@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
index 2de98fce7edd..d50c8c160658 100644
--- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
@@ -198,14 +198,15 @@ static struct kvm_pmu_event_filter *alloc_pmu_event_filter(uint32_t nevents)
 
 
 static struct kvm_pmu_event_filter *
-create_pmu_event_filter(const uint64_t event_list[],
-			int nevents, uint32_t action)
+create_pmu_event_filter(const uint64_t event_list[], int nevents,
+			uint32_t action, uint32_t flags)
 {
 	struct kvm_pmu_event_filter *f;
 	int i;
 
 	f = alloc_pmu_event_filter(nevents);
 	f->action = action;
+	f->flags = flags;
 	for (i = 0; i < nevents; i++)
 		f->events[i] = event_list[i];
 
@@ -216,7 +217,7 @@ static struct kvm_pmu_event_filter *event_filter(uint32_t action)
 {
 	return create_pmu_event_filter(event_list,
 				       ARRAY_SIZE(event_list),
-				       action);
+				       action, 0);
 }
 
 /*
@@ -263,7 +264,7 @@ static void test_amd_deny_list(struct kvm_vcpu *vcpu)
 	struct kvm_pmu_event_filter *f;
 	uint64_t count;
 
-	f = create_pmu_event_filter(&event, 1, KVM_PMU_EVENT_DENY);
+	f = create_pmu_event_filter(&event, 1, KVM_PMU_EVENT_DENY, 0);
 	count = test_with_filter(vcpu, f);
 
 	free(f);
-- 
cgit 


From 7b7027937d0155aaf583afbb9da4311fb5f28dd8 Mon Sep 17 00:00:00 2001
From: Aaron Lewis <aaronlewis@google.com>
Date: Tue, 20 Dec 2022 16:12:35 +0000
Subject: KVM: selftests: Add testing for KVM_SET_PMU_EVENT_FILTER

Test that masked events are not using invalid bits, and if they are,
ensure the pmu event filter is not accepted by KVM_SET_PMU_EVENT_FILTER.
The only valid bits that can be used for masked events are set when
using KVM_PMU_ENCODE_MASKED_ENTRY() with one exception: If any of the
high bits (35:32) of the event select are set when using Intel, the pmu
event filter will fail.

Also, because validation was not being done prior to the introduction
of masked events, only expect validation to fail when masked events
are used.  E.g. in the first test a filter event with all its bits set
is accepted by KVM_SET_PMU_EVENT_FILTER when flags = 0.

Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Link: https://lore.kernel.org/r/20221220161236.555143-7-aaronlewis@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/x86_64/pmu_event_filter_test.c   | 36 ++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
index d50c8c160658..a96830243195 100644
--- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
@@ -404,6 +404,39 @@ static bool use_amd_pmu(void)
 		 is_zen3(family, model));
 }
 
+static int run_filter_test(struct kvm_vcpu *vcpu, const uint64_t *events,
+			   int nevents, uint32_t flags)
+{
+	struct kvm_pmu_event_filter *f;
+	int r;
+
+	f = create_pmu_event_filter(events, nevents, KVM_PMU_EVENT_ALLOW, flags);
+	r = __vm_ioctl(vcpu->vm, KVM_SET_PMU_EVENT_FILTER, f);
+	free(f);
+
+	return r;
+}
+
+static void test_filter_ioctl(struct kvm_vcpu *vcpu)
+{
+	uint64_t e = ~0ul;
+	int r;
+
+	/*
+	 * Unfortunately having invalid bits set in event data is expected to
+	 * pass when flags == 0 (bits other than eventsel+umask).
+	 */
+	r = run_filter_test(vcpu, &e, 1, 0);
+	TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing");
+
+	r = run_filter_test(vcpu, &e, 1, KVM_PMU_EVENT_FLAG_MASKED_EVENTS);
+	TEST_ASSERT(r != 0, "Invalid PMU Event Filter is expected to fail");
+
+	e = KVM_PMU_EVENT_ENCODE_MASKED_ENTRY(0xff, 0xff, 0xff, 0xf);
+	r = run_filter_test(vcpu, &e, 1, KVM_PMU_EVENT_FLAG_MASKED_EVENTS);
+	TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing");
+}
+
 int main(int argc, char *argv[])
 {
 	void (*guest_code)(void);
@@ -411,6 +444,7 @@ int main(int argc, char *argv[])
 	struct kvm_vm *vm;
 
 	TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_FILTER));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_MASKED_EVENTS));
 
 	TEST_REQUIRE(use_intel_pmu() || use_amd_pmu());
 	guest_code = use_intel_pmu() ? intel_guest_code : amd_guest_code;
@@ -431,6 +465,8 @@ int main(int argc, char *argv[])
 	test_not_member_deny_list(vcpu);
 	test_not_member_allow_list(vcpu);
 
+	test_filter_ioctl(vcpu);
+
 	kvm_vm_free(vm);
 
 	test_pmu_config_disable(guest_code);
-- 
cgit 


From 647ffac11ebbbd21e04dd11a7125decb99eeee65 Mon Sep 17 00:00:00 2001
From: Aaron Lewis <aaronlewis@google.com>
Date: Tue, 20 Dec 2022 16:12:36 +0000
Subject: KVM: selftests: Test masked events in PMU filter

Add testing to show that a pmu event can be filtered with a generalized
match on it's unit mask.

These tests set up test cases to demonstrate various ways of filtering
a pmu event that has multiple unit mask values.  It does this by
setting up the filter in KVM with the masked events provided, then
enabling three pmu counters in the guest.  The test then verifies that
the pmu counters agree with which counters should be counting and which
counters should be filtered for both a sparse filter list and a dense
filter list.

Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Link: https://lore.kernel.org/r/20221220161236.555143-8-aaronlewis@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/x86_64/pmu_event_filter_test.c   | 338 ++++++++++++++++++++-
 1 file changed, 336 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
index a96830243195..253e4304bbe3 100644
--- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
@@ -404,6 +404,331 @@ static bool use_amd_pmu(void)
 		 is_zen3(family, model));
 }
 
+/*
+ * "MEM_INST_RETIRED.ALL_LOADS", "MEM_INST_RETIRED.ALL_STORES", and
+ * "MEM_INST_RETIRED.ANY" from https://perfmon-events.intel.com/
+ * supported on Intel Xeon processors:
+ *  - Sapphire Rapids, Ice Lake, Cascade Lake, Skylake.
+ */
+#define MEM_INST_RETIRED		0xD0
+#define MEM_INST_RETIRED_LOAD		EVENT(MEM_INST_RETIRED, 0x81)
+#define MEM_INST_RETIRED_STORE		EVENT(MEM_INST_RETIRED, 0x82)
+#define MEM_INST_RETIRED_LOAD_STORE	EVENT(MEM_INST_RETIRED, 0x83)
+
+static bool supports_event_mem_inst_retired(void)
+{
+	uint32_t eax, ebx, ecx, edx;
+
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+	if (x86_family(eax) == 0x6) {
+		switch (x86_model(eax)) {
+		/* Sapphire Rapids */
+		case 0x8F:
+		/* Ice Lake */
+		case 0x6A:
+		/* Skylake */
+		/* Cascade Lake */
+		case 0x55:
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * "LS Dispatch", from Processor Programming Reference
+ * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors,
+ * Preliminary Processor Programming Reference (PPR) for AMD Family
+ * 17h Model 31h, Revision B0 Processors, and Preliminary Processor
+ * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision
+ * B1 Processors Volume 1 of 2.
+ */
+#define LS_DISPATCH		0x29
+#define LS_DISPATCH_LOAD	EVENT(LS_DISPATCH, BIT(0))
+#define LS_DISPATCH_STORE	EVENT(LS_DISPATCH, BIT(1))
+#define LS_DISPATCH_LOAD_STORE	EVENT(LS_DISPATCH, BIT(2))
+
+#define INCLUDE_MASKED_ENTRY(event_select, mask, match) \
+	KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, false)
+#define EXCLUDE_MASKED_ENTRY(event_select, mask, match) \
+	KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, true)
+
+struct perf_counter {
+	union {
+		uint64_t raw;
+		struct {
+			uint64_t loads:22;
+			uint64_t stores:22;
+			uint64_t loads_stores:20;
+		};
+	};
+};
+
+static uint64_t masked_events_guest_test(uint32_t msr_base)
+{
+	uint64_t ld0, ld1, st0, st1, ls0, ls1;
+	struct perf_counter c;
+	int val;
+
+	/*
+	 * The acutal value of the counters don't determine the outcome of
+	 * the test.  Only that they are zero or non-zero.
+	 */
+	ld0 = rdmsr(msr_base + 0);
+	st0 = rdmsr(msr_base + 1);
+	ls0 = rdmsr(msr_base + 2);
+
+	__asm__ __volatile__("movl $0, %[v];"
+			     "movl %[v], %%eax;"
+			     "incl %[v];"
+			     : [v]"+m"(val) :: "eax");
+
+	ld1 = rdmsr(msr_base + 0);
+	st1 = rdmsr(msr_base + 1);
+	ls1 = rdmsr(msr_base + 2);
+
+	c.loads = ld1 - ld0;
+	c.stores = st1 - st0;
+	c.loads_stores = ls1 - ls0;
+
+	return c.raw;
+}
+
+static void intel_masked_events_guest_code(void)
+{
+	uint64_t r;
+
+	for (;;) {
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+		wrmsr(MSR_P6_EVNTSEL0 + 0, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_LOAD);
+		wrmsr(MSR_P6_EVNTSEL0 + 1, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_STORE);
+		wrmsr(MSR_P6_EVNTSEL0 + 2, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_LOAD_STORE);
+
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0x7);
+
+		r = masked_events_guest_test(MSR_IA32_PMC0);
+
+		GUEST_SYNC(r);
+	}
+}
+
+static void amd_masked_events_guest_code(void)
+{
+	uint64_t r;
+
+	for (;;) {
+		wrmsr(MSR_K7_EVNTSEL0, 0);
+		wrmsr(MSR_K7_EVNTSEL1, 0);
+		wrmsr(MSR_K7_EVNTSEL2, 0);
+
+		wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_LOAD);
+		wrmsr(MSR_K7_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_STORE);
+		wrmsr(MSR_K7_EVNTSEL2, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_LOAD_STORE);
+
+		r = masked_events_guest_test(MSR_K7_PERFCTR0);
+
+		GUEST_SYNC(r);
+	}
+}
+
+static struct perf_counter run_masked_events_test(struct kvm_vcpu *vcpu,
+						 const uint64_t masked_events[],
+						 const int nmasked_events)
+{
+	struct kvm_pmu_event_filter *f;
+	struct perf_counter r;
+
+	f = create_pmu_event_filter(masked_events, nmasked_events,
+				    KVM_PMU_EVENT_ALLOW,
+				    KVM_PMU_EVENT_FLAG_MASKED_EVENTS);
+	r.raw = test_with_filter(vcpu, f);
+	free(f);
+
+	return r;
+}
+
+/* Matches KVM_PMU_EVENT_FILTER_MAX_EVENTS in pmu.c */
+#define MAX_FILTER_EVENTS	300
+#define MAX_TEST_EVENTS		10
+
+#define ALLOW_LOADS		BIT(0)
+#define ALLOW_STORES		BIT(1)
+#define ALLOW_LOADS_STORES	BIT(2)
+
+struct masked_events_test {
+	uint64_t intel_events[MAX_TEST_EVENTS];
+	uint64_t intel_event_end;
+	uint64_t amd_events[MAX_TEST_EVENTS];
+	uint64_t amd_event_end;
+	const char *msg;
+	uint32_t flags;
+};
+
+/*
+ * These are the test cases for the masked events tests.
+ *
+ * For each test, the guest enables 3 PMU counters (loads, stores,
+ * loads + stores).  The filter is then set in KVM with the masked events
+ * provided.  The test then verifies that the counters agree with which
+ * ones should be counting and which ones should be filtered.
+ */
+const struct masked_events_test test_cases[] = {
+	{
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x81),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(0)),
+		},
+		.msg = "Only allow loads.",
+		.flags = ALLOW_LOADS,
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x82),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(1)),
+		},
+		.msg = "Only allow stores.",
+		.flags = ALLOW_STORES,
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x83),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(2)),
+		},
+		.msg = "Only allow loads + stores.",
+		.flags = ALLOW_LOADS_STORES,
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0),
+			EXCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x83),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, ~(BIT(0) | BIT(1)), 0),
+		},
+		.msg = "Only allow loads and stores.",
+		.flags = ALLOW_LOADS | ALLOW_STORES,
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0),
+			EXCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x82),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0),
+			EXCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(1)),
+		},
+		.msg = "Only allow loads and loads + stores.",
+		.flags = ALLOW_LOADS | ALLOW_LOADS_STORES
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFE, 0x82),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0),
+			EXCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(0)),
+		},
+		.msg = "Only allow stores and loads + stores.",
+		.flags = ALLOW_STORES | ALLOW_LOADS_STORES
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0),
+		},
+		.msg = "Only allow loads, stores, and loads + stores.",
+		.flags = ALLOW_LOADS | ALLOW_STORES | ALLOW_LOADS_STORES
+	},
+};
+
+static int append_test_events(const struct masked_events_test *test,
+			      uint64_t *events, int nevents)
+{
+	const uint64_t *evts;
+	int i;
+
+	evts = use_intel_pmu() ? test->intel_events : test->amd_events;
+	for (i = 0; i < MAX_TEST_EVENTS; i++) {
+		if (evts[i] == 0)
+			break;
+
+		events[nevents + i] = evts[i];
+	}
+
+	return nevents + i;
+}
+
+static bool bool_eq(bool a, bool b)
+{
+	return a == b;
+}
+
+static void run_masked_events_tests(struct kvm_vcpu *vcpu, uint64_t *events,
+				    int nevents)
+{
+	int ntests = ARRAY_SIZE(test_cases);
+	struct perf_counter c;
+	int i, n;
+
+	for (i = 0; i < ntests; i++) {
+		const struct masked_events_test *test = &test_cases[i];
+
+		/* Do any test case events overflow MAX_TEST_EVENTS? */
+		assert(test->intel_event_end == 0);
+		assert(test->amd_event_end == 0);
+
+		n = append_test_events(test, events, nevents);
+
+		c = run_masked_events_test(vcpu, events, n);
+		TEST_ASSERT(bool_eq(c.loads, test->flags & ALLOW_LOADS) &&
+			    bool_eq(c.stores, test->flags & ALLOW_STORES) &&
+			    bool_eq(c.loads_stores,
+				    test->flags & ALLOW_LOADS_STORES),
+			    "%s  loads: %u, stores: %u, loads + stores: %u",
+			    test->msg, c.loads, c.stores, c.loads_stores);
+	}
+}
+
+static void add_dummy_events(uint64_t *events, int nevents)
+{
+	int i;
+
+	for (i = 0; i < nevents; i++) {
+		int event_select = i % 0xFF;
+		bool exclude = ((i % 4) == 0);
+
+		if (event_select == MEM_INST_RETIRED ||
+		    event_select == LS_DISPATCH)
+			event_select++;
+
+		events[i] = KVM_PMU_ENCODE_MASKED_ENTRY(event_select, 0,
+							0, exclude);
+	}
+}
+
+static void test_masked_events(struct kvm_vcpu *vcpu)
+{
+	int nevents = MAX_FILTER_EVENTS - MAX_TEST_EVENTS;
+	uint64_t events[MAX_FILTER_EVENTS];
+
+	/* Run the test cases against a sparse PMU event filter. */
+	run_masked_events_tests(vcpu, events, 0);
+
+	/* Run the test cases against a dense PMU event filter. */
+	add_dummy_events(events, MAX_FILTER_EVENTS);
+	run_masked_events_tests(vcpu, events, nevents);
+}
+
 static int run_filter_test(struct kvm_vcpu *vcpu, const uint64_t *events,
 			   int nevents, uint32_t flags)
 {
@@ -432,7 +757,7 @@ static void test_filter_ioctl(struct kvm_vcpu *vcpu)
 	r = run_filter_test(vcpu, &e, 1, KVM_PMU_EVENT_FLAG_MASKED_EVENTS);
 	TEST_ASSERT(r != 0, "Invalid PMU Event Filter is expected to fail");
 
-	e = KVM_PMU_EVENT_ENCODE_MASKED_ENTRY(0xff, 0xff, 0xff, 0xf);
+	e = KVM_PMU_ENCODE_MASKED_ENTRY(0xff, 0xff, 0xff, 0xf);
 	r = run_filter_test(vcpu, &e, 1, KVM_PMU_EVENT_FLAG_MASKED_EVENTS);
 	TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing");
 }
@@ -440,7 +765,7 @@ static void test_filter_ioctl(struct kvm_vcpu *vcpu)
 int main(int argc, char *argv[])
 {
 	void (*guest_code)(void);
-	struct kvm_vcpu *vcpu;
+	struct kvm_vcpu *vcpu, *vcpu2 = NULL;
 	struct kvm_vm *vm;
 
 	TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_FILTER));
@@ -465,6 +790,15 @@ int main(int argc, char *argv[])
 	test_not_member_deny_list(vcpu);
 	test_not_member_allow_list(vcpu);
 
+	if (use_intel_pmu() &&
+	    supports_event_mem_inst_retired() &&
+	    kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) >= 3)
+		vcpu2 = vm_vcpu_add(vm, 2, intel_masked_events_guest_code);
+	else if (use_amd_pmu())
+		vcpu2 = vm_vcpu_add(vm, 2, amd_masked_events_guest_code);
+
+	if (vcpu2)
+		test_masked_events(vcpu2);
 	test_filter_ioctl(vcpu);
 
 	kvm_vm_free(vm);
-- 
cgit 


From bf10993313e909bbf928bc40fd03141ffe23c7ec Mon Sep 17 00:00:00 2001
From: Reiji Watanabe <reijiw@google.com>
Date: Tue, 20 Dec 2022 09:09:21 -0800
Subject: KVM: selftests: kvm_vm_elf_load() and elfhdr_get() should close fd

kvm_vm_elf_load() and elfhdr_get() open one file each, but they
never close the opened file descriptor.  If a test repeatedly
creates and destroys a VM with __vm_create(), which
(directly or indirectly) calls those two functions, the test
might end up getting a open failure with EMFILE.
Fix those two functions to close the file descriptor.

Signed-off-by: Reiji Watanabe <reijiw@google.com>
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Reviewed-by: Andrew Jones <andrew.jones@linux.dev>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/r/20221220170921.2499209-2-reijiw@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/lib/elf.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c
index 820ac2d08c98..266f3876e10a 100644
--- a/tools/testing/selftests/kvm/lib/elf.c
+++ b/tools/testing/selftests/kvm/lib/elf.c
@@ -90,6 +90,7 @@ static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp)
 		"  hdrp->e_shentsize: %x\n"
 		"  expected: %zx",
 		hdrp->e_shentsize, sizeof(Elf64_Shdr));
+	close(fd);
 }
 
 /* VM ELF Load
@@ -190,4 +191,5 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename)
 				phdr.p_filesz);
 		}
 	}
+	close(fd);
 }
-- 
cgit 


From ca17899693518acde9ba1adcf46b95d76286106d Mon Sep 17 00:00:00 2001
From: Aaron Lewis <aaronlewis@google.com>
Date: Fri, 9 Dec 2022 20:13:27 +0000
Subject: KVM: selftests: Fix a typo in the vcpu_msrs_set assert

The assert incorrectly identifies the ioctl being called.  Switch it
from KVM_GET_MSRS to KVM_SET_MSRS.

Fixes: 6ebfef83f03f ("KVM: selftest: Add proper helpers for x86-specific save/restore ioctls")
Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/r/20221209201326.2781950-1-aaronlewis@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/x86_64/processor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 2a5f47d51388..bbe47e6707eb 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -717,7 +717,7 @@ static inline void vcpu_msrs_set(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs)
 	int r = __vcpu_ioctl(vcpu, KVM_SET_MSRS, msrs);
 
 	TEST_ASSERT(r == msrs->nmsrs,
-		    "KVM_GET_MSRS failed, r: %i (failed on MSR %x)",
+		    "KVM_SET_MSRS failed, r: %i (failed on MSR %x)",
 		    r, r < 0 || r >= msrs->nmsrs ? -1 : msrs->entries[r].index);
 }
 static inline void vcpu_debugregs_get(struct kvm_vcpu *vcpu,
-- 
cgit 


From e99b0d4cc2b679c5892d0994610b44e7b584d98b Mon Sep 17 00:00:00 2001
From: Vishal Annapurve <vannapurve@google.com>
Date: Wed, 11 Jan 2023 00:44:43 +0000
Subject: KVM: selftests: x86: Use "this_cpu" prefix for cpu vendor queries

Replace is_intel/amd_cpu helpers with this_cpu_* helpers to better
convey the intent of querying vendor of the current cpu.

Suggested-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>
Link: https://lore.kernel.org/r/20230111004445.416840-2-vannapurve@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/include/x86_64/processor.h       | 25 ++++++++++++++++++---
 tools/testing/selftests/kvm/lib/x86_64/processor.c | 26 ++--------------------
 .../selftests/kvm/x86_64/fix_hypercall_test.c      |  4 ++--
 .../selftests/kvm/x86_64/mmio_warning_test.c       |  2 +-
 .../selftests/kvm/x86_64/pmu_event_filter_test.c   |  4 ++--
 .../vmx_exception_with_invalid_guest_state.c       |  2 +-
 6 files changed, 30 insertions(+), 33 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index bbe47e6707eb..b5ea72f79540 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -555,6 +555,28 @@ static inline uint32_t this_cpu_model(void)
 	return x86_model(this_cpu_fms());
 }
 
+static inline bool this_cpu_vendor_string_is(const char *vendor)
+{
+	const uint32_t *chunk = (const uint32_t *)vendor;
+	uint32_t eax, ebx, ecx, edx;
+
+	cpuid(0, &eax, &ebx, &ecx, &edx);
+	return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
+}
+
+static inline bool this_cpu_is_intel(void)
+{
+	return this_cpu_vendor_string_is("GenuineIntel");
+}
+
+/*
+ * Exclude early K5 samples with a vendor string of "AMDisbetter!"
+ */
+static inline bool this_cpu_is_amd(void)
+{
+	return this_cpu_vendor_string_is("AuthenticAMD");
+}
+
 static inline uint32_t __this_cpu_has(uint32_t function, uint32_t index,
 				      uint8_t reg, uint8_t lo, uint8_t hi)
 {
@@ -691,9 +713,6 @@ static inline void cpu_relax(void)
 		"hlt\n"	\
 		)
 
-bool is_intel_cpu(void);
-bool is_amd_cpu(void);
-
 struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu);
 void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state);
 void kvm_x86_state_cleanup(struct kvm_x86_state *state);
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index acfa1d01e7df..7d1768543b91 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -113,7 +113,7 @@ static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent)
 
 bool kvm_is_tdp_enabled(void)
 {
-	if (is_intel_cpu())
+	if (this_cpu_is_intel())
 		return get_kvm_intel_param_bool("ept");
 	else
 		return get_kvm_amd_param_bool("npt");
@@ -1006,28 +1006,6 @@ void kvm_x86_state_cleanup(struct kvm_x86_state *state)
 	free(state);
 }
 
-static bool cpu_vendor_string_is(const char *vendor)
-{
-	const uint32_t *chunk = (const uint32_t *)vendor;
-	uint32_t eax, ebx, ecx, edx;
-
-	cpuid(0, &eax, &ebx, &ecx, &edx);
-	return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
-}
-
-bool is_intel_cpu(void)
-{
-	return cpu_vendor_string_is("GenuineIntel");
-}
-
-/*
- * Exclude early K5 samples with a vendor string of "AMDisbetter!"
- */
-bool is_amd_cpu(void)
-{
-	return cpu_vendor_string_is("AuthenticAMD");
-}
-
 void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
 {
 	if (!kvm_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR)) {
@@ -1236,7 +1214,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
 	max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1;
 
 	/* Avoid reserved HyperTransport region on AMD processors.  */
-	if (!is_amd_cpu())
+	if (!this_cpu_is_amd())
 		return max_gfn;
 
 	/* On parts with <40 physical address bits, the area is fully hidden */
diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
index 32f7e09ef67c..5489c9836ec8 100644
--- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
+++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
@@ -48,10 +48,10 @@ static void guest_main(void)
 	const uint8_t *other_hypercall_insn;
 	uint64_t ret;
 
-	if (is_intel_cpu()) {
+	if (this_cpu_is_intel()) {
 		native_hypercall_insn = vmx_vmcall;
 		other_hypercall_insn  = svm_vmmcall;
-	} else if (is_amd_cpu()) {
+	} else if (this_cpu_is_amd()) {
 		native_hypercall_insn = svm_vmmcall;
 		other_hypercall_insn  = vmx_vmcall;
 	} else {
diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
index fb02581953a3..b0a2a0bae0f3 100644
--- a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
+++ b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
@@ -93,7 +93,7 @@ int main(void)
 {
 	int warnings_before, warnings_after;
 
-	TEST_REQUIRE(is_intel_cpu());
+	TEST_REQUIRE(this_cpu_is_intel());
 
 	TEST_REQUIRE(!vm_is_unrestricted_guest(NULL));
 
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
index 2de98fce7edd..c728822461b2 100644
--- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
@@ -363,7 +363,7 @@ static void test_pmu_config_disable(void (*guest_code)(void))
  */
 static bool use_intel_pmu(void)
 {
-	return is_intel_cpu() &&
+	return this_cpu_is_intel() &&
 	       kvm_cpu_property(X86_PROPERTY_PMU_VERSION) &&
 	       kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) &&
 	       kvm_pmu_has(X86_PMU_FEATURE_BRANCH_INSNS_RETIRED);
@@ -397,7 +397,7 @@ static bool use_amd_pmu(void)
 	uint32_t family = kvm_cpu_family();
 	uint32_t model = kvm_cpu_model();
 
-	return is_amd_cpu() &&
+	return this_cpu_is_amd() &&
 		(is_zen1(family, model) ||
 		 is_zen2(family, model) ||
 		 is_zen3(family, model));
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
index 2641b286b4ed..53e1ef2fc774 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
@@ -111,7 +111,7 @@ int main(int argc, char *argv[])
 	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
 
-	TEST_REQUIRE(is_intel_cpu());
+	TEST_REQUIRE(this_cpu_is_intel());
 	TEST_REQUIRE(!vm_is_unrestricted_guest(NULL));
 
 	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-- 
cgit 


From e6df2ae3f57c6bfab29e767796ce8d4796a84ebb Mon Sep 17 00:00:00 2001
From: Vishal Annapurve <vannapurve@google.com>
Date: Wed, 11 Jan 2023 00:44:44 +0000
Subject: KVM: selftests: x86: Cache host CPU vendor (AMD vs. Intel)

Cache the host CPU vendor for userspace and share it with guest code.

All the current callers of this_cpu* actually care about host cpu so
they are updated to check host_cpu_is*.

Suggested-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>
Link: https://lore.kernel.org/r/20230111004445.416840-3-vannapurve@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/x86_64/processor.h     |  3 +++
 tools/testing/selftests/kvm/lib/x86_64/processor.c         | 14 ++++++++++++--
 tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c    |  4 ++--
 tools/testing/selftests/kvm/x86_64/mmio_warning_test.c     |  2 +-
 tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c |  4 ++--
 .../kvm/x86_64/vmx_exception_with_invalid_guest_state.c    |  2 +-
 6 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index b5ea72f79540..53ffa43c90db 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -19,6 +19,9 @@
 
 #include "../kvm_util.h"
 
+extern bool host_cpu_is_intel;
+extern bool host_cpu_is_amd;
+
 #define NMI_VECTOR		0x02
 
 #define X86_EFLAGS_FIXED	 (1u << 1)
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 7d1768543b91..84915bc7d689 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -19,6 +19,8 @@
 #define MAX_NR_CPUID_ENTRIES 100
 
 vm_vaddr_t exception_handlers;
+bool host_cpu_is_amd;
+bool host_cpu_is_intel;
 
 static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent)
 {
@@ -113,7 +115,7 @@ static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent)
 
 bool kvm_is_tdp_enabled(void)
 {
-	if (this_cpu_is_intel())
+	if (host_cpu_is_intel)
 		return get_kvm_intel_param_bool("ept");
 	else
 		return get_kvm_amd_param_bool("npt");
@@ -555,6 +557,8 @@ static void vcpu_setup(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
 void kvm_arch_vm_post_create(struct kvm_vm *vm)
 {
 	vm_create_irqchip(vm);
+	sync_global_to_guest(vm, host_cpu_is_intel);
+	sync_global_to_guest(vm, host_cpu_is_amd);
 }
 
 struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
@@ -1214,7 +1218,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
 	max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1;
 
 	/* Avoid reserved HyperTransport region on AMD processors.  */
-	if (!this_cpu_is_amd())
+	if (!host_cpu_is_amd)
 		return max_gfn;
 
 	/* On parts with <40 physical address bits, the area is fully hidden */
@@ -1254,3 +1258,9 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm)
 
 	return get_kvm_intel_param_bool("unrestricted_guest");
 }
+
+void kvm_selftest_arch_init(void)
+{
+	host_cpu_is_intel = this_cpu_is_intel();
+	host_cpu_is_amd = this_cpu_is_amd();
+}
diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
index 5489c9836ec8..0f728f05ea82 100644
--- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
+++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
@@ -48,10 +48,10 @@ static void guest_main(void)
 	const uint8_t *other_hypercall_insn;
 	uint64_t ret;
 
-	if (this_cpu_is_intel()) {
+	if (host_cpu_is_intel) {
 		native_hypercall_insn = vmx_vmcall;
 		other_hypercall_insn  = svm_vmmcall;
-	} else if (this_cpu_is_amd()) {
+	} else if (host_cpu_is_amd) {
 		native_hypercall_insn = svm_vmmcall;
 		other_hypercall_insn  = vmx_vmcall;
 	} else {
diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
index b0a2a0bae0f3..ce1ccc4c1503 100644
--- a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
+++ b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
@@ -93,7 +93,7 @@ int main(void)
 {
 	int warnings_before, warnings_after;
 
-	TEST_REQUIRE(this_cpu_is_intel());
+	TEST_REQUIRE(host_cpu_is_intel);
 
 	TEST_REQUIRE(!vm_is_unrestricted_guest(NULL));
 
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
index c728822461b2..4dbb454e1760 100644
--- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
@@ -363,7 +363,7 @@ static void test_pmu_config_disable(void (*guest_code)(void))
  */
 static bool use_intel_pmu(void)
 {
-	return this_cpu_is_intel() &&
+	return host_cpu_is_intel &&
 	       kvm_cpu_property(X86_PROPERTY_PMU_VERSION) &&
 	       kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) &&
 	       kvm_pmu_has(X86_PMU_FEATURE_BRANCH_INSNS_RETIRED);
@@ -397,7 +397,7 @@ static bool use_amd_pmu(void)
 	uint32_t family = kvm_cpu_family();
 	uint32_t model = kvm_cpu_model();
 
-	return this_cpu_is_amd() &&
+	return host_cpu_is_amd &&
 		(is_zen1(family, model) ||
 		 is_zen2(family, model) ||
 		 is_zen3(family, model));
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
index 53e1ef2fc774..ccdfa5dc1a4d 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
@@ -111,7 +111,7 @@ int main(int argc, char *argv[])
 	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
 
-	TEST_REQUIRE(this_cpu_is_intel());
+	TEST_REQUIRE(host_cpu_is_intel);
 	TEST_REQUIRE(!vm_is_unrestricted_guest(NULL));
 
 	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-- 
cgit 


From ea25ace7710b918fe49647350d4dee363563908f Mon Sep 17 00:00:00 2001
From: Vishal Annapurve <vannapurve@google.com>
Date: Wed, 11 Jan 2023 00:44:45 +0000
Subject: KVM: selftests: x86: Use host's native hypercall instruction in
 kvm_hypercall()

Use the host CPU's native hypercall instruction, i.e. VMCALL vs. VMMCALL,
in kvm_hypercall(), as relying on KVM to patch in the native hypercall on
a #UD for the "wrong" hypercall requires KVM_X86_QUIRK_FIX_HYPERCALL_INSN
to be enabled and flat out doesn't work if guest memory is encrypted with
a private key, e.g. for SEV VMs.

Suggested-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>
Link: https://lore.kernel.org/r/20230111004445.416840-4-vannapurve@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/lib/x86_64/processor.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 84915bc7d689..ae1e573d94ce 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -1144,9 +1144,15 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
 {
 	uint64_t r;
 
-	asm volatile("vmcall"
+	asm volatile("test %[use_vmmcall], %[use_vmmcall]\n\t"
+		     "jnz 1f\n\t"
+		     "vmcall\n\t"
+		     "jmp 2f\n\t"
+		     "1: vmmcall\n\t"
+		     "2:"
 		     : "=a"(r)
-		     : "a"(nr), "b"(a0), "c"(a1), "d"(a2), "S"(a3));
+		     : "a"(nr), "b"(a0), "c"(a1), "d"(a2), "S"(a3),
+		       [use_vmmcall] "r" (host_cpu_is_amd));
 	return r;
 }
 
-- 
cgit 


From 96e78ebbe8e47d096c1ed74ec1d28a107126c26c Mon Sep 17 00:00:00 2001
From: zhang songyi <zhang.songyi@zte.com.cn>
Date: Mon, 19 Dec 2022 14:32:27 +0800
Subject: KVM: x86/xen: Remove unneeded semicolon

The semicolon after the "}" is unneeded.

Signed-off-by: zhang songyi <zhang.songyi@zte.com.cn>
Link: https://lore.kernel.org/r/202212191432274558936@zte.com.cn
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index dae510c263b4..6a838df69174 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -426,7 +426,7 @@ static void *juggle_shinfo_state(void *arg)
 		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_init);
 		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_destroy);
 		pthread_testcancel();
-	};
+	}
 
 	return NULL;
 }
-- 
cgit 


From 78332517a5dab54514ae719805eec218715de1fc Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Tue, 17 Jan 2023 14:27:07 -0800
Subject: KVM: selftests: Stop assuming stats are contiguous in
 kvm_binary_stats_test

Remove the assumption from kvm_binary_stats_test that all stats are
laid out contiguously in memory. The current stats in KVM are
contiguously laid out in memory, but that may change in the future and
the ABI specifically allows holes in the stats data (since each stat
exposes its own offset).

While here drop the check that each stats' offset is less than
size_data, as that is now always true by construction.

Link: https://lore.kernel.org/kvm/20221208193857.4090582-9-dmatlack@google.com/
Fixes: 0b45d58738cd ("KVM: selftests: Add selftest for KVM statistics data binary interface")
Signed-off-by: Jing Zhang <jingzhangos@google.com>
[dmatlack: Re-worded the commit message.]
Signed-off-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20230117222707.3949974-1-dmatlack@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/kvm_binary_stats_test.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c
index 894417c96f70..a7001e29dc06 100644
--- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c
+++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c
@@ -134,7 +134,7 @@ static void stats_test(int stats_fd)
 				    "Bucket size of stats (%s) is not zero",
 				    pdesc->name);
 		}
-		size_data += pdesc->size * sizeof(*stats_data);
+		size_data = max(size_data, pdesc->offset + pdesc->size * sizeof(*stats_data));
 	}
 
 	/*
@@ -149,14 +149,6 @@ static void stats_test(int stats_fd)
 	TEST_ASSERT(size_data >= header.num_desc * sizeof(*stats_data),
 		    "Data size is not correct");
 
-	/* Check stats offset */
-	for (i = 0; i < header.num_desc; ++i) {
-		pdesc = get_stats_descriptor(stats_desc, i, &header);
-		TEST_ASSERT(pdesc->offset < size_data,
-			    "Invalid offset (%u) for stats: %s",
-			    pdesc->offset, pdesc->name);
-	}
-
 	/* Allocate memory for stats data */
 	stats_data = malloc(size_data);
 	TEST_ASSERT(stats_data, "Allocate memory for stats data");
-- 
cgit 


From f9b06695ba64bca310ada3df05264273106c5184 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Sat, 21 Jan 2023 09:53:19 +0100
Subject: selftests/nolibc: Support "x86_64" for arch name

Building the kernel with ARCH=x86_64 works fine, but nolibc-test
only supports "x86", which causes errors when reusing existing build
environment.  Let's permit this environment to be used as well by making
nolibc also accept ARCH=x86_64.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Tested-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 2bf613ee363d..423598045ff1 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -14,6 +14,7 @@ endif
 
 # kernel image names by architecture
 IMAGE_i386    = arch/x86/boot/bzImage
+IMAGE_x86_64  = arch/x86/boot/bzImage
 IMAGE_x86     = arch/x86/boot/bzImage
 IMAGE_arm64   = arch/arm64/boot/Image
 IMAGE_arm     = arch/arm/boot/zImage
@@ -25,6 +26,7 @@ IMAGE_NAME    = $(notdir $(IMAGE))
 
 # default kernel configurations that appear to be usable
 DEFCONFIG_i386    = defconfig
+DEFCONFIG_x86_64  = defconfig
 DEFCONFIG_x86     = defconfig
 DEFCONFIG_arm64   = defconfig
 DEFCONFIG_arm     = multi_v7_defconfig
@@ -38,6 +40,7 @@ TEST =
 
 # QEMU_ARCH: arch names used by qemu
 QEMU_ARCH_i386    = i386
+QEMU_ARCH_x86_64  = x86_64
 QEMU_ARCH_x86     = x86_64
 QEMU_ARCH_arm64   = aarch64
 QEMU_ARCH_arm     = arm
@@ -48,6 +51,7 @@ QEMU_ARCH         = $(QEMU_ARCH_$(ARCH))
 
 # QEMU_ARGS : some arch-specific args to pass to qemu
 QEMU_ARGS_i386    = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_x86_64  = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_x86     = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_arm64   = -M virt -cpu cortex-a53 -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_arm     = -M virt -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)"
-- 
cgit 


From c54ba4178159e440bea3826d22d43a9d0d94b071 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Sat, 21 Jan 2023 09:53:20 +0100
Subject: selftests/nolibc: Add a "run-user" target to test the program in user
 land

When developing tests, it is much faster to use the QEMU Linux
emulator instead of the system emulator, which among other things avoids
kernel-build latencies.  Although use of the QEMU Linux emulator does have
its limitations (please see below), it is sufficient to test startup code,
stdlib code, and syscall calling conventions.

However, the current mainline Linux-kernel nolibc setup does not
support this.  Therefore, add a "run-user" target that immediately
executes the prebuilt executable.

Again, this approach does have its limitations.  For example, the
executable runs with the user's privilege level, which can cause some
false-positive failures due to insufficient permissions.  In addition,
if the underlying kernel is old enough to lack some features that
nolibc relies on, the result will be false-positive failures in the
corresponding tests.  However, for nolibc changes not affected by these
limittions, the result is a much faster code-compile-test-debug cycle.

With this patch, running a userland test is as simple as issuing:

  make ARCH=xxx CROSS_COMPILE=xxx run-user

Signed-off-by: Willy Tarreau <w@1wt.eu>
Tested-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/Makefile | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 423598045ff1..8fe61d3e3cce 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -80,6 +80,7 @@ help:
 	@echo "  help         this help"
 	@echo "  sysroot      create the nolibc sysroot here (uses \$$ARCH)"
 	@echo "  nolibc-test  build the executable (uses \$$CC and \$$CROSS_COMPILE)"
+	@echo "  run-user     runs the executable under QEMU (uses \$$ARCH, \$$TEST)"
 	@echo "  initramfs    prepare the initramfs with nolibc-test"
 	@echo "  defconfig    create a fresh new default config (uses \$$ARCH)"
 	@echo "  kernel       (re)build the kernel with the initramfs (uses \$$ARCH)"
@@ -113,6 +114,11 @@ nolibc-test: nolibc-test.c sysroot/$(ARCH)/include
 	$(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \
 	  -nostdlib -static -Isysroot/$(ARCH)/include $< -lgcc
 
+# qemu user-land test
+run-user: nolibc-test
+	$(Q)qemu-$(QEMU_ARCH) ./nolibc-test > "$(CURDIR)/run.out" || :
+	$(Q)grep -w FAIL "$(CURDIR)/run.out" && echo "See all results in $(CURDIR)/run.out" || echo "$$(grep -c ^[0-9].*OK $(CURDIR)/run.out) test(s) passed."
+
 initramfs: nolibc-test
 	$(QUIET_MKDIR)mkdir -p initramfs
 	$(call QUIET_INSTALL, initramfs/init)
-- 
cgit 


From 7f09d639b8c4959736ff112d0fb42780c37ff6db Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Mon, 19 Dec 2022 13:31:08 -0500
Subject: tracing/selftests: Add test for event filtering on function name

With the new filter logic of passing in the name of a function to match an
instruction pointer (or the address of the function), add a test to make
sure that it is functional.

This is also the first test to test plain filtering. The filtering has
been tested via the trigger logic, which uses the same code, but there was
nothing to test just the event filter, so this test is the first to add
such a case.

Link: https://lkml.kernel.org/r/20221219183214.075559302@goodmis.org

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Zheng Yejian <zhengyejian1@huawei.com>
Cc: linux-kselftest@vger.kernel.org
Suggested-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Ross Zwisler <zwisler@google.com>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 .../ftrace/test.d/filter/event-filter-function.tc  | 58 ++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc
new file mode 100644
index 000000000000..e2ff3bf4df80
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc
@@ -0,0 +1,58 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event filter function - test event filtering on functions
+# requires: set_event events/kmem/kmem_cache_free/filter
+# flags: instance
+
+fail() { #msg
+    echo $1
+    exit_fail
+}
+
+echo "Test event filter function name"
+echo 0 > tracing_on
+echo 0 > events/enable
+echo > trace
+echo 'call_site.function == exit_mmap' > events/kmem/kmem_cache_free/filter
+echo 1 > events/kmem/kmem_cache_free/enable
+echo 1 > tracing_on
+ls > /dev/null
+echo 0 > events/kmem/kmem_cache_free/enable
+
+hitcnt=`grep kmem_cache_free trace| grep exit_mmap | wc -l`
+misscnt=`grep kmem_cache_free trace| grep -v exit_mmap | wc -l`
+
+if [ $hitcnt -eq 0 ]; then
+	exit_fail
+fi
+
+if [ $misscnt -gt 0 ]; then
+	exit_fail
+fi
+
+address=`grep ' exit_mmap$' /proc/kallsyms | cut -d' ' -f1`
+
+echo "Test event filter function address"
+echo 0 > tracing_on
+echo 0 > events/enable
+echo > trace
+echo "call_site.function == 0x$address" > events/kmem/kmem_cache_free/filter
+echo 1 > events/kmem/kmem_cache_free/enable
+echo 1 > tracing_on
+sleep 1
+echo 0 > events/kmem/kmem_cache_free/enable
+
+hitcnt=`grep kmem_cache_free trace| grep exit_mmap | wc -l`
+misscnt=`grep kmem_cache_free trace| grep -v exit_mmap | wc -l`
+
+if [ $hitcnt -eq 0 ]; then
+	exit_fail
+fi
+
+if [ $misscnt -gt 0 ]; then
+	exit_fail
+fi
+
+reset_events_filter
+
+exit 0
-- 
cgit 


From b81a3a100cca1bd63d897efa339d047986615c9f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 17 Jan 2023 10:21:31 -0500
Subject: tracing/histogram: Add simple tests for stacktrace usage of synthetic
 events

Update the selftests to include a test of passing a stacktrace between the
events of a synthetic event.

Link: https://lkml.kernel.org/r/20230117152236.475439286@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Ross Zwisler <zwisler@google.com>
Cc: Ching-lin Yu <chinglinyu@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c                               |  2 +-
 .../inter-event/trigger-synthetic-event-stack.tc   | 24 ++++++++++++++++++++++
 .../inter-event/trigger-synthetic-event-syntax.tc  |  6 ++++++
 3 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc

(limited to 'tools')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 78ed5f1baa8c..b90eecd27dfc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5757,7 +5757,7 @@ static const char readme_msg[] =
 #ifdef CONFIG_SYNTH_EVENTS
 	"  events/synthetic_events\t- Create/append/remove/show synthetic events\n"
 	"\t  Write into this file to define/undefine new synthetic events.\n"
-	"\t     example: echo 'myevent u64 lat; char name[]' >> synthetic_events\n"
+	"\t     example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n"
 #endif
 #endif
 ;
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc
new file mode 100644
index 000000000000..755dbe94ccf4
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc
@@ -0,0 +1,24 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test inter-event histogram trigger trace action with dynamic string param
+# requires: set_event synthetic_events events/sched/sched_process_exec/hist "long[]' >> synthetic_events":README
+
+fail() { #msg
+    echo $1
+    exit_fail
+}
+
+echo "Test create synthetic event with stack"
+
+
+echo 's:wake_lat pid_t pid; u64 delta; unsigned long[] stack;' > dynamic_events
+echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=stacktrace  if prev_state == 1||prev_state == 2' >> events/sched/sched_switch/trigger
+echo 'hist:keys=prev_pid:delta=common_timestamp.usecs-$ts,s=$st:onmax($delta).trace(wake_lat,prev_pid,$delta,$s)' >> events/sched/sched_switch/trigger
+echo 1 > events/synthetic/wake_lat/enable
+sleep 1
+
+if ! grep -q "=>.*sched" trace; then
+    fail "Failed to create synthetic event with stack"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc
index 2968cdc7df30..366f1f3ad906 100644
--- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc
@@ -70,6 +70,12 @@ grep "myevent[[:space:]]unsigned long var" synthetic_events
 echo "myevent char var[10]" > synthetic_events
 grep "myevent[[:space:]]char\[10\] var" synthetic_events
 
+if grep -q 'long\[\]' README; then
+  # test stacktrace type
+  echo "myevent unsigned long[] var" > synthetic_events
+  grep "myevent[[:space:]]unsigned long\[\] var" synthetic_events
+fi
+
 do_reset
 
 exit 0
-- 
cgit 


From caf713c338bd95bf9ac003d8985d2c4e46d452dd Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 25 Jan 2023 08:38:10 -0600
Subject: bpf: Disallow NULLable pointers for trusted kfuncs

KF_TRUSTED_ARGS kfuncs currently have a subtle and insidious bug in
validating pointers to scalars. Say that you have a kfunc like the
following, which takes an array as the first argument:

bool bpf_cpumask_empty(const struct cpumask *cpumask)
{
	return cpumask_empty(cpumask);
}

...
BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_TRUSTED_ARGS)
...

If a BPF program were to invoke the kfunc with a NULL argument, it would
crash the kernel. The reason is that struct cpumask is defined as a
bitmap, which is itself defined as an array, and is accessed as a memory
address by bitmap operations. So when the verifier analyzes the
register, it interprets it as a pointer to a scalar struct, which is an
array of size 8. check_mem_reg() then sees that the register is NULL and
returns 0, and the kfunc crashes when it passes it down to the cpumask
wrappers.

To fix this, this patch adds a check for KF_ARG_PTR_TO_MEM which
verifies that the register doesn't contain a possibly-NULL pointer if
the kfunc is KF_TRUSTED_ARGS.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230125143816.721952-2-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                                  | 6 ++++++
 tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c    | 4 ++--
 tools/testing/selftests/bpf/progs/task_kfunc_failure.c | 4 ++--
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 66ec577fcb8b..bb38b01b738f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9194,6 +9194,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			return -EINVAL;
 		}
 
+		if (is_kfunc_trusted_args(meta) &&
+		    (register_is_null(reg) || type_may_be_null(reg->type))) {
+			verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
+			return -EACCES;
+		}
+
 		if (reg->ref_obj_id) {
 			if (is_kfunc_release(meta) && meta->ref_obj_id) {
 				verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c b/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c
index 973f0c5af965..f3bb0e16e088 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c
@@ -93,11 +93,11 @@ static struct {
 	const char *prog_name;
 	const char *expected_err_msg;
 } failure_tests[] = {
-	{"cgrp_kfunc_acquire_untrusted", "R1 must be referenced or trusted"},
+	{"cgrp_kfunc_acquire_untrusted", "Possibly NULL pointer passed to trusted arg0"},
 	{"cgrp_kfunc_acquire_fp", "arg#0 pointer type STRUCT cgroup must point"},
 	{"cgrp_kfunc_acquire_unsafe_kretprobe", "reg type unsupported for arg#0 function"},
 	{"cgrp_kfunc_acquire_trusted_walked", "R1 must be referenced or trusted"},
-	{"cgrp_kfunc_acquire_null", "arg#0 pointer type STRUCT cgroup must point"},
+	{"cgrp_kfunc_acquire_null", "Possibly NULL pointer passed to trusted arg0"},
 	{"cgrp_kfunc_acquire_unreleased", "Unreleased reference"},
 	{"cgrp_kfunc_get_non_kptr_param", "arg#0 expected pointer to map value"},
 	{"cgrp_kfunc_get_non_kptr_acquired", "arg#0 expected pointer to map value"},
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
index e6950d6a9cf0..f19d54eda4f1 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
@@ -28,7 +28,7 @@ static struct __tasks_kfunc_map_value *insert_lookup_task(struct task_struct *ta
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("R1 must be referenced or trusted")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
 int BPF_PROG(task_kfunc_acquire_untrusted, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
@@ -86,7 +86,7 @@ int BPF_PROG(task_kfunc_acquire_trusted_walked, struct task_struct *task, u64 cl
 
 
 SEC("tp_btf/task_newtask")
-__failure __msg("arg#0 pointer type STRUCT task_struct must point")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
 int BPF_PROG(task_kfunc_acquire_null, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
-- 
cgit 


From a6541f4d280465e3dff08a45734c0d4ac3f363a4 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 25 Jan 2023 08:38:12 -0600
Subject: selftests/bpf: Add nested trust selftests suite

Now that defining trusted fields in a struct is supported, we should add
selftests to verify the behavior. This patch adds a few such testcases.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230125143816.721952-4-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/DENYLIST.s390x         |  1 +
 .../selftests/bpf/prog_tests/nested_trust.c        | 12 ++++++++
 .../selftests/bpf/progs/nested_trust_common.h      | 12 ++++++++
 .../selftests/bpf/progs/nested_trust_failure.c     | 33 ++++++++++++++++++++++
 .../selftests/bpf/progs/nested_trust_success.c     | 19 +++++++++++++
 5 files changed, 77 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/nested_trust.c
 create mode 100644 tools/testing/selftests/bpf/progs/nested_trust_common.h
 create mode 100644 tools/testing/selftests/bpf/progs/nested_trust_failure.c
 create mode 100644 tools/testing/selftests/bpf/progs/nested_trust_success.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index b9ef9cc61d36..f18e367ca4b9 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -44,6 +44,7 @@ map_kptr                                 # failed to open_and_load program: -524
 modify_return                            # modify_return attach failed: -524                                           (trampoline)
 module_attach                            # skel_attach skeleton attach failed: -524                                    (trampoline)
 mptcp
+nested_trust                             # JIT does not support calling kernel function
 netcnt                                   # failed to load BPF skeleton 'netcnt_prog': -7                               (?)
 probe_user                               # check_kprobe_res wrong kprobe res from probe read                           (?)
 rcu_read_lock                            # failed to find kernel BTF type ID of '__x64_sys_getpgid': -3                (?)
diff --git a/tools/testing/selftests/bpf/prog_tests/nested_trust.c b/tools/testing/selftests/bpf/prog_tests/nested_trust.c
new file mode 100644
index 000000000000..39886f58924e
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/nested_trust.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include "nested_trust_failure.skel.h"
+#include "nested_trust_success.skel.h"
+
+void test_nested_trust(void)
+{
+	RUN_TESTS(nested_trust_success);
+	RUN_TESTS(nested_trust_failure);
+}
diff --git a/tools/testing/selftests/bpf/progs/nested_trust_common.h b/tools/testing/selftests/bpf/progs/nested_trust_common.h
new file mode 100644
index 000000000000..83d33931136e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/nested_trust_common.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#ifndef _NESTED_TRUST_COMMON_H
+#define _NESTED_TRUST_COMMON_H
+
+#include <stdbool.h>
+
+bool bpf_cpumask_test_cpu(unsigned int cpu, const struct cpumask *cpumask) __ksym;
+bool bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym;
+
+#endif /* _NESTED_TRUST_COMMON_H */
diff --git a/tools/testing/selftests/bpf/progs/nested_trust_failure.c b/tools/testing/selftests/bpf/progs/nested_trust_failure.c
new file mode 100644
index 000000000000..14aff7676436
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/nested_trust_failure.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+#include "nested_trust_common.h"
+
+char _license[] SEC("license") = "GPL";
+
+/* Prototype for all of the program trace events below:
+ *
+ * TRACE_EVENT(task_newtask,
+ *         TP_PROTO(struct task_struct *p, u64 clone_flags)
+ */
+
+SEC("tp_btf/task_newtask")
+__failure __msg("R2 must be referenced or trusted")
+int BPF_PROG(test_invalid_nested_user_cpus, struct task_struct *task, u64 clone_flags)
+{
+	bpf_cpumask_test_cpu(0, task->user_cpus_ptr);
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+__failure __msg("R1 must have zero offset when passed to release func or trusted arg to kfunc")
+int BPF_PROG(test_invalid_nested_offset, struct task_struct *task, u64 clone_flags)
+{
+	bpf_cpumask_first_zero(&task->cpus_mask);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/nested_trust_success.c b/tools/testing/selftests/bpf/progs/nested_trust_success.c
new file mode 100644
index 000000000000..886ade4aa99d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/nested_trust_success.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+#include "nested_trust_common.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("tp_btf/task_newtask")
+__success
+int BPF_PROG(test_read_cpumask, struct task_struct *task, u64 clone_flags)
+{
+	bpf_cpumask_test_cpu(0, task->cpus_ptr);
+	return 0;
+}
-- 
cgit 


From 7b6abcfa15cd18de21e5bfb952df57268af2f041 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 25 Jan 2023 08:38:13 -0600
Subject: selftests/bpf: Add selftest suite for cpumask kfuncs

A recent patch added a new set of kfuncs for allocating, freeing,
manipulating, and querying cpumasks. This patch adds a new 'cpumask'
selftest suite which verifies their behavior.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230125143816.721952-5-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/DENYLIST.s390x         |   1 +
 tools/testing/selftests/bpf/prog_tests/cpumask.c   |  74 ++++
 tools/testing/selftests/bpf/progs/cpumask_common.h | 114 ++++++
 .../testing/selftests/bpf/progs/cpumask_failure.c  | 126 ++++++
 .../testing/selftests/bpf/progs/cpumask_success.c  | 426 +++++++++++++++++++++
 5 files changed, 741 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/cpumask.c
 create mode 100644 tools/testing/selftests/bpf/progs/cpumask_common.h
 create mode 100644 tools/testing/selftests/bpf/progs/cpumask_failure.c
 create mode 100644 tools/testing/selftests/bpf/progs/cpumask_success.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index f18e367ca4b9..50924611e5bb 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -13,6 +13,7 @@ cgroup_hierarchical_stats                # JIT does not support calling kernel f
 cgrp_kfunc                               # JIT does not support calling kernel function
 cgrp_local_storage                       # prog_attach unexpected error: -524                                          (trampoline)
 core_read_macros                         # unknown func bpf_probe_read#4                                               (overlapping)
+cpumask                                  # JIT does not support calling kernel function
 d_path                                   # failed to auto-attach program 'prog_stat': -524                             (trampoline)
 decap_sanity                             # JIT does not support calling kernel function                                (kfunc)
 deny_namespace                           # failed to attach: ERROR: strerror_r(-524)=22                                (trampoline)
diff --git a/tools/testing/selftests/bpf/prog_tests/cpumask.c b/tools/testing/selftests/bpf/prog_tests/cpumask.c
new file mode 100644
index 000000000000..5fbe457c4ebe
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cpumask.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include "cpumask_failure.skel.h"
+#include "cpumask_success.skel.h"
+
+static const char * const cpumask_success_testcases[] = {
+	"test_alloc_free_cpumask",
+	"test_set_clear_cpu",
+	"test_setall_clear_cpu",
+	"test_first_firstzero_cpu",
+	"test_test_and_set_clear",
+	"test_and_or_xor",
+	"test_intersects_subset",
+	"test_copy_any_anyand",
+	"test_insert_leave",
+	"test_insert_remove_release",
+	"test_insert_kptr_get_release",
+};
+
+static void verify_success(const char *prog_name)
+{
+	struct cpumask_success *skel;
+	struct bpf_program *prog;
+	struct bpf_link *link = NULL;
+	pid_t child_pid;
+	int status;
+
+	skel = cpumask_success__open();
+	if (!ASSERT_OK_PTR(skel, "cpumask_success__open"))
+		return;
+
+	skel->bss->pid = getpid();
+	skel->bss->nr_cpus = libbpf_num_possible_cpus();
+
+	cpumask_success__load(skel);
+	if (!ASSERT_OK_PTR(skel, "cpumask_success__load"))
+		goto cleanup;
+
+	prog = bpf_object__find_program_by_name(skel->obj, prog_name);
+	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
+		goto cleanup;
+
+	link = bpf_program__attach(prog);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
+		goto cleanup;
+
+	child_pid = fork();
+	if (!ASSERT_GT(child_pid, -1, "child_pid"))
+		goto cleanup;
+	if (child_pid == 0)
+		_exit(0);
+	waitpid(child_pid, &status, 0);
+	ASSERT_OK(skel->bss->err, "post_wait_err");
+
+cleanup:
+	bpf_link__destroy(link);
+	cpumask_success__destroy(skel);
+}
+
+void test_cpumask(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(cpumask_success_testcases); i++) {
+		if (!test__start_subtest(cpumask_success_testcases[i]))
+			continue;
+
+		verify_success(cpumask_success_testcases[i]);
+	}
+
+	RUN_TESTS(cpumask_failure);
+}
diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h
new file mode 100644
index 000000000000..ad34f3b602be
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cpumask_common.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#ifndef _CPUMASK_COMMON_H
+#define _CPUMASK_COMMON_H
+
+#include "errno.h"
+#include <stdbool.h>
+
+int err;
+
+struct __cpumask_map_value {
+	struct bpf_cpumask __kptr_ref * cpumask;
+};
+
+struct array_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, int);
+	__type(value, struct __cpumask_map_value);
+	__uint(max_entries, 1);
+} __cpumask_map SEC(".maps");
+
+struct bpf_cpumask *bpf_cpumask_create(void) __ksym;
+void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym;
+struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym;
+struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumask) __ksym;
+u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym;
+u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym;
+void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_and(struct bpf_cpumask *cpumask,
+		     const struct cpumask *src1,
+		     const struct cpumask *src2) __ksym;
+void bpf_cpumask_or(struct bpf_cpumask *cpumask,
+		    const struct cpumask *src1,
+		    const struct cpumask *src2) __ksym;
+void bpf_cpumask_xor(struct bpf_cpumask *cpumask,
+		     const struct cpumask *src1,
+		     const struct cpumask *src2) __ksym;
+bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym;
+bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym;
+void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym;
+u32 bpf_cpumask_any(const struct cpumask *src) __ksym;
+u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+
+static inline const struct cpumask *cast(struct bpf_cpumask *cpumask)
+{
+	return (const struct cpumask *)cpumask;
+}
+
+static inline struct bpf_cpumask *create_cpumask(void)
+{
+	struct bpf_cpumask *cpumask;
+
+	cpumask = bpf_cpumask_create();
+	if (!cpumask) {
+		err = 1;
+		return NULL;
+	}
+
+	if (!bpf_cpumask_empty(cast(cpumask))) {
+		err = 2;
+		bpf_cpumask_release(cpumask);
+		return NULL;
+	}
+
+	return cpumask;
+}
+
+static inline struct __cpumask_map_value *cpumask_map_value_lookup(void)
+{
+	u32 key = 0;
+
+	return bpf_map_lookup_elem(&__cpumask_map, &key);
+}
+
+static inline int cpumask_map_insert(struct bpf_cpumask *mask)
+{
+	struct __cpumask_map_value local, *v;
+	long status;
+	struct bpf_cpumask *old;
+	u32 key = 0;
+
+	local.cpumask = NULL;
+	status = bpf_map_update_elem(&__cpumask_map, &key, &local, 0);
+	if (status) {
+		bpf_cpumask_release(mask);
+		return status;
+	}
+
+	v = bpf_map_lookup_elem(&__cpumask_map, &key);
+	if (!v) {
+		bpf_cpumask_release(mask);
+		return -ENOENT;
+	}
+
+	old = bpf_kptr_xchg(&v->cpumask, mask);
+	if (old) {
+		bpf_cpumask_release(old);
+		return -EEXIST;
+	}
+
+	return 0;
+}
+
+#endif /* _CPUMASK_COMMON_H */
diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c
new file mode 100644
index 000000000000..33e8e86dd090
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+#include "cpumask_common.h"
+
+char _license[] SEC("license") = "GPL";
+
+/* Prototype for all of the program trace events below:
+ *
+ * TRACE_EVENT(task_newtask,
+ *         TP_PROTO(struct task_struct *p, u64 clone_flags)
+ */
+
+SEC("tp_btf/task_newtask")
+__failure __msg("Unreleased reference")
+int BPF_PROG(test_alloc_no_release, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *cpumask;
+
+	cpumask = create_cpumask();
+
+	/* cpumask is never released. */
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+__failure __msg("NULL pointer passed to trusted arg0")
+int BPF_PROG(test_alloc_double_release, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *cpumask;
+
+	cpumask = create_cpumask();
+
+	/* cpumask is released twice. */
+	bpf_cpumask_release(cpumask);
+	bpf_cpumask_release(cpumask);
+
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+__failure __msg("bpf_cpumask_acquire args#0 expected pointer to STRUCT bpf_cpumask")
+int BPF_PROG(test_acquire_wrong_cpumask, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *cpumask;
+
+	/* Can't acquire a non-struct bpf_cpumask. */
+	cpumask = bpf_cpumask_acquire((struct bpf_cpumask *)task->cpus_ptr);
+
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+__failure __msg("bpf_cpumask_set_cpu args#1 expected pointer to STRUCT bpf_cpumask")
+int BPF_PROG(test_mutate_cpumask, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *cpumask;
+
+	/* Can't set the CPU of a non-struct bpf_cpumask. */
+	bpf_cpumask_set_cpu(0, (struct bpf_cpumask *)task->cpus_ptr);
+
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+__failure __msg("Unreleased reference")
+int BPF_PROG(test_insert_remove_no_release, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *cpumask;
+	struct __cpumask_map_value *v;
+
+	cpumask = create_cpumask();
+	if (!cpumask)
+		return 0;
+
+	if (cpumask_map_insert(cpumask))
+		return 0;
+
+	v = cpumask_map_value_lookup();
+	if (!v)
+		return 0;
+
+	cpumask = bpf_kptr_xchg(&v->cpumask, NULL);
+
+	/* cpumask is never released. */
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+__failure __msg("Unreleased reference")
+int BPF_PROG(test_kptr_get_no_release, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *cpumask;
+	struct __cpumask_map_value *v;
+
+	cpumask = create_cpumask();
+	if (!cpumask)
+		return 0;
+
+	if (cpumask_map_insert(cpumask))
+		return 0;
+
+	v = cpumask_map_value_lookup();
+	if (!v)
+		return 0;
+
+	cpumask = bpf_cpumask_kptr_get(&v->cpumask);
+
+	/* cpumask is never released. */
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+__failure __msg("NULL pointer passed to trusted arg0")
+int BPF_PROG(test_cpumask_null, struct task_struct *task, u64 clone_flags)
+{
+  /* NULL passed to KF_TRUSTED_ARGS kfunc. */
+	bpf_cpumask_empty(NULL);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/cpumask_success.c b/tools/testing/selftests/bpf/progs/cpumask_success.c
new file mode 100644
index 000000000000..1d38bc65d4b0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cpumask_success.c
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+
+#include "cpumask_common.h"
+
+char _license[] SEC("license") = "GPL";
+
+int pid, nr_cpus;
+
+static bool is_test_task(void)
+{
+	int cur_pid = bpf_get_current_pid_tgid() >> 32;
+
+	return pid == cur_pid;
+}
+
+static bool create_cpumask_set(struct bpf_cpumask **out1,
+			       struct bpf_cpumask **out2,
+			       struct bpf_cpumask **out3,
+			       struct bpf_cpumask **out4)
+{
+	struct bpf_cpumask *mask1, *mask2, *mask3, *mask4;
+
+	mask1 = create_cpumask();
+	if (!mask1)
+		return false;
+
+	mask2 = create_cpumask();
+	if (!mask2) {
+		bpf_cpumask_release(mask1);
+		err = 3;
+		return false;
+	}
+
+	mask3 = create_cpumask();
+	if (!mask3) {
+		bpf_cpumask_release(mask1);
+		bpf_cpumask_release(mask2);
+		err = 4;
+		return false;
+	}
+
+	mask4 = create_cpumask();
+	if (!mask4) {
+		bpf_cpumask_release(mask1);
+		bpf_cpumask_release(mask2);
+		bpf_cpumask_release(mask3);
+		err = 5;
+		return false;
+	}
+
+	*out1 = mask1;
+	*out2 = mask2;
+	*out3 = mask3;
+	*out4 = mask4;
+
+	return true;
+}
+
+SEC("tp_btf/task_newtask")
+int BPF_PROG(test_alloc_free_cpumask, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *cpumask;
+
+	if (!is_test_task())
+		return 0;
+
+	cpumask = create_cpumask();
+	if (!cpumask)
+		return 0;
+
+	bpf_cpumask_release(cpumask);
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+int BPF_PROG(test_set_clear_cpu, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *cpumask;
+
+	if (!is_test_task())
+		return 0;
+
+	cpumask = create_cpumask();
+	if (!cpumask)
+		return 0;
+
+	bpf_cpumask_set_cpu(0, cpumask);
+	if (!bpf_cpumask_test_cpu(0, cast(cpumask))) {
+		err = 3;
+		goto release_exit;
+	}
+
+	bpf_cpumask_clear_cpu(0, cpumask);
+	if (bpf_cpumask_test_cpu(0, cast(cpumask))) {
+		err = 4;
+		goto release_exit;
+	}
+
+release_exit:
+	bpf_cpumask_release(cpumask);
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+int BPF_PROG(test_setall_clear_cpu, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *cpumask;
+
+	if (!is_test_task())
+		return 0;
+
+	cpumask = create_cpumask();
+	if (!cpumask)
+		return 0;
+
+	bpf_cpumask_setall(cpumask);
+	if (!bpf_cpumask_full(cast(cpumask))) {
+		err = 3;
+		goto release_exit;
+	}
+
+	bpf_cpumask_clear(cpumask);
+	if (!bpf_cpumask_empty(cast(cpumask))) {
+		err = 4;
+		goto release_exit;
+	}
+
+release_exit:
+	bpf_cpumask_release(cpumask);
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+int BPF_PROG(test_first_firstzero_cpu, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *cpumask;
+
+	if (!is_test_task())
+		return 0;
+
+	cpumask = create_cpumask();
+	if (!cpumask)
+		return 0;
+
+	if (bpf_cpumask_first(cast(cpumask)) < nr_cpus) {
+		err = 3;
+		goto release_exit;
+	}
+
+	if (bpf_cpumask_first_zero(cast(cpumask)) != 0) {
+		bpf_printk("first zero: %d", bpf_cpumask_first_zero(cast(cpumask)));
+		err = 4;
+		goto release_exit;
+	}
+
+	bpf_cpumask_set_cpu(0, cpumask);
+	if (bpf_cpumask_first(cast(cpumask)) != 0) {
+		err = 5;
+		goto release_exit;
+	}
+
+	if (bpf_cpumask_first_zero(cast(cpumask)) != 1) {
+		err = 6;
+		goto release_exit;
+	}
+
+release_exit:
+	bpf_cpumask_release(cpumask);
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+int BPF_PROG(test_test_and_set_clear, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *cpumask;
+
+	if (!is_test_task())
+		return 0;
+
+	cpumask = create_cpumask();
+	if (!cpumask)
+		return 0;
+
+	if (bpf_cpumask_test_and_set_cpu(0, cpumask)) {
+		err = 3;
+		goto release_exit;
+	}
+
+	if (!bpf_cpumask_test_and_set_cpu(0, cpumask)) {
+		err = 4;
+		goto release_exit;
+	}
+
+	if (!bpf_cpumask_test_and_clear_cpu(0, cpumask)) {
+		err = 5;
+		goto release_exit;
+	}
+
+release_exit:
+	bpf_cpumask_release(cpumask);
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+int BPF_PROG(test_and_or_xor, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *mask1, *mask2, *dst1, *dst2;
+
+	if (!is_test_task())
+		return 0;
+
+	if (!create_cpumask_set(&mask1, &mask2, &dst1, &dst2))
+		return 0;
+
+	bpf_cpumask_set_cpu(0, mask1);
+	bpf_cpumask_set_cpu(1, mask2);
+
+	if (bpf_cpumask_and(dst1, cast(mask1), cast(mask2))) {
+		err = 6;
+		goto release_exit;
+	}
+	if (!bpf_cpumask_empty(cast(dst1))) {
+		err = 7;
+		goto release_exit;
+	}
+
+	bpf_cpumask_or(dst1, cast(mask1), cast(mask2));
+	if (!bpf_cpumask_test_cpu(0, cast(dst1))) {
+		err = 8;
+		goto release_exit;
+	}
+	if (!bpf_cpumask_test_cpu(1, cast(dst1))) {
+		err = 9;
+		goto release_exit;
+	}
+
+	bpf_cpumask_xor(dst2, cast(mask1), cast(mask2));
+	if (!bpf_cpumask_equal(cast(dst1), cast(dst2))) {
+		err = 10;
+		goto release_exit;
+	}
+
+release_exit:
+	bpf_cpumask_release(mask1);
+	bpf_cpumask_release(mask2);
+	bpf_cpumask_release(dst1);
+	bpf_cpumask_release(dst2);
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+int BPF_PROG(test_intersects_subset, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *mask1, *mask2, *dst1, *dst2;
+
+	if (!is_test_task())
+		return 0;
+
+	if (!create_cpumask_set(&mask1, &mask2, &dst1, &dst2))
+		return 0;
+
+	bpf_cpumask_set_cpu(0, mask1);
+	bpf_cpumask_set_cpu(1, mask2);
+	if (bpf_cpumask_intersects(cast(mask1), cast(mask2))) {
+		err = 6;
+		goto release_exit;
+	}
+
+	bpf_cpumask_or(dst1, cast(mask1), cast(mask2));
+	if (!bpf_cpumask_subset(cast(mask1), cast(dst1))) {
+		err = 7;
+		goto release_exit;
+	}
+
+	if (!bpf_cpumask_subset(cast(mask2), cast(dst1))) {
+		err = 8;
+		goto release_exit;
+	}
+
+	if (bpf_cpumask_subset(cast(dst1), cast(mask1))) {
+		err = 9;
+		goto release_exit;
+	}
+
+release_exit:
+	bpf_cpumask_release(mask1);
+	bpf_cpumask_release(mask2);
+	bpf_cpumask_release(dst1);
+	bpf_cpumask_release(dst2);
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+int BPF_PROG(test_copy_any_anyand, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *mask1, *mask2, *dst1, *dst2;
+	u32 cpu;
+
+	if (!is_test_task())
+		return 0;
+
+	if (!create_cpumask_set(&mask1, &mask2, &dst1, &dst2))
+		return 0;
+
+	bpf_cpumask_set_cpu(0, mask1);
+	bpf_cpumask_set_cpu(1, mask2);
+	bpf_cpumask_or(dst1, cast(mask1), cast(mask2));
+
+	cpu = bpf_cpumask_any(cast(mask1));
+	if (cpu != 0) {
+		err = 6;
+		goto release_exit;
+	}
+
+	cpu = bpf_cpumask_any(cast(dst2));
+	if (cpu < nr_cpus) {
+		err = 7;
+		goto release_exit;
+	}
+
+	bpf_cpumask_copy(dst2, cast(dst1));
+	if (!bpf_cpumask_equal(cast(dst1), cast(dst2))) {
+		err = 8;
+		goto release_exit;
+	}
+
+	cpu = bpf_cpumask_any(cast(dst2));
+	if (cpu > 1) {
+		err = 9;
+		goto release_exit;
+	}
+
+	cpu = bpf_cpumask_any_and(cast(mask1), cast(mask2));
+	if (cpu < nr_cpus) {
+		err = 10;
+		goto release_exit;
+	}
+
+release_exit:
+	bpf_cpumask_release(mask1);
+	bpf_cpumask_release(mask2);
+	bpf_cpumask_release(dst1);
+	bpf_cpumask_release(dst2);
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+int BPF_PROG(test_insert_leave, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *cpumask;
+	struct __cpumask_map_value *v;
+
+	cpumask = create_cpumask();
+	if (!cpumask)
+		return 0;
+
+	if (cpumask_map_insert(cpumask))
+		err = 3;
+
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+int BPF_PROG(test_insert_remove_release, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *cpumask;
+	struct __cpumask_map_value *v;
+
+	cpumask = create_cpumask();
+	if (!cpumask)
+		return 0;
+
+	if (cpumask_map_insert(cpumask)) {
+		err = 3;
+		return 0;
+	}
+
+	v = cpumask_map_value_lookup();
+	if (!v) {
+		err = 4;
+		return 0;
+	}
+
+	cpumask = bpf_kptr_xchg(&v->cpumask, NULL);
+	if (cpumask)
+		bpf_cpumask_release(cpumask);
+	else
+		err = 5;
+
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+int BPF_PROG(test_insert_kptr_get_release, struct task_struct *task, u64 clone_flags)
+{
+	struct bpf_cpumask *cpumask;
+	struct __cpumask_map_value *v;
+
+	cpumask = create_cpumask();
+	if (!cpumask)
+		return 0;
+
+	if (cpumask_map_insert(cpumask)) {
+		err = 3;
+		return 0;
+	}
+
+	v = cpumask_map_value_lookup();
+	if (!v) {
+		err = 4;
+		return 0;
+	}
+
+	cpumask = bpf_cpumask_kptr_get(&v->cpumask);
+	if (cpumask)
+		bpf_cpumask_release(cpumask);
+	else
+		err = 5;
+
+	return 0;
+}
-- 
cgit 


From af03299d8536d62b49c7f3cb929349eb2d66bcd5 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 23 Jan 2023 22:43:23 -0800
Subject: tools/resolve_btfids: Install subcmd headers

Previously tools/lib/subcmd was added to the include path, switch to
installing the headers and then including from that directory. This
avoids dependencies on headers internal to tools/lib/subcmd. Add the
missing subcmd directory to the affected #include.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20230124064324.672022-1-irogers@google.com
---
 tools/bpf/resolve_btfids/Makefile | 19 ++++++++++++++-----
 tools/bpf/resolve_btfids/main.c   |  2 +-
 2 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile
index f7375a119f54..1fe0082b2ecc 100644
--- a/tools/bpf/resolve_btfids/Makefile
+++ b/tools/bpf/resolve_btfids/Makefile
@@ -35,21 +35,29 @@ SUBCMD_SRC := $(srctree)/tools/lib/subcmd/
 BPFOBJ     := $(OUTPUT)/libbpf/libbpf.a
 LIBBPF_OUT := $(abspath $(dir $(BPFOBJ)))/
 SUBCMDOBJ  := $(OUTPUT)/libsubcmd/libsubcmd.a
+SUBCMD_OUT := $(abspath $(dir $(SUBCMDOBJ)))/
 
 LIBBPF_DESTDIR := $(LIBBPF_OUT)
 LIBBPF_INCLUDE := $(LIBBPF_DESTDIR)include
 
+SUBCMD_DESTDIR := $(SUBCMD_OUT)
+SUBCMD_INCLUDE := $(SUBCMD_DESTDIR)include
+
 BINARY     := $(OUTPUT)/resolve_btfids
 BINARY_IN  := $(BINARY)-in.o
 
 all: $(BINARY)
 
+prepare: $(BPFOBJ) $(SUBCMDOBJ)
+
 $(OUTPUT) $(OUTPUT)/libsubcmd $(LIBBPF_OUT):
 	$(call msg,MKDIR,,$@)
 	$(Q)mkdir -p $(@)
 
 $(SUBCMDOBJ): fixdep FORCE | $(OUTPUT)/libsubcmd
-	$(Q)$(MAKE) -C $(SUBCMD_SRC) OUTPUT=$(abspath $(dir $@))/ $(abspath $@)
+	$(Q)$(MAKE) -C $(SUBCMD_SRC) OUTPUT=$(SUBCMD_OUT) \
+		    DESTDIR=$(SUBCMD_DESTDIR) prefix= \
+		    $(abspath $@) install_headers
 
 $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUT)
 	$(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(LIBBPF_OUT)    \
@@ -63,7 +71,7 @@ CFLAGS += -g \
           -I$(srctree)/tools/include \
           -I$(srctree)/tools/include/uapi \
           -I$(LIBBPF_INCLUDE) \
-          -I$(SUBCMD_SRC) \
+          -I$(SUBCMD_INCLUDE) \
           $(LIBELF_FLAGS)
 
 LIBS = $(LIBELF_LIBS) -lz
@@ -71,7 +79,7 @@ LIBS = $(LIBELF_LIBS) -lz
 export srctree OUTPUT CFLAGS Q
 include $(srctree)/tools/build/Makefile.include
 
-$(BINARY_IN): $(BPFOBJ) fixdep FORCE | $(OUTPUT)
+$(BINARY_IN): fixdep FORCE prepare | $(OUTPUT)
 	$(Q)$(MAKE) $(build)=resolve_btfids
 
 $(BINARY): $(BPFOBJ) $(SUBCMDOBJ) $(BINARY_IN)
@@ -83,7 +91,8 @@ clean_objects := $(wildcard $(OUTPUT)/*.o                \
                             $(OUTPUT)/.*.o.d             \
                             $(LIBBPF_OUT)                \
                             $(LIBBPF_DESTDIR)            \
-                            $(OUTPUT)/libsubcmd          \
+                            $(SUBCMD_OUT)                \
+                            $(SUBCMD_DESTDIR)            \
                             $(OUTPUT)/resolve_btfids)
 
 ifneq ($(clean_objects),)
@@ -100,4 +109,4 @@ tags:
 
 FORCE:
 
-.PHONY: all FORCE clean tags
+.PHONY: all FORCE clean tags prepare
diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index 80cd7843c677..77058174082d 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -75,7 +75,7 @@
 #include <linux/err.h>
 #include <bpf/btf.h>
 #include <bpf/libbpf.h>
-#include <parse-options.h>
+#include <subcmd/parse-options.h>
 
 #define BTF_IDS_SECTION	".BTF_ids"
 #define BTF_ID		"__BTF_ID__"
-- 
cgit 


From 13e07691a16ff31b209fbfce25c01ff296b05e45 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 23 Jan 2023 22:43:24 -0800
Subject: tools/resolve_btfids: Alter how HOSTCC is forced

HOSTCC is always wanted when building. Setting CC to HOSTCC happens
after tools/scripts/Makefile.include is included, meaning flags are
set assuming say CC is gcc, but then it can be later set to HOSTCC
which may be clang. tools/scripts/Makefile.include is needed for host
set up and common macros in objtool's Makefile. Rather than override
CC to HOSTCC, just pass CC as HOSTCC to Makefile.build, the libsubcmd
builds and the linkage step. This means the Makefiles don't see things
like CC changing and tool flag determination, and similar, work
properly.

Also, clear the passed subdir as otherwise an outer build may break by
inadvertently passing an inappropriate value.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20230124064324.672022-2-irogers@google.com
---
 tools/bpf/resolve_btfids/Makefile | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile
index 1fe0082b2ecc..daed388aa5d7 100644
--- a/tools/bpf/resolve_btfids/Makefile
+++ b/tools/bpf/resolve_btfids/Makefile
@@ -18,14 +18,11 @@ else
 endif
 
 # always use the host compiler
-AR       = $(HOSTAR)
-CC       = $(HOSTCC)
-LD       = $(HOSTLD)
-ARCH     = $(HOSTARCH)
+HOST_OVERRIDES := AR="$(HOSTAR)" CC="$(HOSTCC)" LD="$(HOSTLD)" ARCH="$(HOSTARCH)" \
+		  EXTRA_CFLAGS="$(HOSTCFLAGS) $(KBUILD_HOSTCFLAGS)"
+
 RM      ?= rm
 CROSS_COMPILE =
-CFLAGS  := $(KBUILD_HOSTCFLAGS)
-LDFLAGS := $(KBUILD_HOSTLDFLAGS)
 
 OUTPUT ?= $(srctree)/tools/bpf/resolve_btfids/
 
@@ -56,12 +53,12 @@ $(OUTPUT) $(OUTPUT)/libsubcmd $(LIBBPF_OUT):
 
 $(SUBCMDOBJ): fixdep FORCE | $(OUTPUT)/libsubcmd
 	$(Q)$(MAKE) -C $(SUBCMD_SRC) OUTPUT=$(SUBCMD_OUT) \
-		    DESTDIR=$(SUBCMD_DESTDIR) prefix= \
+		    DESTDIR=$(SUBCMD_DESTDIR) $(HOST_OVERRIDES) prefix= subdir= \
 		    $(abspath $@) install_headers
 
 $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUT)
 	$(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(LIBBPF_OUT)    \
-		    DESTDIR=$(LIBBPF_DESTDIR) prefix= EXTRA_CFLAGS="$(CFLAGS)" \
+		    DESTDIR=$(LIBBPF_DESTDIR) $(HOST_OVERRIDES) prefix= subdir= \
 		    $(abspath $@) install_headers
 
 LIBELF_FLAGS := $(shell $(HOSTPKG_CONFIG) libelf --cflags 2>/dev/null)
@@ -80,11 +77,11 @@ export srctree OUTPUT CFLAGS Q
 include $(srctree)/tools/build/Makefile.include
 
 $(BINARY_IN): fixdep FORCE prepare | $(OUTPUT)
-	$(Q)$(MAKE) $(build)=resolve_btfids
+	$(Q)$(MAKE) $(build)=resolve_btfids $(HOST_OVERRIDES)
 
 $(BINARY): $(BPFOBJ) $(SUBCMDOBJ) $(BINARY_IN)
 	$(call msg,LINK,$@)
-	$(Q)$(CC) $(BINARY_IN) $(LDFLAGS) -o $@ $(BPFOBJ) $(SUBCMDOBJ) $(LIBS)
+	$(Q)$(HOSTCC) $(BINARY_IN) $(KBUILD_HOSTLDFLAGS) -o $@ $(BPFOBJ) $(SUBCMDOBJ) $(LIBS)
 
 clean_objects := $(wildcard $(OUTPUT)/*.o                \
                             $(OUTPUT)/.*.o.cmd           \
-- 
cgit 


From 2514a31241e1e9067d379e0fbdb60e4bc2bf4659 Mon Sep 17 00:00:00 2001
From: "Daniel T. Lee" <danieltimlee@gmail.com>
Date: Wed, 25 Jan 2023 19:04:40 +0900
Subject: selftests/bpf: Fix vmtest static compilation error

As stated in README.rst, in order to resolve errors with linker errors,
'LDLIBS=-static' should be used. Most problems will be solved by this
option, but in the case of urandom_read, this won't fix the problem. So
the Makefile is currently implemented to strip the 'static' option when
compiling the urandom_read. However, stripping this static option isn't
configured properly on $(LDLIBS) correctly, which is now causing errors
on static compilation.

    # LDLIBS=-static ./vmtest.sh
    ld.lld: error: attempted static link of dynamic object liburandom_read.so
    clang: error: linker command failed with exit code 1 (use -v to see invocation)
    make: *** [Makefile:190: /linux/tools/testing/selftests/bpf/urandom_read] Error 1
    make: *** Waiting for unfinished jobs....

This commit fixes this problem by configuring the strip with $(LDLIBS).

Fixes: 68084a136420 ("selftests/bpf: Fix building bpf selftests statically")
Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230125100440.21734-1-danieltimlee@gmail.com
---
 tools/testing/selftests/bpf/Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index a554b41fe872..53eae7be8dff 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -181,14 +181,15 @@ endif
 # do not fail. Static builds leave urandom_read relying on system-wide shared libraries.
 $(OUTPUT)/liburandom_read.so: urandom_read_lib1.c urandom_read_lib2.c
 	$(call msg,LIB,,$@)
-	$(Q)$(CLANG) $(filter-out -static,$(CFLAGS) $(LDFLAGS)) $^ $(LDLIBS)   \
+	$(Q)$(CLANG) $(filter-out -static,$(CFLAGS) $(LDFLAGS))   \
+		     $^ $(filter-out -static,$(LDLIBS))	     \
 		     -fuse-ld=$(LLD) -Wl,-znoseparate-code -Wl,--build-id=sha1 \
 		     -fPIC -shared -o $@
 
 $(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_read.so
 	$(call msg,BINARY,,$@)
 	$(Q)$(CLANG) $(filter-out -static,$(CFLAGS) $(LDFLAGS)) $(filter %.c,$^) \
-		     liburandom_read.so $(LDLIBS)			       \
+		     liburandom_read.so $(filter-out -static,$(LDLIBS))	     \
 		     -fuse-ld=$(LLD) -Wl,-znoseparate-code -Wl,--build-id=sha1 \
 		     -Wl,-rpath=. -o $@
 
-- 
cgit 


From 1e12d3ef47d228e4e7d30f9bc5e6744ede90319c Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 25 Jan 2023 10:47:32 -0600
Subject: bpf: Allow BPF_PROG_TYPE_STRUCT_OPS programs to be sleepable

BPF struct_ops programs currently cannot be marked as sleepable. This
need not be the case -- struct_ops programs can be sleepable, and e.g.
invoke kfuncs that export the KF_SLEEPABLE flag. So as to allow future
struct_ops programs to invoke such kfuncs, this patch updates the
verifier to allow struct_ops programs to be sleepable. A follow-on patch
will add support to libbpf for specifying struct_ops.s as a sleepable
struct_ops program, and then another patch will add testcases to the
dummy_st_ops selftest suite which test sleepable struct_ops behavior.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230125164735.785732-2-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                            | 5 +++--
 tools/testing/selftests/bpf/verifier/sleepable.c | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bb38b01b738f..c8907df49f81 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -17114,7 +17114,8 @@ static bool can_be_sleepable(struct bpf_prog *prog)
 		}
 	}
 	return prog->type == BPF_PROG_TYPE_LSM ||
-	       prog->type == BPF_PROG_TYPE_KPROBE; /* only for uprobes */
+	       prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
+	       prog->type == BPF_PROG_TYPE_STRUCT_OPS;
 }
 
 static int check_attach_btf_id(struct bpf_verifier_env *env)
@@ -17136,7 +17137,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	}
 
 	if (prog->aux->sleepable && !can_be_sleepable(prog)) {
-		verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter and uprobe programs can be sleepable\n");
+		verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
 		return -EINVAL;
 	}
 
diff --git a/tools/testing/selftests/bpf/verifier/sleepable.c b/tools/testing/selftests/bpf/verifier/sleepable.c
index bea0daef908a..1f0d2bdc673f 100644
--- a/tools/testing/selftests/bpf/verifier/sleepable.c
+++ b/tools/testing/selftests/bpf/verifier/sleepable.c
@@ -85,7 +85,7 @@
 	.expected_attach_type = BPF_TRACE_RAW_TP,
 	.kfunc = "sched_switch",
 	.result = REJECT,
-	.errstr = "Only fentry/fexit/fmod_ret, lsm, iter and uprobe programs can be sleepable",
+	.errstr = "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable",
 	.flags = BPF_F_SLEEPABLE,
 	.runs = -1,
 },
-- 
cgit 


From 913b2255c3d876254e968f1e8e2c817cca283a29 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 25 Jan 2023 10:47:33 -0600
Subject: libbpf: Support sleepable struct_ops.s section

In a prior change, the verifier was updated to support sleepable
BPF_PROG_TYPE_STRUCT_OPS programs. A caller could set the program as
sleepable with bpf_program__set_flags(), but it would be more ergonomic
and more in-line with other sleepable program types if we supported
suffixing a struct_ops section name with .s to indicate that it's
sleepable.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230125164735.785732-3-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/libbpf.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 27d9faa80471..eed5cec6f510 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8605,6 +8605,7 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("cgroup/setsockopt",	CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE),
 	SEC_DEF("cgroup/dev",		CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT),
 	SEC_DEF("struct_ops+",		STRUCT_OPS, 0, SEC_NONE),
+	SEC_DEF("struct_ops.s+",	STRUCT_OPS, 0, SEC_SLEEPABLE),
 	SEC_DEF("sk_lookup",		SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE),
 };
 
-- 
cgit 


From 7dd880592a88799f3ef48fda507849a75f11cbf0 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 25 Jan 2023 10:47:35 -0600
Subject: bpf/selftests: Verify struct_ops prog sleepable behavior

In a set of prior changes, we added the ability for struct_ops programs
to be sleepable. This patch enhances the dummy_st_ops selftest suite to
validate this behavior by adding a new sleepable struct_ops entry to
dummy_st_ops.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230125164735.785732-5-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |  1 +
 net/bpf/bpf_dummy_struct_ops.c                     | 18 ++++++++
 .../selftests/bpf/prog_tests/dummy_st_ops.c        | 52 ++++++++++++++++------
 tools/testing/selftests/bpf/progs/dummy_st_ops.c   | 50 ---------------------
 .../selftests/bpf/progs/dummy_st_ops_fail.c        | 27 +++++++++++
 .../selftests/bpf/progs/dummy_st_ops_success.c     | 47 +++++++++++++++++++
 6 files changed, 132 insertions(+), 63 deletions(-)
 delete mode 100644 tools/testing/selftests/bpf/progs/dummy_st_ops.c
 create mode 100644 tools/testing/selftests/bpf/progs/dummy_st_ops_fail.c
 create mode 100644 tools/testing/selftests/bpf/progs/dummy_st_ops_success.c

(limited to 'tools')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0d868ef1b973..14a0264fac57 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1474,6 +1474,7 @@ struct bpf_dummy_ops {
 	int (*test_1)(struct bpf_dummy_ops_state *cb);
 	int (*test_2)(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2,
 		      char a3, unsigned long a4);
+	int (*test_sleepable)(struct bpf_dummy_ops_state *cb);
 };
 
 int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index 1ac4467928a9..ff4f89a2b02a 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -154,6 +154,23 @@ static bool bpf_dummy_ops_is_valid_access(int off, int size,
 	return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
 }
 
+static int bpf_dummy_ops_check_member(const struct btf_type *t,
+				      const struct btf_member *member,
+				      const struct bpf_prog *prog)
+{
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+	switch (moff) {
+	case offsetof(struct bpf_dummy_ops, test_sleepable):
+		break;
+	default:
+		if (prog->aux->sleepable)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
 					   const struct bpf_reg_state *reg,
 					   int off, int size, enum bpf_access_type atype,
@@ -208,6 +225,7 @@ static void bpf_dummy_unreg(void *kdata)
 struct bpf_struct_ops bpf_bpf_dummy_ops = {
 	.verifier_ops = &bpf_dummy_verifier_ops,
 	.init = bpf_dummy_init,
+	.check_member = bpf_dummy_ops_check_member,
 	.init_member = bpf_dummy_init_member,
 	.reg = bpf_dummy_reg,
 	.unreg = bpf_dummy_unreg,
diff --git a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c
index c11832657d2b..f43fcb13d2c4 100644
--- a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c
+++ b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c
@@ -1,7 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (C) 2021. Huawei Technologies Co., Ltd */
 #include <test_progs.h>
-#include "dummy_st_ops.skel.h"
+#include "dummy_st_ops_success.skel.h"
+#include "dummy_st_ops_fail.skel.h"
 #include "trace_dummy_st_ops.skel.h"
 
 /* Need to keep consistent with definition in include/linux/bpf.h */
@@ -11,17 +12,17 @@ struct bpf_dummy_ops_state {
 
 static void test_dummy_st_ops_attach(void)
 {
-	struct dummy_st_ops *skel;
+	struct dummy_st_ops_success *skel;
 	struct bpf_link *link;
 
-	skel = dummy_st_ops__open_and_load();
+	skel = dummy_st_ops_success__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "dummy_st_ops_load"))
 		return;
 
 	link = bpf_map__attach_struct_ops(skel->maps.dummy_1);
 	ASSERT_EQ(libbpf_get_error(link), -EOPNOTSUPP, "dummy_st_ops_attach");
 
-	dummy_st_ops__destroy(skel);
+	dummy_st_ops_success__destroy(skel);
 }
 
 static void test_dummy_init_ret_value(void)
@@ -31,10 +32,10 @@ static void test_dummy_init_ret_value(void)
 		.ctx_in = args,
 		.ctx_size_in = sizeof(args),
 	);
-	struct dummy_st_ops *skel;
+	struct dummy_st_ops_success *skel;
 	int fd, err;
 
-	skel = dummy_st_ops__open_and_load();
+	skel = dummy_st_ops_success__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "dummy_st_ops_load"))
 		return;
 
@@ -43,7 +44,7 @@ static void test_dummy_init_ret_value(void)
 	ASSERT_OK(err, "test_run");
 	ASSERT_EQ(attr.retval, 0xf2f3f4f5, "test_ret");
 
-	dummy_st_ops__destroy(skel);
+	dummy_st_ops_success__destroy(skel);
 }
 
 static void test_dummy_init_ptr_arg(void)
@@ -58,10 +59,10 @@ static void test_dummy_init_ptr_arg(void)
 		.ctx_size_in = sizeof(args),
 	);
 	struct trace_dummy_st_ops *trace_skel;
-	struct dummy_st_ops *skel;
+	struct dummy_st_ops_success *skel;
 	int fd, err;
 
-	skel = dummy_st_ops__open_and_load();
+	skel = dummy_st_ops_success__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "dummy_st_ops_load"))
 		return;
 
@@ -91,7 +92,7 @@ static void test_dummy_init_ptr_arg(void)
 	ASSERT_EQ(trace_skel->bss->val, exp_retval, "fentry_val");
 
 done:
-	dummy_st_ops__destroy(skel);
+	dummy_st_ops_success__destroy(skel);
 	trace_dummy_st_ops__destroy(trace_skel);
 }
 
@@ -102,12 +103,12 @@ static void test_dummy_multiple_args(void)
 		.ctx_in = args,
 		.ctx_size_in = sizeof(args),
 	);
-	struct dummy_st_ops *skel;
+	struct dummy_st_ops_success *skel;
 	int fd, err;
 	size_t i;
 	char name[8];
 
-	skel = dummy_st_ops__open_and_load();
+	skel = dummy_st_ops_success__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "dummy_st_ops_load"))
 		return;
 
@@ -119,7 +120,28 @@ static void test_dummy_multiple_args(void)
 		ASSERT_EQ(skel->bss->test_2_args[i], args[i], name);
 	}
 
-	dummy_st_ops__destroy(skel);
+	dummy_st_ops_success__destroy(skel);
+}
+
+static void test_dummy_sleepable(void)
+{
+	__u64 args[1] = {0};
+	LIBBPF_OPTS(bpf_test_run_opts, attr,
+		.ctx_in = args,
+		.ctx_size_in = sizeof(args),
+	);
+	struct dummy_st_ops_success *skel;
+	int fd, err;
+
+	skel = dummy_st_ops_success__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "dummy_st_ops_load"))
+		return;
+
+	fd = bpf_program__fd(skel->progs.test_sleepable);
+	err = bpf_prog_test_run_opts(fd, &attr);
+	ASSERT_OK(err, "test_run");
+
+	dummy_st_ops_success__destroy(skel);
 }
 
 void test_dummy_st_ops(void)
@@ -132,4 +154,8 @@ void test_dummy_st_ops(void)
 		test_dummy_init_ptr_arg();
 	if (test__start_subtest("dummy_multiple_args"))
 		test_dummy_multiple_args();
+	if (test__start_subtest("dummy_sleepable"))
+		test_dummy_sleepable();
+
+	RUN_TESTS(dummy_st_ops_fail);
 }
diff --git a/tools/testing/selftests/bpf/progs/dummy_st_ops.c b/tools/testing/selftests/bpf/progs/dummy_st_ops.c
deleted file mode 100644
index ead87edb75e2..000000000000
--- a/tools/testing/selftests/bpf/progs/dummy_st_ops.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2021. Huawei Technologies Co., Ltd */
-#include <linux/bpf.h>
-#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
-
-struct bpf_dummy_ops_state {
-	int val;
-} __attribute__((preserve_access_index));
-
-struct bpf_dummy_ops {
-	int (*test_1)(struct bpf_dummy_ops_state *state);
-	int (*test_2)(struct bpf_dummy_ops_state *state, int a1, unsigned short a2,
-		      char a3, unsigned long a4);
-};
-
-char _license[] SEC("license") = "GPL";
-
-SEC("struct_ops/test_1")
-int BPF_PROG(test_1, struct bpf_dummy_ops_state *state)
-{
-	int ret;
-
-	if (!state)
-		return 0xf2f3f4f5;
-
-	ret = state->val;
-	state->val = 0x5a;
-	return ret;
-}
-
-__u64 test_2_args[5];
-
-SEC("struct_ops/test_2")
-int BPF_PROG(test_2, struct bpf_dummy_ops_state *state, int a1, unsigned short a2,
-	     char a3, unsigned long a4)
-{
-	test_2_args[0] = (unsigned long)state;
-	test_2_args[1] = a1;
-	test_2_args[2] = a2;
-	test_2_args[3] = a3;
-	test_2_args[4] = a4;
-	return 0;
-}
-
-SEC(".struct_ops")
-struct bpf_dummy_ops dummy_1 = {
-	.test_1 = (void *)test_1,
-	.test_2 = (void *)test_2,
-};
diff --git a/tools/testing/selftests/bpf/progs/dummy_st_ops_fail.c b/tools/testing/selftests/bpf/progs/dummy_st_ops_fail.c
new file mode 100644
index 000000000000..0bf969a0b5ed
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/dummy_st_ops_fail.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("struct_ops.s/test_2")
+__failure __msg("attach to unsupported member test_2 of struct bpf_dummy_ops")
+int BPF_PROG(test_unsupported_field_sleepable,
+	     struct bpf_dummy_ops_state *state, int a1, unsigned short a2,
+	     char a3, unsigned long a4)
+{
+	/* Tries to mark an unsleepable field in struct bpf_dummy_ops as sleepable. */
+	return 0;
+}
+
+SEC(".struct_ops")
+struct bpf_dummy_ops dummy_1 = {
+	.test_1 = NULL,
+	.test_2 = (void *)test_unsupported_field_sleepable,
+	.test_sleepable = (void *)NULL,
+};
diff --git a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c
new file mode 100644
index 000000000000..1efa746c25dc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2021. Huawei Technologies Co., Ltd */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("struct_ops/test_1")
+int BPF_PROG(test_1, struct bpf_dummy_ops_state *state)
+{
+	int ret;
+
+	if (!state)
+		return 0xf2f3f4f5;
+
+	ret = state->val;
+	state->val = 0x5a;
+	return ret;
+}
+
+__u64 test_2_args[5];
+
+SEC("struct_ops/test_2")
+int BPF_PROG(test_2, struct bpf_dummy_ops_state *state, int a1, unsigned short a2,
+	     char a3, unsigned long a4)
+{
+	test_2_args[0] = (unsigned long)state;
+	test_2_args[1] = a1;
+	test_2_args[2] = a2;
+	test_2_args[3] = a3;
+	test_2_args[4] = a4;
+	return 0;
+}
+
+SEC("struct_ops.s/test_sleepable")
+int BPF_PROG(test_sleepable, struct bpf_dummy_ops_state *state)
+{
+	return 0;
+}
+
+SEC(".struct_ops")
+struct bpf_dummy_ops dummy_1 = {
+	.test_1 = (void *)test_1,
+	.test_2 = (void *)test_2,
+	.test_sleepable = (void *)test_sleepable,
+};
-- 
cgit 


From f45d63c1218636f77b9c3c53318c56ecd27dc8ec Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Mon, 19 Dec 2022 11:50:50 -0800
Subject: tools/testing/cxl: require 64-bit

size_t is limited to 32-bits and so the gen_pool_alloc() using
the size of SZ_64G would map to 0, triggering a low allocation
which is not expected. Force the dependency on 64-bit for cxl_test
as that is what it was designed for.

This issue was found by build test reports when converting this
driver as a proper upstream driver.

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Link: https://lore.kernel.org/r/20221219195050.325959-1-mcgrof@kernel.org
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/cxl/config_check.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/cxl/config_check.c b/tools/testing/cxl/config_check.c
index c4c457e59841..99b56b5f6edf 100644
--- a/tools/testing/cxl/config_check.c
+++ b/tools/testing/cxl/config_check.c
@@ -7,6 +7,7 @@ void check(void)
 	 * These kconfig symbols must be set to "m" for cxl_test to load
 	 * and operate.
 	 */
+	BUILD_BUG_ON(!IS_ENABLED(CONFIG_64BIT));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_CXL_BUS));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_CXL_ACPI));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_CXL_PMEM));
-- 
cgit 


From d1246f93602316e2dda1000f185e8d13dd611871 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 25 Jan 2023 12:16:08 -0800
Subject: selftests/bpf: Calls bpf_setsockopt() on a ktls enabled socket.

Ensures that whenever bpf_setsockopt() is called with the SOL_TCP
option on a ktls enabled socket, the call will be accepted by the
system. The provided test makes sure of this by performing an
examination when the server side socket is in the CLOSE_WAIT state. At
this stage, ktls is still enabled on the server socket and can be used
to test if bpf_setsockopt() works correctly with linux.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Link: https://lore.kernel.org/r/20230125201608.908230-3-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/setget_sockopt.c      | 73 ++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/setget_sockopt.c |  8 +++
 2 files changed, 81 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
index 018611e6b248..7d4a9b3d3722 100644
--- a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
+++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
@@ -4,6 +4,7 @@
 #define _GNU_SOURCE
 #include <sched.h>
 #include <linux/socket.h>
+#include <linux/tls.h>
 #include <net/if.h>
 
 #include "test_progs.h"
@@ -83,6 +84,76 @@ static void test_udp(int family)
 	ASSERT_EQ(bss->nr_binddev, 1, "nr_bind");
 }
 
+static void test_ktls(int family)
+{
+	struct tls12_crypto_info_aes_gcm_128 aes128;
+	struct setget_sockopt__bss *bss = skel->bss;
+	int cfd = -1, sfd = -1, fd = -1, ret;
+	char buf;
+
+	memset(bss, 0, sizeof(*bss));
+
+	sfd = start_server(family, SOCK_STREAM,
+			   family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
+	if (!ASSERT_GE(sfd, 0, "start_server"))
+		return;
+	fd = connect_to_fd(sfd, 0);
+	if (!ASSERT_GE(fd, 0, "connect_to_fd"))
+		goto err_out;
+
+	cfd = accept(sfd, NULL, 0);
+	if (!ASSERT_GE(cfd, 0, "accept"))
+		goto err_out;
+
+	close(sfd);
+	sfd = -1;
+
+	/* Setup KTLS */
+	ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+	if (!ASSERT_OK(ret, "setsockopt"))
+		goto err_out;
+	ret = setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+	if (!ASSERT_OK(ret, "setsockopt"))
+		goto err_out;
+
+	memset(&aes128, 0, sizeof(aes128));
+	aes128.info.version = TLS_1_2_VERSION;
+	aes128.info.cipher_type = TLS_CIPHER_AES_GCM_128;
+
+	ret = setsockopt(fd, SOL_TLS, TLS_TX, &aes128, sizeof(aes128));
+	if (!ASSERT_OK(ret, "setsockopt"))
+		goto err_out;
+
+	ret = setsockopt(cfd, SOL_TLS, TLS_RX, &aes128, sizeof(aes128));
+	if (!ASSERT_OK(ret, "setsockopt"))
+		goto err_out;
+
+	/* KTLS is enabled */
+
+	close(fd);
+	/* At this point, the cfd socket is at the CLOSE_WAIT state
+	 * and still run TLS protocol.  The test for
+	 * BPF_TCP_CLOSE_WAIT should be run at this point.
+	 */
+	ret = read(cfd, &buf, sizeof(buf));
+	ASSERT_EQ(ret, 0, "read");
+	close(cfd);
+
+	ASSERT_EQ(bss->nr_listen, 1, "nr_listen");
+	ASSERT_EQ(bss->nr_connect, 1, "nr_connect");
+	ASSERT_EQ(bss->nr_active, 1, "nr_active");
+	ASSERT_EQ(bss->nr_passive, 1, "nr_passive");
+	ASSERT_EQ(bss->nr_socket_post_create, 2, "nr_socket_post_create");
+	ASSERT_EQ(bss->nr_binddev, 2, "nr_bind");
+	ASSERT_EQ(bss->nr_fin_wait1, 1, "nr_fin_wait1");
+	return;
+
+err_out:
+	close(fd);
+	close(cfd);
+	close(sfd);
+}
+
 void test_setget_sockopt(void)
 {
 	cg_fd = test__join_cgroup(CG_NAME);
@@ -118,6 +189,8 @@ void test_setget_sockopt(void)
 	test_tcp(AF_INET);
 	test_udp(AF_INET6);
 	test_udp(AF_INET);
+	test_ktls(AF_INET6);
+	test_ktls(AF_INET);
 
 done:
 	setget_sockopt__destroy(skel);
diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c
index 9523333b8905..7a438600ae98 100644
--- a/tools/testing/selftests/bpf/progs/setget_sockopt.c
+++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c
@@ -22,6 +22,7 @@ int nr_active;
 int nr_connect;
 int nr_binddev;
 int nr_socket_post_create;
+int nr_fin_wait1;
 
 struct sockopt_test {
 	int opt;
@@ -386,6 +387,13 @@ int skops_sockopt(struct bpf_sock_ops *skops)
 		nr_passive += !(bpf_test_sockopt(skops, sk) ||
 				test_tcp_maxseg(skops, sk) ||
 				test_tcp_saved_syn(skops, sk));
+		bpf_sock_ops_cb_flags_set(skops,
+					  skops->bpf_sock_ops_cb_flags |
+					  BPF_SOCK_OPS_STATE_CB_FLAG);
+		break;
+	case BPF_SOCK_OPS_STATE_CB:
+		if (skops->args[1] == BPF_TCP_CLOSE_WAIT)
+			nr_fin_wait1 += !bpf_test_sockopt(skops, sk);
 		break;
 	}
 
-- 
cgit 


From ae5439658ccec3dcef1754dcd2ea3d3529b3508d Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Tue, 24 Jan 2023 14:36:44 +0100
Subject: selftests/net: Cover the IP_LOCAL_PORT_RANGE socket option

Exercise IP_LOCAL_PORT_RANGE socket option in various scenarios:

1. pass invalid values to setsockopt
2. pass a range outside of the per-netns port range
3. configure a single-port range
4. exhaust a configured multi-port range
5. check interaction with late-bind (IP_BIND_ADDRESS_NO_PORT)
6. set then get the per-socket port range

Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile               |   2 +
 tools/testing/selftests/net/ip_local_port_range.c  | 447 +++++++++++++++++++++
 tools/testing/selftests/net/ip_local_port_range.sh |   5 +
 3 files changed, 454 insertions(+)
 create mode 100644 tools/testing/selftests/net/ip_local_port_range.c
 create mode 100755 tools/testing/selftests/net/ip_local_port_range.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 47314f0b3006..951bd5342bc6 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -45,6 +45,7 @@ TEST_PROGS += arp_ndisc_untracked_subnets.sh
 TEST_PROGS += stress_reuseport_listen.sh
 TEST_PROGS += l2_tos_ttl_inherit.sh
 TEST_PROGS += bind_bhash.sh
+TEST_PROGS += ip_local_port_range.sh
 TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh
 TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh
 TEST_GEN_FILES =  socket nettest
@@ -76,6 +77,7 @@ TEST_PROGS += sctp_vrf.sh
 TEST_GEN_FILES += sctp_hello
 TEST_GEN_FILES += csum
 TEST_GEN_FILES += nat6to4.o
+TEST_GEN_FILES += ip_local_port_range
 
 TEST_FILES := settings
 
diff --git a/tools/testing/selftests/net/ip_local_port_range.c b/tools/testing/selftests/net/ip_local_port_range.c
new file mode 100644
index 000000000000..75e3fdacdf73
--- /dev/null
+++ b/tools/testing/selftests/net/ip_local_port_range.c
@@ -0,0 +1,447 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+// Copyright (c) 2023 Cloudflare
+
+/* Test IP_LOCAL_PORT_RANGE socket option: IPv4 + IPv6, TCP + UDP.
+ *
+ * Tests assume that net.ipv4.ip_local_port_range is [40000, 49999].
+ * Don't run these directly but with ip_local_port_range.sh script.
+ */
+
+#include <fcntl.h>
+#include <netinet/ip.h>
+
+#include "../kselftest_harness.h"
+
+#ifndef IP_LOCAL_PORT_RANGE
+#define IP_LOCAL_PORT_RANGE 51
+#endif
+
+static __u32 pack_port_range(__u16 lo, __u16 hi)
+{
+	return (hi << 16) | (lo << 0);
+}
+
+static void unpack_port_range(__u32 range, __u16 *lo, __u16 *hi)
+{
+	*lo = range & 0xffff;
+	*hi = range >> 16;
+}
+
+static int get_so_domain(int fd)
+{
+	int domain, err;
+	socklen_t len;
+
+	len = sizeof(domain);
+	err = getsockopt(fd, SOL_SOCKET, SO_DOMAIN, &domain, &len);
+	if (err)
+		return -1;
+
+	return domain;
+}
+
+static int bind_to_loopback_any_port(int fd)
+{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in v4;
+		struct sockaddr_in6 v6;
+	} addr;
+	socklen_t addr_len;
+
+	memset(&addr, 0, sizeof(addr));
+	switch (get_so_domain(fd)) {
+	case AF_INET:
+		addr.v4.sin_family = AF_INET;
+		addr.v4.sin_port = htons(0);
+		addr.v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+		addr_len = sizeof(addr.v4);
+		break;
+	case AF_INET6:
+		addr.v6.sin6_family = AF_INET6;
+		addr.v6.sin6_port = htons(0);
+		addr.v6.sin6_addr = in6addr_loopback;
+		addr_len = sizeof(addr.v6);
+		break;
+	default:
+		return -1;
+	}
+
+	return bind(fd, &addr.sa, addr_len);
+}
+
+static int get_sock_port(int fd)
+{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in v4;
+		struct sockaddr_in6 v6;
+	} addr;
+	socklen_t addr_len;
+	int err;
+
+	addr_len = sizeof(addr);
+	memset(&addr, 0, sizeof(addr));
+	err = getsockname(fd, &addr.sa, &addr_len);
+	if (err)
+		return -1;
+
+	switch (addr.sa.sa_family) {
+	case AF_INET:
+		return ntohs(addr.v4.sin_port);
+	case AF_INET6:
+		return ntohs(addr.v6.sin6_port);
+	default:
+		errno = EAFNOSUPPORT;
+		return -1;
+	}
+}
+
+static int get_ip_local_port_range(int fd, __u32 *range)
+{
+	socklen_t len;
+	__u32 val;
+	int err;
+
+	len = sizeof(val);
+	err = getsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &val, &len);
+	if (err)
+		return -1;
+
+	*range = val;
+	return 0;
+}
+
+FIXTURE(ip_local_port_range) {};
+
+FIXTURE_SETUP(ip_local_port_range)
+{
+}
+
+FIXTURE_TEARDOWN(ip_local_port_range)
+{
+}
+
+FIXTURE_VARIANT(ip_local_port_range) {
+	int so_domain;
+	int so_type;
+	int so_protocol;
+};
+
+FIXTURE_VARIANT_ADD(ip_local_port_range, ip4_tcp) {
+	.so_domain	= AF_INET,
+	.so_type	= SOCK_STREAM,
+	.so_protocol	= 0,
+};
+
+FIXTURE_VARIANT_ADD(ip_local_port_range, ip4_udp) {
+	.so_domain	= AF_INET,
+	.so_type	= SOCK_DGRAM,
+	.so_protocol	= 0,
+};
+
+FIXTURE_VARIANT_ADD(ip_local_port_range, ip4_stcp) {
+	.so_domain	= AF_INET,
+	.so_type	= SOCK_STREAM,
+	.so_protocol	= IPPROTO_SCTP,
+};
+
+FIXTURE_VARIANT_ADD(ip_local_port_range, ip6_tcp) {
+	.so_domain	= AF_INET6,
+	.so_type	= SOCK_STREAM,
+	.so_protocol	= 0,
+};
+
+FIXTURE_VARIANT_ADD(ip_local_port_range, ip6_udp) {
+	.so_domain	= AF_INET6,
+	.so_type	= SOCK_DGRAM,
+	.so_protocol	= 0,
+};
+
+FIXTURE_VARIANT_ADD(ip_local_port_range, ip6_stcp) {
+	.so_domain	= AF_INET6,
+	.so_type	= SOCK_STREAM,
+	.so_protocol	= IPPROTO_SCTP,
+};
+
+TEST_F(ip_local_port_range, invalid_option_value)
+{
+	__u16 val16;
+	__u32 val32;
+	__u64 val64;
+	int fd, err;
+
+	fd = socket(variant->so_domain, variant->so_type, variant->so_protocol);
+	ASSERT_GE(fd, 0) TH_LOG("socket failed");
+
+	/* Too few bytes */
+	val16 = 40000;
+	err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &val16, sizeof(val16));
+	EXPECT_TRUE(err) TH_LOG("expected setsockopt(IP_LOCAL_PORT_RANGE) to fail");
+	EXPECT_EQ(errno, EINVAL);
+
+	/* Empty range: low port > high port */
+	val32 = pack_port_range(40222, 40111);
+	err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &val32, sizeof(val32));
+	EXPECT_TRUE(err) TH_LOG("expected setsockopt(IP_LOCAL_PORT_RANGE) to fail");
+	EXPECT_EQ(errno, EINVAL);
+
+	/* Too many bytes */
+	val64 = pack_port_range(40333, 40444);
+	err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &val64, sizeof(val64));
+	EXPECT_TRUE(err) TH_LOG("expected setsockopt(IP_LOCAL_PORT_RANGE) to fail");
+	EXPECT_EQ(errno, EINVAL);
+
+	err = close(fd);
+	ASSERT_TRUE(!err) TH_LOG("close failed");
+}
+
+TEST_F(ip_local_port_range, port_range_out_of_netns_range)
+{
+	const struct test {
+		__u16 range_lo;
+		__u16 range_hi;
+	} tests[] = {
+		{ 30000, 39999 }, /* socket range below netns range */
+		{ 50000, 59999 }, /* socket range above netns range */
+	};
+	const struct test *t;
+
+	for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+		/* Bind a couple of sockets, not just one, to check
+		 * that the range wasn't clamped to a single port from
+		 * the netns range. That is [40000, 40000] or [49999,
+		 * 49999], respectively for each test case.
+		 */
+		int fds[2], i;
+
+		TH_LOG("lo %5hu, hi %5hu", t->range_lo, t->range_hi);
+
+		for (i = 0; i < ARRAY_SIZE(fds); i++) {
+			int fd, err, port;
+			__u32 range;
+
+			fd = socket(variant->so_domain, variant->so_type, variant->so_protocol);
+			ASSERT_GE(fd, 0) TH_LOG("#%d: socket failed", i);
+
+			range = pack_port_range(t->range_lo, t->range_hi);
+			err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &range, sizeof(range));
+			ASSERT_TRUE(!err) TH_LOG("#%d: setsockopt(IP_LOCAL_PORT_RANGE) failed", i);
+
+			err = bind_to_loopback_any_port(fd);
+			ASSERT_TRUE(!err) TH_LOG("#%d: bind failed", i);
+
+			/* Check that socket port range outside of ephemeral range is ignored */
+			port = get_sock_port(fd);
+			ASSERT_GE(port, 40000) TH_LOG("#%d: expected port within netns range", i);
+			ASSERT_LE(port, 49999) TH_LOG("#%d: expected port within netns range", i);
+
+			fds[i] = fd;
+		}
+
+		for (i = 0; i < ARRAY_SIZE(fds); i++)
+			ASSERT_TRUE(close(fds[i]) == 0) TH_LOG("#%d: close failed", i);
+	}
+}
+
+TEST_F(ip_local_port_range, single_port_range)
+{
+	const struct test {
+		__u16 range_lo;
+		__u16 range_hi;
+		__u16 expected;
+	} tests[] = {
+		/* single port range within ephemeral range */
+		{ 45000, 45000, 45000 },
+		/* first port in the ephemeral range (clamp from above) */
+		{ 0, 40000, 40000 },
+		/* last port in the ephemeral range (clamp from below)  */
+		{ 49999, 0, 49999 },
+	};
+	const struct test *t;
+
+	for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+		int fd, err, port;
+		__u32 range;
+
+		TH_LOG("lo %5hu, hi %5hu, expected %5hu",
+		       t->range_lo, t->range_hi, t->expected);
+
+		fd = socket(variant->so_domain, variant->so_type, variant->so_protocol);
+		ASSERT_GE(fd, 0) TH_LOG("socket failed");
+
+		range = pack_port_range(t->range_lo, t->range_hi);
+		err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &range, sizeof(range));
+		ASSERT_TRUE(!err) TH_LOG("setsockopt(IP_LOCAL_PORT_RANGE) failed");
+
+		err = bind_to_loopback_any_port(fd);
+		ASSERT_TRUE(!err) TH_LOG("bind failed");
+
+		port = get_sock_port(fd);
+		ASSERT_EQ(port, t->expected) TH_LOG("unexpected local port");
+
+		err = close(fd);
+		ASSERT_TRUE(!err) TH_LOG("close failed");
+	}
+}
+
+TEST_F(ip_local_port_range, exhaust_8_port_range)
+{
+	__u8 port_set = 0;
+	int i, fd, err;
+	__u32 range;
+	__u16 port;
+	int fds[8];
+
+	for (i = 0; i < ARRAY_SIZE(fds); i++) {
+		fd = socket(variant->so_domain, variant->so_type, variant->so_protocol);
+		ASSERT_GE(fd, 0) TH_LOG("socket failed");
+
+		range = pack_port_range(40000, 40007);
+		err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &range, sizeof(range));
+		ASSERT_TRUE(!err) TH_LOG("setsockopt(IP_LOCAL_PORT_RANGE) failed");
+
+		err = bind_to_loopback_any_port(fd);
+		ASSERT_TRUE(!err) TH_LOG("bind failed");
+
+		port = get_sock_port(fd);
+		ASSERT_GE(port, 40000) TH_LOG("expected port within sockopt range");
+		ASSERT_LE(port, 40007) TH_LOG("expected port within sockopt range");
+
+		port_set |= 1 << (port - 40000);
+		fds[i] = fd;
+	}
+
+	/* Check that all every port from the test range is in use */
+	ASSERT_EQ(port_set, 0xff) TH_LOG("expected all ports to be busy");
+
+	/* Check that bind() fails because the whole range is busy */
+	fd = socket(variant->so_domain, variant->so_type, variant->so_protocol);
+	ASSERT_GE(fd, 0) TH_LOG("socket failed");
+
+	range = pack_port_range(40000, 40007);
+	err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &range, sizeof(range));
+	ASSERT_TRUE(!err) TH_LOG("setsockopt(IP_LOCAL_PORT_RANGE) failed");
+
+	err = bind_to_loopback_any_port(fd);
+	ASSERT_TRUE(err) TH_LOG("expected bind to fail");
+	ASSERT_EQ(errno, EADDRINUSE);
+
+	err = close(fd);
+	ASSERT_TRUE(!err) TH_LOG("close failed");
+
+	for (i = 0; i < ARRAY_SIZE(fds); i++) {
+		err = close(fds[i]);
+		ASSERT_TRUE(!err) TH_LOG("close failed");
+	}
+}
+
+TEST_F(ip_local_port_range, late_bind)
+{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in v4;
+		struct sockaddr_in6 v6;
+	} addr;
+	socklen_t addr_len;
+	const int one = 1;
+	int fd, err;
+	__u32 range;
+	__u16 port;
+
+	if (variant->so_protocol == IPPROTO_SCTP)
+		SKIP(return, "SCTP doesn't support IP_BIND_ADDRESS_NO_PORT");
+
+	fd = socket(variant->so_domain, variant->so_type, 0);
+	ASSERT_GE(fd, 0) TH_LOG("socket failed");
+
+	range = pack_port_range(40100, 40199);
+	err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &range, sizeof(range));
+	ASSERT_TRUE(!err) TH_LOG("setsockopt(IP_LOCAL_PORT_RANGE) failed");
+
+	err = setsockopt(fd, SOL_IP, IP_BIND_ADDRESS_NO_PORT, &one, sizeof(one));
+	ASSERT_TRUE(!err) TH_LOG("setsockopt(IP_BIND_ADDRESS_NO_PORT) failed");
+
+	err = bind_to_loopback_any_port(fd);
+	ASSERT_TRUE(!err) TH_LOG("bind failed");
+
+	port = get_sock_port(fd);
+	ASSERT_EQ(port, 0) TH_LOG("getsockname failed");
+
+	/* Invalid destination */
+	memset(&addr, 0, sizeof(addr));
+	switch (variant->so_domain) {
+	case AF_INET:
+		addr.v4.sin_family = AF_INET;
+		addr.v4.sin_port = htons(0);
+		addr.v4.sin_addr.s_addr = htonl(INADDR_ANY);
+		addr_len = sizeof(addr.v4);
+		break;
+	case AF_INET6:
+		addr.v6.sin6_family = AF_INET6;
+		addr.v6.sin6_port = htons(0);
+		addr.v6.sin6_addr = in6addr_any;
+		addr_len = sizeof(addr.v6);
+		break;
+	default:
+		ASSERT_TRUE(false) TH_LOG("unsupported socket domain");
+	}
+
+	/* connect() doesn't need to succeed for late bind to happen */
+	connect(fd, &addr.sa, addr_len);
+
+	port = get_sock_port(fd);
+	ASSERT_GE(port, 40100);
+	ASSERT_LE(port, 40199);
+
+	err = close(fd);
+	ASSERT_TRUE(!err) TH_LOG("close failed");
+}
+
+TEST_F(ip_local_port_range, get_port_range)
+{
+	__u16 lo, hi;
+	__u32 range;
+	int fd, err;
+
+	fd = socket(variant->so_domain, variant->so_type, variant->so_protocol);
+	ASSERT_GE(fd, 0) TH_LOG("socket failed");
+
+	/* Get range before it will be set */
+	err = get_ip_local_port_range(fd, &range);
+	ASSERT_TRUE(!err) TH_LOG("getsockopt(IP_LOCAL_PORT_RANGE) failed");
+
+	unpack_port_range(range, &lo, &hi);
+	ASSERT_EQ(lo, 0) TH_LOG("unexpected low port");
+	ASSERT_EQ(hi, 0) TH_LOG("unexpected high port");
+
+	range = pack_port_range(12345, 54321);
+	err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &range, sizeof(range));
+	ASSERT_TRUE(!err) TH_LOG("setsockopt(IP_LOCAL_PORT_RANGE) failed");
+
+	/* Get range after it has been set */
+	err = get_ip_local_port_range(fd, &range);
+	ASSERT_TRUE(!err) TH_LOG("getsockopt(IP_LOCAL_PORT_RANGE) failed");
+
+	unpack_port_range(range, &lo, &hi);
+	ASSERT_EQ(lo, 12345) TH_LOG("unexpected low port");
+	ASSERT_EQ(hi, 54321) TH_LOG("unexpected high port");
+
+	/* Unset the port range  */
+	range = pack_port_range(0, 0);
+	err = setsockopt(fd, SOL_IP, IP_LOCAL_PORT_RANGE, &range, sizeof(range));
+	ASSERT_TRUE(!err) TH_LOG("setsockopt(IP_LOCAL_PORT_RANGE) failed");
+
+	/* Get range after it has been unset */
+	err = get_ip_local_port_range(fd, &range);
+	ASSERT_TRUE(!err) TH_LOG("getsockopt(IP_LOCAL_PORT_RANGE) failed");
+
+	unpack_port_range(range, &lo, &hi);
+	ASSERT_EQ(lo, 0) TH_LOG("unexpected low port");
+	ASSERT_EQ(hi, 0) TH_LOG("unexpected high port");
+
+	err = close(fd);
+	ASSERT_TRUE(!err) TH_LOG("close failed");
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/ip_local_port_range.sh b/tools/testing/selftests/net/ip_local_port_range.sh
new file mode 100755
index 000000000000..6c6ad346eaa0
--- /dev/null
+++ b/tools/testing/selftests/net/ip_local_port_range.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+./in_netns.sh \
+  sh -c 'sysctl -q -w net.ipv4.ip_local_port_range="40000 49999" && ./ip_local_port_range'
-- 
cgit 


From ad3493746ebe9e9adf3d8c5b003fa0240f25242a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 25 Jan 2023 11:47:23 +0100
Subject: selftests: mptcp: add test-cases for mixed v4/v6 subflows

Note that we can't guess the listener family anymore based on the client
target address: always use IPv6.

The fullmesh flag with endpoints from different families is also
validated here.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 53 ++++++++++++++++++++-----
 1 file changed, 44 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index d11d3d566608..387abdcec011 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -774,24 +774,17 @@ do_transfer()
 		addr_nr_ns2=${addr_nr_ns2:9}
 	fi
 
-	local local_addr
-	if is_v6 "${connect_addr}"; then
-		local_addr="::"
-	else
-		local_addr="0.0.0.0"
-	fi
-
 	extra_srv_args="$extra_args $extra_srv_args"
 	if [ "$test_link_fail" -gt 1 ];then
 		timeout ${timeout_test} \
 			ip netns exec ${listener_ns} \
 				./mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \
-					$extra_srv_args ${local_addr} < "$sinfail" > "$sout" &
+					$extra_srv_args "::" < "$sinfail" > "$sout" &
 	else
 		timeout ${timeout_test} \
 			ip netns exec ${listener_ns} \
 				./mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \
-					$extra_srv_args ${local_addr} < "$sin" > "$sout" &
+					$extra_srv_args "::" < "$sin" > "$sout" &
 	fi
 	local spid=$!
 
@@ -2448,6 +2441,47 @@ v4mapped_tests()
 	fi
 }
 
+mixed_tests()
+{
+	if reset "IPv4 sockets do not use IPv6 addresses"; then
+		pm_nl_set_limits $ns1 0 1
+		pm_nl_set_limits $ns2 1 1
+		pm_nl_add_endpoint $ns1 dead:beef:2::1 flags signal
+		run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow
+		chk_join_nr 0 0 0
+	fi
+
+	# Need an IPv6 mptcp socket to allow subflows of both families
+	if reset "simult IPv4 and IPv6 subflows"; then
+		pm_nl_set_limits $ns1 0 1
+		pm_nl_set_limits $ns2 1 1
+		pm_nl_add_endpoint $ns1 10.0.1.1 flags signal
+		run_tests $ns1 $ns2 dead:beef:2::1 0 0 0 slow
+		chk_join_nr 1 1 1
+	fi
+
+	# cross families subflows will not be created even in fullmesh mode
+	if reset "simult IPv4 and IPv6 subflows, fullmesh 1x1"; then
+		pm_nl_set_limits $ns1 0 4
+		pm_nl_set_limits $ns2 1 4
+		pm_nl_add_endpoint $ns2 dead:beef:2::2 flags subflow,fullmesh
+		pm_nl_add_endpoint $ns1 10.0.1.1 flags signal
+		run_tests $ns1 $ns2 dead:beef:2::1 0 0 0 slow
+		chk_join_nr 1 1 1
+	fi
+
+	# fullmesh still tries to create all the possibly subflows with
+	# matching family
+	if reset "simult IPv4 and IPv6 subflows, fullmesh 2x2"; then
+		pm_nl_set_limits $ns1 0 4
+		pm_nl_set_limits $ns2 2 4
+		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+		pm_nl_add_endpoint $ns1 dead:beef:2::1 flags signal
+		run_tests $ns1 $ns2 dead:beef:1::1 0 0 fullmesh_1 slow
+		chk_join_nr 4 4 4
+	fi
+}
+
 backup_tests()
 {
 	# single subflow, backup
@@ -3120,6 +3154,7 @@ all_tests_sorted=(
 	a@add_tests
 	6@ipv6_tests
 	4@v4mapped_tests
+	M@mixed_tests
 	b@backup_tests
 	p@add_addr_ports_tests
 	k@syncookies_tests
-- 
cgit 


From f790ae03db33e6a3efd18fe7aefe3c7897195cc2 Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matthieu.baerts@tessares.net>
Date: Wed, 25 Jan 2023 11:47:25 +0100
Subject: selftests: mptcp: userspace: print titles

This script is running a few tests after having setup the environment.

Printing titles helps understand what is being tested.

Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/mptcp/userspace_pm.sh | 24 ++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh
index ab2d581f28a1..7b06d9d0aa46 100755
--- a/tools/testing/selftests/net/mptcp/userspace_pm.sh
+++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh
@@ -43,6 +43,11 @@ rndh=$(printf %x "$sec")-$(mktemp -u XXXXXX)
 ns1="ns1-$rndh"
 ns2="ns2-$rndh"
 
+print_title()
+{
+	stdbuf -o0 -e0 printf "INFO: %s\n" "${1}"
+}
+
 kill_wait()
 {
 	kill $1 > /dev/null 2>&1
@@ -51,7 +56,7 @@ kill_wait()
 
 cleanup()
 {
-	echo "cleanup"
+	print_title "Cleanup"
 
 	rm -rf $file $client_evts $server_evts
 
@@ -78,6 +83,8 @@ cleanup()
 	for netns in "$ns1" "$ns2" ;do
 		ip netns del "$netns"
 	done
+
+	stdbuf -o0 -e0 printf "Done\n"
 }
 
 trap cleanup EXIT
@@ -108,6 +115,7 @@ ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad
 ip -net "$ns2" addr add dead:beef:2::2/64 dev ns2eth1 nodad
 ip -net "$ns2" link set ns2eth1 up
 
+print_title "Init"
 stdbuf -o0 -e0 printf "Created network namespaces ns1, ns2         \t\t\t[OK]\n"
 
 make_file()
@@ -255,6 +263,8 @@ verify_announce_event()
 
 test_announce()
 {
+	print_title "Announce tests"
+
 	# Capture events on the network namespace running the server
 	:>"$server_evts"
 
@@ -359,6 +369,8 @@ verify_remove_event()
 
 test_remove()
 {
+	print_title "Remove tests"
+
 	# Capture events on the network namespace running the server
 	:>"$server_evts"
 
@@ -521,6 +533,8 @@ verify_subflow_events()
 
 test_subflows()
 {
+	print_title "Subflows v4 or v6 only tests"
+
 	# Capture events on the network namespace running the server
 	:>"$server_evts"
 
@@ -754,6 +768,8 @@ test_subflows()
 
 test_subflows_v4_v6_mix()
 {
+	print_title "Subflows v4 and v6 mix tests"
+
 	# Attempt to add a listener at 10.0.2.1:<subflow-port>
 	ip netns exec "$ns1" ./pm_nl_ctl listen 10.0.2.1\
 	   $app6_port > /dev/null 2>&1 &
@@ -800,6 +816,8 @@ test_subflows_v4_v6_mix()
 
 test_prio()
 {
+	print_title "Prio tests"
+
 	local count
 
 	# Send MP_PRIO signal from client to server machine
@@ -876,6 +894,8 @@ verify_listener_events()
 
 test_listener()
 {
+	print_title "Listener tests"
+
 	# Capture events on the network namespace running the client
 	:>$client_evts
 
@@ -902,8 +922,10 @@ test_listener()
 	verify_listener_events $client_evts $LISTENER_CLOSED $AF_INET 10.0.2.2 $client4_port
 }
 
+print_title "Make connections"
 make_connection
 make_connection "v6"
+
 test_announce
 test_remove
 test_subflows
-- 
cgit 


From 1c0b0ee2640bbbf225c9bdf2f7b121fcf4ee2059 Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matthieu.baerts@tessares.net>
Date: Wed, 25 Jan 2023 11:47:26 +0100
Subject: selftests: mptcp: userspace: refactor asserts

Instead of having a long list of conditions to check, it is possible to
give a list of variable names to compare with their 'e_XXX' version.

This will ease the introduction of the following commit which will print
which condition has failed (if any).

Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/mptcp/userspace_pm.sh | 72 +++++++++++------------
 1 file changed, 36 insertions(+), 36 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh
index 7b06d9d0aa46..2f2a85a212b0 100755
--- a/tools/testing/selftests/net/mptcp/userspace_pm.sh
+++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh
@@ -225,6 +225,36 @@ make_connection()
 	fi
 }
 
+# $1: var name
+check_expected_one()
+{
+	local var="${1}"
+	local exp="e_${var}"
+
+	[ "${!var}" = "${!exp}" ]
+}
+
+# $@: all var names to check
+check_expected()
+{
+	local ret=0
+	local var
+
+	for var in "${@}"
+	do
+		check_expected_one "${var}" || ret=1
+	done
+
+	if [ ${ret} -eq 0 ]
+	then
+		stdbuf -o0 -e0 printf "[OK]\n"
+		return 0
+	fi
+
+	stdbuf -o0 -e0 printf "[FAIL]\n"
+	exit 1
+}
+
 verify_announce_event()
 {
 	local evt=$1
@@ -250,15 +280,8 @@ verify_announce_event()
 	fi
 	dport=$(sed --unbuffered -n 's/.*\(dport:\)\([[:digit:]]*\).*$/\2/p;q' "$evt")
 	id=$(sed --unbuffered -n 's/.*\(rem_id:\)\([[:digit:]]*\).*$/\2/p;q' "$evt")
-	if [ "$type" = "$e_type" ] && [ "$token" = "$e_token" ] &&
-		   [ "$addr" = "$e_addr" ] && [ "$dport" = "$e_dport" ] &&
-		   [ "$id" = "$e_id" ]
-	then
-		stdbuf -o0 -e0 printf "[OK]\n"
-		return 0
-	fi
-	stdbuf -o0 -e0 printf "[FAIL]\n"
-	exit 1
+
+	check_expected "type" "token" "addr" "dport" "id"
 }
 
 test_announce()
@@ -357,14 +380,8 @@ verify_remove_event()
 	type=$(sed --unbuffered -n 's/.*\(type:\)\([[:digit:]]*\).*$/\2/p;q' "$evt")
 	token=$(sed --unbuffered -n 's/.*\(token:\)\([[:digit:]]*\).*$/\2/p;q' "$evt")
 	id=$(sed --unbuffered -n 's/.*\(rem_id:\)\([[:digit:]]*\).*$/\2/p;q' "$evt")
-	if [ "$type" = "$e_type" ] && [ "$token" = "$e_token" ] &&
-		   [ "$id" = "$e_id" ]
-	then
-		stdbuf -o0 -e0 printf "[OK]\n"
-		return 0
-	fi
-	stdbuf -o0 -e0 printf "[FAIL]\n"
-	exit 1
+
+	check_expected "type" "token" "id"
 }
 
 test_remove()
@@ -519,16 +536,7 @@ verify_subflow_events()
 		daddr=$(sed --unbuffered -n 's/.*\(daddr4:\)\([0-9.]*\).*$/\2/p;q' "$evt")
 	fi
 
-	if [ "$type" = "$e_type" ] && [ "$token" = "$e_token" ] &&
-		   [ "$daddr" = "$e_daddr" ] && [ "$e_dport" = "$dport" ] &&
-		   [ "$family" = "$e_family" ] && [ "$saddr" = "$e_saddr" ] &&
-		   [ "$e_locid" = "$locid" ] && [ "$e_remid" = "$remid" ]
-	then
-		stdbuf -o0 -e0 printf "[OK]\n"
-		return 0
-	fi
-	stdbuf -o0 -e0 printf "[FAIL]\n"
-	exit 1
+	check_expected "type" "token" "daddr" "dport" "family" "saddr" "locid" "remid"
 }
 
 test_subflows()
@@ -881,15 +889,7 @@ verify_listener_events()
 			sed --unbuffered -n 's/.*\(saddr4:\)\([0-9.]*\).*$/\2/p;q')
 	fi
 
-	if [ $type ] && [ $type = $e_type ] &&
-	   [ $family ] && [ $family = $e_family ] &&
-	   [ $saddr ] && [ $saddr = $e_saddr ] &&
-	   [ $sport ] && [ $sport = $e_sport ]; then
-		stdbuf -o0 -e0 printf "[OK]\n"
-		return 0
-	fi
-	stdbuf -o0 -e0 printf "[FAIL]\n"
-	exit 1
+	check_expected "type" "family" "saddr" "sport"
 }
 
 test_listener()
-- 
cgit 


From 10d4273411be6ead4d4d8a50a0593da15154f43f Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matthieu.baerts@tessares.net>
Date: Wed, 25 Jan 2023 11:47:27 +0100
Subject: selftests: mptcp: userspace: print error details if any

Before, only '[FAIL]' was printed in case of error during the validation
phase.

Now, in case of failure, the variable name, its value and expected one
are displayed to help understand what was wrong.

Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/mptcp/userspace_pm.sh | 33 +++++++++++++++++------
 1 file changed, 25 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh
index 2f2a85a212b0..259382ad552c 100755
--- a/tools/testing/selftests/net/mptcp/userspace_pm.sh
+++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh
@@ -201,11 +201,16 @@ make_connection()
 	server_serverside=$(grep "type:1," "$server_evts" |
 			    sed --unbuffered -n 's/.*\(server_side:\)\([[:digit:]]*\).*$/\2/p;q')
 
+	stdbuf -o0 -e0 printf "Established IP%s MPTCP Connection ns2 => ns1    \t\t" $is_v6
 	if [ "$client_token" != "" ] && [ "$server_token" != "" ] && [ "$client_serverside" = 0 ] &&
 		   [ "$server_serverside" = 1 ]
 	then
-		stdbuf -o0 -e0 printf "Established IP%s MPTCP Connection ns2 => ns1    \t\t[OK]\n" $is_v6
+		stdbuf -o0 -e0 printf "[OK]\n"
 	else
+		stdbuf -o0 -e0 printf "[FAIL]\n"
+		stdbuf -o0 -e0 printf "\tExpected tokens (c:%s - s:%s) and server (c:%d - s:%d)\n" \
+			"${client_token}" "${server_token}" \
+			"${client_serverside}" "${server_serverside}"
 		exit 1
 	fi
 
@@ -225,13 +230,26 @@ make_connection()
 	fi
 }
 
-# $1: var name
+# $1: var name ; $2: prev ret
 check_expected_one()
 {
 	local var="${1}"
 	local exp="e_${var}"
+	local prev_ret="${2}"
 
-	[ "${!var}" = "${!exp}" ]
+	if [ "${!var}" = "${!exp}" ]
+	then
+		return 0
+	fi
+
+	if [ "${prev_ret}" = "0" ]
+	then
+		stdbuf -o0 -e0 printf "[FAIL]\n"
+	fi
+
+	stdbuf -o0 -e0 printf "\tExpected value for '%s': '%s', got '%s'.\n" \
+		"${var}" "${!var}" "${!exp}"
+	return 1
 }
 
 # $@: all var names to check
@@ -242,7 +260,7 @@ check_expected()
 
 	for var in "${@}"
 	do
-		check_expected_one "${var}" || ret=1
+		check_expected_one "${var}" "${ret}" || ret=1
 	done
 
 	if [ ${ret} -eq 0 ]
@@ -251,7 +269,6 @@ check_expected()
 		return 0
 	fi
 
-	stdbuf -o0 -e0 printf "[FAIL]\n"
 	exit 1
 }
 
@@ -303,7 +320,7 @@ test_announce()
 	then
 		stdbuf -o0 -e0 printf "[OK]\n"
 	else
-		stdbuf -o0 -e0 printf "[FAIL]\n"
+		stdbuf -o0 -e0 printf "[FAIL]\n\ttype defined: %s\n" "${type}"
 		exit 1
 	fi
 
@@ -837,7 +854,7 @@ test_prio()
 	count=$(ip netns exec "$ns2" nstat -as | grep MPTcpExtMPPrioTx | awk '{print $2}')
 	[ -z "$count" ] && count=0
 	if [ $count != 1 ]; then
-		stdbuf -o0 -e0 printf "[FAIL]\n"
+		stdbuf -o0 -e0 printf "[FAIL]\n\tCount != 1: %d\n" "${count}"
 		exit 1
 	else
 		stdbuf -o0 -e0 printf "[OK]\n"
@@ -848,7 +865,7 @@ test_prio()
 	count=$(ip netns exec "$ns1" nstat -as | grep MPTcpExtMPPrioRx | awk '{print $2}')
 	[ -z "$count" ] && count=0
 	if [ $count != 1 ]; then
-		stdbuf -o0 -e0 printf "[FAIL]\n"
+		stdbuf -o0 -e0 printf "[FAIL]\n\tCount != 1: %d\n" "${count}"
 		exit 1
 	else
 		stdbuf -o0 -e0 printf "[OK]\n"
-- 
cgit 


From 8dbdf24f4e9e060e4c4a3ad282a4a57f80f48b1f Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matthieu.baerts@tessares.net>
Date: Wed, 25 Jan 2023 11:47:28 +0100
Subject: selftests: mptcp: userspace: avoid read errors

During the cleanup phase, the server pids were killed with a SIGTERM
directly, not using a SIGUSR1 first to quit safely. As a result, this
test was often ending with two error messages:

  read: Connection reset by peer

While at it, use a for-loop to terminate all the PIDs the same way.

Also the different files are now removed after having killed the PIDs
using them. It makes more sense to do that in this order.

Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/mptcp/userspace_pm.sh | 32 +++++++++--------------
 1 file changed, 12 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh
index 259382ad552c..66c5be25c13d 100755
--- a/tools/testing/selftests/net/mptcp/userspace_pm.sh
+++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh
@@ -50,6 +50,9 @@ print_title()
 
 kill_wait()
 {
+	[ $1 -eq 0 ] && return 0
+
+	kill -SIGUSR1 $1 > /dev/null 2>&1
 	kill $1 > /dev/null 2>&1
 	wait $1 2>/dev/null
 }
@@ -58,32 +61,21 @@ cleanup()
 {
 	print_title "Cleanup"
 
-	rm -rf $file $client_evts $server_evts
-
 	# Terminate the MPTCP connection and related processes
-	if [ $client4_pid -ne 0 ]; then
-		kill -SIGUSR1 $client4_pid > /dev/null 2>&1
-	fi
-	if [ $server4_pid -ne 0 ]; then
-		kill_wait $server4_pid
-	fi
-	if [ $client6_pid -ne 0 ]; then
-		kill -SIGUSR1 $client6_pid > /dev/null 2>&1
-	fi
-	if [ $server6_pid -ne 0 ]; then
-		kill_wait $server6_pid
-	fi
-	if [ $server_evts_pid -ne 0 ]; then
-		kill_wait $server_evts_pid
-	fi
-	if [ $client_evts_pid -ne 0 ]; then
-		kill_wait $client_evts_pid
-	fi
+	local pid
+	for pid in $client4_pid $server4_pid $client6_pid $server6_pid\
+		   $server_evts_pid $client_evts_pid
+	do
+		kill_wait $pid
+	done
+
 	local netns
 	for netns in "$ns1" "$ns2" ;do
 		ip netns del "$netns"
 	done
 
+	rm -rf $file $client_evts $server_evts
+
 	stdbuf -o0 -e0 printf "Done\n"
 }
 
-- 
cgit 


From 65177e47d3035c083cff034ffbfc7ab750a4675c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 25 Jan 2023 15:13:56 -0800
Subject: testing: kselftest_harness: add filtering and enumerating tests

As the number of test cases and length of execution grows it's
useful to select only a subset of tests. In TLS for instance we
have a matrix of variants for different crypto protocols and
during development mostly care about testing a handful.
This is quicker and makes reading output easier.

This patch adds argument parsing to kselftest_harness.

It supports a couple of ways to filter things, I could not come
up with one way which will cover all cases.

The first and simplest switch is -r which takes the name of
a test to run (can be specified multiple times). For example:

  $ ./my_test -r some.test.name -r some.other.name

will run tests some.test.name and some.other.name (where "some"
is the fixture, "test" and "other" and "name is the test.)

Then there is a handful of group filtering options. f/v/t for
filtering by fixture/variant/test. They have both positive
(match -> run) and negative versions (match -> skip).
If user specifies any positive option we assume the default
is not to run the tests. If only negative options are set
we assume the tests are supposed to be run by default.

  Usage: ./tools/testing/selftests/net/tls [-h|-l] [-t|-T|-v|-V|-f|-F|-r name]
	-h       print help
	-l       list all tests

	-t name  include test
	-T name  exclude test
	-v name  include variant
	-V name  exclude variant
	-f name  include fixture
	-F name  exclude fixture
	-r name  run specified test

  Test filter options can be specified multiple times. The filtering stops
  at the first match. For example to include all tests from variant 'bla'
  but not test 'foo' specify '-T foo -v bla'.

Here we can request for example all tests from fixture "foo" to run:

 ./my_test -f foo

or to skip variants var1 and var2:

 ./my_test -V var1 -V var2

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/kselftest_harness.h | 142 +++++++++++++++++++++++++++-
 1 file changed, 137 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
index 25f4d54067c0..d8bff2005dfc 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -54,6 +54,7 @@
 #define _GNU_SOURCE
 #endif
 #include <asm/types.h>
+#include <ctype.h>
 #include <errno.h>
 #include <stdbool.h>
 #include <stdint.h>
@@ -985,6 +986,127 @@ void __wait_for_test(struct __test_metadata *t)
 	}
 }
 
+static void test_harness_list_tests(void)
+{
+	struct __fixture_variant_metadata *v;
+	struct __fixture_metadata *f;
+	struct __test_metadata *t;
+
+	for (f = __fixture_list; f; f = f->next) {
+		v = f->variant;
+		t = f->tests;
+
+		if (f == __fixture_list)
+			fprintf(stderr, "%-20s %-25s %s\n",
+				"# FIXTURE", "VARIANT", "TEST");
+		else
+			fprintf(stderr, "--------------------------------------------------------------------------------\n");
+
+		do {
+			fprintf(stderr, "%-20s %-25s %s\n",
+				t == f->tests ? f->name : "",
+				v ? v->name : "",
+				t ? t->name : "");
+
+			v = v ? v->next : NULL;
+			t = t ? t->next : NULL;
+		} while (v || t);
+	}
+}
+
+static int test_harness_argv_check(int argc, char **argv)
+{
+	int opt;
+
+	while ((opt = getopt(argc, argv, "hlF:f:V:v:t:T:r:")) != -1) {
+		switch (opt) {
+		case 'f':
+		case 'F':
+		case 'v':
+		case 'V':
+		case 't':
+		case 'T':
+		case 'r':
+			break;
+		case 'l':
+			test_harness_list_tests();
+			return KSFT_SKIP;
+		case 'h':
+		default:
+			fprintf(stderr,
+				"Usage: %s [-h|-l] [-t|-T|-v|-V|-f|-F|-r name]\n"
+				"\t-h       print help\n"
+				"\t-l       list all tests\n"
+				"\n"
+				"\t-t name  include test\n"
+				"\t-T name  exclude test\n"
+				"\t-v name  include variant\n"
+				"\t-V name  exclude variant\n"
+				"\t-f name  include fixture\n"
+				"\t-F name  exclude fixture\n"
+				"\t-r name  run specified test\n"
+				"\n"
+				"Test filter options can be specified "
+				"multiple times. The filtering stops\n"
+				"at the first match. For example to "
+				"include all tests from variant 'bla'\n"
+				"but not test 'foo' specify '-T foo -v bla'.\n"
+				"", argv[0]);
+			return opt == 'h' ? KSFT_SKIP : KSFT_FAIL;
+		}
+	}
+
+	return KSFT_PASS;
+}
+
+static bool test_enabled(int argc, char **argv,
+			 struct __fixture_metadata *f,
+			 struct __fixture_variant_metadata *v,
+			 struct __test_metadata *t)
+{
+	unsigned int flen = 0, vlen = 0, tlen = 0;
+	bool has_positive = false;
+	int opt;
+
+	optind = 1;
+	while ((opt = getopt(argc, argv, "F:f:V:v:t:T:r:")) != -1) {
+		has_positive |= islower(opt);
+
+		switch (tolower(opt)) {
+		case 't':
+			if (!strcmp(t->name, optarg))
+				return islower(opt);
+			break;
+		case 'f':
+			if (!strcmp(f->name, optarg))
+				return islower(opt);
+			break;
+		case 'v':
+			if (!strcmp(v->name, optarg))
+				return islower(opt);
+			break;
+		case 'r':
+			if (!tlen) {
+				flen = strlen(f->name);
+				vlen = strlen(v->name);
+				tlen = strlen(t->name);
+			}
+			if (strlen(optarg) == flen + 1 + vlen + !!vlen + tlen &&
+			    !strncmp(f->name, &optarg[0], flen) &&
+			    !strncmp(v->name, &optarg[flen + 1], vlen) &&
+			    !strncmp(t->name, &optarg[flen + 1 + vlen + !!vlen], tlen))
+				return true;
+			break;
+		}
+	}
+
+	/*
+	 * If there are no positive tests then we assume user just wants
+	 * exclusions and everything else is a pass.
+	 */
+	return !has_positive;
+}
+
 void __run_test(struct __fixture_metadata *f,
 		struct __fixture_variant_metadata *variant,
 		struct __test_metadata *t)
@@ -1032,24 +1154,32 @@ void __run_test(struct __fixture_metadata *f,
 			f->name, variant->name[0] ? "." : "", variant->name, t->name);
 }
 
-static int test_harness_run(int __attribute__((unused)) argc,
-			    char __attribute__((unused)) **argv)
+static int test_harness_run(int argc, char **argv)
 {
 	struct __fixture_variant_metadata no_variant = { .name = "", };
 	struct __fixture_variant_metadata *v;
 	struct __fixture_metadata *f;
 	struct __test_results *results;
 	struct __test_metadata *t;
-	int ret = 0;
+	int ret;
 	unsigned int case_count = 0, test_count = 0;
 	unsigned int count = 0;
 	unsigned int pass_count = 0;
 
+	ret = test_harness_argv_check(argc, argv);
+	if (ret != KSFT_PASS)
+		return ret;
+
 	for (f = __fixture_list; f; f = f->next) {
 		for (v = f->variant ?: &no_variant; v; v = v->next) {
-			case_count++;
+			unsigned int old_tests = test_count;
+
 			for (t = f->tests; t; t = t->next)
-				test_count++;
+				if (test_enabled(argc, argv, f, v, t))
+					test_count++;
+
+			if (old_tests != test_count)
+				case_count++;
 		}
 	}
 
@@ -1063,6 +1193,8 @@ static int test_harness_run(int __attribute__((unused)) argc,
 	for (f = __fixture_list; f; f = f->next) {
 		for (v = f->variant ?: &no_variant; v; v = v->next) {
 			for (t = f->tests; t; t = t->next) {
+				if (!test_enabled(argc, argv, f, v, t))
+					continue;
 				count++;
 				t->results = results;
 				__run_test(f, v, t);
-- 
cgit 


From 66f3cb7993c2729b72e20313f8dc6e0716416186 Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Thu, 26 Jan 2023 09:05:55 -0800
Subject: tools/testing/cxl: Remove cxl_test module math loading message

Commit "tools/testing/cxl: Add XOR Math support to cxl_test" added
a module parameter to cxl_test for the interleave_arithmetic option.

In doing so, it also added this dev_dbg() message describing which
option cxl_test used during load:
"[  111.743246] (NULL device *): cxl_test loading modulo math option"
That "(NULL device *)" has raised needless user concern.

Remove the dev_dbg() message and make the module_param readable via
sysfs for users that need to know which math option is active.

Suggested-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Link: https://lore.kernel.org/r/20230126170555.701240-1-alison.schofield@intel.com
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/cxl/test/cxl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index 920bd969c554..a65305218c90 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -1143,11 +1143,9 @@ static __init int cxl_test_init(void)
 	if (interleave_arithmetic == 1) {
 		cfmws_start = CFMWS_XOR_ARRAY_START;
 		cfmws_end = CFMWS_XOR_ARRAY_END;
-		dev_dbg(NULL, "cxl_test loading xor math option\n");
 	} else {
 		cfmws_start = CFMWS_MOD_ARRAY_START;
 		cfmws_end = CFMWS_MOD_ARRAY_END;
-		dev_dbg(NULL, "cxl_test loading modulo math option\n");
 	}
 
 	rc = populate_cedt();
@@ -1334,7 +1332,7 @@ static __exit void cxl_test_exit(void)
 	unregister_cxl_mock_ops(&cxl_mock_ops);
 }
 
-module_param(interleave_arithmetic, int, 0000);
+module_param(interleave_arithmetic, int, 0444);
 MODULE_PARM_DESC(interleave_arithmetic, "Modulo:0, XOR:1");
 module_init(cxl_test_init);
 module_exit(cxl_test_exit);
-- 
cgit 


From 66fa34b9c2a51baa2b8b6e529eddfe80c298f21a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 25 Jan 2023 16:02:33 -0800
Subject: tools: ynl: support kdocs for flags in code generation

Lorenzo reports that after switching from enum to flags netdev
family lost ability to render kdoc (and the enum contents got
generally garbled).

Combine the flags and enum handling in uAPI handling.

Reported-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynl-gen-c.py | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 0c0f18540b7f..91df8eec86f9 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -565,6 +565,7 @@ class EnumEntry:
             self.doc = yaml.get('doc', '')
 
         self.yaml = yaml
+        self.enum_set = enum_set
         self.c_name = c_upper(enum_set.value_pfx + self.name)
 
         if 'value' in yaml:
@@ -572,11 +573,14 @@ class EnumEntry:
             if prev:
                 self.value_change = (self.value != prev.value + 1)
         elif prev:
+            self.value_change = False
             self.value = prev.value + 1
         else:
             self.value = value_start
             self.value_change = (self.value != 0)
 
+        self.value_change = self.value_change or self.enum_set['type'] == 'flags'
+
     def __getitem__(self, key):
         return self.yaml[key]
 
@@ -586,6 +590,17 @@ class EnumEntry:
     def has_doc(self):
         return bool(self.doc)
 
+    # raw value, i.e. the id in the enum, unlike user value which is a mask for flags
+    def raw_value(self):
+        return self.value
+
+    # user value, same as raw value for enums, for flags it's the mask
+    def user_value(self):
+        if self.enum_set['type'] == 'flags':
+            return 1 << self.value
+        else:
+            return self.value
+
 
 class EnumSet:
     def __init__(self, family, yaml):
@@ -824,7 +839,7 @@ class Family:
 
     def _dictify(self):
         for elem in self.yaml['definitions']:
-            if elem['type'] == 'enum':
+            if elem['type'] == 'enum' or elem['type'] == 'flags':
                 self.consts[elem['name']] = EnumSet(self, elem)
             else:
                 self.consts[elem['name']] = elem
@@ -1973,7 +1988,8 @@ def render_uapi(family, cw):
             defines = []
             cw.nl()
 
-        if const['type'] == 'enum':
+        # Write kdoc for enum and flags (one day maybe also structs)
+        if const['type'] == 'enum' or const['type'] == 'flags':
             enum = family.consts[const['name']]
 
             if enum.has_doc():
@@ -1989,13 +2005,11 @@ def render_uapi(family, cw):
                 cw.p(' */')
 
             uapi_enum_start(family, cw, const, 'name')
-            first = True
             name_pfx = const.get('name-prefix', f"{family.name}-{const['name']}-")
             for entry in enum.entry_list:
                 suffix = ','
-                if first and 'value-start' in const:
-                    suffix = f" = {const['value-start']}" + suffix
-                first = False
+                if entry.value_change:
+                    suffix = f" = {entry.user_value()}" + suffix
                 cw.p(entry.c_name + suffix)
 
             if const.get('render-max', False):
@@ -2005,17 +2019,6 @@ def render_uapi(family, cw):
                 cw.p(max_name + ' = (__' + max_name + ' - 1)')
             cw.block_end(line=';')
             cw.nl()
-        elif const['type'] == 'flags':
-            uapi_enum_start(family, cw, const, 'name')
-            i = const.get('value-start', 0)
-            for item in const['entries']:
-                item_name = item
-                if 'name-prefix' in const:
-                    item_name = c_upper(const['name-prefix'] + item)
-                cw.p(f'{item_name} = {1 << i},')
-                i += 1
-            cw.block_end(line=';')
-            cw.nl()
         elif const['type'] == 'const':
             defines.append([c_upper(family.get('c-define-name',
                                                f"{family.name}-{const['name']}")),
-- 
cgit 


From b49c34e217c629a9d282e84889dbe0128917b8c1 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 25 Jan 2023 16:02:34 -0800
Subject: tools: ynl: rename ops_list -> msg_list

ops_list contains all the operations, but the main iteration use
case is to walk only ops which define attrs. Rename ops_list to
msg_list, because now it looks like the contents are the same,
just the format is different. While at it convert from tuple
to just keys, none of the users care about the name of the op.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynl-gen-c.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 91df8eec86f9..9297cfacbe06 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -790,8 +790,10 @@ class Family:
         self.mcgrps = self.yaml.get('mcast-groups', {'list': []})
 
         self.consts = dict()
+        # list of all operations
+        self.msg_list = []
+        # dict of operations which have their own message type (have attributes)
         self.ops = dict()
-        self.ops_list = []
         self.attr_sets = dict()
         self.attr_sets_list = []
 
@@ -858,7 +860,7 @@ class Family:
             op = Operation(self, elem, val)
             val += 1
 
-            self.ops_list.append((elem['name'], op),)
+            self.msg_list.append(op)
             if 'notify' in elem:
                 ntf.append(op)
                 continue
@@ -2063,7 +2065,7 @@ def render_uapi(family, cw):
     max_value = f"({cnt_name} - 1)"
 
     uapi_enum_start(family, cw, family['operations'], 'enum-name')
-    for _, op in family.ops_list:
+    for op in family.msg_list:
         if separate_ntf and ('notify' in op or 'event' in op):
             continue
 
@@ -2082,7 +2084,7 @@ def render_uapi(family, cw):
 
     if separate_ntf:
         uapi_enum_start(family, cw, family['operations'], enum_name='async-enum')
-        for _, op in family.ops_list:
+        for op in family.msg_list:
             if separate_ntf and not ('notify' in op or 'event' in op):
                 continue
 
-- 
cgit 


From 3a43ded081f862aa2f66a8f4f6630a45a9081e58 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 25 Jan 2023 16:02:35 -0800
Subject: tools: ynl: store ops in ordered dict to avoid random ordering

When rendering code we should walk the ops in the order in which
they are declared in the spec. This is both more intuitive and
prevents code from jumping around when hashing in the dict changes.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynl-gen-c.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 9297cfacbe06..1aa872e582ab 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 import argparse
+import collections
 import jsonschema
 import os
 import yaml
@@ -793,7 +794,7 @@ class Family:
         # list of all operations
         self.msg_list = []
         # dict of operations which have their own message type (have attributes)
-        self.ops = dict()
+        self.ops = collections.OrderedDict()
         self.attr_sets = dict()
         self.attr_sets_list = []
 
-- 
cgit 


From d1dca858f058f53f68aeacb6db0e1cb3568fa6ef Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 17 Jan 2023 21:53:41 -0800
Subject: cxl/test: Add generic mock events

Facilitate testing basic Get/Clear Event functionality by creating
multiple logs and generic events with made up UUID's.

Data is completely made up with data patterns which should be easy to
spot in trace output.

A single sysfs entry resets the event data and triggers collecting the
events for testing.

Test traces are easy to obtain with a small script such as this:

	#!/bin/bash -x

	devices=`find /sys/devices/platform -name cxl_mem*`

	# Turn on tracing
	echo "" > /sys/kernel/tracing/trace
	echo 1 > /sys/kernel/tracing/events/cxl/enable
	echo 1 > /sys/kernel/tracing/tracing_on

	# Generate fake interrupt
	for device in $devices; do
	        echo 1 > $device/event_trigger
	done

	# Turn off tracing and report events
	echo 0 > /sys/kernel/tracing/tracing_on
	cat /sys/kernel/tracing/trace

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20221216-cxl-ev-log-v7-6-2316a5c8f7d8@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/cxl/test/Kbuild |   2 +-
 tools/testing/cxl/test/mem.c  | 231 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 232 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/cxl/test/Kbuild b/tools/testing/cxl/test/Kbuild
index 4e59e2c911f6..61d5f7bcddf9 100644
--- a/tools/testing/cxl/test/Kbuild
+++ b/tools/testing/cxl/test/Kbuild
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-ccflags-y := -I$(srctree)/drivers/cxl/
+ccflags-y := -I$(srctree)/drivers/cxl/ -I$(srctree)/drivers/cxl/core
 
 obj-m += cxl_test.o
 obj-m += cxl_mock.o
diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 5e4ecd93f1d2..90a463f83ae4 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -9,6 +9,8 @@
 #include <linux/bits.h>
 #include <cxlmem.h>
 
+#include "trace.h"
+
 #define LSA_SIZE SZ_128K
 #define DEV_SIZE SZ_2G
 #define EFFECT(x) (1U << x)
@@ -67,6 +69,24 @@ static struct {
 
 #define PASS_TRY_LIMIT 3
 
+#define CXL_TEST_EVENT_CNT_MAX 15
+
+/* Set a number of events to return at a time for simulation.  */
+#define CXL_TEST_EVENT_CNT 3
+
+struct mock_event_log {
+	u16 clear_idx;
+	u16 cur_idx;
+	u16 nr_events;
+	struct cxl_event_record_raw *events[CXL_TEST_EVENT_CNT_MAX];
+};
+
+struct mock_event_store {
+	struct cxl_dev_state *cxlds;
+	struct mock_event_log mock_logs[CXL_EVENT_TYPE_MAX];
+	u32 ev_status;
+};
+
 struct cxl_mockmem_data {
 	void *lsa;
 	u32 security_state;
@@ -74,9 +94,198 @@ struct cxl_mockmem_data {
 	u8 master_pass[NVDIMM_PASSPHRASE_LEN];
 	int user_limit;
 	int master_limit;
+	struct mock_event_store mes;
+	u8 event_buf[SZ_4K];
+};
+
+static struct mock_event_log *event_find_log(struct device *dev, int log_type)
+{
+	struct cxl_mockmem_data *mdata = dev_get_drvdata(dev);
+
+	if (log_type >= CXL_EVENT_TYPE_MAX)
+		return NULL;
+	return &mdata->mes.mock_logs[log_type];
+}
+
+static struct cxl_event_record_raw *event_get_current(struct mock_event_log *log)
+{
+	return log->events[log->cur_idx];
+}
+
+static void event_reset_log(struct mock_event_log *log)
+{
+	log->cur_idx = 0;
+	log->clear_idx = 0;
+}
+
+/* Handle can never be 0 use 1 based indexing for handle */
+static u16 event_get_clear_handle(struct mock_event_log *log)
+{
+	return log->clear_idx + 1;
+}
+
+/* Handle can never be 0 use 1 based indexing for handle */
+static __le16 event_get_cur_event_handle(struct mock_event_log *log)
+{
+	u16 cur_handle = log->cur_idx + 1;
+
+	return cpu_to_le16(cur_handle);
+}
+
+static bool event_log_empty(struct mock_event_log *log)
+{
+	return log->cur_idx == log->nr_events;
+}
+
+static void mes_add_event(struct mock_event_store *mes,
+			  enum cxl_event_log_type log_type,
+			  struct cxl_event_record_raw *event)
+{
+	struct mock_event_log *log;
+
+	if (WARN_ON(log_type >= CXL_EVENT_TYPE_MAX))
+		return;
+
+	log = &mes->mock_logs[log_type];
+	if (WARN_ON(log->nr_events >= CXL_TEST_EVENT_CNT_MAX))
+		return;
+
+	log->events[log->nr_events] = event;
+	log->nr_events++;
+}
+
+static int mock_get_event(struct cxl_dev_state *cxlds,
+			  struct cxl_mbox_cmd *cmd)
+{
+	struct cxl_get_event_payload *pl;
+	struct mock_event_log *log;
+	u8 log_type;
+	int i;
+
+	if (cmd->size_in != sizeof(log_type))
+		return -EINVAL;
+
+	if (cmd->size_out < struct_size(pl, records, CXL_TEST_EVENT_CNT))
+		return -EINVAL;
+
+	log_type = *((u8 *)cmd->payload_in);
+	if (log_type >= CXL_EVENT_TYPE_MAX)
+		return -EINVAL;
+
+	memset(cmd->payload_out, 0, cmd->size_out);
+
+	log = event_find_log(cxlds->dev, log_type);
+	if (!log || event_log_empty(log))
+		return 0;
+
+	pl = cmd->payload_out;
+
+	for (i = 0; i < CXL_TEST_EVENT_CNT && !event_log_empty(log); i++) {
+		memcpy(&pl->records[i], event_get_current(log),
+		       sizeof(pl->records[i]));
+		pl->records[i].hdr.handle = event_get_cur_event_handle(log);
+		log->cur_idx++;
+	}
+
+	pl->record_count = cpu_to_le16(i);
+	if (!event_log_empty(log))
+		pl->flags |= CXL_GET_EVENT_FLAG_MORE_RECORDS;
+
+	return 0;
+}
+
+static int mock_clear_event(struct cxl_dev_state *cxlds,
+			    struct cxl_mbox_cmd *cmd)
+{
+	struct cxl_mbox_clear_event_payload *pl = cmd->payload_in;
+	struct mock_event_log *log;
+	u8 log_type = pl->event_log;
+	u16 handle;
+	int nr;
+
+	if (log_type >= CXL_EVENT_TYPE_MAX)
+		return -EINVAL;
+
+	log = event_find_log(cxlds->dev, log_type);
+	if (!log)
+		return 0; /* No mock data in this log */
+
+	/*
+	 * This check is technically not invalid per the specification AFAICS.
+	 * (The host could 'guess' handles and clear them in order).
+	 * However, this is not good behavior for the host so test it.
+	 */
+	if (log->clear_idx + pl->nr_recs > log->cur_idx) {
+		dev_err(cxlds->dev,
+			"Attempting to clear more events than returned!\n");
+		return -EINVAL;
+	}
+
+	/* Check handle order prior to clearing events */
+	for (nr = 0, handle = event_get_clear_handle(log);
+	     nr < pl->nr_recs;
+	     nr++, handle++) {
+		if (handle != le16_to_cpu(pl->handles[nr])) {
+			dev_err(cxlds->dev, "Clearing events out of order\n");
+			return -EINVAL;
+		}
+	}
+
+	/* Clear events */
+	log->clear_idx += pl->nr_recs;
+	return 0;
+}
+
+static void cxl_mock_event_trigger(struct device *dev)
+{
+	struct cxl_mockmem_data *mdata = dev_get_drvdata(dev);
+	struct mock_event_store *mes = &mdata->mes;
+	int i;
+
+	for (i = CXL_EVENT_TYPE_INFO; i < CXL_EVENT_TYPE_MAX; i++) {
+		struct mock_event_log *log;
+
+		log = event_find_log(dev, i);
+		if (log)
+			event_reset_log(log);
+	}
+
+	cxl_mem_get_event_records(mes->cxlds, mes->ev_status);
+}
+
+struct cxl_event_record_raw maint_needed = {
+	.hdr = {
+		.id = UUID_INIT(0xBA5EBA11, 0xABCD, 0xEFEB,
+				0xa5, 0x5a, 0xa5, 0x5a, 0xa5, 0xa5, 0x5a, 0xa5),
+		.length = sizeof(struct cxl_event_record_raw),
+		.flags[0] = CXL_EVENT_RECORD_FLAG_MAINT_NEEDED,
+		/* .handle = Set dynamically */
+		.related_handle = cpu_to_le16(0xa5b6),
+	},
+	.data = { 0xDE, 0xAD, 0xBE, 0xEF },
+};
 
+struct cxl_event_record_raw hardware_replace = {
+	.hdr = {
+		.id = UUID_INIT(0xABCDEFEB, 0xBA11, 0xBA5E,
+				0xa5, 0x5a, 0xa5, 0x5a, 0xa5, 0xa5, 0x5a, 0xa5),
+		.length = sizeof(struct cxl_event_record_raw),
+		.flags[0] = CXL_EVENT_RECORD_FLAG_HW_REPLACE,
+		/* .handle = Set dynamically */
+		.related_handle = cpu_to_le16(0xb6a5),
+	},
+	.data = { 0xDE, 0xAD, 0xBE, 0xEF },
 };
 
+static void cxl_mock_add_event_logs(struct mock_event_store *mes)
+{
+	mes_add_event(mes, CXL_EVENT_TYPE_INFO, &maint_needed);
+	mes->ev_status |= CXLDEV_EVENT_STATUS_INFO;
+
+	mes_add_event(mes, CXL_EVENT_TYPE_FATAL, &hardware_replace);
+	mes->ev_status |= CXLDEV_EVENT_STATUS_FATAL;
+}
+
 static int mock_gsl(struct cxl_mbox_cmd *cmd)
 {
 	if (cmd->size_out < sizeof(mock_gsl_payload))
@@ -582,6 +791,12 @@ static int cxl_mock_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *
 	case CXL_MBOX_OP_GET_PARTITION_INFO:
 		rc = mock_partition_info(cxlds, cmd);
 		break;
+	case CXL_MBOX_OP_GET_EVENT_RECORD:
+		rc = mock_get_event(cxlds, cmd);
+		break;
+	case CXL_MBOX_OP_CLEAR_EVENT_RECORD:
+		rc = mock_clear_event(cxlds, cmd);
+		break;
 	case CXL_MBOX_OP_SET_LSA:
 		rc = mock_set_lsa(cxlds, cmd);
 		break;
@@ -628,6 +843,15 @@ static bool is_rcd(struct platform_device *pdev)
 	return !!id->driver_data;
 }
 
+static ssize_t event_trigger_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t count)
+{
+	cxl_mock_event_trigger(dev);
+	return count;
+}
+static DEVICE_ATTR_WO(event_trigger);
+
 static int cxl_mock_mem_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -655,6 +879,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
 	cxlds->serial = pdev->id;
 	cxlds->mbox_send = cxl_mock_mbox_send;
 	cxlds->payload_size = SZ_4K;
+	cxlds->event.buf = (struct cxl_get_event_payload *) mdata->event_buf;
 	if (is_rcd(pdev)) {
 		cxlds->rcd = true;
 		cxlds->component_reg_phys = CXL_RESOURCE_NONE;
@@ -672,10 +897,15 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
 	if (rc)
 		return rc;
 
+	mdata->mes.cxlds = cxlds;
+	cxl_mock_add_event_logs(&mdata->mes);
+
 	cxlmd = devm_cxl_add_memdev(cxlds);
 	if (IS_ERR(cxlmd))
 		return PTR_ERR(cxlmd);
 
+	cxl_mem_get_event_records(cxlds, CXLDEV_EVENT_STATUS_ALL);
+
 	return 0;
 }
 
@@ -714,6 +944,7 @@ static DEVICE_ATTR_RW(security_lock);
 
 static struct attribute *cxl_mock_mem_attrs[] = {
 	&dev_attr_security_lock.attr,
+	&dev_attr_event_trigger.attr,
 	NULL
 };
 ATTRIBUTE_GROUPS(cxl_mock_mem);
-- 
cgit 


From 0092f62acc31ada89af09fe84b65999b8f434dd9 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 17 Jan 2023 21:53:42 -0800
Subject: cxl/test: Add specific events

Each type of event has different trace point outputs.

Add mock General Media Event, DRAM event, and Memory Module Event
records to the mock list of events returned.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20221216-cxl-ev-log-v7-7-2316a5c8f7d8@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/cxl/test/mem.c | 73 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 90a463f83ae4..00bf19a68604 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -277,12 +277,85 @@ struct cxl_event_record_raw hardware_replace = {
 	.data = { 0xDE, 0xAD, 0xBE, 0xEF },
 };
 
+struct cxl_event_gen_media gen_media = {
+	.hdr = {
+		.id = UUID_INIT(0xfbcd0a77, 0xc260, 0x417f,
+				0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6),
+		.length = sizeof(struct cxl_event_gen_media),
+		.flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT,
+		/* .handle = Set dynamically */
+		.related_handle = cpu_to_le16(0),
+	},
+	.phys_addr = cpu_to_le64(0x2000),
+	.descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT,
+	.type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR,
+	.transaction_type = CXL_GMER_TRANS_HOST_WRITE,
+	/* .validity_flags = <set below> */
+	.channel = 1,
+	.rank = 30
+};
+
+struct cxl_event_dram dram = {
+	.hdr = {
+		.id = UUID_INIT(0x601dcbb3, 0x9c06, 0x4eab,
+				0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24),
+		.length = sizeof(struct cxl_event_dram),
+		.flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,
+		/* .handle = Set dynamically */
+		.related_handle = cpu_to_le16(0),
+	},
+	.phys_addr = cpu_to_le64(0x8000),
+	.descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT,
+	.type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR,
+	.transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB,
+	/* .validity_flags = <set below> */
+	.channel = 1,
+	.bank_group = 5,
+	.bank = 2,
+	.column = {0xDE, 0xAD},
+};
+
+struct cxl_event_mem_module mem_module = {
+	.hdr = {
+		.id = UUID_INIT(0xfe927475, 0xdd59, 0x4339,
+				0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74),
+		.length = sizeof(struct cxl_event_mem_module),
+		/* .handle = Set dynamically */
+		.related_handle = cpu_to_le16(0),
+	},
+	.event_type = CXL_MMER_TEMP_CHANGE,
+	.info = {
+		.health_status = CXL_DHI_HS_PERFORMANCE_DEGRADED,
+		.media_status = CXL_DHI_MS_ALL_DATA_LOST,
+		.add_status = (CXL_DHI_AS_CRITICAL << 2) |
+			      (CXL_DHI_AS_WARNING << 4) |
+			      (CXL_DHI_AS_WARNING << 5),
+		.device_temp = { 0xDE, 0xAD},
+		.dirty_shutdown_cnt = { 0xde, 0xad, 0xbe, 0xef },
+		.cor_vol_err_cnt = { 0xde, 0xad, 0xbe, 0xef },
+		.cor_per_err_cnt = { 0xde, 0xad, 0xbe, 0xef },
+	}
+};
+
 static void cxl_mock_add_event_logs(struct mock_event_store *mes)
 {
+	put_unaligned_le16(CXL_GMER_VALID_CHANNEL | CXL_GMER_VALID_RANK,
+			   &gen_media.validity_flags);
+
+	put_unaligned_le16(CXL_DER_VALID_CHANNEL | CXL_DER_VALID_BANK_GROUP |
+			   CXL_DER_VALID_BANK | CXL_DER_VALID_COLUMN,
+			   &dram.validity_flags);
+
 	mes_add_event(mes, CXL_EVENT_TYPE_INFO, &maint_needed);
+	mes_add_event(mes, CXL_EVENT_TYPE_INFO,
+		      (struct cxl_event_record_raw *)&gen_media);
+	mes_add_event(mes, CXL_EVENT_TYPE_INFO,
+		      (struct cxl_event_record_raw *)&mem_module);
 	mes->ev_status |= CXLDEV_EVENT_STATUS_INFO;
 
 	mes_add_event(mes, CXL_EVENT_TYPE_FATAL, &hardware_replace);
+	mes_add_event(mes, CXL_EVENT_TYPE_FATAL,
+		      (struct cxl_event_record_raw *)&dram);
 	mes->ev_status |= CXLDEV_EVENT_STATUS_FATAL;
 }
 
-- 
cgit 


From bab2a5e6fe7fddc00be0356bd538e38161bab085 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 17 Jan 2023 21:53:43 -0800
Subject: cxl/test: Simulate event log overflow

Log overflow is marked by a separate trace message.

Simulate a log with lots of messages and flag overflow until space is
cleared.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20221216-cxl-ev-log-v7-8-2316a5c8f7d8@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/cxl/test/mem.c | 50 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 00bf19a68604..9263b04d35f7 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -78,6 +78,8 @@ struct mock_event_log {
 	u16 clear_idx;
 	u16 cur_idx;
 	u16 nr_events;
+	u16 nr_overflow;
+	u16 overflow_reset;
 	struct cxl_event_record_raw *events[CXL_TEST_EVENT_CNT_MAX];
 };
 
@@ -116,6 +118,7 @@ static void event_reset_log(struct mock_event_log *log)
 {
 	log->cur_idx = 0;
 	log->clear_idx = 0;
+	log->nr_overflow = log->overflow_reset;
 }
 
 /* Handle can never be 0 use 1 based indexing for handle */
@@ -147,8 +150,12 @@ static void mes_add_event(struct mock_event_store *mes,
 		return;
 
 	log = &mes->mock_logs[log_type];
-	if (WARN_ON(log->nr_events >= CXL_TEST_EVENT_CNT_MAX))
+
+	if ((log->nr_events + 1) > CXL_TEST_EVENT_CNT_MAX) {
+		log->nr_overflow++;
+		log->overflow_reset = log->nr_overflow;
 		return;
+	}
 
 	log->events[log->nr_events] = event;
 	log->nr_events++;
@@ -159,6 +166,7 @@ static int mock_get_event(struct cxl_dev_state *cxlds,
 {
 	struct cxl_get_event_payload *pl;
 	struct mock_event_log *log;
+	u16 nr_overflow;
 	u8 log_type;
 	int i;
 
@@ -191,6 +199,19 @@ static int mock_get_event(struct cxl_dev_state *cxlds,
 	if (!event_log_empty(log))
 		pl->flags |= CXL_GET_EVENT_FLAG_MORE_RECORDS;
 
+	if (log->nr_overflow) {
+		u64 ns;
+
+		pl->flags |= CXL_GET_EVENT_FLAG_OVERFLOW;
+		pl->overflow_err_count = cpu_to_le16(nr_overflow);
+		ns = ktime_get_real_ns();
+		ns -= 5000000000; /* 5s ago */
+		pl->first_overflow_timestamp = cpu_to_le64(ns);
+		ns = ktime_get_real_ns();
+		ns -= 1000000000; /* 1s ago */
+		pl->last_overflow_timestamp = cpu_to_le64(ns);
+	}
+
 	return 0;
 }
 
@@ -231,6 +252,9 @@ static int mock_clear_event(struct cxl_dev_state *cxlds,
 		}
 	}
 
+	if (log->nr_overflow)
+		log->nr_overflow = 0;
+
 	/* Clear events */
 	log->clear_idx += pl->nr_recs;
 	return 0;
@@ -353,6 +377,30 @@ static void cxl_mock_add_event_logs(struct mock_event_store *mes)
 		      (struct cxl_event_record_raw *)&mem_module);
 	mes->ev_status |= CXLDEV_EVENT_STATUS_INFO;
 
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &maint_needed);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL,
+		      (struct cxl_event_record_raw *)&dram);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL,
+		      (struct cxl_event_record_raw *)&gen_media);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL,
+		      (struct cxl_event_record_raw *)&mem_module);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL,
+		      (struct cxl_event_record_raw *)&dram);
+	/* Overflow this log */
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
+	mes_add_event(mes, CXL_EVENT_TYPE_FAIL, &hardware_replace);
+	mes->ev_status |= CXLDEV_EVENT_STATUS_FAIL;
+
 	mes_add_event(mes, CXL_EVENT_TYPE_FATAL, &hardware_replace);
 	mes_add_event(mes, CXL_EVENT_TYPE_FATAL,
 		      (struct cxl_event_record_raw *)&dram);
-- 
cgit 


From a5f3a3f7c17273f825e96d1b3369164a39345a35 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 26 Jan 2023 14:50:30 -0800
Subject: selftests/bpf: Properly enable hwtstamp in xdp_hw_metadata

The existing timestamping_enable() is a no-op because it applies
to the socket-related path that we are not verifying here
anymore. (but still leaving the code around hoping we can
have xdp->skb path verified here as well)

  poll: 1 (0)
  xsk_ring_cons__peek: 1
  0xf64788: rx_desc[0]->addr=100000000008000 addr=8100 comp_addr=8000
  rx_hash: 3697961069
  rx_timestamp:  1674657672142214773 (sec:1674657672.1422)
  XDP RX-time:   1674657709561774876 (sec:1674657709.5618) delta sec:37.4196
  AF_XDP time:   1674657709561871034 (sec:1674657709.5619) delta
sec:0.0001 (96.158 usec)
  0xf64788: complete idx=8 addr=8000

Also, maybe something to archive here, see [0] for Jesper's note
about NIC vs host clock delta.

0: https://lore.kernel.org/bpf/f3a116dc-1b14-3432-ad20-a36179ef0608@redhat.com/

v2:
- Restore original value (Martin)

Fixes: 297a3f124155 ("selftests/bpf: Simple program to dump XDP RX metadata")
Reported-by: Jesper Dangaard Brouer <jbrouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <jbrouer@redhat.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230126225030.510629-1-sdf@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/xdp_hw_metadata.c | 45 ++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index 0008f0f239e8..3823b1c499cc 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -24,6 +24,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/udp.h>
 #include <linux/sockios.h>
+#include <linux/net_tstamp.h>
 #include <sys/mman.h>
 #include <net/if.h>
 #include <poll.h>
@@ -278,13 +279,53 @@ static int rxq_num(const char *ifname)
 
 	ret = ioctl(fd, SIOCETHTOOL, &ifr);
 	if (ret < 0)
-		error(-1, errno, "socket");
+		error(-1, errno, "ioctl(SIOCETHTOOL)");
 
 	close(fd);
 
 	return ch.rx_count + ch.combined_count;
 }
 
+static void hwtstamp_ioctl(int op, const char *ifname, struct hwtstamp_config *cfg)
+{
+	struct ifreq ifr = {
+		.ifr_data = (void *)cfg,
+	};
+	strcpy(ifr.ifr_name, ifname);
+	int fd, ret;
+
+	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+	if (fd < 0)
+		error(-1, errno, "socket");
+
+	ret = ioctl(fd, op, &ifr);
+	if (ret < 0)
+		error(-1, errno, "ioctl(%d)", op);
+
+	close(fd);
+}
+
+static struct hwtstamp_config saved_hwtstamp_cfg;
+static const char *saved_hwtstamp_ifname;
+
+static void hwtstamp_restore(void)
+{
+	hwtstamp_ioctl(SIOCSHWTSTAMP, saved_hwtstamp_ifname, &saved_hwtstamp_cfg);
+}
+
+static void hwtstamp_enable(const char *ifname)
+{
+	struct hwtstamp_config cfg = {
+		.rx_filter = HWTSTAMP_FILTER_ALL,
+	};
+
+	hwtstamp_ioctl(SIOCGHWTSTAMP, ifname, &saved_hwtstamp_cfg);
+	saved_hwtstamp_ifname = strdup(ifname);
+	atexit(hwtstamp_restore);
+
+	hwtstamp_ioctl(SIOCSHWTSTAMP, ifname, &cfg);
+}
+
 static void cleanup(void)
 {
 	LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
@@ -341,6 +382,8 @@ int main(int argc, char *argv[])
 
 	printf("rxq: %d\n", rxq);
 
+	hwtstamp_enable(ifname);
+
 	rx_xsk = malloc(sizeof(struct xsk) * rxq);
 	if (!rx_xsk)
 		error(-1, ENOMEM, "malloc");
-- 
cgit 


From 8677e555f17f51321d0730b945aeb7d4b95f998f Mon Sep 17 00:00:00 2001
From: Jeff Xu <jeffxu@google.com>
Date: Sat, 14 Jan 2023 02:03:06 +0000
Subject: selftests/landlock: Test ptrace as much as possible with Yama
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update ptrace tests according to all potential Yama security policies.
This is required to make such tests pass even if Yama is enabled.

Tests are not skipped but they now check both Landlock and Yama boundary
restrictions at run time to keep a maximum test coverage (i.e. positive
and negative testing).

Signed-off-by: Jeff Xu <jeffxu@google.com>
Link: https://lore.kernel.org/r/20230114020306.1407195-2-jeffxu@google.com
Cc: stable@vger.kernel.org
[mic: Add curly braces around EXPECT_EQ() to make it build, and improve
commit message]
Co-developed-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/ptrace_test.c | 113 +++++++++++++++++++++----
 1 file changed, 96 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/ptrace_test.c b/tools/testing/selftests/landlock/ptrace_test.c
index c28ef98ff3ac..55e7871631a1 100644
--- a/tools/testing/selftests/landlock/ptrace_test.c
+++ b/tools/testing/selftests/landlock/ptrace_test.c
@@ -19,6 +19,12 @@
 
 #include "common.h"
 
+/* Copied from security/yama/yama_lsm.c */
+#define YAMA_SCOPE_DISABLED 0
+#define YAMA_SCOPE_RELATIONAL 1
+#define YAMA_SCOPE_CAPABILITY 2
+#define YAMA_SCOPE_NO_ATTACH 3
+
 static void create_domain(struct __test_metadata *const _metadata)
 {
 	int ruleset_fd;
@@ -60,6 +66,25 @@ static int test_ptrace_read(const pid_t pid)
 	return 0;
 }
 
+static int get_yama_ptrace_scope(void)
+{
+	int ret;
+	char buf[2] = {};
+	const int fd = open("/proc/sys/kernel/yama/ptrace_scope", O_RDONLY);
+
+	if (fd < 0)
+		return 0;
+
+	if (read(fd, buf, 1) < 0) {
+		close(fd);
+		return -1;
+	}
+
+	ret = atoi(buf);
+	close(fd);
+	return ret;
+}
+
 /* clang-format off */
 FIXTURE(hierarchy) {};
 /* clang-format on */
@@ -232,8 +257,51 @@ TEST_F(hierarchy, trace)
 	pid_t child, parent;
 	int status, err_proc_read;
 	int pipe_child[2], pipe_parent[2];
+	int yama_ptrace_scope;
 	char buf_parent;
 	long ret;
+	bool can_read_child, can_trace_child, can_read_parent, can_trace_parent;
+
+	yama_ptrace_scope = get_yama_ptrace_scope();
+	ASSERT_LE(0, yama_ptrace_scope);
+
+	if (yama_ptrace_scope > YAMA_SCOPE_DISABLED)
+		TH_LOG("Incomplete tests due to Yama restrictions (scope %d)",
+		       yama_ptrace_scope);
+
+	/*
+	 * can_read_child is true if a parent process can read its child
+	 * process, which is only the case when the parent process is not
+	 * isolated from the child with a dedicated Landlock domain.
+	 */
+	can_read_child = !variant->domain_parent;
+
+	/*
+	 * can_trace_child is true if a parent process can trace its child
+	 * process.  This depends on two conditions:
+	 * - The parent process is not isolated from the child with a dedicated
+	 *   Landlock domain.
+	 * - Yama allows tracing children (up to YAMA_SCOPE_RELATIONAL).
+	 */
+	can_trace_child = can_read_child &&
+			  yama_ptrace_scope <= YAMA_SCOPE_RELATIONAL;
+
+	/*
+	 * can_read_parent is true if a child process can read its parent
+	 * process, which is only the case when the child process is not
+	 * isolated from the parent with a dedicated Landlock domain.
+	 */
+	can_read_parent = !variant->domain_child;
+
+	/*
+	 * can_trace_parent is true if a child process can trace its parent
+	 * process.  This depends on two conditions:
+	 * - The child process is not isolated from the parent with a dedicated
+	 *   Landlock domain.
+	 * - Yama is disabled (YAMA_SCOPE_DISABLED).
+	 */
+	can_trace_parent = can_read_parent &&
+			   yama_ptrace_scope <= YAMA_SCOPE_DISABLED;
 
 	/*
 	 * Removes all effective and permitted capabilities to not interfere
@@ -264,16 +332,21 @@ TEST_F(hierarchy, trace)
 		/* Waits for the parent to be in a domain, if any. */
 		ASSERT_EQ(1, read(pipe_parent[0], &buf_child, 1));
 
-		/* Tests PTRACE_ATTACH and PTRACE_MODE_READ on the parent. */
+		/* Tests PTRACE_MODE_READ on the parent. */
 		err_proc_read = test_ptrace_read(parent);
+		if (can_read_parent) {
+			EXPECT_EQ(0, err_proc_read);
+		} else {
+			EXPECT_EQ(EACCES, err_proc_read);
+		}
+
+		/* Tests PTRACE_ATTACH on the parent. */
 		ret = ptrace(PTRACE_ATTACH, parent, NULL, 0);
-		if (variant->domain_child) {
+		if (can_trace_parent) {
+			EXPECT_EQ(0, ret);
+		} else {
 			EXPECT_EQ(-1, ret);
 			EXPECT_EQ(EPERM, errno);
-			EXPECT_EQ(EACCES, err_proc_read);
-		} else {
-			EXPECT_EQ(0, ret);
-			EXPECT_EQ(0, err_proc_read);
 		}
 		if (ret == 0) {
 			ASSERT_EQ(parent, waitpid(parent, &status, 0));
@@ -283,11 +356,11 @@ TEST_F(hierarchy, trace)
 
 		/* Tests child PTRACE_TRACEME. */
 		ret = ptrace(PTRACE_TRACEME);
-		if (variant->domain_parent) {
+		if (can_trace_child) {
+			EXPECT_EQ(0, ret);
+		} else {
 			EXPECT_EQ(-1, ret);
 			EXPECT_EQ(EPERM, errno);
-		} else {
-			EXPECT_EQ(0, ret);
 		}
 
 		/*
@@ -296,7 +369,7 @@ TEST_F(hierarchy, trace)
 		 */
 		ASSERT_EQ(1, write(pipe_child[1], ".", 1));
 
-		if (!variant->domain_parent) {
+		if (can_trace_child) {
 			ASSERT_EQ(0, raise(SIGSTOP));
 		}
 
@@ -321,7 +394,7 @@ TEST_F(hierarchy, trace)
 	ASSERT_EQ(1, read(pipe_child[0], &buf_parent, 1));
 
 	/* Tests child PTRACE_TRACEME. */
-	if (!variant->domain_parent) {
+	if (can_trace_child) {
 		ASSERT_EQ(child, waitpid(child, &status, 0));
 		ASSERT_EQ(1, WIFSTOPPED(status));
 		ASSERT_EQ(0, ptrace(PTRACE_DETACH, child, NULL, 0));
@@ -331,17 +404,23 @@ TEST_F(hierarchy, trace)
 		EXPECT_EQ(ESRCH, errno);
 	}
 
-	/* Tests PTRACE_ATTACH and PTRACE_MODE_READ on the child. */
+	/* Tests PTRACE_MODE_READ on the child. */
 	err_proc_read = test_ptrace_read(child);
+	if (can_read_child) {
+		EXPECT_EQ(0, err_proc_read);
+	} else {
+		EXPECT_EQ(EACCES, err_proc_read);
+	}
+
+	/* Tests PTRACE_ATTACH on the child. */
 	ret = ptrace(PTRACE_ATTACH, child, NULL, 0);
-	if (variant->domain_parent) {
+	if (can_trace_child) {
+		EXPECT_EQ(0, ret);
+	} else {
 		EXPECT_EQ(-1, ret);
 		EXPECT_EQ(EPERM, errno);
-		EXPECT_EQ(EACCES, err_proc_read);
-	} else {
-		EXPECT_EQ(0, ret);
-		EXPECT_EQ(0, err_proc_read);
 	}
+
 	if (ret == 0) {
 		ASSERT_EQ(child, waitpid(child, &status, 0));
 		ASSERT_EQ(1, WIFSTOPPED(status));
-- 
cgit 


From 139df64d26fd5bb83cd04b330f7a849eefa1a889 Mon Sep 17 00:00:00 2001
From: Grant Seltzer <grantseltzer@gmail.com>
Date: Wed, 25 Jan 2023 21:47:49 -0500
Subject: libbpf: Fix malformed documentation formatting

This fixes the doxygen format documentation above the
user_ring_buffer__* APIs. There has to be a newline
before the @brief, otherwise doxygen won't render them
for libbpf.readthedocs.org.

Signed-off-by: Grant Seltzer <grantseltzer@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230126024749.522278-1-grantseltzer@gmail.com
---
 tools/lib/bpf/libbpf.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 898db26e42e9..830ad38e9f51 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1064,7 +1064,8 @@ struct user_ring_buffer_opts {
 
 #define user_ring_buffer_opts__last_field sz
 
-/* @brief **user_ring_buffer__new()** creates a new instance of a user ring
+/**
+ * @brief **user_ring_buffer__new()** creates a new instance of a user ring
  * buffer.
  *
  * @param map_fd A file descriptor to a BPF_MAP_TYPE_USER_RINGBUF map.
@@ -1075,7 +1076,8 @@ struct user_ring_buffer_opts {
 LIBBPF_API struct user_ring_buffer *
 user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts);
 
-/* @brief **user_ring_buffer__reserve()** reserves a pointer to a sample in the
+/**
+ * @brief **user_ring_buffer__reserve()** reserves a pointer to a sample in the
  * user ring buffer.
  * @param rb A pointer to a user ring buffer.
  * @param size The size of the sample, in bytes.
@@ -1095,7 +1097,8 @@ user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts);
  */
 LIBBPF_API void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size);
 
-/* @brief **user_ring_buffer__reserve_blocking()** reserves a record in the
+/**
+ * @brief **user_ring_buffer__reserve_blocking()** reserves a record in the
  * ring buffer, possibly blocking for up to @timeout_ms until a sample becomes
  * available.
  * @param rb The user ring buffer.
@@ -1139,7 +1142,8 @@ LIBBPF_API void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb,
 						    __u32 size,
 						    int timeout_ms);
 
-/* @brief **user_ring_buffer__submit()** submits a previously reserved sample
+/**
+ * @brief **user_ring_buffer__submit()** submits a previously reserved sample
  * into the ring buffer.
  * @param rb The user ring buffer.
  * @param sample A reserved sample.
@@ -1149,7 +1153,8 @@ LIBBPF_API void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb,
  */
 LIBBPF_API void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample);
 
-/* @brief **user_ring_buffer__discard()** discards a previously reserved sample.
+/**
+ * @brief **user_ring_buffer__discard()** discards a previously reserved sample.
  * @param rb The user ring buffer.
  * @param sample A reserved sample.
  *
@@ -1158,7 +1163,8 @@ LIBBPF_API void user_ring_buffer__submit(struct user_ring_buffer *rb, void *samp
  */
 LIBBPF_API void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample);
 
-/* @brief **user_ring_buffer__free()** frees a ring buffer that was previously
+/**
+ * @brief **user_ring_buffer__free()** frees a ring buffer that was previously
  * created with **user_ring_buffer__new()**.
  * @param rb The user ring buffer being freed.
  */
-- 
cgit 


From e4ce876f10a21fb4e550d1743468791021a430f5 Mon Sep 17 00:00:00 2001
From: Grant Seltzer <grantseltzer@gmail.com>
Date: Wed, 25 Jan 2023 21:42:25 -0500
Subject: libbpf: Add documentation to map pinning API functions

This adds documentation for the following API functions:

- bpf_map__set_pin_path()
- bpf_map__pin_path()
- bpf_map__is_pinned()
- bpf_map__pin()
- bpf_map__unpin()
- bpf_object__pin_maps()
- bpf_object__unpin_maps()

Signed-off-by: Grant Seltzer <grantseltzer@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230126024225.520685-1-grantseltzer@gmail.com
---
 tools/lib/bpf/libbpf.h | 72 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 69 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 830ad38e9f51..8777ff21ea1d 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -233,11 +233,30 @@ LIBBPF_API int bpf_object__load(struct bpf_object *obj);
  */
 LIBBPF_API void bpf_object__close(struct bpf_object *obj);
 
-/* pin_maps and unpin_maps can both be called with a NULL path, in which case
- * they will use the pin_path attribute of each map (and ignore all maps that
- * don't have a pin_path set).
+/**
+ * @brief **bpf_object__pin_maps()** pins each map contained within
+ * the BPF object at the passed directory.
+ * @param obj Pointer to a valid BPF object
+ * @param path A directory where maps should be pinned.
+ * @return 0, on success; negative error code, otherwise
+ *
+ * If `path` is NULL `bpf_map__pin` (which is being used on each map)
+ * will use the pin_path attribute of each map. In this case, maps that
+ * don't have a pin_path set will be ignored.
  */
 LIBBPF_API int bpf_object__pin_maps(struct bpf_object *obj, const char *path);
+
+/**
+ * @brief **bpf_object__unpin_maps()** unpins each map contained within
+ * the BPF object found in the passed directory.
+ * @param obj Pointer to a valid BPF object
+ * @param path A directory where pinned maps should be searched for.
+ * @return 0, on success; negative error code, otherwise
+ *
+ * If `path` is NULL `bpf_map__unpin` (which is being used on each map)
+ * will use the pin_path attribute of each map. In this case, maps that
+ * don't have a pin_path set will be ignored.
+ */
 LIBBPF_API int bpf_object__unpin_maps(struct bpf_object *obj,
 				      const char *path);
 LIBBPF_API int bpf_object__pin_programs(struct bpf_object *obj,
@@ -848,10 +867,57 @@ LIBBPF_API const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize
  * @return true, if the map is an internal map; false, otherwise
  */
 LIBBPF_API bool bpf_map__is_internal(const struct bpf_map *map);
+
+/**
+ * @brief **bpf_map__set_pin_path()** sets the path attribute that tells where the
+ * BPF map should be pinned. This does not actually create the 'pin'.
+ * @param map The bpf_map
+ * @param path The path
+ * @return 0, on success; negative error, otherwise
+ */
 LIBBPF_API int bpf_map__set_pin_path(struct bpf_map *map, const char *path);
+
+/**
+ * @brief **bpf_map__pin_path()** gets the path attribute that tells where the
+ * BPF map should be pinned.
+ * @param map The bpf_map
+ * @return The path string; which can be NULL
+ */
 LIBBPF_API const char *bpf_map__pin_path(const struct bpf_map *map);
+
+/**
+ * @brief **bpf_map__is_pinned()** tells the caller whether or not the
+ * passed map has been pinned via a 'pin' file.
+ * @param map The bpf_map
+ * @return true, if the map is pinned; false, otherwise
+ */
 LIBBPF_API bool bpf_map__is_pinned(const struct bpf_map *map);
+
+/**
+ * @brief **bpf_map__pin()** creates a file that serves as a 'pin'
+ * for the BPF map. This increments the reference count on the
+ * BPF map which will keep the BPF map loaded even after the
+ * userspace process which loaded it has exited.
+ * @param map The bpf_map to pin
+ * @param path A file path for the 'pin'
+ * @return 0, on success; negative error, otherwise
+ *
+ * If `path` is NULL the maps `pin_path` attribute will be used. If this is
+ * also NULL, an error will be returned and the map will not be pinned.
+ */
 LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path);
+
+/**
+ * @brief **bpf_map__unpin()** removes the file that serves as a
+ * 'pin' for the BPF map.
+ * @param map The bpf_map to unpin
+ * @param path A file path for the 'pin'
+ * @return 0, on success; negative error, otherwise
+ *
+ * The `path` parameter can be NULL, in which case the `pin_path`
+ * map attribute is unpinned. If both the `path` parameter and
+ * `pin_path` map attribute are set, they must be equal.
+ */
 LIBBPF_API int bpf_map__unpin(struct bpf_map *map, const char *path);
 
 LIBBPF_API int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd);
-- 
cgit 


From 16809afdcbad5fa45f34622f62873c7d7114cde5 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 27 Jan 2023 13:57:05 -0800
Subject: selftest/bpf: Make crashes more debuggable in test_progs

Reset stdio before printing verbose log of the SIGSEGV'ed test.
Otherwise, it's hard to understand what's going on in the cases like [0].

With the following patch applied:

	--- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
	+++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
	@@ -392,6 +392,11 @@ void test_xdp_metadata(void)
	 		       "generate freplace packet"))
	 		goto out;

	+
	+	ASSERT_EQ(1, 2, "oops");
	+	int *x = 0;
	+	*x = 1; /* die */
	+
	 	while (!retries--) {
	 		if (bpf_obj2->bss->called)
	 			break;

Before:

 #281     xdp_metadata:FAIL
Caught signal #11!
Stack trace:
./test_progs(crash_handler+0x1f)[0x55c919d98bcf]
/lib/x86_64-linux-gnu/libc.so.6(+0x3bf90)[0x7f36aea5df90]
./test_progs(test_xdp_metadata+0x1db0)[0x55c919d8c6d0]
./test_progs(+0x23b438)[0x55c919d9a438]
./test_progs(main+0x534)[0x55c919d99454]
/lib/x86_64-linux-gnu/libc.so.6(+0x2718a)[0x7f36aea4918a]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x85)[0x7f36aea49245]
./test_progs(_start+0x21)[0x55c919b82ef1]

After:

test_xdp_metadata:PASS:ip netns add xdp_metadata 0 nsec
open_netns:PASS:malloc token 0 nsec
open_netns:PASS:open /proc/self/ns/net 0 nsec
open_netns:PASS:open netns fd 0 nsec
open_netns:PASS:setns 0 nsec
..
test_xdp_metadata:FAIL:oops unexpected oops: actual 1 != expected 2
 #281     xdp_metadata:FAIL
Caught signal #11!
Stack trace:
./test_progs(crash_handler+0x1f)[0x562714a76bcf]
/lib/x86_64-linux-gnu/libc.so.6(+0x3bf90)[0x7fa663f9cf90]
./test_progs(test_xdp_metadata+0x1db0)[0x562714a6a6d0]
./test_progs(+0x23b438)[0x562714a78438]
./test_progs(main+0x534)[0x562714a77454]
/lib/x86_64-linux-gnu/libc.so.6(+0x2718a)[0x7fa663f8818a]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x85)[0x7fa663f88245]
./test_progs(_start+0x21)[0x562714860ef1]

0: https://github.com/kernel-patches/bpf/actions/runs/4019879316/jobs/6907358876

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230127215705.1254316-1-sdf@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/test_progs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 4716e38e153a..c5f852163246 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -975,12 +975,12 @@ void crash_handler(int signum)
 
 	sz = backtrace(bt, ARRAY_SIZE(bt));
 
+	if (env.stdout)
+		stdio_restore();
 	if (env.test) {
 		env.test_state->error_cnt++;
 		dump_test_log(env.test, env.test_state, true, false);
 	}
-	if (env.stdout)
-		stdio_restore();
 	if (env.worker_id != -1)
 		fprintf(stderr, "[%d]: ", env.worker_id);
 	fprintf(stderr, "Caught signal #%d!\nStack trace:\n", signum);
-- 
cgit 


From 8fb9fb2f1728a56a2a6dba79de6f17975def658d Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:22 +0100
Subject: selftests/bpf: Query BPF_MAX_TRAMP_LINKS using BTF

Do not hard-code the value, since for s390x it will be smaller than
for x86.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-4-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/fexit_stress.c        | 22 ++++++++-----
 .../selftests/bpf/prog_tests/trampoline_count.c    | 16 ++++++---
 tools/testing/selftests/bpf/test_progs.c           | 38 ++++++++++++++++++++++
 tools/testing/selftests/bpf/test_progs.h           |  2 ++
 4 files changed, 65 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
index 5a7e6011f6bf..596536def43d 100644
--- a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
@@ -2,14 +2,19 @@
 /* Copyright (c) 2019 Facebook */
 #include <test_progs.h>
 
-/* that's kernel internal BPF_MAX_TRAMP_PROGS define */
-#define CNT 38
-
 void serial_test_fexit_stress(void)
 {
-	int fexit_fd[CNT] = {};
-	int link_fd[CNT] = {};
-	int err, i;
+	int bpf_max_tramp_links, err, i;
+	int *fd, *fexit_fd, *link_fd;
+
+	bpf_max_tramp_links = get_bpf_max_tramp_links();
+	if (!ASSERT_GE(bpf_max_tramp_links, 1, "bpf_max_tramp_links"))
+		return;
+	fd = calloc(bpf_max_tramp_links * 2, sizeof(*fd));
+	if (!ASSERT_OK_PTR(fd, "fd"))
+		return;
+	fexit_fd = fd;
+	link_fd = fd + bpf_max_tramp_links;
 
 	const struct bpf_insn trace_program[] = {
 		BPF_MOV64_IMM(BPF_REG_0, 0),
@@ -28,7 +33,7 @@ void serial_test_fexit_stress(void)
 		goto out;
 	trace_opts.attach_btf_id = err;
 
-	for (i = 0; i < CNT; i++) {
+	for (i = 0; i < bpf_max_tramp_links; i++) {
 		fexit_fd[i] = bpf_prog_load(BPF_PROG_TYPE_TRACING, NULL, "GPL",
 					    trace_program,
 					    sizeof(trace_program) / sizeof(struct bpf_insn),
@@ -44,10 +49,11 @@ void serial_test_fexit_stress(void)
 	ASSERT_OK(err, "bpf_prog_test_run_opts");
 
 out:
-	for (i = 0; i < CNT; i++) {
+	for (i = 0; i < bpf_max_tramp_links; i++) {
 		if (link_fd[i])
 			close(link_fd[i]);
 		if (fexit_fd[i])
 			close(fexit_fd[i]);
 	}
+	free(fd);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
index 564b75bc087f..8fd4c0d78089 100644
--- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
+++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
@@ -2,8 +2,6 @@
 #define _GNU_SOURCE
 #include <test_progs.h>
 
-#define MAX_TRAMP_PROGS 38
-
 struct inst {
 	struct bpf_object *obj;
 	struct bpf_link   *link;
@@ -37,14 +35,21 @@ void serial_test_trampoline_count(void)
 {
 	char *file = "test_trampoline_count.bpf.o";
 	char *const progs[] = { "fentry_test", "fmod_ret_test", "fexit_test" };
-	struct inst inst[MAX_TRAMP_PROGS + 1] = {};
+	int bpf_max_tramp_links, err, i, prog_fd;
 	struct bpf_program *prog;
 	struct bpf_link *link;
-	int prog_fd, err, i;
+	struct inst *inst;
 	LIBBPF_OPTS(bpf_test_run_opts, opts);
 
+	bpf_max_tramp_links = get_bpf_max_tramp_links();
+	if (!ASSERT_GE(bpf_max_tramp_links, 1, "bpf_max_tramp_links"))
+		return;
+	inst = calloc(bpf_max_tramp_links + 1, sizeof(*inst));
+	if (!ASSERT_OK_PTR(inst, "inst"))
+		return;
+
 	/* attach 'allowed' trampoline programs */
-	for (i = 0; i < MAX_TRAMP_PROGS; i++) {
+	for (i = 0; i < bpf_max_tramp_links; i++) {
 		prog = load_prog(file, progs[i % ARRAY_SIZE(progs)], &inst[i]);
 		if (!prog)
 			goto cleanup;
@@ -91,4 +96,5 @@ cleanup:
 		bpf_link__destroy(inst[i].link);
 		bpf_object__close(inst[i].obj);
 	}
+	free(inst);
 }
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index c5f852163246..6d5e3022c75f 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -17,6 +17,7 @@
 #include <sys/select.h>
 #include <sys/socket.h>
 #include <sys/un.h>
+#include <bpf/btf.h>
 
 static bool verbose(void)
 {
@@ -967,6 +968,43 @@ int write_sysctl(const char *sysctl, const char *value)
 	return 0;
 }
 
+int get_bpf_max_tramp_links_from(struct btf *btf)
+{
+	const struct btf_enum *e;
+	const struct btf_type *t;
+	__u32 i, type_cnt;
+	const char *name;
+	__u16 j, vlen;
+
+	for (i = 1, type_cnt = btf__type_cnt(btf); i < type_cnt; i++) {
+		t = btf__type_by_id(btf, i);
+		if (!t || !btf_is_enum(t) || t->name_off)
+			continue;
+		e = btf_enum(t);
+		for (j = 0, vlen = btf_vlen(t); j < vlen; j++, e++) {
+			name = btf__str_by_offset(btf, e->name_off);
+			if (name && !strcmp(name, "BPF_MAX_TRAMP_LINKS"))
+				return e->val;
+		}
+	}
+
+	return -1;
+}
+
+int get_bpf_max_tramp_links(void)
+{
+	struct btf *vmlinux_btf;
+	int ret;
+
+	vmlinux_btf = btf__load_vmlinux_btf();
+	if (!ASSERT_OK_PTR(vmlinux_btf, "vmlinux btf"))
+		return -1;
+	ret = get_bpf_max_tramp_links_from(vmlinux_btf);
+	btf__free(vmlinux_btf);
+
+	return ret;
+}
+
 #define MAX_BACKTRACE_SZ 128
 void crash_handler(int signum)
 {
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 3f058dfadbaf..d5d51ec97ec8 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -394,6 +394,8 @@ int kern_sync_rcu(void);
 int trigger_module_test_read(int read_sz);
 int trigger_module_test_write(int write_sz);
 int write_sysctl(const char *sysctl, const char *value);
+int get_bpf_max_tramp_links_from(struct btf *btf);
+int get_bpf_max_tramp_links(void);
 
 #ifdef __x86_64__
 #define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep"
-- 
cgit 


From b14b01f281f728f465d2440ce6ed0491df735169 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:23 +0100
Subject: selftests/bpf: Fix liburandom_read.so linker error

When building with O=, the following linker error occurs:

    clang: error: no such file or directory: 'liburandom_read.so'

Fix by adding $(OUTPUT) to the linker search path.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-5-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 53eae7be8dff..02d41af05fe1 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -189,7 +189,7 @@ $(OUTPUT)/liburandom_read.so: urandom_read_lib1.c urandom_read_lib2.c
 $(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_read.so
 	$(call msg,BINARY,,$@)
 	$(Q)$(CLANG) $(filter-out -static,$(CFLAGS) $(LDFLAGS)) $(filter %.c,$^) \
-		     liburandom_read.so $(filter-out -static,$(LDLIBS))	     \
+		     -lurandom_read $(filter-out -static,$(LDLIBS)) -L$(OUTPUT)  \
 		     -fuse-ld=$(LLD) -Wl,-znoseparate-code -Wl,--build-id=sha1 \
 		     -Wl,-rpath=. -o $@
 
-- 
cgit 


From 6eab2370d142cdfe853d168881b0d4abd3c6f7a6 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:24 +0100
Subject: selftests/bpf: Fix symlink creation error

When building with O=, the following error occurs:

    ln: failed to create symbolic link 'no_alu32/bpftool': No such file or directory

Adjust the code to account for $(OUTPUT).

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-6-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 02d41af05fe1..e79039726173 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -519,7 +519,8 @@ $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS)			\
 	$$(call msg,BINARY,,$$@)
 	$(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) -o $$@
 	$(Q)$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@
-	$(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/bootstrap/bpftool $(if $2,$2/)bpftool
+	$(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/bootstrap/bpftool \
+		   $(OUTPUT)/$(if $2,$2/)bpftool
 
 endef
 
-- 
cgit 


From 31da9be64a1136e5699a1fc59a45001d6b98acdc Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:25 +0100
Subject: selftests/bpf: Fix kfree_skb on s390x

h_proto is big-endian; use htons() in order to make comparison work on
both little- and big-endian machines.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-7-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/kfree_skb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
index 73579370bfbd..c07991544a78 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
@@ -36,7 +36,7 @@ static void on_sample(void *ctx, int cpu, void *data, __u32 size)
 		  "cb32_0 %x != %x\n",
 		  meta->cb32_0, cb.cb32[0]))
 		return;
-	if (CHECK(pkt_v6->eth.h_proto != 0xdd86, "check_eth",
+	if (CHECK(pkt_v6->eth.h_proto != htons(ETH_P_IPV6), "check_eth",
 		  "h_proto %x\n", pkt_v6->eth.h_proto))
 		return;
 	if (CHECK(pkt_v6->iph.nexthdr != 6, "check_ip",
-- 
cgit 


From 804acdd251e837bb1a588074752d69698ac36b5f Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:26 +0100
Subject: selftests/bpf: Set errno when urand_spawn() fails

The result of urand_spawn() is checked with ASSERT_OK_PTR, which treats
NULL as success if errno == 0.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-8-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/usdt.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c
index 9ad9da0f215e..56ed1eb9b527 100644
--- a/tools/testing/selftests/bpf/prog_tests/usdt.c
+++ b/tools/testing/selftests/bpf/prog_tests/usdt.c
@@ -314,6 +314,7 @@ static FILE *urand_spawn(int *pid)
 
 	if (fscanf(f, "%d", pid) != 1) {
 		pclose(f);
+		errno = EINVAL;
 		return NULL;
 	}
 
-- 
cgit 


From 98e13848cf43b66b0f32f10011aa398c2bff5ae6 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:27 +0100
Subject: selftests/bpf: Fix decap_sanity_ns cleanup

decap_sanity prints the following on the 1st run:

    decap_sanity: sh: 1: Syntax error: Bad fd number

and the following on the 2nd run:

    Cannot create namespace file "/run/netns/decap_sanity_ns": File exists

The problem is that the cleanup command has a typo and does nothing.
Fix the typo.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-9-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/decap_sanity.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/decap_sanity.c b/tools/testing/selftests/bpf/prog_tests/decap_sanity.c
index 0b2f73b88c53..2853883b7cbb 100644
--- a/tools/testing/selftests/bpf/prog_tests/decap_sanity.c
+++ b/tools/testing/selftests/bpf/prog_tests/decap_sanity.c
@@ -80,6 +80,6 @@ fail:
 		bpf_tc_hook_destroy(&qdisc_hook);
 		close_netns(nstoken);
 	}
-	system("ip netns del " NS_TEST " >& /dev/null");
+	system("ip netns del " NS_TEST " &> /dev/null");
 	decap_sanity__destroy(skel);
 }
-- 
cgit 


From 56e1a50483194b2e0ac54849e94cc2b80480895e Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:28 +0100
Subject: selftests/bpf: Fix verify_pkcs7_sig on s390x

Use bpf_probe_read_kernel() instead of bpf_probe_read(), which is not
defined on all architectures.

While at it, improve the error handling: do not hide the verifier log,
and check the return values of bpf_probe_read_kernel() and
bpf_copy_from_user().

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-10-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verify_pkcs7_sig.c |  3 +++
 tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c | 12 ++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verify_pkcs7_sig.c b/tools/testing/selftests/bpf/prog_tests/verify_pkcs7_sig.c
index 579d6ee83ce0..dd7f2bc70048 100644
--- a/tools/testing/selftests/bpf/prog_tests/verify_pkcs7_sig.c
+++ b/tools/testing/selftests/bpf/prog_tests/verify_pkcs7_sig.c
@@ -61,6 +61,9 @@ static bool kfunc_not_supported;
 static int libbpf_print_cb(enum libbpf_print_level level, const char *fmt,
 			   va_list args)
 {
+	if (level == LIBBPF_WARN)
+		vprintf(fmt, args);
+
 	if (strcmp(fmt, "libbpf: extern (func ksym) '%s': not found in kernel or module BTFs\n"))
 		return 0;
 
diff --git a/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c
index ce419304ff1f..7748cc23de8a 100644
--- a/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c
+++ b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c
@@ -59,10 +59,14 @@ int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size)
 	if (!data_val)
 		return 0;
 
-	bpf_probe_read(&value, sizeof(value), &attr->value);
-
-	bpf_copy_from_user(data_val, sizeof(struct data),
-			   (void *)(unsigned long)value);
+	ret = bpf_probe_read_kernel(&value, sizeof(value), &attr->value);
+	if (ret)
+		return ret;
+
+	ret = bpf_copy_from_user(data_val, sizeof(struct data),
+				 (void *)(unsigned long)value);
+	if (ret)
+		return ret;
 
 	if (data_val->data_len > sizeof(data_val->data))
 		return -EINVAL;
-- 
cgit 


From 06c1865b0b0c7820ea53af2394dd7aff31100295 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:29 +0100
Subject: selftests/bpf: Fix xdp_do_redirect on s390x

s390x cache line size is 256 bytes, so skb_shared_info must be aligned
on a much larger boundary than for x86. This makes the maximum packet
size smaller.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-11-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
index a50971c6cf4a..ac70e871d62f 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
@@ -65,7 +65,11 @@ static int attach_tc_prog(struct bpf_tc_hook *hook, int fd)
 /* The maximum permissible size is: PAGE_SIZE - sizeof(struct xdp_page_head) -
  * sizeof(struct skb_shared_info) - XDP_PACKET_HEADROOM = 3368 bytes
  */
+#if defined(__s390x__)
+#define MAX_PKT_SIZE 3176
+#else
 #define MAX_PKT_SIZE 3368
+#endif
 static void test_max_pkt_size(int fd)
 {
 	char data[MAX_PKT_SIZE + 1] = {};
-- 
cgit 


From 06cea99e683c66e16cdf04ef91d2a008b34d7d3d Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:30 +0100
Subject: selftests/bpf: Fix cgrp_local_storage on s390x

Sync the definition of socket_cookie between the eBPF program and the
test. Currently the test works by accident, since on little-endian it
is sometimes acceptable to access u64 as u32.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-12-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
index 33a2776737e7..2cc759956e3b 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
@@ -16,7 +16,7 @@
 
 struct socket_cookie {
 	__u64 cookie_key;
-	__u32 cookie_value;
+	__u64 cookie_value;
 };
 
 static void test_tp_btf(int cgroup_fd)
-- 
cgit 


From 2934565f04fd21319fd2dfcc68bebebff503ceb2 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:31 +0100
Subject: selftests/bpf: Check stack_mprotect() return value

If stack_mprotect() succeeds, errno is not changed. This can produce
misleading error messages, that show stale errno.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-13-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/bpf_cookie.c | 6 ++++--
 tools/testing/selftests/bpf/prog_tests/test_lsm.c   | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
index 2be2d61954bc..26b2d1bffdfd 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
@@ -472,6 +472,7 @@ static void lsm_subtest(struct test_bpf_cookie *skel)
 	int prog_fd;
 	int lsm_fd = -1;
 	LIBBPF_OPTS(bpf_link_create_opts, link_opts);
+	int err;
 
 	skel->bss->lsm_res = 0;
 
@@ -482,8 +483,9 @@ static void lsm_subtest(struct test_bpf_cookie *skel)
 	if (!ASSERT_GE(lsm_fd, 0, "lsm.link_create"))
 		goto cleanup;
 
-	stack_mprotect();
-	if (!ASSERT_EQ(errno, EPERM, "stack_mprotect"))
+	err = stack_mprotect();
+	if (!ASSERT_EQ(err, -1, "stack_mprotect") ||
+	    !ASSERT_EQ(errno, EPERM, "stack_mprotect"))
 		goto cleanup;
 
 	usleep(1);
diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c
index 244c01125126..16175d579bc7 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c
@@ -75,7 +75,8 @@ static int test_lsm(struct lsm *skel)
 	skel->bss->monitored_pid = getpid();
 
 	err = stack_mprotect();
-	if (!ASSERT_EQ(errno, EPERM, "stack_mprotect"))
+	if (!ASSERT_EQ(err, -1, "stack_mprotect") ||
+	    !ASSERT_EQ(errno, EPERM, "stack_mprotect"))
 		return err;
 
 	ASSERT_EQ(skel->bss->mprotect_count, 1, "mprotect_count");
-- 
cgit 


From 80a611904eef65e8c60e0c8c8f50fa98a0bd0c69 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:32 +0100
Subject: selftests/bpf: Increase SIZEOF_BPF_LOCAL_STORAGE_ELEM on s390x

sizeof(struct bpf_local_storage_elem) is 512 on s390x:

    struct bpf_local_storage_elem {
            struct hlist_node          map_node;             /*     0    16 */
            struct hlist_node          snode;                /*    16    16 */
            struct bpf_local_storage * local_storage;        /*    32     8 */
            struct callback_head       rcu __attribute__((__aligned__(8))); /*    40    16 */

            /* XXX 200 bytes hole, try to pack */

            /* --- cacheline 1 boundary (256 bytes) --- */
            struct bpf_local_storage_data sdata __attribute__((__aligned__(256))); /*   256     8 */

            /* size: 512, cachelines: 2, members: 5 */
            /* sum members: 64, holes: 1, sum holes: 200 */
            /* padding: 248 */
            /* forced alignments: 2, forced holes: 1, sum forced holes: 200 */
    } __attribute__((__aligned__(256)));

As the existing comment suggests, use a larger number in order to be
future-proof.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-14-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/netcnt_common.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/netcnt_common.h b/tools/testing/selftests/bpf/netcnt_common.h
index 0ab1c88041cd..2d4a58e4e39c 100644
--- a/tools/testing/selftests/bpf/netcnt_common.h
+++ b/tools/testing/selftests/bpf/netcnt_common.h
@@ -8,11 +8,11 @@
 
 /* sizeof(struct bpf_local_storage_elem):
  *
- * It really is about 128 bytes on x86_64, but allocate more to account for
- * possible layout changes, different architectures, etc.
+ * It is about 128 bytes on x86_64 and 512 bytes on s390x, but allocate more to
+ * account for possible layout changes, different architectures, etc.
  * The kernel will wrap up to PAGE_SIZE internally anyway.
  */
-#define SIZEOF_BPF_LOCAL_STORAGE_ELEM		256
+#define SIZEOF_BPF_LOCAL_STORAGE_ELEM		768
 
 /* Try to estimate kernel's BPF_LOCAL_STORAGE_MAX_VALUE_SIZE: */
 #define BPF_LOCAL_STORAGE_MAX_VALUE_SIZE	(0xFFFF - \
-- 
cgit 


From be6b5c10ecc4014446e5c807d6a69c5a7cc1c497 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:33 +0100
Subject: selftests/bpf: Add a sign-extension test for kfuncs

s390x ABI requires the caller to zero- or sign-extend the arguments.
eBPF already deals with zero-extension (by definition of its ABI), but
not with sign-extension.

Add a test to cover that potentially problematic area.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-15-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/bpf/test_run.c                                  |  9 +++++++++
 tools/testing/selftests/bpf/prog_tests/kfunc_call.c |  1 +
 tools/testing/selftests/bpf/progs/kfunc_call_test.c | 18 ++++++++++++++++++
 3 files changed, 28 insertions(+)

(limited to 'tools')

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 8da0d73b368e..7dbefa4fd2eb 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -550,6 +550,14 @@ struct sock * noinline bpf_kfunc_call_test3(struct sock *sk)
 	return sk;
 }
 
+long noinline bpf_kfunc_call_test4(signed char a, short b, int c, long d)
+{
+	/* Provoke the compiler to assume that the caller has sign-extended a,
+	 * b and c on platforms where this is required (e.g. s390x).
+	 */
+	return (long)a + (long)b + (long)c + d;
+}
+
 struct prog_test_member1 {
 	int a;
 };
@@ -746,6 +754,7 @@ BTF_SET8_START(test_sk_check_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test1)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test2)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test3)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test4)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_acquire, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_kfunc_call_memb_acquire, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE)
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
index 5af1ee8f0e6e..bb4cd82a788a 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
@@ -72,6 +72,7 @@ static struct kfunc_test_params kfunc_tests[] = {
 	/* success cases */
 	TC_TEST(kfunc_call_test1, 12),
 	TC_TEST(kfunc_call_test2, 3),
+	TC_TEST(kfunc_call_test4, -1234),
 	TC_TEST(kfunc_call_test_ref_btf_id, 0),
 	TC_TEST(kfunc_call_test_get_mem, 42),
 	SYSCALL_TEST(kfunc_syscall_test, 0),
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
index f636e50be259..d91c58d06d38 100644
--- a/tools/testing/selftests/bpf/progs/kfunc_call_test.c
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
@@ -3,6 +3,7 @@
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
+extern long bpf_kfunc_call_test4(signed char a, short b, int c, long d) __ksym;
 extern int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym;
 extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b,
 				  __u32 c, __u64 d) __ksym;
@@ -17,6 +18,23 @@ extern void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym;
 extern int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, const int rdwr_buf_size) __ksym;
 extern int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) __ksym;
 
+SEC("tc")
+int kfunc_call_test4(struct __sk_buff *skb)
+{
+	struct bpf_sock *sk = skb->sk;
+	long tmp;
+
+	if (!sk)
+		return -1;
+
+	sk = bpf_sk_fullsock(sk);
+	if (!sk)
+		return -1;
+
+	tmp = bpf_kfunc_call_test4(-3, -30, -200, -1000);
+	return (tmp >> 32) + tmp;
+}
+
 SEC("tc")
 int kfunc_call_test2(struct __sk_buff *skb)
 {
-- 
cgit 


From 207612eb12b912cad40c0299fefcabf3d6cc1f3f Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:34 +0100
Subject: selftests/bpf: Fix test_lsm on s390x

Use syscall macros to access the setdomainname() arguments; currently
the code uses gprs[2] instead of orig_gpr2 for the first argument.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-16-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/lsm.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c
index d8d8af623bc2..dc93887ed34c 100644
--- a/tools/testing/selftests/bpf/progs/lsm.c
+++ b/tools/testing/selftests/bpf/progs/lsm.c
@@ -6,9 +6,10 @@
 
 #include "bpf_misc.h"
 #include "vmlinux.h"
+#include <bpf/bpf_core_read.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include  <errno.h>
+#include <errno.h>
 
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
@@ -164,8 +165,8 @@ int copy_test = 0;
 SEC("fentry.s/" SYS_PREFIX "sys_setdomainname")
 int BPF_PROG(test_sys_setdomainname, struct pt_regs *regs)
 {
-	void *ptr = (void *)PT_REGS_PARM1(regs);
-	int len = PT_REGS_PARM2(regs);
+	void *ptr = (void *)PT_REGS_PARM1_SYSCALL(regs);
+	int len = PT_REGS_PARM2_SYSCALL(regs);
 	int buf = 0;
 	long ret;
 
-- 
cgit 


From 26e8a014947948387790a029b310276ac083971b Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:35 +0100
Subject: selftests/bpf: Fix test_xdp_adjust_tail_grow2 on s390x

s390x cache line size is 256 bytes, so skb_shared_info must be aligned
on a much larger boundary than for x86.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-17-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c      | 7 ++++++-
 tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c | 8 +++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
index 39973ea1ce43..f09505f8b038 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
@@ -76,10 +76,15 @@ static void test_xdp_adjust_tail_grow2(void)
 {
 	const char *file = "./test_xdp_adjust_tail_grow.bpf.o";
 	char buf[4096]; /* avoid segfault: large buf to hold grow results */
-	int tailroom = 320; /* SKB_DATA_ALIGN(sizeof(struct skb_shared_info))*/;
 	struct bpf_object *obj;
 	int err, cnt, i;
 	int max_grow, prog_fd;
+	/* SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) */
+#if defined(__s390x__)
+	int tailroom = 512;
+#else
+	int tailroom = 320;
+#endif
 
 	LIBBPF_OPTS(bpf_test_run_opts, tattr,
 		.repeat		= 1,
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c
index 53b64c999450..297c260fc364 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c
@@ -9,6 +9,12 @@ int _xdp_adjust_tail_grow(struct xdp_md *xdp)
 	void *data = (void *)(long)xdp->data;
 	int data_len = bpf_xdp_get_buff_len(xdp);
 	int offset = 0;
+	/* SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) */
+#if defined(__TARGET_ARCH_s390)
+	int tailroom = 512;
+#else
+	int tailroom = 320;
+#endif
 
 	/* Data length determine test case */
 
@@ -20,7 +26,7 @@ int _xdp_adjust_tail_grow(struct xdp_md *xdp)
 		offset = 128;
 	} else if (data_len == 128) {
 		/* Max tail grow 3520 */
-		offset = 4096 - 256 - 320 - data_len;
+		offset = 4096 - 256 - tailroom - data_len;
 	} else if (data_len == 9000) {
 		offset = 10;
 	} else if (data_len == 9001) {
-- 
cgit 


From d504270a233d2710cf9203b9ecec820736d0e042 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:36 +0100
Subject: selftests/bpf: Fix vmlinux test on s390x

Use a syscall macro to access the nanosleep()'s first argument;
currently the code uses gprs[2] instead of orig_gpr2.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-18-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/test_vmlinux.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_vmlinux.c b/tools/testing/selftests/bpf/progs/test_vmlinux.c
index e9dfa0313d1b..4b8e37f7fd06 100644
--- a/tools/testing/selftests/bpf/progs/test_vmlinux.c
+++ b/tools/testing/selftests/bpf/progs/test_vmlinux.c
@@ -42,7 +42,7 @@ int BPF_PROG(handle__raw_tp, struct pt_regs *regs, long id)
 	if (id != __NR_nanosleep)
 		return 0;
 
-	ts = (void *)PT_REGS_PARM1_CORE(regs);
+	ts = (void *)PT_REGS_PARM1_CORE_SYSCALL(regs);
 	if (bpf_probe_read_user(&tv_nsec, sizeof(ts->tv_nsec), &ts->tv_nsec) ||
 	    tv_nsec != MY_TV_NSEC)
 		return 0;
@@ -60,7 +60,7 @@ int BPF_PROG(handle__tp_btf, struct pt_regs *regs, long id)
 	if (id != __NR_nanosleep)
 		return 0;
 
-	ts = (void *)PT_REGS_PARM1_CORE(regs);
+	ts = (void *)PT_REGS_PARM1_CORE_SYSCALL(regs);
 	if (bpf_probe_read_user(&tv_nsec, sizeof(ts->tv_nsec), &ts->tv_nsec) ||
 	    tv_nsec != MY_TV_NSEC)
 		return 0;
-- 
cgit 


From 438a2edf26b7384d3c781ff98015d58135b848d5 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:38 +0100
Subject: selftests/bpf: Fix xdp_synproxy/tc on s390x

Use the correct datatype for the values map values; currently the test
works by accident, since on little-endian machines it is sometimes
acceptable to access u64 as u32.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-20-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
index 736686e903f6..07d786329105 100644
--- a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
+++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
@@ -310,7 +310,7 @@ static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
 static __always_inline void values_inc_synacks(void)
 {
 	__u32 key = 1;
-	__u32 *value;
+	__u64 *value;
 
 	value = bpf_map_lookup_elem(&values, &key);
 	if (value)
-- 
cgit 


From 1b5e385325810b23e4e8d46f97d0e2e838e26802 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:39 +0100
Subject: selftests/bpf: Fix profiler on s390x

Use bpf_probe_read_kernel() and bpf_probe_read_kernel_str() instead
of bpf_probe_read() and bpf_probe_read_kernel().

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-21-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/profiler.inc.h | 62 +++++++++++++++---------
 1 file changed, 38 insertions(+), 24 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h
index 92331053dba3..68a3fd7387a4 100644
--- a/tools/testing/selftests/bpf/progs/profiler.inc.h
+++ b/tools/testing/selftests/bpf/progs/profiler.inc.h
@@ -156,10 +156,10 @@ probe_read_lim(void* dst, void* src, unsigned long len, unsigned long max)
 {
 	len = len < max ? len : max;
 	if (len > 1) {
-		if (bpf_probe_read(dst, len, src))
+		if (bpf_probe_read_kernel(dst, len, src))
 			return 0;
 	} else if (len == 1) {
-		if (bpf_probe_read(dst, 1, src))
+		if (bpf_probe_read_kernel(dst, 1, src))
 			return 0;
 	}
 	return len;
@@ -216,7 +216,8 @@ static INLINE void* read_full_cgroup_path(struct kernfs_node* cgroup_node,
 #endif
 	for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) {
 		filepart_length =
-			bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(cgroup_node, name));
+			bpf_probe_read_kernel_str(payload, MAX_PATH,
+						  BPF_CORE_READ(cgroup_node, name));
 		if (!cgroup_node)
 			return payload;
 		if (cgroup_node == cgroup_root_node)
@@ -303,7 +304,8 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data,
 	cgroup_data->cgroup_full_length = 0;
 
 	size_t cgroup_root_length =
-		bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(root_kernfs, name));
+		bpf_probe_read_kernel_str(payload, MAX_PATH,
+					  BPF_CORE_READ(root_kernfs, name));
 	barrier_var(cgroup_root_length);
 	if (cgroup_root_length <= MAX_PATH) {
 		barrier_var(cgroup_root_length);
@@ -312,7 +314,8 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data,
 	}
 
 	size_t cgroup_proc_length =
-		bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(proc_kernfs, name));
+		bpf_probe_read_kernel_str(payload, MAX_PATH,
+					  BPF_CORE_READ(proc_kernfs, name));
 	barrier_var(cgroup_proc_length);
 	if (cgroup_proc_length <= MAX_PATH) {
 		barrier_var(cgroup_proc_length);
@@ -395,7 +398,8 @@ static INLINE int trace_var_sys_kill(void* ctx, int tpid, int sig)
 		arr_struct = bpf_map_lookup_elem(&data_heap, &zero);
 		if (arr_struct == NULL)
 			return 0;
-		bpf_probe_read(&arr_struct->array[0], sizeof(arr_struct->array[0]), kill_data);
+		bpf_probe_read_kernel(&arr_struct->array[0],
+				      sizeof(arr_struct->array[0]), kill_data);
 	} else {
 		int index = get_var_spid_index(arr_struct, spid);
 
@@ -409,8 +413,9 @@ static INLINE int trace_var_sys_kill(void* ctx, int tpid, int sig)
 #endif
 			for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++)
 				if (arr_struct->array[i].meta.pid == 0) {
-					bpf_probe_read(&arr_struct->array[i],
-						       sizeof(arr_struct->array[i]), kill_data);
+					bpf_probe_read_kernel(&arr_struct->array[i],
+							      sizeof(arr_struct->array[i]),
+							      kill_data);
 					bpf_map_update_elem(&var_tpid_to_data, &tpid,
 							    arr_struct, 0);
 
@@ -427,17 +432,17 @@ static INLINE int trace_var_sys_kill(void* ctx, int tpid, int sig)
 		if (delta_sec < STALE_INFO) {
 			kill_data->kill_count++;
 			kill_data->last_kill_time = bpf_ktime_get_ns();
-			bpf_probe_read(&arr_struct->array[index],
-				       sizeof(arr_struct->array[index]),
-				       kill_data);
+			bpf_probe_read_kernel(&arr_struct->array[index],
+					      sizeof(arr_struct->array[index]),
+					      kill_data);
 		} else {
 			struct var_kill_data_t* kill_data =
 				get_var_kill_data(ctx, spid, tpid, sig);
 			if (kill_data == NULL)
 				return 0;
-			bpf_probe_read(&arr_struct->array[index],
-				       sizeof(arr_struct->array[index]),
-				       kill_data);
+			bpf_probe_read_kernel(&arr_struct->array[index],
+					      sizeof(arr_struct->array[index]),
+					      kill_data);
 		}
 	}
 	bpf_map_update_elem(&var_tpid_to_data, &tpid, arr_struct, 0);
@@ -487,8 +492,9 @@ read_absolute_file_path_from_dentry(struct dentry* filp_dentry, void* payload)
 #pragma unroll
 #endif
 	for (int i = 0; i < MAX_PATH_DEPTH; i++) {
-		filepart_length = bpf_probe_read_str(payload, MAX_PATH,
-						     BPF_CORE_READ(filp_dentry, d_name.name));
+		filepart_length =
+			bpf_probe_read_kernel_str(payload, MAX_PATH,
+						  BPF_CORE_READ(filp_dentry, d_name.name));
 		barrier_var(filepart_length);
 		if (filepart_length > MAX_PATH)
 			break;
@@ -572,7 +578,8 @@ ssize_t BPF_KPROBE(kprobe__proc_sys_write,
 	sysctl_data->sysctl_val_length = 0;
 	sysctl_data->sysctl_path_length = 0;
 
-	size_t sysctl_val_length = bpf_probe_read_str(payload, CTL_MAXNAME, buf);
+	size_t sysctl_val_length = bpf_probe_read_kernel_str(payload,
+							     CTL_MAXNAME, buf);
 	barrier_var(sysctl_val_length);
 	if (sysctl_val_length <= CTL_MAXNAME) {
 		barrier_var(sysctl_val_length);
@@ -580,8 +587,10 @@ ssize_t BPF_KPROBE(kprobe__proc_sys_write,
 		payload += sysctl_val_length;
 	}
 
-	size_t sysctl_path_length = bpf_probe_read_str(payload, MAX_PATH,
-						       BPF_CORE_READ(filp, f_path.dentry, d_name.name));
+	size_t sysctl_path_length =
+		bpf_probe_read_kernel_str(payload, MAX_PATH,
+					  BPF_CORE_READ(filp, f_path.dentry,
+							d_name.name));
 	barrier_var(sysctl_path_length);
 	if (sysctl_path_length <= MAX_PATH) {
 		barrier_var(sysctl_path_length);
@@ -638,7 +647,8 @@ int raw_tracepoint__sched_process_exit(void* ctx)
 		struct var_kill_data_t* past_kill_data = &arr_struct->array[i];
 
 		if (past_kill_data != NULL && past_kill_data->kill_target_pid == tpid) {
-			bpf_probe_read(kill_data, sizeof(*past_kill_data), past_kill_data);
+			bpf_probe_read_kernel(kill_data, sizeof(*past_kill_data),
+					      past_kill_data);
 			void* payload = kill_data->payload;
 			size_t offset = kill_data->payload_length;
 			if (offset >= MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN)
@@ -656,8 +666,10 @@ int raw_tracepoint__sched_process_exit(void* ctx)
 				payload += comm_length;
 			}
 
-			size_t cgroup_proc_length = bpf_probe_read_str(payload, KILL_TARGET_LEN,
-								       BPF_CORE_READ(proc_kernfs, name));
+			size_t cgroup_proc_length =
+				bpf_probe_read_kernel_str(payload,
+							  KILL_TARGET_LEN,
+							  BPF_CORE_READ(proc_kernfs, name));
 			barrier_var(cgroup_proc_length);
 			if (cgroup_proc_length <= KILL_TARGET_LEN) {
 				barrier_var(cgroup_proc_length);
@@ -718,7 +730,8 @@ int raw_tracepoint__sched_process_exec(struct bpf_raw_tracepoint_args* ctx)
 	proc_exec_data->parent_start_time = BPF_CORE_READ(parent_task, start_time);
 
 	const char* filename = BPF_CORE_READ(bprm, filename);
-	size_t bin_path_length = bpf_probe_read_str(payload, MAX_FILENAME_LEN, filename);
+	size_t bin_path_length =
+		bpf_probe_read_kernel_str(payload, MAX_FILENAME_LEN, filename);
 	barrier_var(bin_path_length);
 	if (bin_path_length <= MAX_FILENAME_LEN) {
 		barrier_var(bin_path_length);
@@ -922,7 +935,8 @@ int BPF_KPROBE(kprobe__vfs_symlink, struct inode* dir, struct dentry* dentry,
 					      filemod_data->payload);
 	payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload);
 
-	size_t len = bpf_probe_read_str(payload, MAX_FILEPATH_LENGTH, oldname);
+	size_t len = bpf_probe_read_kernel_str(payload, MAX_FILEPATH_LENGTH,
+					       oldname);
 	barrier_var(len);
 	if (len <= MAX_FILEPATH_LENGTH) {
 		barrier_var(len);
-- 
cgit 


From e85465e420be1e408f9465f8b6fd9e2f7b17aea1 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:40 +0100
Subject: libbpf: Simplify barrier_var()

Use a single "+r" constraint instead of the separate "=r" and "0".

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-22-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/bpf_helpers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index d37c4fe2849d..5ec1871acb2f 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -109,7 +109,7 @@
  * This is a variable-specific variant of more global barrier().
  */
 #ifndef barrier_var
-#define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var))
+#define barrier_var(var) asm volatile("" : "+r"(var))
 #endif
 
 /*
-- 
cgit 


From 25c76ed428219127aba385d40756f6d5814e96e9 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:41 +0100
Subject: libbpf: Fix unbounded memory access in bpf_usdt_arg()

Loading programs that use bpf_usdt_arg() on s390x fails with:

    ; if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt)
    128: (79) r1 = *(u64 *)(r10 -24)      ; frame1: R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0
    129: (25) if r1 > 0xb goto pc+83      ; frame1: R1_w=scalar(umax=11,var_off=(0x0; 0xf))
    ...
    ; arg_spec = &spec->args[arg_num];
    135: (79) r1 = *(u64 *)(r10 -24)      ; frame1: R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0
    ...
    ; switch (arg_spec->arg_type) {
    139: (61) r1 = *(u32 *)(r2 +8)
    R2 unbounded memory access, make sure to bounds check any such access

The reason is that, even though the C code enforces that
arg_num < BPF_USDT_MAX_ARG_CNT, the verifier cannot propagate this
constraint to the arg_spec assignment yet. Help it by forcing r1 back
to stack after comparison.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-23-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/usdt.bpf.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h
index fdfd235e52c4..0bd4c135acc2 100644
--- a/tools/lib/bpf/usdt.bpf.h
+++ b/tools/lib/bpf/usdt.bpf.h
@@ -130,7 +130,10 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res)
 	if (!spec)
 		return -ESRCH;
 
-	if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt)
+	if (arg_num >= BPF_USDT_MAX_ARG_CNT)
+		return -ENOENT;
+	barrier_var(arg_num);
+	if (arg_num >= spec->arg_cnt)
 		return -ENOENT;
 
 	arg_spec = &spec->args[arg_num];
-- 
cgit 


From 42fae973c2b168d527c8b4f6a02828ecd1e19626 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sat, 28 Jan 2023 01:06:42 +0100
Subject: libbpf: Fix BPF_PROBE_READ{_STR}_INTO() on s390x

BPF_PROBE_READ_INTO() and BPF_PROBE_READ_STR_INTO() should map to
bpf_probe_read() and bpf_probe_read_str() respectively in order to work
correctly on architectures with !ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230128000650.1516334-24-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/bpf_core_read.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
index 496e6a8ee0dc..1ac57bb7ac55 100644
--- a/tools/lib/bpf/bpf_core_read.h
+++ b/tools/lib/bpf/bpf_core_read.h
@@ -364,7 +364,7 @@ enum bpf_enum_value_kind {
 
 /* Non-CO-RE variant of BPF_CORE_READ_INTO() */
 #define BPF_PROBE_READ_INTO(dst, src, a, ...) ({			    \
-	___core_read(bpf_probe_read, bpf_probe_read,			    \
+	___core_read(bpf_probe_read_kernel, bpf_probe_read_kernel,	    \
 		     dst, (src), a, ##__VA_ARGS__)			    \
 })
 
@@ -400,7 +400,7 @@ enum bpf_enum_value_kind {
 
 /* Non-CO-RE variant of BPF_CORE_READ_STR_INTO() */
 #define BPF_PROBE_READ_STR_INTO(dst, src, a, ...) ({			    \
-	___core_read(bpf_probe_read_str, bpf_probe_read,		    \
+	___core_read(bpf_probe_read_kernel_str, bpf_probe_read_kernel,	    \
 		     dst, (src), a, ##__VA_ARGS__)			    \
 })
 
-- 
cgit 


From 7ce878ca81bca7811e669db4c394b86780e0dbe4 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sun, 29 Jan 2023 20:04:54 +0100
Subject: selftests/bpf: Fix sk_assign on s390x

sk_assign is failing on an s390x machine running Debian "bookworm" for
2 reasons: legacy server_map definition and uninitialized addrlen in
recvfrom() call.

Fix by adding a new-style server_map definition and dropping addrlen
(recvfrom() allows NULL values for src_addr and addrlen).

Since the test should support tc built without libbpf, build the prog
twice: with the old-style definition and with the new-style definition,
then select the right one at runtime. This could be done at compile
time too, but this would not be cross-compilation friendly.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230129190501.1624747-2-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/sk_assign.c | 25 ++++++++++++++++------
 tools/testing/selftests/bpf/progs/test_sk_assign.c | 11 ++++++++++
 .../selftests/bpf/progs/test_sk_assign_libbpf.c    |  3 +++
 3 files changed, 33 insertions(+), 6 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
index 3e190ed63976..1374b626a985 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
@@ -29,7 +29,23 @@ static int stop, duration;
 static bool
 configure_stack(void)
 {
+	char tc_version[128];
 	char tc_cmd[BUFSIZ];
+	char *prog;
+	FILE *tc;
+
+	/* Check whether tc is built with libbpf. */
+	tc = popen("tc -V", "r");
+	if (CHECK_FAIL(!tc))
+		return false;
+	if (CHECK_FAIL(!fgets(tc_version, sizeof(tc_version), tc)))
+		return false;
+	if (strstr(tc_version, ", libbpf "))
+		prog = "test_sk_assign_libbpf.bpf.o";
+	else
+		prog = "test_sk_assign.bpf.o";
+	if (CHECK_FAIL(pclose(tc)))
+		return false;
 
 	/* Move to a new networking namespace */
 	if (CHECK_FAIL(unshare(CLONE_NEWNET)))
@@ -46,8 +62,8 @@ configure_stack(void)
 	/* Load qdisc, BPF program */
 	if (CHECK_FAIL(system("tc qdisc add dev lo clsact")))
 		return false;
-	sprintf(tc_cmd, "%s %s %s %s", "tc filter add dev lo ingress bpf",
-		       "direct-action object-file ./test_sk_assign.bpf.o",
+	sprintf(tc_cmd, "%s %s %s %s %s", "tc filter add dev lo ingress bpf",
+		       "direct-action object-file", prog,
 		       "section tc",
 		       (env.verbosity < VERBOSE_VERY) ? " 2>/dev/null" : "verbose");
 	if (CHECK(system(tc_cmd), "BPF load failed;",
@@ -129,15 +145,12 @@ get_port(int fd)
 static ssize_t
 rcv_msg(int srv_client, int type)
 {
-	struct sockaddr_storage ss;
 	char buf[BUFSIZ];
-	socklen_t slen;
 
 	if (type == SOCK_STREAM)
 		return read(srv_client, &buf, sizeof(buf));
 	else
-		return recvfrom(srv_client, &buf, sizeof(buf), 0,
-				(struct sockaddr *)&ss, &slen);
+		return recvfrom(srv_client, &buf, sizeof(buf), 0, NULL, NULL);
 }
 
 static int
diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c
index 98c6493d9b91..21b19b758c4e 100644
--- a/tools/testing/selftests/bpf/progs/test_sk_assign.c
+++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c
@@ -16,6 +16,16 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
 
+#if defined(IPROUTE2_HAVE_LIBBPF)
+/* Use a new-style map definition. */
+struct {
+	__uint(type, BPF_MAP_TYPE_SOCKMAP);
+	__type(key, int);
+	__type(value, __u64);
+	__uint(pinning, LIBBPF_PIN_BY_NAME);
+	__uint(max_entries, 1);
+} server_map SEC(".maps");
+#else
 /* Pin map under /sys/fs/bpf/tc/globals/<map name> */
 #define PIN_GLOBAL_NS 2
 
@@ -35,6 +45,7 @@ struct {
 	.max_elem = 1,
 	.pinning = PIN_GLOBAL_NS,
 };
+#endif
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c b/tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c
new file mode 100644
index 000000000000..dcf46adfda04
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c
@@ -0,0 +1,3 @@
+// SPDX-License-Identifier: GPL-2.0
+#define IPROUTE2_HAVE_LIBBPF
+#include "test_sk_assign.c"
-- 
cgit 


From af320fb7ddb091a401a407bb256a9abd85587624 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sun, 29 Jan 2023 20:05:00 +0100
Subject: selftests/bpf: Fix s390x vmlinux path

After commit edd4a8667355 ("s390/boot: get rid of startup archive")
there is no more compressed/ subdirectory.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230129190501.1624747-8-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/vmtest.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh
index 316a56d680f2..685034528018 100755
--- a/tools/testing/selftests/bpf/vmtest.sh
+++ b/tools/testing/selftests/bpf/vmtest.sh
@@ -13,7 +13,7 @@ s390x)
 	QEMU_BINARY=qemu-system-s390x
 	QEMU_CONSOLE="ttyS1"
 	QEMU_FLAGS=(-smp 2)
-	BZIMAGE="arch/s390/boot/compressed/vmlinux"
+	BZIMAGE="arch/s390/boot/vmlinux"
 	;;
 x86_64)
 	QEMU_BINARY=qemu-system-x86_64
-- 
cgit 


From ee105d5a50d4075be81b2d924bfc630e0d7cdb90 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Sun, 29 Jan 2023 20:05:01 +0100
Subject: selftests/bpf: Trim DENYLIST.s390x

Now that trampoline is implemented, enable a number of tests on s390x.
18 of the remaining failures have to do with either lack of rethook
(fixed by [1]) or syscall symbols missing from BTF (fixed by [2]).

Do not re-classify the remaining failures for now; wait until the
s390/for-next fixes are merged and re-classify only the remaining few.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=for-next&id=1a280f48c0e403903cf0b4231c95b948e664f25a
[2] https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=for-next&id=2213d44e140f979f4b60c3c0f8dd56d151cc8692

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230129190501.1624747-9-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/DENYLIST.s390x | 69 ------------------------------
 1 file changed, 69 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index 50924611e5bb..b89eb87034e4 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -1,93 +1,24 @@
 # TEMPORARY
 # Alphabetical order
-atomics                                  # attach(add): actual -524 <= expected 0                                      (trampoline)
 bloom_filter_map                         # failed to find kernel BTF type ID of '__x64_sys_getpgid': -3                (?)
 bpf_cookie                               # failed to open_and_load program: -524 (trampoline)
-bpf_iter_setsockopt                      # JIT does not support calling kernel function                                (kfunc)
 bpf_loop                                 # attaches to __x64_sys_nanosleep
-bpf_mod_race                             # BPF trampoline
-bpf_nf                                   # JIT does not support calling kernel function
-bpf_tcp_ca                               # JIT does not support calling kernel function                                (kfunc)
-cb_refs                                  # expected error message unexpected error: -524                               (trampoline)
-cgroup_hierarchical_stats                # JIT does not support calling kernel function                                (kfunc)
-cgrp_kfunc                               # JIT does not support calling kernel function
 cgrp_local_storage                       # prog_attach unexpected error: -524                                          (trampoline)
-core_read_macros                         # unknown func bpf_probe_read#4                                               (overlapping)
-cpumask                                  # JIT does not support calling kernel function
-d_path                                   # failed to auto-attach program 'prog_stat': -524                             (trampoline)
-decap_sanity                             # JIT does not support calling kernel function                                (kfunc)
-deny_namespace                           # failed to attach: ERROR: strerror_r(-524)=22                                (trampoline)
-dummy_st_ops                             # test_run unexpected error: -524 (errno 524)                                 (trampoline)
-fentry_fexit                             # fentry attach failed: -524                                                  (trampoline)
-fentry_test                              # fentry_first_attach unexpected error: -524                                  (trampoline)
-fexit_bpf2bpf                            # freplace_attach_trace unexpected error: -524                                (trampoline)
 fexit_sleep                              # fexit_skel_load fexit skeleton failed                                       (trampoline)
-fexit_stress                             # fexit attach failed prog 0 failed: -524                                     (trampoline)
-fexit_test                               # fexit_first_attach unexpected error: -524                                   (trampoline)
-get_func_args_test	                 # trampoline
-get_func_ip_test                         # get_func_ip_test__attach unexpected error: -524                             (trampoline)
 get_stack_raw_tp                         # user_stack corrupted user stack                                             (no backchain userspace)
-htab_update                              # failed to attach: ERROR: strerror_r(-524)=22                                (trampoline)
-jit_probe_mem                            # jit_probe_mem__open_and_load unexpected error: -524                         (kfunc)
-kfree_skb                                # attach fentry unexpected error: -524                                        (trampoline)
-kfunc_call                               # 'bpf_prog_active': not found in kernel BTF                                  (?)
-kfunc_dynptr_param                       # JIT does not support calling kernel function                                (kfunc)
 kprobe_multi_bench_attach                # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 kprobe_multi_test                        # relies on fentry
 ksyms_module                             # test_ksyms_module__open_and_load unexpected error: -9                       (?)
 ksyms_module_libbpf                      # JIT does not support calling kernel function                                (kfunc)
 ksyms_module_lskel                       # test_ksyms_module_lskel__open_and_load unexpected error: -9                 (?)
-libbpf_get_fd_by_id_opts                 # failed to attach: ERROR: strerror_r(-524)=22                                (trampoline)
-linked_list				 # JIT does not support calling kernel function                                (kfunc)
-lookup_key                               # JIT does not support calling kernel function                                (kfunc)
-lru_bug                                  # prog 'printk': failed to auto-attach: -524
-map_kptr                                 # failed to open_and_load program: -524 (trampoline)
-modify_return                            # modify_return attach failed: -524                                           (trampoline)
 module_attach                            # skel_attach skeleton attach failed: -524                                    (trampoline)
-mptcp
-nested_trust                             # JIT does not support calling kernel function
-netcnt                                   # failed to load BPF skeleton 'netcnt_prog': -7                               (?)
-probe_user                               # check_kprobe_res wrong kprobe res from probe read                           (?)
-rcu_read_lock                            # failed to find kernel BTF type ID of '__x64_sys_getpgid': -3                (?)
-recursion                                # skel_attach unexpected error: -524                                          (trampoline)
 ringbuf                                  # skel_load skeleton load failed                                              (?)
-select_reuseport                         # intermittently fails on new s390x setup
-send_signal                              # intermittently fails to receive signal
-setget_sockopt                           # attach unexpected error: -524                                               (trampoline)
-sk_assign                                # Can't read on server: Invalid argument                                      (?)
-sk_lookup                                # endianness problem
-sk_storage_tracing                       # test_sk_storage_tracing__attach unexpected error: -524                      (trampoline)
-skc_to_unix_sock                         # could not attach BPF object unexpected error: -524                          (trampoline)
-socket_cookie                            # prog_attach unexpected error: -524                                          (trampoline)
 stacktrace_build_id                      # compare_map_keys stackid_hmap vs. stackmap err -2 errno 2                   (?)
-tailcalls                                # tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls      (?)
-task_kfunc                               # JIT does not support calling kernel function
-task_local_storage                       # failed to auto-attach program 'trace_exit_creds': -524                      (trampoline)
-test_bpffs                               # bpffs test  failed 255                                                      (iterator)
-test_bprm_opts                           # failed to auto-attach program 'secure_exec': -524                           (trampoline)
-test_ima                                 # failed to auto-attach program 'ima': -524                                   (trampoline)
-test_local_storage                       # failed to auto-attach program 'unlink_hook': -524                           (trampoline)
 test_lsm                                 # attach unexpected error: -524                                               (trampoline)
-test_overhead                            # attach_fentry unexpected error: -524                                        (trampoline)
-test_profiler                            # unknown func bpf_probe_read_str#45                                          (overlapping)
-timer                                    # failed to auto-attach program 'test1': -524                                 (trampoline)
-timer_crash                              # trampoline
-timer_mim                                # failed to auto-attach program 'test1': -524                                 (trampoline)
-trace_ext                                # failed to auto-attach program 'test_pkt_md_access_new': -524                (trampoline)
 trace_printk                             # trace_printk__load unexpected error: -2 (errno 2)                           (?)
 trace_vprintk                            # trace_vprintk__open_and_load unexpected error: -9                           (?)
-tracing_struct                           # failed to auto-attach: -524                                                 (trampoline)
-trampoline_count                         # prog 'prog1': failed to attach: ERROR: strerror_r(-524)=22                  (trampoline)
-type_cast                                # JIT does not support calling kernel function
 unpriv_bpf_disabled                      # fentry
 user_ringbuf                             # failed to find kernel BTF type ID of '__s390x_sys_prctl': -3                (?)
 verif_stats                              # trace_vprintk__open_and_load unexpected error: -9                           (?)
-verify_pkcs7_sig                         # JIT does not support calling kernel function                                (kfunc)
-vmlinux                                  # failed to auto-attach program 'handle__fentry': -524                        (trampoline)
-xdp_adjust_tail                          # case-128 err 0 errno 28 retval 1 size 128 expect-size 3520                  (?)
 xdp_bonding                              # failed to auto-attach program 'trace_on_entry': -524                        (trampoline)
-xdp_bpf2bpf                              # failed to auto-attach program 'trace_on_entry': -524                        (trampoline)
-xdp_do_redirect                          # prog_run_max_size unexpected error: -22 (errno 22)
 xdp_metadata                             # JIT does not support calling kernel function                                (kfunc)
-xdp_synproxy                             # JIT does not support calling kernel function                                (kfunc)
-xfrm_info                                # JIT does not support calling kernel function                                (kfunc)
-- 
cgit 


From 68efe8f7a1c5168be2228bfb806ddc05475b7205 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 6 Jan 2023 19:24:19 +0000
Subject: KVM: selftests: Fix build of rseq test

The KVM rseq test is failing to build in -next due to a commit merged
from the tip tree which adds a wrapper for sys_getcpu() to the rseq
kselftests, conflicting with the wrapper already included in the KVM
selftest:

rseq_test.c:48:13: error: conflicting types for 'sys_getcpu'
   48 | static void sys_getcpu(unsigned *cpu)
          |             ^~~~~~~~~~
In file included from rseq_test.c:23:
../rseq/rseq.c:82:12: note: previous definition of 'sys_getcpu' was here
   82 | static int sys_getcpu(unsigned *cpu, unsigned *node)
          |            ^~~~~~~~~~

Fix this by removing the local wrapper and moving the result check up to
the caller.

Fixes: 99babd04b250 ("selftests/rseq: Implement rseq numa node id field selftest")
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/r/20230106-fix-kvm-rseq-build-v1-1-b704d9831d02@kernel.org
---
 tools/testing/selftests/kvm/rseq_test.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c
index 3045fdf9bdf5..f74e76d03b7e 100644
--- a/tools/testing/selftests/kvm/rseq_test.c
+++ b/tools/testing/selftests/kvm/rseq_test.c
@@ -41,18 +41,6 @@ static void guest_code(void)
 		GUEST_SYNC(0);
 }
 
-/*
- * We have to perform direct system call for getcpu() because it's
- * not available until glic 2.29.
- */
-static void sys_getcpu(unsigned *cpu)
-{
-	int r;
-
-	r = syscall(__NR_getcpu, cpu, NULL, NULL);
-	TEST_ASSERT(!r, "getcpu failed, errno = %d (%s)", errno, strerror(errno));
-}
-
 static int next_cpu(int cpu)
 {
 	/*
@@ -249,7 +237,9 @@ int main(int argc, char *argv[])
 			 * across the seq_cnt reads.
 			 */
 			smp_rmb();
-			sys_getcpu(&cpu);
+			r = sys_getcpu(&cpu, NULL);
+			TEST_ASSERT(!r, "getcpu failed, errno = %d (%s)",
+				    errno, strerror(errno));
 			rseq_cpu = rseq_current_cpu_raw();
 			smp_rmb();
 		} while (snapshot != atomic_read(&seq_cnt));
-- 
cgit 


From be6c50d315f92071e481eacd1c0260cb4f7c325f Mon Sep 17 00:00:00 2001
From: Michael Schmitz <schmitzmic@gmail.com>
Date: Thu, 12 Jan 2023 16:55:29 +1300
Subject: selftests/seccomp: Add m68k support

Add m68k seccomp definitions to seccomp_bpf self test code.

Tested on ARAnyM.

Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20230112035529.13521-4-schmitzmic@gmail.com
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 9c2f448bb3a9..61386e499b77 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -138,6 +138,8 @@ struct seccomp_data {
 #  define __NR_seccomp 337
 # elif defined(__sh__)
 #  define __NR_seccomp 372
+# elif defined(__mc68000__)
+#  define __NR_seccomp 380
 # else
 #  warning "seccomp syscall number unknown for this architecture"
 #  define __NR_seccomp 0xffff
@@ -1838,6 +1840,10 @@ TEST_F(TRACE_poke, getpid_runs_normally)
 # define ARCH_REGS		struct pt_regs
 # define SYSCALL_NUM(_regs)	(_regs).regs[3]
 # define SYSCALL_RET(_regs)	(_regs).regs[0]
+#elif defined(__mc68000__)
+# define ARCH_REGS		struct user_regs_struct
+# define SYSCALL_NUM(_regs)	(_regs).orig_d0
+# define SYSCALL_RET(_regs)	(_regs).d0
 #else
 # error "Do not know how to find your architecture's registers and syscalls"
 #endif
@@ -1902,7 +1908,7 @@ const bool ptrace_entry_set_syscall_ret =
  * Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
  * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
  */
-#if defined(__x86_64__) || defined(__i386__) || defined(__mips__)
+#if defined(__x86_64__) || defined(__i386__) || defined(__mips__) || defined(__mc68000__)
 # define ARCH_GETREGS(_regs)	ptrace(PTRACE_GETREGS, tracee, 0, &(_regs))
 # define ARCH_SETREGS(_regs)	ptrace(PTRACE_SETREGS, tracee, 0, &(_regs))
 #else
-- 
cgit 


From 7482c19173b7eb044d476b3444d7ee55bc669d03 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:22 -0500
Subject: selftests: arm64: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/arm64/fp/Makefile   | 2 +-
 tools/testing/selftests/arm64/tags/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/Makefile b/tools/testing/selftests/arm64/fp/Makefile
index 36db61358ed5..932ec8792316 100644
--- a/tools/testing/selftests/arm64/fp/Makefile
+++ b/tools/testing/selftests/arm64/fp/Makefile
@@ -3,7 +3,7 @@
 # A proper top_srcdir is needed by KSFT(lib.mk)
 top_srcdir = $(realpath ../../../../../)
 
-CFLAGS += -I$(top_srcdir)/usr/include/
+CFLAGS += $(KHDR_INCLUDES)
 
 TEST_GEN_PROGS := fp-stress \
 	sve-ptrace sve-probe-vls \
diff --git a/tools/testing/selftests/arm64/tags/Makefile b/tools/testing/selftests/arm64/tags/Makefile
index 41cb75070511..6d29cfde43a2 100644
--- a/tools/testing/selftests/arm64/tags/Makefile
+++ b/tools/testing/selftests/arm64/tags/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-CFLAGS += -I../../../../../usr/include/
+CFLAGS += $(KHDR_INCLUDES)
 TEST_GEN_PROGS := tags_test
 TEST_PROGS := run_tags_test.sh
 
-- 
cgit 


From 612cf4d283414a5ee2733db6608d917deb45fa46 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:24 -0500
Subject: selftests: clone3: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/clone3/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/clone3/Makefile b/tools/testing/selftests/clone3/Makefile
index 79b19a2863a0..84832c369a2e 100644
--- a/tools/testing/selftests/clone3/Makefile
+++ b/tools/testing/selftests/clone3/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-CFLAGS += -g -std=gnu99 -I../../../../usr/include/
+CFLAGS += -g -std=gnu99 $(KHDR_INCLUDES)
 LDLIBS += -lcap
 
 TEST_GEN_PROGS := clone3 clone3_clear_sighand clone3_set_tid \
-- 
cgit 


From 145df2fdc38f24b3e52e4c2a59b02d874a074fbd Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:25 -0500
Subject: selftests: core: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org> # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/core/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/core/Makefile b/tools/testing/selftests/core/Makefile
index f6f2d6f473c6..ce262d097269 100644
--- a/tools/testing/selftests/core/Makefile
+++ b/tools/testing/selftests/core/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-CFLAGS += -g -I../../../../usr/include/
+CFLAGS += -g $(KHDR_INCLUDES)
 
 TEST_GEN_PROGS := close_range_test
 
-- 
cgit 


From bdb8bf7d56afd1d22c12c61455d732d3baff2bde Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 26 Jan 2023 11:06:04 -0800
Subject: objtool: Install libsubcmd in build

Including from tools/lib can create inadvertent dependencies. Install
libsubcmd in the objtool build and then include the headers from
there.

Signed-off-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20230126190606.40739-2-irogers@google.com
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 tools/objtool/.gitignore |  1 +
 tools/objtool/Build      |  2 --
 tools/objtool/Makefile   | 31 +++++++++++++++++++++++--------
 3 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore
index 14236db3677f..4faa4dd72f35 100644
--- a/tools/objtool/.gitignore
+++ b/tools/objtool/.gitignore
@@ -2,3 +2,4 @@
 arch/x86/lib/inat-tables.c
 /objtool
 fixdep
+libsubcmd/
diff --git a/tools/objtool/Build b/tools/objtool/Build
index 33f2ee5a46d3..a3cdf8af6635 100644
--- a/tools/objtool/Build
+++ b/tools/objtool/Build
@@ -16,8 +16,6 @@ objtool-y += libctype.o
 objtool-y += str_error_r.o
 objtool-y += librbtree.o
 
-CFLAGS += -I$(srctree)/tools/lib
-
 $(OUTPUT)libstring.o: ../lib/string.c FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_o_c)
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index a3a9cc24e0e3..3505ae4b0e36 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -12,9 +12,13 @@ srctree := $(patsubst %/,%,$(dir $(CURDIR)))
 srctree := $(patsubst %/,%,$(dir $(srctree)))
 endif
 
-SUBCMD_SRCDIR		= $(srctree)/tools/lib/subcmd/
-LIBSUBCMD_OUTPUT	= $(or $(OUTPUT),$(CURDIR)/)
-LIBSUBCMD		= $(LIBSUBCMD_OUTPUT)libsubcmd.a
+LIBSUBCMD_DIR = $(srctree)/tools/lib/subcmd/
+ifneq ($(OUTPUT),)
+  LIBSUBCMD_OUTPUT = $(abspath $(OUTPUT))/libsubcmd
+else
+  LIBSUBCMD_OUTPUT = $(CURDIR)/libsubcmd
+endif
+LIBSUBCMD = $(LIBSUBCMD_OUTPUT)/libsubcmd.a
 
 OBJTOOL    := $(OUTPUT)objtool
 OBJTOOL_IN := $(OBJTOOL)-in.o
@@ -28,7 +32,8 @@ INCLUDES := -I$(srctree)/tools/include \
 	    -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \
 	    -I$(srctree)/tools/arch/$(SRCARCH)/include	\
 	    -I$(srctree)/tools/objtool/include \
-	    -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include
+	    -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include \
+	    -I$(LIBSUBCMD_OUTPUT)/include
 WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed -Wno-nested-externs
 CFLAGS   := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS)
 LDFLAGS  += $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS)
@@ -38,6 +43,7 @@ elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(CC) $(CFLAGS) -x c -E -
 CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED)
 
 AWK = awk
+MKDIR = mkdir
 
 BUILD_ORC := n
 
@@ -57,13 +63,22 @@ $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN)
 	$(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@
 
 
-$(LIBSUBCMD): fixdep FORCE
-	$(Q)$(MAKE) -C $(SUBCMD_SRCDIR) OUTPUT=$(LIBSUBCMD_OUTPUT)
+$(LIBSUBCMD_OUTPUT):
+	@$(MKDIR) -p $@
+
+$(LIBSUBCMD): fixdep $(LIBSUBCMD_OUTPUT) FORCE
+	@$(MAKE) -C $(LIBSUBCMD_DIR) O=$(LIBSUBCMD_OUTPUT) \
+		DESTDIR=$(LIBSUBCMD_OUTPUT) prefix= subdir= \
+		$@ install_headers
+
+$(LIBSUBCMD)-clean:
+	$(call QUIET_CLEAN, libsubcmd)
+	$(Q)$(RM) -r -- $(LIBSUBCMD_OUTPUT)
 
-clean:
+clean: $(LIBSUBCMD)-clean
 	$(call QUIET_CLEAN, objtool) $(RM) $(OBJTOOL)
 	$(Q)find $(OUTPUT) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
-	$(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep $(LIBSUBCMD)
+	$(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep
 
 FORCE:
 
-- 
cgit 


From 8c4526ca6a45e7ff915c2b33b54db6b773291fac Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 26 Jan 2023 11:06:05 -0800
Subject: objtool: Properly support make V=1

The Q variable was being used but never correctly set up. Add the
setting up and use in place of @.

Signed-off-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20230126190606.40739-3-irogers@google.com
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 tools/objtool/Makefile | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index 3505ae4b0e36..d54b66986627 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -45,6 +45,12 @@ CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED)
 AWK = awk
 MKDIR = mkdir
 
+ifeq ($(V),1)
+  Q =
+else
+  Q = @
+endif
+
 BUILD_ORC := n
 
 ifeq ($(SRCARCH),x86)
@@ -56,18 +62,18 @@ export srctree OUTPUT CFLAGS SRCARCH AWK
 include $(srctree)/tools/build/Makefile.include
 
 $(OBJTOOL_IN): fixdep FORCE
-	@$(CONFIG_SHELL) ./sync-check.sh
-	@$(MAKE) $(build)=objtool
+	$(Q)$(CONFIG_SHELL) ./sync-check.sh
+	$(Q)$(MAKE) $(build)=objtool
 
 $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN)
 	$(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@
 
 
 $(LIBSUBCMD_OUTPUT):
-	@$(MKDIR) -p $@
+	$(Q)$(MKDIR) -p $@
 
 $(LIBSUBCMD): fixdep $(LIBSUBCMD_OUTPUT) FORCE
-	@$(MAKE) -C $(LIBSUBCMD_DIR) O=$(LIBSUBCMD_OUTPUT) \
+	$(Q)$(MAKE) -C $(LIBSUBCMD_DIR) O=$(LIBSUBCMD_OUTPUT) \
 		DESTDIR=$(LIBSUBCMD_OUTPUT) prefix= subdir= \
 		$@ install_headers
 
-- 
cgit 


From 89ff30b9b72079a1c677500218bdc511eac246d4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 11 Jan 2023 15:02:41 +0000
Subject: kselftest/arm64: Limit the maximum VL we try to set via ptrace

When SVE was initially merged we chose to export the maximum VQ in the ABI
as being 512, rather more than the architecturally supported maximum of 16.
For the ptrace tests this results in us generating a lot of test cases and
hence log output which are redundant since a system couldn't possibly
support them. Instead only check values up to the current architectural
limit, plus one more so that we're covering the constraining of higher
vector lengths.

This makes no practical difference to our test coverage, speeds things up
on slower consoles and makes the output much more managable.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230111-arm64-kselftest-ptrace-max-vl-v1-1-8167f41d1ad8@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/sve-ptrace.c | 14 ++++++++++++--
 tools/testing/selftests/arm64/fp/za-ptrace.c  | 14 ++++++++++++--
 2 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c
index 8c4847977583..6d61992fe8a0 100644
--- a/tools/testing/selftests/arm64/fp/sve-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c
@@ -30,6 +30,16 @@
 #define NT_ARM_SSVE 0x40b
 #endif
 
+/*
+ * The architecture defines the maximum VQ as 16 but for extensibility
+ * the kernel specifies the SVE_VQ_MAX as 512 resulting in us running
+ * a *lot* more tests than are useful if we use it.  Until the
+ * architecture is extended let's limit our coverage to what is
+ * currently allowed, plus one extra to ensure we cover constraining
+ * the VL as expected.
+ */
+#define TEST_VQ_MAX 17
+
 struct vec_type {
 	const char *name;
 	unsigned long hwcap_type;
@@ -55,7 +65,7 @@ static const struct vec_type vec_types[] = {
 	},
 };
 
-#define VL_TESTS (((SVE_VQ_MAX - SVE_VQ_MIN) + 1) * 4)
+#define VL_TESTS (((TEST_VQ_MAX - SVE_VQ_MIN) + 1) * 4)
 #define FLAG_TESTS 2
 #define FPSIMD_TESTS 2
 
@@ -689,7 +699,7 @@ static int do_parent(pid_t child)
 		}
 
 		/* Step through every possible VQ */
-		for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; vq++) {
+		for (vq = SVE_VQ_MIN; vq <= TEST_VQ_MAX; vq++) {
 			vl = sve_vl_from_vq(vq);
 
 			/* First, try to set this vector length */
diff --git a/tools/testing/selftests/arm64/fp/za-ptrace.c b/tools/testing/selftests/arm64/fp/za-ptrace.c
index bf6158654056..ac27d87396fc 100644
--- a/tools/testing/selftests/arm64/fp/za-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/za-ptrace.c
@@ -25,7 +25,17 @@
 #define NT_ARM_ZA 0x40c
 #endif
 
-#define EXPECTED_TESTS (((SVE_VQ_MAX - SVE_VQ_MIN) + 1) * 3)
+/*
+ * The architecture defines the maximum VQ as 16 but for extensibility
+ * the kernel specifies the SVE_VQ_MAX as 512 resulting in us running
+ * a *lot* more tests than are useful if we use it.  Until the
+ * architecture is extended let's limit our coverage to what is
+ * currently allowed, plus one extra to ensure we cover constraining
+ * the VL as expected.
+ */
+#define TEST_VQ_MAX 17
+
+#define EXPECTED_TESTS (((TEST_VQ_MAX - SVE_VQ_MIN) + 1) * 3)
 
 static void fill_buf(char *buf, size_t size)
 {
@@ -301,7 +311,7 @@ static int do_parent(pid_t child)
 	ksft_print_msg("Parent is %d, child is %d\n", getpid(), child);
 
 	/* Step through every possible VQ */
-	for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; vq++) {
+	for (vq = SVE_VQ_MIN; vq <= TEST_VQ_MAX; vq++) {
 		vl = sve_vl_from_vq(vq);
 
 		/* First, try to set this vector length */
-- 
cgit 


From b2ab432bcf65e6fa3ec3fef6dd08796404b009d0 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 30 Jan 2023 23:45:57 +0000
Subject: kselftest/arm64: Remove redundant _start labels from zt-test

The newly added zt-test program copied the pattern from the other FP
stress test programs of having a redundant _start label which is
rejected by clang, as we did in a parallel series for the other tests
remove the label so we can build with clang.

No functional change.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230130-arm64-fix-sme2-clang-v1-1-3ce81d99ea8f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/zt-test.S | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S
index 7ec90976cf5e..d63286397638 100644
--- a/tools/testing/selftests/arm64/fp/zt-test.S
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -200,7 +200,6 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
-_start:
 	mov	x23, #0		// signal count
 
 	mov	w0, #SIGINT
-- 
cgit 


From a37380ef8b54b18717e908e173179bf78c143ed2 Mon Sep 17 00:00:00 2001
From: zhang songyi <zhang.songyi@zte.com.cn>
Date: Mon, 19 Dec 2022 14:31:05 +0800
Subject: tools/rv: Remove unneeded semicolon

The semicolon after the "}" is unneeded.

Link: https://lore.kernel.org/linux-trace-devel/202212191431057948891@zte.com.cn

Signed-off-by: zhang songyi <zhang.songyi@zte.com.cn>
Acked-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/verification/rv/src/in_kernel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/verification/rv/src/in_kernel.c b/tools/verification/rv/src/in_kernel.c
index 50848d79b38b..ad28582bcf2b 100644
--- a/tools/verification/rv/src/in_kernel.c
+++ b/tools/verification/rv/src/in_kernel.c
@@ -519,7 +519,7 @@ static void ikm_usage_print_reactors(void)
 
 		start = ++end;
 		end = strstr(start, "\n");
-	};
+	}
 
 	fprintf(stderr, "\n");
 }
-- 
cgit 


From fe137a4fe0e77eb95396cfc5c3dd7df404421aa4 Mon Sep 17 00:00:00 2001
From: Andreas Ziegler <br015@umbiko.net>
Date: Tue, 3 Jan 2023 11:33:59 +0100
Subject: tools/tracing/rtla: osnoise_hist: use total duration for average
 calculation

Sampled durations must be weighted by observed quantity, to arrive at a correct
average duration value.

Perform calculation of total duration by summing (duration * count).

Link: https://lkml.kernel.org/r/20230103103400.275566-2-br015@umbiko.net

Fixes: 829a6c0b5698 ("rtla/osnoise: Add the hist mode")

Signed-off-by: Andreas Ziegler <br015@umbiko.net>
Acked-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/osnoise_hist.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index 5d7ea479ac89..fe34452fc4ec 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -121,6 +121,7 @@ static void osnoise_hist_update_multiple(struct osnoise_tool *tool, int cpu,
 {
 	struct osnoise_hist_params *params = tool->params;
 	struct osnoise_hist_data *data = tool->data;
+	unsigned long long total_duration;
 	int entries = data->entries;
 	int bucket;
 	int *hist;
@@ -131,10 +132,12 @@ static void osnoise_hist_update_multiple(struct osnoise_tool *tool, int cpu,
 	if (data->bucket_size)
 		bucket = duration / data->bucket_size;
 
+	total_duration = duration * count;
+
 	hist = data->hist[cpu].samples;
 	data->hist[cpu].count += count;
 	update_min(&data->hist[cpu].min_sample, &duration);
-	update_sum(&data->hist[cpu].sum_sample, &duration);
+	update_sum(&data->hist[cpu].sum_sample, &total_duration);
 	update_max(&data->hist[cpu].max_sample, &duration);
 
 	if (bucket < entries)
-- 
cgit 


From 1fab1469b66baf7847298b205e5c4aff47c2ae8a Mon Sep 17 00:00:00 2001
From: Andreas Ziegler <br015@umbiko.net>
Date: Tue, 3 Jan 2023 11:34:00 +0100
Subject: tools/tracing/rtla: osnoise_hist: display average with two-digit
 precision

Calculate average value in osnoise-hist summary with two-digit
precision to avoid displaying too optimitic results.

Link: https://lkml.kernel.org/r/20230103103400.275566-3-br015@umbiko.net

Signed-off-by: Andreas Ziegler <br015@umbiko.net>
Acked-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/osnoise_hist.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index fe34452fc4ec..13e1233690bb 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -335,8 +335,8 @@ osnoise_print_summary(struct osnoise_hist_params *params,
 			continue;
 
 		if (data->hist[cpu].count)
-			trace_seq_printf(trace->seq, "%9llu ",
-					data->hist[cpu].sum_sample / data->hist[cpu].count);
+			trace_seq_printf(trace->seq, "%9.2f ",
+				((double) data->hist[cpu].sum_sample) / data->hist[cpu].count);
 		else
 			trace_seq_printf(trace->seq, "        - ");
 	}
-- 
cgit 


From eaf317e7d2bbb04486c9842aea9be1e94bd416ed Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 30 Jan 2023 18:33:41 -0800
Subject: tools: ynl-gen: prevent do / dump reordering

An earlier fix tried to address generated code jumping around
one code-gen run to another. Turns out dict()s are already
ordered since Python 3.7, the problem is that we iterate over
operation modes using a set(). Sets are unordered in Python.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynl-gen-c.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 1aa872e582ab..e5002d420961 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -933,7 +933,7 @@ class Family:
             if attr_set_name != op['attribute-set']:
                 raise Exception('For a global policy all ops must use the same set')
 
-            for op_mode in {'do', 'dump'}:
+            for op_mode in ['do', 'dump']:
                 if op_mode in op:
                     global_set.update(op[op_mode].get('request', []))
 
@@ -2244,7 +2244,7 @@ def main():
 
             for op_name, op in parsed.ops.items():
                 if parsed.kernel_policy in {'per-op', 'split'}:
-                    for op_mode in {'do', 'dump'}:
+                    for op_mode in ['do', 'dump']:
                         if op_mode in op and 'request' in op[op_mode]:
                             cw.p(f"/* {op.enum_name} - {op_mode} */")
                             ri = RenderInfo(cw, parsed, args.mode, op, op_name, op_mode)
-- 
cgit 


From 4e4480e89c47b52b3f4fbc1ddf07a7ce541f0839 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 30 Jan 2023 18:33:42 -0800
Subject: tools: ynl: move the cli and netlink code around

Move the CLI code out of samples/ and the library part
of it into tools/net/ynl/lib/. This way we can start
sharing some code with the code gen.

Initially I thought that code gen is too C-specific to
share anything but basic stuff like calculating values
for enums can easily be shared.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/cli.py          |  47 ++++
 tools/net/ynl/lib/__init__.py |   5 +
 tools/net/ynl/lib/ynl.py      | 534 ++++++++++++++++++++++++++++++++++++++++++
 tools/net/ynl/samples/cli.py  |  47 ----
 tools/net/ynl/samples/ynl.py  | 534 ------------------------------------------
 5 files changed, 586 insertions(+), 581 deletions(-)
 create mode 100755 tools/net/ynl/cli.py
 create mode 100644 tools/net/ynl/lib/__init__.py
 create mode 100644 tools/net/ynl/lib/ynl.py
 delete mode 100755 tools/net/ynl/samples/cli.py
 delete mode 100644 tools/net/ynl/samples/ynl.py

(limited to 'tools')

diff --git a/tools/net/ynl/cli.py b/tools/net/ynl/cli.py
new file mode 100755
index 000000000000..5c4eb5a68514
--- /dev/null
+++ b/tools/net/ynl/cli.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+# SPDX-License-Identifier: BSD-3-Clause
+
+import argparse
+import json
+import pprint
+import time
+
+from lib import YnlFamily
+
+
+def main():
+    parser = argparse.ArgumentParser(description='YNL CLI sample')
+    parser.add_argument('--spec', dest='spec', type=str, required=True)
+    parser.add_argument('--schema', dest='schema', type=str)
+    parser.add_argument('--json', dest='json_text', type=str)
+    parser.add_argument('--do', dest='do', type=str)
+    parser.add_argument('--dump', dest='dump', type=str)
+    parser.add_argument('--sleep', dest='sleep', type=int)
+    parser.add_argument('--subscribe', dest='ntf', type=str)
+    args = parser.parse_args()
+
+    attrs = {}
+    if args.json_text:
+        attrs = json.loads(args.json_text)
+
+    ynl = YnlFamily(args.spec, args.schema)
+
+    if args.ntf:
+        ynl.ntf_subscribe(args.ntf)
+
+    if args.sleep:
+        time.sleep(args.sleep)
+
+    if args.do or args.dump:
+        method = getattr(ynl, args.do if args.do else args.dump)
+
+        reply = method(attrs, dump=bool(args.dump))
+        pprint.PrettyPrinter().pprint(reply)
+
+    if args.ntf:
+        ynl.check_ntf()
+        pprint.PrettyPrinter().pprint(ynl.async_msg_queue)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/net/ynl/lib/__init__.py b/tools/net/ynl/lib/__init__.py
new file mode 100644
index 000000000000..0a6102758ebe
--- /dev/null
+++ b/tools/net/ynl/lib/__init__.py
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: BSD-3-Clause
+
+from .ynl import YnlFamily
+
+__all__ = ["YnlFamily"]
diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py
new file mode 100644
index 000000000000..b71523d71d46
--- /dev/null
+++ b/tools/net/ynl/lib/ynl.py
@@ -0,0 +1,534 @@
+# SPDX-License-Identifier: BSD-3-Clause
+
+import functools
+import jsonschema
+import os
+import random
+import socket
+import struct
+import yaml
+
+#
+# Generic Netlink code which should really be in some library, but I can't quickly find one.
+#
+
+
+class Netlink:
+    # Netlink socket
+    SOL_NETLINK = 270
+
+    NETLINK_ADD_MEMBERSHIP = 1
+    NETLINK_CAP_ACK = 10
+    NETLINK_EXT_ACK = 11
+
+    # Netlink message
+    NLMSG_ERROR = 2
+    NLMSG_DONE = 3
+
+    NLM_F_REQUEST = 1
+    NLM_F_ACK = 4
+    NLM_F_ROOT = 0x100
+    NLM_F_MATCH = 0x200
+    NLM_F_APPEND = 0x800
+
+    NLM_F_CAPPED = 0x100
+    NLM_F_ACK_TLVS = 0x200
+
+    NLM_F_DUMP = NLM_F_ROOT | NLM_F_MATCH
+
+    NLA_F_NESTED = 0x8000
+    NLA_F_NET_BYTEORDER = 0x4000
+
+    NLA_TYPE_MASK = NLA_F_NESTED | NLA_F_NET_BYTEORDER
+
+    # Genetlink defines
+    NETLINK_GENERIC = 16
+
+    GENL_ID_CTRL = 0x10
+
+    # nlctrl
+    CTRL_CMD_GETFAMILY = 3
+
+    CTRL_ATTR_FAMILY_ID = 1
+    CTRL_ATTR_FAMILY_NAME = 2
+    CTRL_ATTR_MAXATTR = 5
+    CTRL_ATTR_MCAST_GROUPS = 7
+
+    CTRL_ATTR_MCAST_GRP_NAME = 1
+    CTRL_ATTR_MCAST_GRP_ID = 2
+
+    # Extack types
+    NLMSGERR_ATTR_MSG = 1
+    NLMSGERR_ATTR_OFFS = 2
+    NLMSGERR_ATTR_COOKIE = 3
+    NLMSGERR_ATTR_POLICY = 4
+    NLMSGERR_ATTR_MISS_TYPE = 5
+    NLMSGERR_ATTR_MISS_NEST = 6
+
+
+class NlAttr:
+    def __init__(self, raw, offset):
+        self._len, self._type = struct.unpack("HH", raw[offset:offset + 4])
+        self.type = self._type & ~Netlink.NLA_TYPE_MASK
+        self.payload_len = self._len
+        self.full_len = (self.payload_len + 3) & ~3
+        self.raw = raw[offset + 4:offset + self.payload_len]
+
+    def as_u16(self):
+        return struct.unpack("H", self.raw)[0]
+
+    def as_u32(self):
+        return struct.unpack("I", self.raw)[0]
+
+    def as_u64(self):
+        return struct.unpack("Q", self.raw)[0]
+
+    def as_strz(self):
+        return self.raw.decode('ascii')[:-1]
+
+    def as_bin(self):
+        return self.raw
+
+    def __repr__(self):
+        return f"[type:{self.type} len:{self._len}] {self.raw}"
+
+
+class NlAttrs:
+    def __init__(self, msg):
+        self.attrs = []
+
+        offset = 0
+        while offset < len(msg):
+            attr = NlAttr(msg, offset)
+            offset += attr.full_len
+            self.attrs.append(attr)
+
+    def __iter__(self):
+        yield from self.attrs
+
+    def __repr__(self):
+        msg = ''
+        for a in self.attrs:
+            if msg:
+                msg += '\n'
+            msg += repr(a)
+        return msg
+
+
+class NlMsg:
+    def __init__(self, msg, offset, attr_space=None):
+        self.hdr = msg[offset:offset + 16]
+
+        self.nl_len, self.nl_type, self.nl_flags, self.nl_seq, self.nl_portid = \
+            struct.unpack("IHHII", self.hdr)
+
+        self.raw = msg[offset + 16:offset + self.nl_len]
+
+        self.error = 0
+        self.done = 0
+
+        extack_off = None
+        if self.nl_type == Netlink.NLMSG_ERROR:
+            self.error = struct.unpack("i", self.raw[0:4])[0]
+            self.done = 1
+            extack_off = 20
+        elif self.nl_type == Netlink.NLMSG_DONE:
+            self.done = 1
+            extack_off = 4
+
+        self.extack = None
+        if self.nl_flags & Netlink.NLM_F_ACK_TLVS and extack_off:
+            self.extack = dict()
+            extack_attrs = NlAttrs(self.raw[extack_off:])
+            for extack in extack_attrs:
+                if extack.type == Netlink.NLMSGERR_ATTR_MSG:
+                    self.extack['msg'] = extack.as_strz()
+                elif extack.type == Netlink.NLMSGERR_ATTR_MISS_TYPE:
+                    self.extack['miss-type'] = extack.as_u32()
+                elif extack.type == Netlink.NLMSGERR_ATTR_MISS_NEST:
+                    self.extack['miss-nest'] = extack.as_u32()
+                elif extack.type == Netlink.NLMSGERR_ATTR_OFFS:
+                    self.extack['bad-attr-offs'] = extack.as_u32()
+                else:
+                    if 'unknown' not in self.extack:
+                        self.extack['unknown'] = []
+                    self.extack['unknown'].append(extack)
+
+            if attr_space:
+                # We don't have the ability to parse nests yet, so only do global
+                if 'miss-type' in self.extack and 'miss-nest' not in self.extack:
+                    miss_type = self.extack['miss-type']
+                    if len(attr_space.attr_list) > miss_type:
+                        spec = attr_space.attr_list[miss_type]
+                        desc = spec['name']
+                        if 'doc' in spec:
+                            desc += f" ({spec['doc']})"
+                        self.extack['miss-type'] = desc
+
+    def __repr__(self):
+        msg = f"nl_len = {self.nl_len} ({len(self.raw)}) nl_flags = 0x{self.nl_flags:x} nl_type = {self.nl_type}\n"
+        if self.error:
+            msg += '\terror: ' + str(self.error)
+        if self.extack:
+            msg += '\textack: ' + repr(self.extack)
+        return msg
+
+
+class NlMsgs:
+    def __init__(self, data, attr_space=None):
+        self.msgs = []
+
+        offset = 0
+        while offset < len(data):
+            msg = NlMsg(data, offset, attr_space=attr_space)
+            offset += msg.nl_len
+            self.msgs.append(msg)
+
+    def __iter__(self):
+        yield from self.msgs
+
+
+genl_family_name_to_id = None
+
+
+def _genl_msg(nl_type, nl_flags, genl_cmd, genl_version, seq=None):
+    # we prepend length in _genl_msg_finalize()
+    if seq is None:
+        seq = random.randint(1, 1024)
+    nlmsg = struct.pack("HHII", nl_type, nl_flags, seq, 0)
+    genlmsg = struct.pack("bbH", genl_cmd, genl_version, 0)
+    return nlmsg + genlmsg
+
+
+def _genl_msg_finalize(msg):
+    return struct.pack("I", len(msg) + 4) + msg
+
+
+def _genl_load_families():
+    with socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, Netlink.NETLINK_GENERIC) as sock:
+        sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1)
+
+        msg = _genl_msg(Netlink.GENL_ID_CTRL,
+                        Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK | Netlink.NLM_F_DUMP,
+                        Netlink.CTRL_CMD_GETFAMILY, 1)
+        msg = _genl_msg_finalize(msg)
+
+        sock.send(msg, 0)
+
+        global genl_family_name_to_id
+        genl_family_name_to_id = dict()
+
+        while True:
+            reply = sock.recv(128 * 1024)
+            nms = NlMsgs(reply)
+            for nl_msg in nms:
+                if nl_msg.error:
+                    print("Netlink error:", nl_msg.error)
+                    return
+                if nl_msg.done:
+                    return
+
+                gm = GenlMsg(nl_msg)
+                fam = dict()
+                for attr in gm.raw_attrs:
+                    if attr.type == Netlink.CTRL_ATTR_FAMILY_ID:
+                        fam['id'] = attr.as_u16()
+                    elif attr.type == Netlink.CTRL_ATTR_FAMILY_NAME:
+                        fam['name'] = attr.as_strz()
+                    elif attr.type == Netlink.CTRL_ATTR_MAXATTR:
+                        fam['maxattr'] = attr.as_u32()
+                    elif attr.type == Netlink.CTRL_ATTR_MCAST_GROUPS:
+                        fam['mcast'] = dict()
+                        for entry in NlAttrs(attr.raw):
+                            mcast_name = None
+                            mcast_id = None
+                            for entry_attr in NlAttrs(entry.raw):
+                                if entry_attr.type == Netlink.CTRL_ATTR_MCAST_GRP_NAME:
+                                    mcast_name = entry_attr.as_strz()
+                                elif entry_attr.type == Netlink.CTRL_ATTR_MCAST_GRP_ID:
+                                    mcast_id = entry_attr.as_u32()
+                            if mcast_name and mcast_id is not None:
+                                fam['mcast'][mcast_name] = mcast_id
+                if 'name' in fam and 'id' in fam:
+                    genl_family_name_to_id[fam['name']] = fam
+
+
+class GenlMsg:
+    def __init__(self, nl_msg):
+        self.nl = nl_msg
+
+        self.hdr = nl_msg.raw[0:4]
+        self.raw = nl_msg.raw[4:]
+
+        self.genl_cmd, self.genl_version, _ = struct.unpack("bbH", self.hdr)
+
+        self.raw_attrs = NlAttrs(self.raw)
+
+    def __repr__(self):
+        msg = repr(self.nl)
+        msg += f"\tgenl_cmd = {self.genl_cmd} genl_ver = {self.genl_version}\n"
+        for a in self.raw_attrs:
+            msg += '\t\t' + repr(a) + '\n'
+        return msg
+
+
+class GenlFamily:
+    def __init__(self, family_name):
+        self.family_name = family_name
+
+        global genl_family_name_to_id
+        if genl_family_name_to_id is None:
+            _genl_load_families()
+
+        self.genl_family = genl_family_name_to_id[family_name]
+        self.family_id = genl_family_name_to_id[family_name]['id']
+
+
+#
+# YNL implementation details.
+#
+
+
+class YnlAttrSpace:
+    def __init__(self, family, yaml):
+        self.yaml = yaml
+
+        self.attrs = dict()
+        self.name = self.yaml['name']
+        self.subspace_of = self.yaml['subset-of'] if 'subspace-of' in self.yaml else None
+
+        val = 0
+        max_val = 0
+        for elem in self.yaml['attributes']:
+            if 'value' in elem:
+                val = elem['value']
+            else:
+                elem['value'] = val
+            if val > max_val:
+                max_val = val
+            val += 1
+
+            self.attrs[elem['name']] = elem
+
+        self.attr_list = [None] * (max_val + 1)
+        for elem in self.yaml['attributes']:
+            self.attr_list[elem['value']] = elem
+
+    def __getitem__(self, key):
+        return self.attrs[key]
+
+    def __contains__(self, key):
+        return key in self.yaml
+
+    def __iter__(self):
+        yield from self.attrs
+
+    def items(self):
+        return self.attrs.items()
+
+
+class YnlFamily:
+    def __init__(self, def_path, schema=None):
+        self.include_raw = False
+
+        with open(def_path, "r") as stream:
+            self.yaml = yaml.safe_load(stream)
+
+        if schema:
+            with open(schema, "r") as stream:
+                schema = yaml.safe_load(stream)
+
+            jsonschema.validate(self.yaml, schema)
+
+        self.sock = socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, Netlink.NETLINK_GENERIC)
+        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1)
+        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_EXT_ACK, 1)
+
+        self._ops = dict()
+        self._spaces = dict()
+        self._types = dict()
+
+        for elem in self.yaml['attribute-sets']:
+            self._spaces[elem['name']] = YnlAttrSpace(self, elem)
+
+        for elem in self.yaml['definitions']:
+            self._types[elem['name']] = elem
+
+        async_separation = 'async-prefix' in self.yaml['operations']
+        self.async_msg_ids = set()
+        self.async_msg_queue = []
+        val = 0
+        max_val = 0
+        for elem in self.yaml['operations']['list']:
+            if not (async_separation and ('notify' in elem or 'event' in elem)):
+                if 'value' in elem:
+                    val = elem['value']
+                else:
+                    elem['value'] = val
+                val += 1
+                max_val = max(val, max_val)
+
+            if 'notify' in elem or 'event' in elem:
+                self.async_msg_ids.add(elem['value'])
+
+            self._ops[elem['name']] = elem
+
+            op_name = elem['name'].replace('-', '_')
+
+            bound_f = functools.partial(self._op, elem['name'])
+            setattr(self, op_name, bound_f)
+
+        self._op_array = [None] * max_val
+        for _, op in self._ops.items():
+            self._op_array[op['value']] = op
+            if 'notify' in op:
+                op['attribute-set'] = self._ops[op['notify']]['attribute-set']
+
+        self.family = GenlFamily(self.yaml['name'])
+
+    def ntf_subscribe(self, mcast_name):
+        if mcast_name not in self.family.genl_family['mcast']:
+            raise Exception(f'Multicast group "{mcast_name}" not present in the family')
+
+        self.sock.bind((0, 0))
+        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_ADD_MEMBERSHIP,
+                             self.family.genl_family['mcast'][mcast_name])
+
+    def _add_attr(self, space, name, value):
+        attr = self._spaces[space][name]
+        nl_type = attr['value']
+        if attr["type"] == 'nest':
+            nl_type |= Netlink.NLA_F_NESTED
+            attr_payload = b''
+            for subname, subvalue in value.items():
+                attr_payload += self._add_attr(attr['nested-attributes'], subname, subvalue)
+        elif attr["type"] == 'u32':
+            attr_payload = struct.pack("I", int(value))
+        elif attr["type"] == 'string':
+            attr_payload = str(value).encode('ascii') + b'\x00'
+        elif attr["type"] == 'binary':
+            attr_payload = value
+        else:
+            raise Exception(f'Unknown type at {space} {name} {value} {attr["type"]}')
+
+        pad = b'\x00' * ((4 - len(attr_payload) % 4) % 4)
+        return struct.pack('HH', len(attr_payload) + 4, nl_type) + attr_payload + pad
+
+    def _decode_enum(self, rsp, attr_spec):
+        raw = rsp[attr_spec['name']]
+        enum = self._types[attr_spec['enum']]
+        i = attr_spec.get('value-start', 0)
+        if 'enum-as-flags' in attr_spec and attr_spec['enum-as-flags']:
+            value = set()
+            while raw:
+                if raw & 1:
+                    value.add(enum['entries'][i])
+                raw >>= 1
+                i += 1
+        else:
+            value = enum['entries'][raw - i]
+        rsp[attr_spec['name']] = value
+
+    def _decode(self, attrs, space):
+        attr_space = self._spaces[space]
+        rsp = dict()
+        for attr in attrs:
+            attr_spec = attr_space.attr_list[attr.type]
+            if attr_spec["type"] == 'nest':
+                subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'])
+                rsp[attr_spec['name']] = subdict
+            elif attr_spec['type'] == 'u32':
+                rsp[attr_spec['name']] = attr.as_u32()
+            elif attr_spec['type'] == 'u64':
+                rsp[attr_spec['name']] = attr.as_u64()
+            elif attr_spec["type"] == 'string':
+                rsp[attr_spec['name']] = attr.as_strz()
+            elif attr_spec["type"] == 'binary':
+                rsp[attr_spec['name']] = attr.as_bin()
+            else:
+                raise Exception(f'Unknown {attr.type} {attr_spec["name"]} {attr_spec["type"]}')
+
+            if 'enum' in attr_spec:
+                self._decode_enum(rsp, attr_spec)
+        return rsp
+
+    def handle_ntf(self, nl_msg, genl_msg):
+        msg = dict()
+        if self.include_raw:
+            msg['nlmsg'] = nl_msg
+            msg['genlmsg'] = genl_msg
+        op = self._op_array[genl_msg.genl_cmd]
+        msg['name'] = op['name']
+        msg['msg'] = self._decode(genl_msg.raw_attrs, op['attribute-set'])
+        self.async_msg_queue.append(msg)
+
+    def check_ntf(self):
+        while True:
+            try:
+                reply = self.sock.recv(128 * 1024, socket.MSG_DONTWAIT)
+            except BlockingIOError:
+                return
+
+            nms = NlMsgs(reply)
+            for nl_msg in nms:
+                if nl_msg.error:
+                    print("Netlink error in ntf!?", os.strerror(-nl_msg.error))
+                    print(nl_msg)
+                    continue
+                if nl_msg.done:
+                    print("Netlink done while checking for ntf!?")
+                    continue
+
+                gm = GenlMsg(nl_msg)
+                if gm.genl_cmd not in self.async_msg_ids:
+                    print("Unexpected msg id done while checking for ntf", gm)
+                    continue
+
+                self.handle_ntf(nl_msg, gm)
+
+    def _op(self, method, vals, dump=False):
+        op = self._ops[method]
+
+        nl_flags = Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK
+        if dump:
+            nl_flags |= Netlink.NLM_F_DUMP
+
+        req_seq = random.randint(1024, 65535)
+        msg = _genl_msg(self.family.family_id, nl_flags, op['value'], 1, req_seq)
+        for name, value in vals.items():
+            msg += self._add_attr(op['attribute-set'], name, value)
+        msg = _genl_msg_finalize(msg)
+
+        self.sock.send(msg, 0)
+
+        done = False
+        rsp = []
+        while not done:
+            reply = self.sock.recv(128 * 1024)
+            nms = NlMsgs(reply, attr_space=self._spaces[op['attribute-set']])
+            for nl_msg in nms:
+                if nl_msg.error:
+                    print("Netlink error:", os.strerror(-nl_msg.error))
+                    print(nl_msg)
+                    return
+                if nl_msg.done:
+                    done = True
+                    break
+
+                gm = GenlMsg(nl_msg)
+                # Check if this is a reply to our request
+                if nl_msg.nl_seq != req_seq or gm.genl_cmd != op['value']:
+                    if gm.genl_cmd in self.async_msg_ids:
+                        self.handle_ntf(nl_msg, gm)
+                        continue
+                    else:
+                        print('Unexpected message: ' + repr(gm))
+                        continue
+
+                rsp.append(self._decode(gm.raw_attrs, op['attribute-set']))
+
+        if not rsp:
+            return None
+        if not dump and len(rsp) == 1:
+            return rsp[0]
+        return rsp
diff --git a/tools/net/ynl/samples/cli.py b/tools/net/ynl/samples/cli.py
deleted file mode 100755
index b27159c70710..000000000000
--- a/tools/net/ynl/samples/cli.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env python
-# SPDX-License-Identifier: BSD-3-Clause
-
-import argparse
-import json
-import pprint
-import time
-
-from ynl import YnlFamily
-
-
-def main():
-    parser = argparse.ArgumentParser(description='YNL CLI sample')
-    parser.add_argument('--spec', dest='spec', type=str, required=True)
-    parser.add_argument('--schema', dest='schema', type=str)
-    parser.add_argument('--json', dest='json_text', type=str)
-    parser.add_argument('--do', dest='do', type=str)
-    parser.add_argument('--dump', dest='dump', type=str)
-    parser.add_argument('--sleep', dest='sleep', type=int)
-    parser.add_argument('--subscribe', dest='ntf', type=str)
-    args = parser.parse_args()
-
-    attrs = {}
-    if args.json_text:
-        attrs = json.loads(args.json_text)
-
-    ynl = YnlFamily(args.spec, args.schema)
-
-    if args.ntf:
-        ynl.ntf_subscribe(args.ntf)
-
-    if args.sleep:
-        time.sleep(args.sleep)
-
-    if args.do or args.dump:
-        method = getattr(ynl, args.do if args.do else args.dump)
-
-        reply = method(attrs, dump=bool(args.dump))
-        pprint.PrettyPrinter().pprint(reply)
-
-    if args.ntf:
-        ynl.check_ntf()
-        pprint.PrettyPrinter().pprint(ynl.async_msg_queue)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/net/ynl/samples/ynl.py b/tools/net/ynl/samples/ynl.py
deleted file mode 100644
index b71523d71d46..000000000000
--- a/tools/net/ynl/samples/ynl.py
+++ /dev/null
@@ -1,534 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-
-import functools
-import jsonschema
-import os
-import random
-import socket
-import struct
-import yaml
-
-#
-# Generic Netlink code which should really be in some library, but I can't quickly find one.
-#
-
-
-class Netlink:
-    # Netlink socket
-    SOL_NETLINK = 270
-
-    NETLINK_ADD_MEMBERSHIP = 1
-    NETLINK_CAP_ACK = 10
-    NETLINK_EXT_ACK = 11
-
-    # Netlink message
-    NLMSG_ERROR = 2
-    NLMSG_DONE = 3
-
-    NLM_F_REQUEST = 1
-    NLM_F_ACK = 4
-    NLM_F_ROOT = 0x100
-    NLM_F_MATCH = 0x200
-    NLM_F_APPEND = 0x800
-
-    NLM_F_CAPPED = 0x100
-    NLM_F_ACK_TLVS = 0x200
-
-    NLM_F_DUMP = NLM_F_ROOT | NLM_F_MATCH
-
-    NLA_F_NESTED = 0x8000
-    NLA_F_NET_BYTEORDER = 0x4000
-
-    NLA_TYPE_MASK = NLA_F_NESTED | NLA_F_NET_BYTEORDER
-
-    # Genetlink defines
-    NETLINK_GENERIC = 16
-
-    GENL_ID_CTRL = 0x10
-
-    # nlctrl
-    CTRL_CMD_GETFAMILY = 3
-
-    CTRL_ATTR_FAMILY_ID = 1
-    CTRL_ATTR_FAMILY_NAME = 2
-    CTRL_ATTR_MAXATTR = 5
-    CTRL_ATTR_MCAST_GROUPS = 7
-
-    CTRL_ATTR_MCAST_GRP_NAME = 1
-    CTRL_ATTR_MCAST_GRP_ID = 2
-
-    # Extack types
-    NLMSGERR_ATTR_MSG = 1
-    NLMSGERR_ATTR_OFFS = 2
-    NLMSGERR_ATTR_COOKIE = 3
-    NLMSGERR_ATTR_POLICY = 4
-    NLMSGERR_ATTR_MISS_TYPE = 5
-    NLMSGERR_ATTR_MISS_NEST = 6
-
-
-class NlAttr:
-    def __init__(self, raw, offset):
-        self._len, self._type = struct.unpack("HH", raw[offset:offset + 4])
-        self.type = self._type & ~Netlink.NLA_TYPE_MASK
-        self.payload_len = self._len
-        self.full_len = (self.payload_len + 3) & ~3
-        self.raw = raw[offset + 4:offset + self.payload_len]
-
-    def as_u16(self):
-        return struct.unpack("H", self.raw)[0]
-
-    def as_u32(self):
-        return struct.unpack("I", self.raw)[0]
-
-    def as_u64(self):
-        return struct.unpack("Q", self.raw)[0]
-
-    def as_strz(self):
-        return self.raw.decode('ascii')[:-1]
-
-    def as_bin(self):
-        return self.raw
-
-    def __repr__(self):
-        return f"[type:{self.type} len:{self._len}] {self.raw}"
-
-
-class NlAttrs:
-    def __init__(self, msg):
-        self.attrs = []
-
-        offset = 0
-        while offset < len(msg):
-            attr = NlAttr(msg, offset)
-            offset += attr.full_len
-            self.attrs.append(attr)
-
-    def __iter__(self):
-        yield from self.attrs
-
-    def __repr__(self):
-        msg = ''
-        for a in self.attrs:
-            if msg:
-                msg += '\n'
-            msg += repr(a)
-        return msg
-
-
-class NlMsg:
-    def __init__(self, msg, offset, attr_space=None):
-        self.hdr = msg[offset:offset + 16]
-
-        self.nl_len, self.nl_type, self.nl_flags, self.nl_seq, self.nl_portid = \
-            struct.unpack("IHHII", self.hdr)
-
-        self.raw = msg[offset + 16:offset + self.nl_len]
-
-        self.error = 0
-        self.done = 0
-
-        extack_off = None
-        if self.nl_type == Netlink.NLMSG_ERROR:
-            self.error = struct.unpack("i", self.raw[0:4])[0]
-            self.done = 1
-            extack_off = 20
-        elif self.nl_type == Netlink.NLMSG_DONE:
-            self.done = 1
-            extack_off = 4
-
-        self.extack = None
-        if self.nl_flags & Netlink.NLM_F_ACK_TLVS and extack_off:
-            self.extack = dict()
-            extack_attrs = NlAttrs(self.raw[extack_off:])
-            for extack in extack_attrs:
-                if extack.type == Netlink.NLMSGERR_ATTR_MSG:
-                    self.extack['msg'] = extack.as_strz()
-                elif extack.type == Netlink.NLMSGERR_ATTR_MISS_TYPE:
-                    self.extack['miss-type'] = extack.as_u32()
-                elif extack.type == Netlink.NLMSGERR_ATTR_MISS_NEST:
-                    self.extack['miss-nest'] = extack.as_u32()
-                elif extack.type == Netlink.NLMSGERR_ATTR_OFFS:
-                    self.extack['bad-attr-offs'] = extack.as_u32()
-                else:
-                    if 'unknown' not in self.extack:
-                        self.extack['unknown'] = []
-                    self.extack['unknown'].append(extack)
-
-            if attr_space:
-                # We don't have the ability to parse nests yet, so only do global
-                if 'miss-type' in self.extack and 'miss-nest' not in self.extack:
-                    miss_type = self.extack['miss-type']
-                    if len(attr_space.attr_list) > miss_type:
-                        spec = attr_space.attr_list[miss_type]
-                        desc = spec['name']
-                        if 'doc' in spec:
-                            desc += f" ({spec['doc']})"
-                        self.extack['miss-type'] = desc
-
-    def __repr__(self):
-        msg = f"nl_len = {self.nl_len} ({len(self.raw)}) nl_flags = 0x{self.nl_flags:x} nl_type = {self.nl_type}\n"
-        if self.error:
-            msg += '\terror: ' + str(self.error)
-        if self.extack:
-            msg += '\textack: ' + repr(self.extack)
-        return msg
-
-
-class NlMsgs:
-    def __init__(self, data, attr_space=None):
-        self.msgs = []
-
-        offset = 0
-        while offset < len(data):
-            msg = NlMsg(data, offset, attr_space=attr_space)
-            offset += msg.nl_len
-            self.msgs.append(msg)
-
-    def __iter__(self):
-        yield from self.msgs
-
-
-genl_family_name_to_id = None
-
-
-def _genl_msg(nl_type, nl_flags, genl_cmd, genl_version, seq=None):
-    # we prepend length in _genl_msg_finalize()
-    if seq is None:
-        seq = random.randint(1, 1024)
-    nlmsg = struct.pack("HHII", nl_type, nl_flags, seq, 0)
-    genlmsg = struct.pack("bbH", genl_cmd, genl_version, 0)
-    return nlmsg + genlmsg
-
-
-def _genl_msg_finalize(msg):
-    return struct.pack("I", len(msg) + 4) + msg
-
-
-def _genl_load_families():
-    with socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, Netlink.NETLINK_GENERIC) as sock:
-        sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1)
-
-        msg = _genl_msg(Netlink.GENL_ID_CTRL,
-                        Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK | Netlink.NLM_F_DUMP,
-                        Netlink.CTRL_CMD_GETFAMILY, 1)
-        msg = _genl_msg_finalize(msg)
-
-        sock.send(msg, 0)
-
-        global genl_family_name_to_id
-        genl_family_name_to_id = dict()
-
-        while True:
-            reply = sock.recv(128 * 1024)
-            nms = NlMsgs(reply)
-            for nl_msg in nms:
-                if nl_msg.error:
-                    print("Netlink error:", nl_msg.error)
-                    return
-                if nl_msg.done:
-                    return
-
-                gm = GenlMsg(nl_msg)
-                fam = dict()
-                for attr in gm.raw_attrs:
-                    if attr.type == Netlink.CTRL_ATTR_FAMILY_ID:
-                        fam['id'] = attr.as_u16()
-                    elif attr.type == Netlink.CTRL_ATTR_FAMILY_NAME:
-                        fam['name'] = attr.as_strz()
-                    elif attr.type == Netlink.CTRL_ATTR_MAXATTR:
-                        fam['maxattr'] = attr.as_u32()
-                    elif attr.type == Netlink.CTRL_ATTR_MCAST_GROUPS:
-                        fam['mcast'] = dict()
-                        for entry in NlAttrs(attr.raw):
-                            mcast_name = None
-                            mcast_id = None
-                            for entry_attr in NlAttrs(entry.raw):
-                                if entry_attr.type == Netlink.CTRL_ATTR_MCAST_GRP_NAME:
-                                    mcast_name = entry_attr.as_strz()
-                                elif entry_attr.type == Netlink.CTRL_ATTR_MCAST_GRP_ID:
-                                    mcast_id = entry_attr.as_u32()
-                            if mcast_name and mcast_id is not None:
-                                fam['mcast'][mcast_name] = mcast_id
-                if 'name' in fam and 'id' in fam:
-                    genl_family_name_to_id[fam['name']] = fam
-
-
-class GenlMsg:
-    def __init__(self, nl_msg):
-        self.nl = nl_msg
-
-        self.hdr = nl_msg.raw[0:4]
-        self.raw = nl_msg.raw[4:]
-
-        self.genl_cmd, self.genl_version, _ = struct.unpack("bbH", self.hdr)
-
-        self.raw_attrs = NlAttrs(self.raw)
-
-    def __repr__(self):
-        msg = repr(self.nl)
-        msg += f"\tgenl_cmd = {self.genl_cmd} genl_ver = {self.genl_version}\n"
-        for a in self.raw_attrs:
-            msg += '\t\t' + repr(a) + '\n'
-        return msg
-
-
-class GenlFamily:
-    def __init__(self, family_name):
-        self.family_name = family_name
-
-        global genl_family_name_to_id
-        if genl_family_name_to_id is None:
-            _genl_load_families()
-
-        self.genl_family = genl_family_name_to_id[family_name]
-        self.family_id = genl_family_name_to_id[family_name]['id']
-
-
-#
-# YNL implementation details.
-#
-
-
-class YnlAttrSpace:
-    def __init__(self, family, yaml):
-        self.yaml = yaml
-
-        self.attrs = dict()
-        self.name = self.yaml['name']
-        self.subspace_of = self.yaml['subset-of'] if 'subspace-of' in self.yaml else None
-
-        val = 0
-        max_val = 0
-        for elem in self.yaml['attributes']:
-            if 'value' in elem:
-                val = elem['value']
-            else:
-                elem['value'] = val
-            if val > max_val:
-                max_val = val
-            val += 1
-
-            self.attrs[elem['name']] = elem
-
-        self.attr_list = [None] * (max_val + 1)
-        for elem in self.yaml['attributes']:
-            self.attr_list[elem['value']] = elem
-
-    def __getitem__(self, key):
-        return self.attrs[key]
-
-    def __contains__(self, key):
-        return key in self.yaml
-
-    def __iter__(self):
-        yield from self.attrs
-
-    def items(self):
-        return self.attrs.items()
-
-
-class YnlFamily:
-    def __init__(self, def_path, schema=None):
-        self.include_raw = False
-
-        with open(def_path, "r") as stream:
-            self.yaml = yaml.safe_load(stream)
-
-        if schema:
-            with open(schema, "r") as stream:
-                schema = yaml.safe_load(stream)
-
-            jsonschema.validate(self.yaml, schema)
-
-        self.sock = socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, Netlink.NETLINK_GENERIC)
-        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1)
-        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_EXT_ACK, 1)
-
-        self._ops = dict()
-        self._spaces = dict()
-        self._types = dict()
-
-        for elem in self.yaml['attribute-sets']:
-            self._spaces[elem['name']] = YnlAttrSpace(self, elem)
-
-        for elem in self.yaml['definitions']:
-            self._types[elem['name']] = elem
-
-        async_separation = 'async-prefix' in self.yaml['operations']
-        self.async_msg_ids = set()
-        self.async_msg_queue = []
-        val = 0
-        max_val = 0
-        for elem in self.yaml['operations']['list']:
-            if not (async_separation and ('notify' in elem or 'event' in elem)):
-                if 'value' in elem:
-                    val = elem['value']
-                else:
-                    elem['value'] = val
-                val += 1
-                max_val = max(val, max_val)
-
-            if 'notify' in elem or 'event' in elem:
-                self.async_msg_ids.add(elem['value'])
-
-            self._ops[elem['name']] = elem
-
-            op_name = elem['name'].replace('-', '_')
-
-            bound_f = functools.partial(self._op, elem['name'])
-            setattr(self, op_name, bound_f)
-
-        self._op_array = [None] * max_val
-        for _, op in self._ops.items():
-            self._op_array[op['value']] = op
-            if 'notify' in op:
-                op['attribute-set'] = self._ops[op['notify']]['attribute-set']
-
-        self.family = GenlFamily(self.yaml['name'])
-
-    def ntf_subscribe(self, mcast_name):
-        if mcast_name not in self.family.genl_family['mcast']:
-            raise Exception(f'Multicast group "{mcast_name}" not present in the family')
-
-        self.sock.bind((0, 0))
-        self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_ADD_MEMBERSHIP,
-                             self.family.genl_family['mcast'][mcast_name])
-
-    def _add_attr(self, space, name, value):
-        attr = self._spaces[space][name]
-        nl_type = attr['value']
-        if attr["type"] == 'nest':
-            nl_type |= Netlink.NLA_F_NESTED
-            attr_payload = b''
-            for subname, subvalue in value.items():
-                attr_payload += self._add_attr(attr['nested-attributes'], subname, subvalue)
-        elif attr["type"] == 'u32':
-            attr_payload = struct.pack("I", int(value))
-        elif attr["type"] == 'string':
-            attr_payload = str(value).encode('ascii') + b'\x00'
-        elif attr["type"] == 'binary':
-            attr_payload = value
-        else:
-            raise Exception(f'Unknown type at {space} {name} {value} {attr["type"]}')
-
-        pad = b'\x00' * ((4 - len(attr_payload) % 4) % 4)
-        return struct.pack('HH', len(attr_payload) + 4, nl_type) + attr_payload + pad
-
-    def _decode_enum(self, rsp, attr_spec):
-        raw = rsp[attr_spec['name']]
-        enum = self._types[attr_spec['enum']]
-        i = attr_spec.get('value-start', 0)
-        if 'enum-as-flags' in attr_spec and attr_spec['enum-as-flags']:
-            value = set()
-            while raw:
-                if raw & 1:
-                    value.add(enum['entries'][i])
-                raw >>= 1
-                i += 1
-        else:
-            value = enum['entries'][raw - i]
-        rsp[attr_spec['name']] = value
-
-    def _decode(self, attrs, space):
-        attr_space = self._spaces[space]
-        rsp = dict()
-        for attr in attrs:
-            attr_spec = attr_space.attr_list[attr.type]
-            if attr_spec["type"] == 'nest':
-                subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'])
-                rsp[attr_spec['name']] = subdict
-            elif attr_spec['type'] == 'u32':
-                rsp[attr_spec['name']] = attr.as_u32()
-            elif attr_spec['type'] == 'u64':
-                rsp[attr_spec['name']] = attr.as_u64()
-            elif attr_spec["type"] == 'string':
-                rsp[attr_spec['name']] = attr.as_strz()
-            elif attr_spec["type"] == 'binary':
-                rsp[attr_spec['name']] = attr.as_bin()
-            else:
-                raise Exception(f'Unknown {attr.type} {attr_spec["name"]} {attr_spec["type"]}')
-
-            if 'enum' in attr_spec:
-                self._decode_enum(rsp, attr_spec)
-        return rsp
-
-    def handle_ntf(self, nl_msg, genl_msg):
-        msg = dict()
-        if self.include_raw:
-            msg['nlmsg'] = nl_msg
-            msg['genlmsg'] = genl_msg
-        op = self._op_array[genl_msg.genl_cmd]
-        msg['name'] = op['name']
-        msg['msg'] = self._decode(genl_msg.raw_attrs, op['attribute-set'])
-        self.async_msg_queue.append(msg)
-
-    def check_ntf(self):
-        while True:
-            try:
-                reply = self.sock.recv(128 * 1024, socket.MSG_DONTWAIT)
-            except BlockingIOError:
-                return
-
-            nms = NlMsgs(reply)
-            for nl_msg in nms:
-                if nl_msg.error:
-                    print("Netlink error in ntf!?", os.strerror(-nl_msg.error))
-                    print(nl_msg)
-                    continue
-                if nl_msg.done:
-                    print("Netlink done while checking for ntf!?")
-                    continue
-
-                gm = GenlMsg(nl_msg)
-                if gm.genl_cmd not in self.async_msg_ids:
-                    print("Unexpected msg id done while checking for ntf", gm)
-                    continue
-
-                self.handle_ntf(nl_msg, gm)
-
-    def _op(self, method, vals, dump=False):
-        op = self._ops[method]
-
-        nl_flags = Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK
-        if dump:
-            nl_flags |= Netlink.NLM_F_DUMP
-
-        req_seq = random.randint(1024, 65535)
-        msg = _genl_msg(self.family.family_id, nl_flags, op['value'], 1, req_seq)
-        for name, value in vals.items():
-            msg += self._add_attr(op['attribute-set'], name, value)
-        msg = _genl_msg_finalize(msg)
-
-        self.sock.send(msg, 0)
-
-        done = False
-        rsp = []
-        while not done:
-            reply = self.sock.recv(128 * 1024)
-            nms = NlMsgs(reply, attr_space=self._spaces[op['attribute-set']])
-            for nl_msg in nms:
-                if nl_msg.error:
-                    print("Netlink error:", os.strerror(-nl_msg.error))
-                    print(nl_msg)
-                    return
-                if nl_msg.done:
-                    done = True
-                    break
-
-                gm = GenlMsg(nl_msg)
-                # Check if this is a reply to our request
-                if nl_msg.nl_seq != req_seq or gm.genl_cmd != op['value']:
-                    if gm.genl_cmd in self.async_msg_ids:
-                        self.handle_ntf(nl_msg, gm)
-                        continue
-                    else:
-                        print('Unexpected message: ' + repr(gm))
-                        continue
-
-                rsp.append(self._decode(gm.raw_attrs, op['attribute-set']))
-
-        if not rsp:
-            return None
-        if not dump and len(rsp) == 1:
-            return rsp[0]
-        return rsp
-- 
cgit 


From 3aacf8281336ac57fe4a4e85fa55a68218e90b5c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 30 Jan 2023 18:33:43 -0800
Subject: tools: ynl: add an object hierarchy to represent parsed spec

There's a lot of copy and pasting going on between the "cli"
and code gen when it comes to representing the parsed spec.
Create a library which both can use.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/__init__.py |   4 +-
 tools/net/ynl/lib/nlspec.py   | 301 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 304 insertions(+), 1 deletion(-)
 create mode 100644 tools/net/ynl/lib/nlspec.py

(limited to 'tools')

diff --git a/tools/net/ynl/lib/__init__.py b/tools/net/ynl/lib/__init__.py
index 0a6102758ebe..3c73f59eabab 100644
--- a/tools/net/ynl/lib/__init__.py
+++ b/tools/net/ynl/lib/__init__.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
+from .nlspec import SpecAttr, SpecAttrSet, SpecFamily, SpecOperation
 from .ynl import YnlFamily
 
-__all__ = ["YnlFamily"]
+__all__ = ["SpecAttr", "SpecAttrSet", "SpecFamily", "SpecOperation",
+           "YnlFamily"]
diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py
new file mode 100644
index 000000000000..4aa3b1ad97f0
--- /dev/null
+++ b/tools/net/ynl/lib/nlspec.py
@@ -0,0 +1,301 @@
+# SPDX-License-Identifier: BSD-3-Clause
+
+import collections
+import jsonschema
+import os
+import traceback
+import yaml
+
+
+class SpecElement:
+    """Netlink spec element.
+
+    Abstract element of the Netlink spec. Implements the dictionary interface
+    for access to the raw spec. Supports iterative resolution of dependencies
+    across elements and class inheritance levels. The elements of the spec
+    may refer to each other, and although loops should be very rare, having
+    to maintain correct ordering of instantiation is painful, so the resolve()
+    method should be used to perform parts of init which require access to
+    other parts of the spec.
+
+    Attributes:
+        yaml        raw spec as loaded from the spec file
+        family      back reference to the full family
+
+        name        name of the entity as listed in the spec (optional)
+        ident_name  name which can be safely used as identifier in code (optional)
+    """
+    def __init__(self, family, yaml):
+        self.yaml = yaml
+        self.family = family
+
+        if 'name' in self.yaml:
+            self.name = self.yaml['name']
+            self.ident_name = self.name.replace('-', '_')
+
+        self._super_resolved = False
+        family.add_unresolved(self)
+
+    def __getitem__(self, key):
+        return self.yaml[key]
+
+    def __contains__(self, key):
+        return key in self.yaml
+
+    def get(self, key, default=None):
+        return self.yaml.get(key, default)
+
+    def resolve_up(self, up):
+        if not self._super_resolved:
+            up.resolve()
+            self._super_resolved = True
+
+    def resolve(self):
+        pass
+
+
+class SpecAttr(SpecElement):
+    """ Single Netlink atttribute type
+
+    Represents a single attribute type within an attr space.
+
+    Attributes:
+        value      numerical ID when serialized
+        attr_set   Attribute Set containing this attr
+    """
+    def __init__(self, family, attr_set, yaml, value):
+        super().__init__(family, yaml)
+
+        self.value = value
+        self.attr_set = attr_set
+        self.is_multi = yaml.get('multi-attr', False)
+
+
+class SpecAttrSet(SpecElement):
+    """ Netlink Attribute Set class.
+
+    Represents a ID space of attributes within Netlink.
+
+    Note that unlike other elements, which expose contents of the raw spec
+    via the dictionary interface Attribute Set exposes attributes by name.
+
+    Attributes:
+        attrs      ordered dict of all attributes (indexed by name)
+        attrs_by_val  ordered dict of all attributes (indexed by value)
+        subset_of  parent set if this is a subset, otherwise None
+    """
+    def __init__(self, family, yaml):
+        super().__init__(family, yaml)
+
+        self.subset_of = self.yaml.get('subset-of', None)
+
+        self.attrs = collections.OrderedDict()
+        self.attrs_by_val = collections.OrderedDict()
+
+        val = 0
+        for elem in self.yaml['attributes']:
+            if 'value' in elem:
+                val = elem['value']
+
+            attr = self.new_attr(elem, val)
+            self.attrs[attr.name] = attr
+            self.attrs_by_val[attr.value] = attr
+            val += 1
+
+    def new_attr(self, elem, value):
+        return SpecAttr(self.family, self, elem, value)
+
+    def __getitem__(self, key):
+        return self.attrs[key]
+
+    def __contains__(self, key):
+        return key in self.attrs
+
+    def __iter__(self):
+        yield from self.attrs
+
+    def items(self):
+        return self.attrs.items()
+
+
+class SpecOperation(SpecElement):
+    """Netlink Operation
+
+    Information about a single Netlink operation.
+
+    Attributes:
+        value       numerical ID when serialized, None if req/rsp values differ
+
+        req_value   numerical ID when serialized, user -> kernel
+        rsp_value   numerical ID when serialized, user <- kernel
+        is_call     bool, whether the operation is a call
+        is_async    bool, whether the operation is a notification
+        is_resv     bool, whether the operation does not exist (it's just a reserved ID)
+        attr_set    attribute set name
+
+        yaml        raw spec as loaded from the spec file
+    """
+    def __init__(self, family, yaml, req_value, rsp_value):
+        super().__init__(family, yaml)
+
+        self.value = req_value if req_value == rsp_value else None
+        self.req_value = req_value
+        self.rsp_value = rsp_value
+
+        self.is_call = 'do' in yaml or 'dump' in yaml
+        self.is_async = 'notify' in yaml or 'event' in yaml
+        self.is_resv = not self.is_async and not self.is_call
+
+        # Added by resolve:
+        self.attr_set = None
+        delattr(self, "attr_set")
+
+    def resolve(self):
+        self.resolve_up(super())
+
+        if 'attribute-set' in self.yaml:
+            attr_set_name = self.yaml['attribute-set']
+        elif 'notify' in self.yaml:
+            msg = self.family.msgs[self.yaml['notify']]
+            attr_set_name = msg['attribute-set']
+        elif self.is_resv:
+            attr_set_name = ''
+        else:
+            raise Exception(f"Can't resolve attribute set for op '{self.name}'")
+        if attr_set_name:
+            self.attr_set = self.family.attr_sets[attr_set_name]
+
+
+class SpecFamily(SpecElement):
+    """ Netlink Family Spec class.
+
+    Netlink family information loaded from a spec (e.g. in YAML).
+    Takes care of unfolding implicit information which can be skipped
+    in the spec itself for brevity.
+
+    The class can be used like a dictionary to access the raw spec
+    elements but that's usually a bad idea.
+
+    Attributes:
+        proto     protocol type (e.g. genetlink)
+
+        attr_sets  dict of attribute sets
+        msgs       dict of all messages (index by name)
+        msgs_by_value  dict of all messages (indexed by name)
+        ops        dict of all valid requests / responses
+    """
+    def __init__(self, spec_path, schema_path=None):
+        with open(spec_path, "r") as stream:
+            spec = yaml.safe_load(stream)
+
+        self._resolution_list = []
+
+        super().__init__(self, spec)
+
+        self.proto = self.yaml.get('protocol', 'genetlink')
+
+        if schema_path is None:
+            schema_path = os.path.dirname(os.path.dirname(spec_path)) + f'/{self.proto}.yaml'
+        if schema_path:
+            with open(schema_path, "r") as stream:
+                schema = yaml.safe_load(stream)
+
+            jsonschema.validate(self.yaml, schema)
+
+        self.attr_sets = collections.OrderedDict()
+        self.msgs = collections.OrderedDict()
+        self.req_by_value = collections.OrderedDict()
+        self.rsp_by_value = collections.OrderedDict()
+        self.ops = collections.OrderedDict()
+
+        last_exception = None
+        while len(self._resolution_list) > 0:
+            resolved = []
+            unresolved = self._resolution_list
+            self._resolution_list = []
+
+            for elem in unresolved:
+                try:
+                    elem.resolve()
+                except (KeyError, AttributeError) as e:
+                    self._resolution_list.append(elem)
+                    last_exception = e
+                    continue
+
+                resolved.append(elem)
+
+            if len(resolved) == 0:
+                traceback.print_exception(last_exception)
+                raise Exception("Could not resolve any spec element, infinite loop?")
+
+    def new_attr_set(self, elem):
+        return SpecAttrSet(self, elem)
+
+    def new_operation(self, elem, req_val, rsp_val):
+        return SpecOperation(self, elem, req_val, rsp_val)
+
+    def add_unresolved(self, elem):
+        self._resolution_list.append(elem)
+
+    def _dictify_ops_unified(self):
+        val = 0
+        for elem in self.yaml['operations']['list']:
+            if 'value' in elem:
+                val = elem['value']
+
+            op = self.new_operation(elem, val, val)
+            val += 1
+
+            self.msgs[op.name] = op
+
+    def _dictify_ops_directional(self):
+        req_val = rsp_val = 0
+        for elem in self.yaml['operations']['list']:
+            if 'notify' in elem:
+                if 'value' in elem:
+                    rsp_val = elem['value']
+                req_val_next = req_val
+                rsp_val_next = rsp_val + 1
+                req_val = None
+            elif 'do' in elem or 'dump' in elem:
+                mode = elem['do'] if 'do' in elem else elem['dump']
+
+                v = mode.get('request', {}).get('value', None)
+                if v:
+                    req_val = v
+                v = mode.get('reply', {}).get('value', None)
+                if v:
+                    rsp_val = v
+
+                rsp_inc = 1 if 'reply' in mode else 0
+                req_val_next = req_val + 1
+                rsp_val_next = rsp_val + rsp_inc
+            else:
+                raise Exception("Can't parse directional ops")
+
+            op = self.new_operation(elem, req_val, rsp_val)
+            req_val = req_val_next
+            rsp_val = rsp_val_next
+
+            self.msgs[op.name] = op
+
+    def resolve(self):
+        self.resolve_up(super())
+
+        for elem in self.yaml['attribute-sets']:
+            attr_set = self.new_attr_set(elem)
+            self.attr_sets[elem['name']] = attr_set
+
+        msg_id_model = self.yaml['operations'].get('enum-model', 'unified')
+        if msg_id_model == 'unified':
+            self._dictify_ops_unified()
+        elif msg_id_model == 'directional':
+            self._dictify_ops_directional()
+
+        for op in self.msgs.values():
+            if op.req_value is not None:
+                self.req_by_value[op.req_value] = op
+            if op.rsp_value is not None:
+                self.rsp_by_value[op.rsp_value] = op
+            if not op.is_async and 'attribute-set' in op:
+                self.ops[op.name] = op
-- 
cgit 


From 30a5c6c8104f34ed02c50041b82f3ef80924de6d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 30 Jan 2023 18:33:44 -0800
Subject: tools: ynl: use the common YAML loading and validation code

Adapt the common object hierarchy in code gen and CLI.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/ynl.py   | 118 +++++----------------
 tools/net/ynl/ynl-gen-c.py | 256 +++++++++++++++++++++------------------------
 2 files changed, 142 insertions(+), 232 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py
index b71523d71d46..0ceb627ba686 100644
--- a/tools/net/ynl/lib/ynl.py
+++ b/tools/net/ynl/lib/ynl.py
@@ -1,13 +1,14 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 import functools
-import jsonschema
 import os
 import random
 import socket
 import struct
 import yaml
 
+from .nlspec import SpecFamily
+
 #
 # Generic Netlink code which should really be in some library, but I can't quickly find one.
 #
@@ -158,8 +159,8 @@ class NlMsg:
                 # We don't have the ability to parse nests yet, so only do global
                 if 'miss-type' in self.extack and 'miss-nest' not in self.extack:
                     miss_type = self.extack['miss-type']
-                    if len(attr_space.attr_list) > miss_type:
-                        spec = attr_space.attr_list[miss_type]
+                    if miss_type in attr_space.attrs_by_val:
+                        spec = attr_space.attrs_by_val[miss_type]
                         desc = spec['name']
                         if 'doc' in spec:
                             desc += f" ({spec['doc']})"
@@ -289,100 +290,31 @@ class GenlFamily:
 #
 
 
-class YnlAttrSpace:
-    def __init__(self, family, yaml):
-        self.yaml = yaml
-
-        self.attrs = dict()
-        self.name = self.yaml['name']
-        self.subspace_of = self.yaml['subset-of'] if 'subspace-of' in self.yaml else None
-
-        val = 0
-        max_val = 0
-        for elem in self.yaml['attributes']:
-            if 'value' in elem:
-                val = elem['value']
-            else:
-                elem['value'] = val
-            if val > max_val:
-                max_val = val
-            val += 1
-
-            self.attrs[elem['name']] = elem
-
-        self.attr_list = [None] * (max_val + 1)
-        for elem in self.yaml['attributes']:
-            self.attr_list[elem['value']] = elem
-
-    def __getitem__(self, key):
-        return self.attrs[key]
-
-    def __contains__(self, key):
-        return key in self.yaml
-
-    def __iter__(self):
-        yield from self.attrs
-
-    def items(self):
-        return self.attrs.items()
-
-
-class YnlFamily:
+class YnlFamily(SpecFamily):
     def __init__(self, def_path, schema=None):
-        self.include_raw = False
+        super().__init__(def_path, schema)
 
-        with open(def_path, "r") as stream:
-            self.yaml = yaml.safe_load(stream)
-
-        if schema:
-            with open(schema, "r") as stream:
-                schema = yaml.safe_load(stream)
-
-            jsonschema.validate(self.yaml, schema)
+        self.include_raw = False
 
         self.sock = socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, Netlink.NETLINK_GENERIC)
         self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1)
         self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_EXT_ACK, 1)
 
-        self._ops = dict()
-        self._spaces = dict()
         self._types = dict()
 
-        for elem in self.yaml['attribute-sets']:
-            self._spaces[elem['name']] = YnlAttrSpace(self, elem)
-
         for elem in self.yaml['definitions']:
             self._types[elem['name']] = elem
 
-        async_separation = 'async-prefix' in self.yaml['operations']
         self.async_msg_ids = set()
         self.async_msg_queue = []
-        val = 0
-        max_val = 0
-        for elem in self.yaml['operations']['list']:
-            if not (async_separation and ('notify' in elem or 'event' in elem)):
-                if 'value' in elem:
-                    val = elem['value']
-                else:
-                    elem['value'] = val
-                val += 1
-                max_val = max(val, max_val)
-
-            if 'notify' in elem or 'event' in elem:
-                self.async_msg_ids.add(elem['value'])
-
-            self._ops[elem['name']] = elem
-
-            op_name = elem['name'].replace('-', '_')
 
-            bound_f = functools.partial(self._op, elem['name'])
-            setattr(self, op_name, bound_f)
+        for msg in self.msgs.values():
+            if msg.is_async:
+                self.async_msg_ids.add(msg.value)
 
-        self._op_array = [None] * max_val
-        for _, op in self._ops.items():
-            self._op_array[op['value']] = op
-            if 'notify' in op:
-                op['attribute-set'] = self._ops[op['notify']]['attribute-set']
+        for op_name, op in self.ops.items():
+            bound_f = functools.partial(self._op, op_name)
+            setattr(self, op.ident_name, bound_f)
 
         self.family = GenlFamily(self.yaml['name'])
 
@@ -395,8 +327,8 @@ class YnlFamily:
                              self.family.genl_family['mcast'][mcast_name])
 
     def _add_attr(self, space, name, value):
-        attr = self._spaces[space][name]
-        nl_type = attr['value']
+        attr = self.attr_sets[space][name]
+        nl_type = attr.value
         if attr["type"] == 'nest':
             nl_type |= Netlink.NLA_F_NESTED
             attr_payload = b''
@@ -430,10 +362,10 @@ class YnlFamily:
         rsp[attr_spec['name']] = value
 
     def _decode(self, attrs, space):
-        attr_space = self._spaces[space]
+        attr_space = self.attr_sets[space]
         rsp = dict()
         for attr in attrs:
-            attr_spec = attr_space.attr_list[attr.type]
+            attr_spec = attr_space.attrs_by_val[attr.type]
             if attr_spec["type"] == 'nest':
                 subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'])
                 rsp[attr_spec['name']] = subdict
@@ -457,9 +389,9 @@ class YnlFamily:
         if self.include_raw:
             msg['nlmsg'] = nl_msg
             msg['genlmsg'] = genl_msg
-        op = self._op_array[genl_msg.genl_cmd]
+        op = self.msgs_by_value[genl_msg.genl_cmd]
         msg['name'] = op['name']
-        msg['msg'] = self._decode(genl_msg.raw_attrs, op['attribute-set'])
+        msg['msg'] = self._decode(genl_msg.raw_attrs, op.attr_set.name)
         self.async_msg_queue.append(msg)
 
     def check_ntf(self):
@@ -487,16 +419,16 @@ class YnlFamily:
                 self.handle_ntf(nl_msg, gm)
 
     def _op(self, method, vals, dump=False):
-        op = self._ops[method]
+        op = self.ops[method]
 
         nl_flags = Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK
         if dump:
             nl_flags |= Netlink.NLM_F_DUMP
 
         req_seq = random.randint(1024, 65535)
-        msg = _genl_msg(self.family.family_id, nl_flags, op['value'], 1, req_seq)
+        msg = _genl_msg(self.family.family_id, nl_flags, op.value, 1, req_seq)
         for name, value in vals.items():
-            msg += self._add_attr(op['attribute-set'], name, value)
+            msg += self._add_attr(op.attr_set.name, name, value)
         msg = _genl_msg_finalize(msg)
 
         self.sock.send(msg, 0)
@@ -505,7 +437,7 @@ class YnlFamily:
         rsp = []
         while not done:
             reply = self.sock.recv(128 * 1024)
-            nms = NlMsgs(reply, attr_space=self._spaces[op['attribute-set']])
+            nms = NlMsgs(reply, attr_space=op.attr_set)
             for nl_msg in nms:
                 if nl_msg.error:
                     print("Netlink error:", os.strerror(-nl_msg.error))
@@ -517,7 +449,7 @@ class YnlFamily:
 
                 gm = GenlMsg(nl_msg)
                 # Check if this is a reply to our request
-                if nl_msg.nl_seq != req_seq or gm.genl_cmd != op['value']:
+                if nl_msg.nl_seq != req_seq or gm.genl_cmd != op.value:
                     if gm.genl_cmd in self.async_msg_ids:
                         self.handle_ntf(nl_msg, gm)
                         continue
@@ -525,7 +457,7 @@ class YnlFamily:
                         print('Unexpected message: ' + repr(gm))
                         continue
 
-                rsp.append(self._decode(gm.raw_attrs, op['attribute-set']))
+                rsp.append(self._decode(gm.raw_attrs, op.attr_set.name))
 
         if not rsp:
             return None
diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index e5002d420961..dc14da634e8e 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -2,10 +2,11 @@
 
 import argparse
 import collections
-import jsonschema
 import os
 import yaml
 
+from lib import SpecFamily, SpecAttrSet, SpecAttr, SpecOperation
+
 
 def c_upper(name):
     return name.upper().replace('-', '_')
@@ -28,12 +29,12 @@ class BaseNlLib:
                    "ynl_cb_array, NLMSG_MIN_TYPE)"
 
 
-class Type:
-    def __init__(self, family, attr_set, attr):
-        self.family = family
+class Type(SpecAttr):
+    def __init__(self, family, attr_set, attr, value):
+        super().__init__(family, attr_set, attr, value)
+
         self.attr = attr
-        self.value = attr['value']
-        self.name = c_lower(attr['name'])
+        self.attr_set = attr_set
         self.type = attr['type']
         self.checks = attr.get('checks', {})
 
@@ -46,17 +47,17 @@ class Type:
             else:
                 self.nested_render_name = f"{family.name}_{c_lower(self.nested_attrs)}"
 
-        self.enum_name = f"{attr_set.name_prefix}{self.name}"
-        self.enum_name = c_upper(self.enum_name)
         self.c_name = c_lower(self.name)
         if self.c_name in _C_KW:
             self.c_name += '_'
 
-    def __getitem__(self, key):
-        return self.attr[key]
+        # Added by resolve():
+        self.enum_name = None
+        delattr(self, "enum_name")
 
-    def __contains__(self, key):
-        return key in self.attr
+    def resolve(self):
+        self.enum_name = f"{self.attr_set.name_prefix}{self.name}"
+        self.enum_name = c_upper(self.enum_name)
 
     def is_multi_val(self):
         return None
@@ -214,24 +215,34 @@ class TypePad(Type):
 
 
 class TypeScalar(Type):
-    def __init__(self, family, attr_set, attr):
-        super().__init__(family, attr_set, attr)
+    def __init__(self, family, attr_set, attr, value):
+        super().__init__(family, attr_set, attr, value)
+
+        self.byte_order_comment = ''
+        if 'byte-order' in attr:
+            self.byte_order_comment = f" /* {attr['byte-order']} */"
+
+        # Added by resolve():
+        self.is_bitfield = None
+        delattr(self, "is_bitfield")
+        self.type_name = None
+        delattr(self, "type_name")
+
+    def resolve(self):
+        self.resolve_up(super())
 
-        self.is_bitfield = False
-        if 'enum' in self.attr:
-            self.is_bitfield = family.consts[self.attr['enum']]['type'] == 'flags'
         if 'enum-as-flags' in self.attr and self.attr['enum-as-flags']:
             self.is_bitfield = True
+        elif 'enum' in self.attr:
+            self.is_bitfield = self.family.consts[self.attr['enum']]['type'] == 'flags'
+        else:
+            self.is_bitfield = False
 
         if 'enum' in self.attr and not self.is_bitfield:
-            self.type_name = f"enum {family.name}_{c_lower(self.attr['enum'])}"
+            self.type_name = f"enum {self.family.name}_{c_lower(self.attr['enum'])}"
         else:
             self.type_name = '__' + self.type
 
-        self.byte_order_comment = ''
-        if 'byte-order' in attr:
-            self.byte_order_comment = f" /* {attr['byte-order']} */"
-
     def _mnl_type(self):
         t = self.type
         # mnl does not have a helper for signed types
@@ -648,14 +659,11 @@ class EnumSet:
         return mask
 
 
-class AttrSet:
+class AttrSet(SpecAttrSet):
     def __init__(self, family, yaml):
-        self.yaml = yaml
+        super().__init__(family, yaml)
 
-        self.attrs = dict()
-        self.name = self.yaml['name']
-        if 'subset-of' not in yaml:
-            self.subset_of = None
+        if self.subset_of is None:
             if 'name-prefix' in yaml:
                 pfx = yaml['name-prefix']
             elif self.name == family.name:
@@ -665,83 +673,68 @@ class AttrSet:
             self.name_prefix = c_upper(pfx)
             self.max_name = c_upper(self.yaml.get('attr-max-name', f"{self.name_prefix}max"))
         else:
-            self.subset_of = self.yaml['subset-of']
             self.name_prefix = family.attr_sets[self.subset_of].name_prefix
             self.max_name = family.attr_sets[self.subset_of].max_name
 
+        # Added by resolve:
+        self.c_name = None
+        delattr(self, "c_name")
+
+    def resolve(self):
         self.c_name = c_lower(self.name)
         if self.c_name in _C_KW:
             self.c_name += '_'
-        if self.c_name == family.c_name:
+        if self.c_name == self.family.c_name:
             self.c_name = ''
 
-        val = 0
-        for elem in self.yaml['attributes']:
-            if 'value' in elem:
-                val = elem['value']
-            else:
-                elem['value'] = val
-            val += 1
-
-            if 'multi-attr' in elem and elem['multi-attr']:
-                attr = TypeMultiAttr(family, self, elem)
-            elif elem['type'] in scalars:
-                attr = TypeScalar(family, self, elem)
-            elif elem['type'] == 'unused':
-                attr = TypeUnused(family, self, elem)
-            elif elem['type'] == 'pad':
-                attr = TypePad(family, self, elem)
-            elif elem['type'] == 'flag':
-                attr = TypeFlag(family, self, elem)
-            elif elem['type'] == 'string':
-                attr = TypeString(family, self, elem)
-            elif elem['type'] == 'binary':
-                attr = TypeBinary(family, self, elem)
-            elif elem['type'] == 'nest':
-                attr = TypeNest(family, self, elem)
-            elif elem['type'] == 'array-nest':
-                attr = TypeArrayNest(family, self, elem)
-            elif elem['type'] == 'nest-type-value':
-                attr = TypeNestTypeValue(family, self, elem)
-            else:
-                raise Exception(f"No typed class for type {elem['type']}")
-
-            self.attrs[elem['name']] = attr
-
-    def __getitem__(self, key):
-        return self.attrs[key]
-
-    def __contains__(self, key):
-        return key in self.yaml
-
-    def __iter__(self):
-        yield from self.attrs
+    def new_attr(self, elem, value):
+        if 'multi-attr' in elem and elem['multi-attr']:
+            return TypeMultiAttr(self.family, self, elem, value)
+        elif elem['type'] in scalars:
+            return TypeScalar(self.family, self, elem, value)
+        elif elem['type'] == 'unused':
+            return TypeUnused(self.family, self, elem, value)
+        elif elem['type'] == 'pad':
+            return TypePad(self.family, self, elem, value)
+        elif elem['type'] == 'flag':
+            return TypeFlag(self.family, self, elem, value)
+        elif elem['type'] == 'string':
+            return TypeString(self.family, self, elem, value)
+        elif elem['type'] == 'binary':
+            return TypeBinary(self.family, self, elem, value)
+        elif elem['type'] == 'nest':
+            return TypeNest(self.family, self, elem, value)
+        elif elem['type'] == 'array-nest':
+            return TypeArrayNest(self.family, self, elem, value)
+        elif elem['type'] == 'nest-type-value':
+            return TypeNestTypeValue(self.family, self, elem, value)
+        else:
+            raise Exception(f"No typed class for type {elem['type']}")
 
-    def items(self):
-        return self.attrs.items()
 
+class Operation(SpecOperation):
+    def __init__(self, family, yaml, req_value, rsp_value):
+        super().__init__(family, yaml, req_value, rsp_value)
 
-class Operation:
-    def __init__(self, family, yaml, value):
-        self.yaml = yaml
-        self.value = value
+        if req_value != rsp_value:
+            raise Exception("Directional messages not supported by codegen")
 
-        self.name = self.yaml['name']
         self.render_name = family.name + '_' + c_lower(self.name)
-        self.is_async = 'notify' in yaml or 'event' in yaml
-        if not self.is_async:
-            self.enum_name = family.op_prefix + c_upper(self.name)
-        else:
-            self.enum_name = family.async_op_prefix + c_upper(self.name)
 
         self.dual_policy = ('do' in yaml and 'request' in yaml['do']) and \
                          ('dump' in yaml and 'request' in yaml['dump'])
 
-    def __getitem__(self, key):
-        return self.yaml[key]
+        # Added by resolve:
+        self.enum_name = None
+        delattr(self, "enum_name")
 
-    def __contains__(self, key):
-        return key in self.yaml
+    def resolve(self):
+        self.resolve_up(super())
+
+        if not self.is_async:
+            self.enum_name = self.family.op_prefix + c_upper(self.name)
+        else:
+            self.enum_name = self.family.async_op_prefix + c_upper(self.name)
 
     def add_notification(self, op):
         if 'notify' not in self.yaml:
@@ -751,21 +744,23 @@ class Operation:
         self.yaml['notify']['cmds'].append(op)
 
 
-class Family:
+class Family(SpecFamily):
     def __init__(self, file_name):
-        with open(file_name, "r") as stream:
-            self.yaml = yaml.safe_load(stream)
-
-        self.proto = self.yaml.get('protocol', 'genetlink')
-
-        with open(os.path.dirname(os.path.dirname(file_name)) +
-                  f'/{self.proto}.yaml', "r") as stream:
-            schema = yaml.safe_load(stream)
-
-        jsonschema.validate(self.yaml, schema)
-
-        if self.yaml.get('protocol', 'genetlink') not in {'genetlink', 'genetlink-c', 'genetlink-legacy'}:
-            raise Exception("Codegen only supported for genetlink")
+        # Added by resolve:
+        self.c_name = None
+        delattr(self, "c_name")
+        self.op_prefix = None
+        delattr(self, "op_prefix")
+        self.async_op_prefix = None
+        delattr(self, "async_op_prefix")
+        self.mcgrps = None
+        delattr(self, "mcgrps")
+        self.consts = None
+        delattr(self, "consts")
+        self.hooks = None
+        delattr(self, "hooks")
+
+        super().__init__(file_name)
 
         self.fam_key = c_upper(self.yaml.get('c-family-name', self.yaml["name"] + '_FAMILY_NAME'))
         self.ver_key = c_upper(self.yaml.get('c-version-name', self.yaml["name"] + '_FAMILY_VERSION'))
@@ -773,12 +768,18 @@ class Family:
         if 'definitions' not in self.yaml:
             self.yaml['definitions'] = []
 
-        self.name = self.yaml['name']
-        self.c_name = c_lower(self.name)
         if 'uapi-header' in self.yaml:
             self.uapi_header = self.yaml['uapi-header']
         else:
             self.uapi_header = f"linux/{self.name}.h"
+
+    def resolve(self):
+        self.resolve_up(super())
+
+        if self.yaml.get('protocol', 'genetlink') not in {'genetlink', 'genetlink-c', 'genetlink-legacy'}:
+            raise Exception("Codegen only supported for genetlink")
+
+        self.c_name = c_lower(self.name)
         if 'name-prefix' in self.yaml['operations']:
             self.op_prefix = c_upper(self.yaml['operations']['name-prefix'])
         else:
@@ -791,12 +792,6 @@ class Family:
         self.mcgrps = self.yaml.get('mcast-groups', {'list': []})
 
         self.consts = dict()
-        # list of all operations
-        self.msg_list = []
-        # dict of operations which have their own message type (have attributes)
-        self.ops = collections.OrderedDict()
-        self.attr_sets = dict()
-        self.attr_sets_list = []
 
         self.hooks = dict()
         for when in ['pre', 'post']:
@@ -824,11 +819,11 @@ class Family:
         if self.kernel_policy == 'global':
             self._load_global_policy()
 
-    def __getitem__(self, key):
-        return self.yaml[key]
+    def new_attr_set(self, elem):
+        return AttrSet(self, elem)
 
-    def get(self, key, default=None):
-        return self.yaml.get(key, default)
+    def new_operation(self, elem, req_value, rsp_value):
+        return Operation(self, elem, req_value, rsp_value)
 
     # Fake a 'do' equivalent of all events, so that we can render their response parsing
     def _mock_up_events(self):
@@ -847,27 +842,10 @@ class Family:
             else:
                 self.consts[elem['name']] = elem
 
-        for elem in self.yaml['attribute-sets']:
-            attr_set = AttrSet(self, elem)
-            self.attr_sets[elem['name']] = attr_set
-            self.attr_sets_list.append((elem['name'], attr_set), )
-
         ntf = []
-        val = 0
-        for elem in self.yaml['operations']['list']:
-            if 'value' in elem:
-                val = elem['value']
-
-            op = Operation(self, elem, val)
-            val += 1
-
-            self.msg_list.append(op)
-            if 'notify' in elem:
-                ntf.append(op)
-                continue
-            if 'attribute-set' not in elem:
-                continue
-            self.ops[elem['name']] = op
+        for msg in self.msgs.values():
+            if 'notify' in msg:
+                ntf.append(msg)
         for n in ntf:
             self.ops[n['notify']].add_notification(n)
 
@@ -2033,7 +2011,7 @@ def render_uapi(family, cw):
 
     max_by_define = family.get('max-by-define', False)
 
-    for _, attr_set in family.attr_sets_list:
+    for _, attr_set in family.attr_sets.items():
         if attr_set.subset_of:
             continue
 
@@ -2044,9 +2022,9 @@ def render_uapi(family, cw):
         uapi_enum_start(family, cw, attr_set.yaml, 'enum-name')
         for _, attr in attr_set.items():
             suffix = ','
-            if attr['value'] != val:
-                suffix = f" = {attr['value']},"
-                val = attr['value']
+            if attr.value != val:
+                suffix = f" = {attr.value},"
+                val = attr.value
             val += 1
             cw.p(attr.enum_name + suffix)
         cw.nl()
@@ -2066,7 +2044,7 @@ def render_uapi(family, cw):
     max_value = f"({cnt_name} - 1)"
 
     uapi_enum_start(family, cw, family['operations'], 'enum-name')
-    for op in family.msg_list:
+    for op in family.msgs.values():
         if separate_ntf and ('notify' in op or 'event' in op):
             continue
 
@@ -2085,7 +2063,7 @@ def render_uapi(family, cw):
 
     if separate_ntf:
         uapi_enum_start(family, cw, family['operations'], enum_name='async-enum')
-        for op in family.msg_list:
+        for op in family.msgs.values():
             if separate_ntf and not ('notify' in op or 'event' in op):
                 continue
 
-- 
cgit 


From 19b64b48a33e986dd21430e759629af2562373b2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 30 Jan 2023 18:33:45 -0800
Subject: tools: ynl: add support for types needed by ethtool

Ethtool needs support for handful of extra types.
It doesn't have the definitions section yet.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/ynl.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py
index 0ceb627ba686..a656b655d302 100644
--- a/tools/net/ynl/lib/ynl.py
+++ b/tools/net/ynl/lib/ynl.py
@@ -75,6 +75,9 @@ class NlAttr:
         self.full_len = (self.payload_len + 3) & ~3
         self.raw = raw[offset + 4:offset + self.payload_len]
 
+    def as_u8(self):
+        return struct.unpack("B", self.raw)[0]
+
     def as_u16(self):
         return struct.unpack("H", self.raw)[0]
 
@@ -302,7 +305,7 @@ class YnlFamily(SpecFamily):
 
         self._types = dict()
 
-        for elem in self.yaml['definitions']:
+        for elem in self.yaml.get('definitions', []):
             self._types[elem['name']] = elem
 
         self.async_msg_ids = set()
@@ -334,6 +337,8 @@ class YnlFamily(SpecFamily):
             attr_payload = b''
             for subname, subvalue in value.items():
                 attr_payload += self._add_attr(attr['nested-attributes'], subname, subvalue)
+        elif attr["type"] == 'flag':
+            attr_payload = b''
         elif attr["type"] == 'u32':
             attr_payload = struct.pack("I", int(value))
         elif attr["type"] == 'string':
@@ -369,6 +374,8 @@ class YnlFamily(SpecFamily):
             if attr_spec["type"] == 'nest':
                 subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'])
                 rsp[attr_spec['name']] = subdict
+            elif attr_spec['type'] == 'u8':
+                rsp[attr_spec['name']] = attr.as_u8()
             elif attr_spec['type'] == 'u32':
                 rsp[attr_spec['name']] = attr.as_u32()
             elif attr_spec['type'] == 'u64':
@@ -377,6 +384,8 @@ class YnlFamily(SpecFamily):
                 rsp[attr_spec['name']] = attr.as_strz()
             elif attr_spec["type"] == 'binary':
                 rsp[attr_spec['name']] = attr.as_bin()
+            elif attr_spec["type"] == 'flag':
+                rsp[attr_spec['name']] = True
             else:
                 raise Exception(f'Unknown {attr.type} {attr_spec["name"]} {attr_spec["type"]}')
 
-- 
cgit 


From fd0616d34274fb489e3ea9aed089414d533b58e6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 30 Jan 2023 18:33:46 -0800
Subject: tools: ynl: support directional enum-model in CLI

Support families which use different IDs for messages
to and from the kernel.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/ynl.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py
index a656b655d302..690065003935 100644
--- a/tools/net/ynl/lib/ynl.py
+++ b/tools/net/ynl/lib/ynl.py
@@ -313,7 +313,7 @@ class YnlFamily(SpecFamily):
 
         for msg in self.msgs.values():
             if msg.is_async:
-                self.async_msg_ids.add(msg.value)
+                self.async_msg_ids.add(msg.rsp_value)
 
         for op_name, op in self.ops.items():
             bound_f = functools.partial(self._op, op_name)
@@ -398,7 +398,7 @@ class YnlFamily(SpecFamily):
         if self.include_raw:
             msg['nlmsg'] = nl_msg
             msg['genlmsg'] = genl_msg
-        op = self.msgs_by_value[genl_msg.genl_cmd]
+        op = self.rsp_by_value[genl_msg.genl_cmd]
         msg['name'] = op['name']
         msg['msg'] = self._decode(genl_msg.raw_attrs, op.attr_set.name)
         self.async_msg_queue.append(msg)
@@ -435,7 +435,7 @@ class YnlFamily(SpecFamily):
             nl_flags |= Netlink.NLM_F_DUMP
 
         req_seq = random.randint(1024, 65535)
-        msg = _genl_msg(self.family.family_id, nl_flags, op.value, 1, req_seq)
+        msg = _genl_msg(self.family.family_id, nl_flags, op.req_value, 1, req_seq)
         for name, value in vals.items():
             msg += self._add_attr(op.attr_set.name, name, value)
         msg = _genl_msg_finalize(msg)
@@ -458,7 +458,7 @@ class YnlFamily(SpecFamily):
 
                 gm = GenlMsg(nl_msg)
                 # Check if this is a reply to our request
-                if nl_msg.nl_seq != req_seq or gm.genl_cmd != op.value:
+                if nl_msg.nl_seq != req_seq or gm.genl_cmd != op.rsp_value:
                     if gm.genl_cmd in self.async_msg_ids:
                         self.handle_ntf(nl_msg, gm)
                         continue
-- 
cgit 


From 90256f3f8093284ef4f162229438ff1eea3a928e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 30 Jan 2023 18:33:47 -0800
Subject: tools: ynl: support multi-attr

Ethtool uses mutli-attr, add the support to YNL.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/ynl.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py
index 690065003935..c16326495cb7 100644
--- a/tools/net/ynl/lib/ynl.py
+++ b/tools/net/ynl/lib/ynl.py
@@ -373,22 +373,29 @@ class YnlFamily(SpecFamily):
             attr_spec = attr_space.attrs_by_val[attr.type]
             if attr_spec["type"] == 'nest':
                 subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'])
-                rsp[attr_spec['name']] = subdict
+                decoded = subdict
             elif attr_spec['type'] == 'u8':
-                rsp[attr_spec['name']] = attr.as_u8()
+                decoded = attr.as_u8()
             elif attr_spec['type'] == 'u32':
-                rsp[attr_spec['name']] = attr.as_u32()
+                decoded = attr.as_u32()
             elif attr_spec['type'] == 'u64':
-                rsp[attr_spec['name']] = attr.as_u64()
+                decoded = attr.as_u64()
             elif attr_spec["type"] == 'string':
-                rsp[attr_spec['name']] = attr.as_strz()
+                decoded = attr.as_strz()
             elif attr_spec["type"] == 'binary':
-                rsp[attr_spec['name']] = attr.as_bin()
+                decoded = attr.as_bin()
             elif attr_spec["type"] == 'flag':
-                rsp[attr_spec['name']] = True
+                decoded = True
             else:
                 raise Exception(f'Unknown {attr.type} {attr_spec["name"]} {attr_spec["type"]}')
 
+            if not attr_spec.is_multi:
+                rsp[attr_spec['name']] = decoded
+            elif attr_spec.name in rsp:
+                rsp[attr_spec.name].append(decoded)
+            else:
+                rsp[attr_spec.name] = [decoded]
+
             if 'enum' in attr_spec:
                 self._decode_enum(rsp, attr_spec)
         return rsp
-- 
cgit 


From 4cd2796f3f8d82aa3a13d3eca39950fe6fe9d672 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 30 Jan 2023 18:33:48 -0800
Subject: tools: ynl: support pretty printing bad attribute names

One of my favorite features of the Netlink specs is that they
make decoding structured extack a ton easier.
Implement pretty printing bad attribute names in YNL.

For example it will now say:

  'bad-attr': '.header.flags'

rather than the useless:

  'bad-attr-offs': 32

Proof:

  $ ./cli.py --spec ethtool.yaml --do rings_get \
     --json '{"header":{"dev-index":1, "flags":4}}'
  Netlink error: Invalid argument
  nl_len = 68 (52) nl_flags = 0x300 nl_type = 2
	error: -22	extack: {'msg': 'reserved bit set',
				 'bad-attr': '.header.flags'}

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/ynl.py | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'tools')

diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py
index c16326495cb7..2ff3e6dbdbf6 100644
--- a/tools/net/ynl/lib/ynl.py
+++ b/tools/net/ynl/lib/ynl.py
@@ -400,6 +400,40 @@ class YnlFamily(SpecFamily):
                 self._decode_enum(rsp, attr_spec)
         return rsp
 
+    def _decode_extack_path(self, attrs, attr_set, offset, target):
+        for attr in attrs:
+            attr_spec = attr_set.attrs_by_val[attr.type]
+            if offset > target:
+                break
+            if offset == target:
+                return '.' + attr_spec.name
+
+            if offset + attr.full_len <= target:
+                offset += attr.full_len
+                continue
+            if attr_spec['type'] != 'nest':
+                raise Exception(f"Can't dive into {attr.type} ({attr_spec['name']}) for extack")
+            offset += 4
+            subpath = self._decode_extack_path(NlAttrs(attr.raw),
+                                               self.attr_sets[attr_spec['nested-attributes']],
+                                               offset, target)
+            if subpath is None:
+                return None
+            return '.' + attr_spec.name + subpath
+
+        return None
+
+    def _decode_extack(self, request, attr_space, extack):
+        if 'bad-attr-offs' not in extack:
+            return
+
+        genl_req = GenlMsg(NlMsg(request, 0, attr_space=attr_space))
+        path = self._decode_extack_path(genl_req.raw_attrs, attr_space,
+                                        20, extack['bad-attr-offs'])
+        if path:
+            del extack['bad-attr-offs']
+            extack['bad-attr'] = path
+
     def handle_ntf(self, nl_msg, genl_msg):
         msg = dict()
         if self.include_raw:
@@ -455,11 +489,17 @@ class YnlFamily(SpecFamily):
             reply = self.sock.recv(128 * 1024)
             nms = NlMsgs(reply, attr_space=op.attr_set)
             for nl_msg in nms:
+                if nl_msg.extack:
+                    self._decode_extack(msg, op.attr_set, nl_msg.extack)
+
                 if nl_msg.error:
                     print("Netlink error:", os.strerror(-nl_msg.error))
                     print(nl_msg)
                     return
                 if nl_msg.done:
+                    if nl_msg.extack:
+                        print("Netlink warning:")
+                        print(nl_msg)
                     done = True
                     break
 
-- 
cgit 


From 8dfec0a8886880868802094967c6a769b6d15737 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 30 Jan 2023 18:33:49 -0800
Subject: tools: ynl: use operation names from spec on the CLI

When I wrote the first version of the Python code I was quite
excited that we can generate class methods directly from the
spec. Unfortunately we need to use valid identifiers for method
names (specifically no dashes are allowed). Don't reuse those
names on the CLI, it's much more natural to use the operation
names exactly as listed in the spec.

Instead of:
  ./cli --do rings_get
use:
  ./cli --do rings-get

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/cli.py     | 9 +++++----
 tools/net/ynl/lib/ynl.py | 6 ++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/cli.py b/tools/net/ynl/cli.py
index 5c4eb5a68514..05d1f4069ce1 100755
--- a/tools/net/ynl/cli.py
+++ b/tools/net/ynl/cli.py
@@ -32,10 +32,11 @@ def main():
     if args.sleep:
         time.sleep(args.sleep)
 
-    if args.do or args.dump:
-        method = getattr(ynl, args.do if args.do else args.dump)
-
-        reply = method(attrs, dump=bool(args.dump))
+    if args.do:
+        reply = ynl.do(args.do, attrs)
+        pprint.PrettyPrinter().pprint(reply)
+    if args.dump:
+        reply = ynl.dump(args.dump, attrs)
         pprint.PrettyPrinter().pprint(reply)
 
     if args.ntf:
diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py
index 2ff3e6dbdbf6..1c7411ee04dc 100644
--- a/tools/net/ynl/lib/ynl.py
+++ b/tools/net/ynl/lib/ynl.py
@@ -520,3 +520,9 @@ class YnlFamily(SpecFamily):
         if not dump and len(rsp) == 1:
             return rsp[0]
         return rsp
+
+    def do(self, method, vals):
+        return self._op(method, vals)
+
+    def dump(self, method, vals):
+        return self._op(method, vals, dump=True)
-- 
cgit 


From 5c6674f6eb52f7968b805b25c7478b3d96b6b4f7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 30 Jan 2023 18:33:50 -0800
Subject: tools: ynl: load jsonschema on demand

The CLI script tries to validate jsonschema by default.
It's seems better to validate too many times than too few.
However, when copying the scripts to random servers having
to install jsonschema is tedious. Load jsonschema via
importlib, and let the user opt out.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/cli.py        |  4 ++++
 tools/net/ynl/lib/nlspec.py | 11 ++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/ynl/cli.py b/tools/net/ynl/cli.py
index 05d1f4069ce1..e64f1478764f 100755
--- a/tools/net/ynl/cli.py
+++ b/tools/net/ynl/cli.py
@@ -13,6 +13,7 @@ def main():
     parser = argparse.ArgumentParser(description='YNL CLI sample')
     parser.add_argument('--spec', dest='spec', type=str, required=True)
     parser.add_argument('--schema', dest='schema', type=str)
+    parser.add_argument('--no-schema', action='store_true')
     parser.add_argument('--json', dest='json_text', type=str)
     parser.add_argument('--do', dest='do', type=str)
     parser.add_argument('--dump', dest='dump', type=str)
@@ -20,6 +21,9 @@ def main():
     parser.add_argument('--subscribe', dest='ntf', type=str)
     args = parser.parse_args()
 
+    if args.no_schema:
+        args.schema = ''
+
     attrs = {}
     if args.json_text:
         attrs = json.loads(args.json_text)
diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py
index 4aa3b1ad97f0..e204679ad8b7 100644
--- a/tools/net/ynl/lib/nlspec.py
+++ b/tools/net/ynl/lib/nlspec.py
@@ -1,12 +1,16 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 import collections
-import jsonschema
+import importlib
 import os
 import traceback
 import yaml
 
 
+# To be loaded dynamically as needed
+jsonschema = None
+
+
 class SpecElement:
     """Netlink spec element.
 
@@ -197,9 +201,14 @@ class SpecFamily(SpecElement):
         if schema_path is None:
             schema_path = os.path.dirname(os.path.dirname(spec_path)) + f'/{self.proto}.yaml'
         if schema_path:
+            global jsonschema
+
             with open(schema_path, "r") as stream:
                 schema = yaml.safe_load(stream)
 
+            if jsonschema is None:
+                jsonschema = importlib.import_module("jsonschema")
+
             jsonschema.validate(self.yaml, schema)
 
         self.attr_sets = collections.OrderedDict()
-- 
cgit 


From 981cbcb030d919ee49ebbfc2889839c6882d9ea7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 30 Jan 2023 18:33:54 -0800
Subject: tools: net: use python3 explicitly

The scripts require Python 3 and some distros are dropping
Python 2 support.

Reported-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/cli.py       | 2 +-
 tools/net/ynl/ynl-gen-c.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/cli.py b/tools/net/ynl/cli.py
index e64f1478764f..db410b74d539 100755
--- a/tools/net/ynl/cli.py
+++ b/tools/net/ynl/cli.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # SPDX-License-Identifier: BSD-3-Clause
 
 import argparse
diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index dc14da634e8e..3942f24b9163 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import argparse
 import collections
-- 
cgit 


From 1680801ef64d12b44048e0a35a913d7b13d78be0 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Mon, 30 Jan 2023 17:40:01 +0100
Subject: selftests: mlxsw: qos_dscp_bridge: Convert from lldptool to dcb

Set up DSCP prioritization through the iproute2 dcb tool, which is easier
to understand and manage.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Danielle Ratson <danieller@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/drivers/net/mlxsw/qos_dscp_bridge.sh | 23 +++++-----------------
 1 file changed, 5 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh
index 28a570006d4d..87c41f5727c9 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh
@@ -20,7 +20,7 @@
 # | SW |                                                                |     |
 # |  +-|----------------------------------------------------------------|-+   |
 # |  | + $swp1                       BR                           $swp2 + |   |
-# |  |   APP=0,5,10 .. 7,5,17                      APP=0,5,20 .. 7,5,27   |   |
+# |  |   dcb dscp-prio 10:0...17:7            dcb dscp-prio 20:0...27:7   |   |
 # |  +--------------------------------------------------------------------+   |
 # +---------------------------------------------------------------------------+
 
@@ -62,16 +62,6 @@ h2_destroy()
 	simple_if_fini $h2 192.0.2.2/28
 }
 
-dscp_map()
-{
-	local base=$1; shift
-	local prio
-
-	for prio in {0..7}; do
-		echo app=$prio,5,$((base + prio))
-	done
-}
-
 switch_create()
 {
 	ip link add name br1 type bridge vlan_filtering 1
@@ -81,17 +71,14 @@ switch_create()
 	ip link set dev $swp2 master br1
 	ip link set dev $swp2 up
 
-	lldptool -T -i $swp1 -V APP $(dscp_map 10) >/dev/null
-	lldptool -T -i $swp2 -V APP $(dscp_map 20) >/dev/null
-	lldpad_app_wait_set $swp1
-	lldpad_app_wait_set $swp2
+	dcb app add dev $swp1 dscp-prio 10:0 11:1 12:2 13:3 14:4 15:5 16:6 17:7
+	dcb app add dev $swp2 dscp-prio 20:0 21:1 22:2 23:3 24:4 25:5 26:6 27:7
 }
 
 switch_destroy()
 {
-	lldptool -T -i $swp2 -V APP -d $(dscp_map 20) >/dev/null
-	lldptool -T -i $swp1 -V APP -d $(dscp_map 10) >/dev/null
-	lldpad_app_wait_del
+	dcb app del dev $swp2 dscp-prio 20:0 21:1 22:2 23:3 24:4 25:5 26:6 27:7
+	dcb app del dev $swp1 dscp-prio 10:0 11:1 12:2 13:3 14:4 15:5 16:6 17:7
 
 	ip link set dev $swp2 down
 	ip link set dev $swp2 nomaster
-- 
cgit 


From 10d5bd0b69edb5af7933ba76dd23af3ff9d63b29 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Mon, 30 Jan 2023 17:40:02 +0100
Subject: selftests: mlxsw: qos_dscp_router: Convert from lldptool to dcb

Set up DSCP prioritization through the iproute2 dcb tool, which is easier
to understand and manage.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Danielle Ratson <danieller@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/drivers/net/mlxsw/qos_dscp_router.sh | 27 +++++-----------------
 1 file changed, 6 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh
index 4cb2aa65278a..f6c23f84423e 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh
@@ -94,16 +94,6 @@ h2_destroy()
 	simple_if_fini $h2 192.0.2.18/28
 }
 
-dscp_map()
-{
-	local base=$1; shift
-	local prio
-
-	for prio in {0..7}; do
-		echo app=$prio,5,$((base + prio))
-	done
-}
-
 switch_create()
 {
 	simple_if_init $swp1 192.0.2.2/28
@@ -112,17 +102,14 @@ switch_create()
 	tc qdisc add dev $swp1 clsact
 	tc qdisc add dev $swp2 clsact
 
-	lldptool -T -i $swp1 -V APP $(dscp_map 0) >/dev/null
-	lldptool -T -i $swp2 -V APP $(dscp_map 0) >/dev/null
-	lldpad_app_wait_set $swp1
-	lldpad_app_wait_set $swp2
+	dcb app add dev $swp1 dscp-prio 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7
+	dcb app add dev $swp2 dscp-prio 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7
 }
 
 switch_destroy()
 {
-	lldptool -T -i $swp2 -V APP -d $(dscp_map 0) >/dev/null
-	lldptool -T -i $swp1 -V APP -d $(dscp_map 0) >/dev/null
-	lldpad_app_wait_del
+	dcb app del dev $swp2 dscp-prio 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7
+	dcb app del dev $swp1 dscp-prio 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7
 
 	tc qdisc del dev $swp2 clsact
 	tc qdisc del dev $swp1 clsact
@@ -265,13 +252,11 @@ test_dscp_leftover()
 {
 	echo "Test that last removed DSCP rule is deconfigured correctly"
 
-	lldptool -T -i $swp2 -V APP -d $(dscp_map 0) >/dev/null
-	lldpad_app_wait_del
+	dcb app del dev $swp2 dscp-prio 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7
 
 	__test_update 0 zero
 
-	lldptool -T -i $swp2 -V APP $(dscp_map 0) >/dev/null
-	lldpad_app_wait_set $swp2
+	dcb app add dev $swp2 dscp-prio 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7
 }
 
 trap cleanup EXIT
-- 
cgit 


From 5b3ef0452c593b3439278cbc919119c770786f25 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Mon, 30 Jan 2023 17:40:03 +0100
Subject: selftests: mlxsw: qos_defprio: Convert from lldptool to dcb

Set up default port priority through the iproute2 dcb tool, which is easier
to understand and manage.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Danielle Ratson <danieller@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/drivers/net/mlxsw/qos_defprio.sh     | 68 +++++-----------------
 1 file changed, 16 insertions(+), 52 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh
index 71066bc4b886..5492fa5550d7 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh
@@ -5,18 +5,18 @@
 # prioritized according to the default priority specified at the port.
 # rx_octets_prio_* counters are used to verify the prioritization.
 #
-# +-----------------------+
-# | H1                    |
-# |    + $h1              |
-# |    | 192.0.2.1/28     |
-# +----|------------------+
+# +----------------------------------+
+# | H1                               |
+# |    + $h1                         |
+# |    | 192.0.2.1/28                |
+# +----|-----------------------------+
 #      |
-# +----|------------------+
-# | SW |                  |
-# |    + $swp1            |
-# |      192.0.2.2/28     |
-# |      APP=<prio>,1,0   |
-# +-----------------------+
+# +----|-----------------------------+
+# | SW |                             |
+# |    + $swp1                       |
+# |      192.0.2.2/28                |
+# |      dcb app default-prio <prio> |
+# +----------------------------------+
 
 ALL_TESTS="
 	ping_ipv4
@@ -29,42 +29,6 @@ NUM_NETIFS=2
 : ${HIT_TIMEOUT:=1000} # ms
 source $lib_dir/lib.sh
 
-declare -a APP
-
-defprio_install()
-{
-	local dev=$1; shift
-	local prio=$1; shift
-	local app="app=$prio,1,0"
-
-	lldptool -T -i $dev -V APP $app >/dev/null
-	lldpad_app_wait_set $dev
-	APP[$prio]=$app
-}
-
-defprio_uninstall()
-{
-	local dev=$1; shift
-	local prio=$1; shift
-	local app=${APP[$prio]}
-
-	lldptool -T -i $dev -V APP -d $app >/dev/null
-	lldpad_app_wait_del
-	unset APP[$prio]
-}
-
-defprio_flush()
-{
-	local dev=$1; shift
-	local prio
-
-	if ((${#APP[@]})); then
-		lldptool -T -i $dev -V APP -d ${APP[@]} >/dev/null
-	fi
-	lldpad_app_wait_del
-	APP=()
-}
-
 h1_create()
 {
 	simple_if_init $h1 192.0.2.1/28
@@ -83,7 +47,7 @@ switch_create()
 
 switch_destroy()
 {
-	defprio_flush $swp1
+	dcb app flush dev $swp1 default-prio
 	ip addr del dev $swp1 192.0.2.2/28
 	ip link set dev $swp1 down
 }
@@ -124,7 +88,7 @@ __test_defprio()
 
 	RET=0
 
-	defprio_install $swp1 $prio_install
+	dcb app add dev $swp1 default-prio $prio_install
 
 	local t0=$(ethtool_stats_get $swp1 rx_frames_prio_$prio_observe)
 	mausezahn -q $h1 -d 100m -c 10 -t arp reply
@@ -134,7 +98,7 @@ __test_defprio()
 	check_err $? "Default priority $prio_install/$prio_observe: Expected to capture 10 packets, got $((t1 - t0))."
 	log_test "Default priority $prio_install/$prio_observe"
 
-	defprio_uninstall $swp1 $prio_install
+	dcb app del dev $swp1 default-prio $prio_install
 }
 
 test_defprio()
@@ -145,7 +109,7 @@ test_defprio()
 		__test_defprio $prio $prio
 	done
 
-	defprio_install $swp1 3
+	dcb app add dev $swp1 default-prio 3
 	__test_defprio 0 3
 	__test_defprio 1 3
 	__test_defprio 2 3
@@ -153,7 +117,7 @@ test_defprio()
 	__test_defprio 5 5
 	__test_defprio 6 6
 	__test_defprio 7 7
-	defprio_uninstall $swp1 3
+	dcb app del dev $swp1 default-prio 3
 }
 
 trap cleanup EXIT
-- 
cgit 


From bd32ff68721c3c1c5943f5493b62aae99190af36 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Mon, 30 Jan 2023 17:40:04 +0100
Subject: selftests: net: forwarding: lib: Drop lldpad_app_wait_set(), _del()

The existing users of these helpers have been converted to iproute2 dcb.
Drop the helpers.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Danielle Ratson <danieller@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/lib.sh | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 1c4f866de7d7..29cd4705c752 100755
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -524,27 +524,6 @@ cmd_jq()
 	[ ! -z "$output" ]
 }
 
-lldpad_app_wait_set()
-{
-	local dev=$1; shift
-
-	while lldptool -t -i $dev -V APP -c app | grep -Eq "pending|unknown"; do
-		echo "$dev: waiting for lldpad to push pending APP updates"
-		sleep 5
-	done
-}
-
-lldpad_app_wait_del()
-{
-	# Give lldpad a chance to push down the changes. If the device is downed
-	# too soon, the updates will be left pending. However, they will have
-	# been struck off the lldpad's DB already, so we won't be able to tell
-	# they are pending. Then on next test iteration this would cause
-	# weirdness as newly-added APP rules conflict with the old ones,
-	# sometimes getting stuck in an "unknown" state.
-	sleep 5
-}
-
 pre_cleanup()
 {
 	if [ "${PAUSE_ON_CLEANUP}" = "yes" ]; then
-- 
cgit 


From cd955bdd6aa5ec54cdef622a142f8899a64b5446 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 26 Jan 2023 11:06:06 -0800
Subject: objtool: Fix HOSTCC flag usage

HOSTCC is always wanted when building objtool. Setting CC to HOSTCC
happens after tools/scripts/Makefile.include is included, meaning
flags (like CFLAGS) are set assuming say CC is gcc, but then it can be
later set to HOSTCC which may be clang. tools/scripts/Makefile.include
is needed for host set up and common macros in objtool's
Makefile. Rather than override the CC variable to HOSTCC, just pass CC
as HOSTCC to the sub-makes of Makefile.build, the libsubcmd builds and
also to the linkage step.

Signed-off-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20230126190606.40739-4-irogers@google.com
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 tools/objtool/Makefile | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index d54b66986627..83b100c1e7f6 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -2,11 +2,6 @@
 include ../scripts/Makefile.include
 include ../scripts/Makefile.arch
 
-# always use the host compiler
-AR	 = $(HOSTAR)
-CC	 = $(HOSTCC)
-LD	 = $(HOSTLD)
-
 ifeq ($(srctree),)
 srctree := $(patsubst %/,%,$(dir $(CURDIR)))
 srctree := $(patsubst %/,%,$(dir $(srctree)))
@@ -34,13 +29,18 @@ INCLUDES := -I$(srctree)/tools/include \
 	    -I$(srctree)/tools/objtool/include \
 	    -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include \
 	    -I$(LIBSUBCMD_OUTPUT)/include
+# Note, EXTRA_WARNINGS here was determined for CC and not HOSTCC, it
+# is passed here to match a legacy behavior.
 WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed -Wno-nested-externs
-CFLAGS   := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS)
-LDFLAGS  += $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS)
+OBJTOOL_CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS)
+OBJTOOL_LDFLAGS := $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS)
 
 # Allow old libelf to be used:
-elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(CC) $(CFLAGS) -x c -E - | grep elf_getshdr)
-CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED)
+elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(HOSTCC) $(OBJTOOL_CFLAGS) -x c -E - | grep elf_getshdr)
+OBJTOOL_CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED)
+
+# Always want host compilation.
+HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)"
 
 AWK = awk
 MKDIR = mkdir
@@ -61,12 +61,14 @@ export BUILD_ORC
 export srctree OUTPUT CFLAGS SRCARCH AWK
 include $(srctree)/tools/build/Makefile.include
 
-$(OBJTOOL_IN): fixdep FORCE
+$(OBJTOOL_IN): fixdep $(LIBSUBCMD) FORCE
 	$(Q)$(CONFIG_SHELL) ./sync-check.sh
-	$(Q)$(MAKE) $(build)=objtool
+	$(Q)$(MAKE) $(build)=objtool $(HOST_OVERRIDES) CFLAGS="$(OBJTOOL_CFLAGS)" \
+		LDFLAGS="$(OBJTOOL_LDFLAGS)"
+
 
 $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN)
-	$(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@
+	$(QUIET_LINK)$(HOSTCC) $(OBJTOOL_IN) $(OBJTOOL_LDFLAGS) -o $@
 
 
 $(LIBSUBCMD_OUTPUT):
@@ -75,6 +77,7 @@ $(LIBSUBCMD_OUTPUT):
 $(LIBSUBCMD): fixdep $(LIBSUBCMD_OUTPUT) FORCE
 	$(Q)$(MAKE) -C $(LIBSUBCMD_DIR) O=$(LIBSUBCMD_OUTPUT) \
 		DESTDIR=$(LIBSUBCMD_OUTPUT) prefix= subdir= \
+		$(HOST_OVERRIDES) EXTRA_CFLAGS="$(OBJTOOL_CFLAGS)" \
 		$@ install_headers
 
 $(LIBSUBCMD)-clean:
-- 
cgit 


From d93ee0553cf2e83c1696a18423bcf05b94b85e1d Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Tue, 27 Dec 2022 16:00:57 +0000
Subject: objtool: Make struct entries[] static and const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This data is not modified and not used outside of special.c.

Also adapt its users to the constness.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://lore.kernel.org/r/20221216-objtool-memory-v2-1-17968f85a464@weissschuh.net
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 tools/objtool/special.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/special.c b/tools/objtool/special.c
index 9c8d827f69af..baa85c31526b 100644
--- a/tools/objtool/special.c
+++ b/tools/objtool/special.c
@@ -26,7 +26,7 @@ struct special_entry {
 	unsigned char key; /* jump_label key */
 };
 
-struct special_entry entries[] = {
+static const struct special_entry entries[] = {
 	{
 		.sec = ".altinstructions",
 		.group = true,
@@ -65,7 +65,7 @@ static void reloc_to_sec_off(struct reloc *reloc, struct section **sec,
 	*off = reloc->sym->offset + reloc->addend;
 }
 
-static int get_alt_entry(struct elf *elf, struct special_entry *entry,
+static int get_alt_entry(struct elf *elf, const struct special_entry *entry,
 			 struct section *sec, int idx,
 			 struct special_alt *alt)
 {
@@ -139,7 +139,7 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry,
  */
 int special_get_alts(struct elf *elf, struct list_head *alts)
 {
-	struct special_entry *entry;
+	const struct special_entry *entry;
 	struct section *sec;
 	unsigned int nr_entries;
 	struct special_alt *alt;
-- 
cgit 


From cfd66e81799f4a2fdc6447fa99bdb1871f45ff08 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Tue, 27 Dec 2022 16:00:58 +0000
Subject: objtool: Make struct check_options static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is not used outside of builtin-check.c.

Also remove the unused declaration from builtin.h .

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://lore.kernel.org/r/20221216-objtool-memory-v2-2-17968f85a464@weissschuh.net
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 tools/objtool/builtin-check.c           | 2 +-
 tools/objtool/include/objtool/builtin.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index a4f39407bf59..7c175198d09f 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -65,7 +65,7 @@ static int parse_hacks(const struct option *opt, const char *str, int unset)
 	return found ? 0 : -1;
 }
 
-const struct option check_options[] = {
+static const struct option check_options[] = {
 	OPT_GROUP("Actions:"),
 	OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr,skylake", "patch toolchain bugs/limitations", parse_hacks),
 	OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"),
diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h
index fa45044e3863..2a108e648b7a 100644
--- a/tools/objtool/include/objtool/builtin.h
+++ b/tools/objtool/include/objtool/builtin.h
@@ -7,8 +7,6 @@
 
 #include <subcmd/parse-options.h>
 
-extern const struct option check_options[];
-
 struct opts {
 	/* actions: */
 	bool dump_orc;
-- 
cgit 


From 8045b8f0b17edf375849f83c80dd05194850b6ed Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Tue, 27 Dec 2022 16:00:59 +0000
Subject: objtool: Allocate multiple structures with calloc()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

By using calloc() instead of malloc() in a loop, libc does not have to
keep around bookkeeping information for each single structure.

This reduces maximum memory usage while processing vmlinux.o from
3153325 KB to 3035668 KB (-3.7%) on my notebooks "localmodconfig".

Note this introduces memory leaks, because some additional structs get
added to the lists later after reading the symbols and sections from the
original object.  Luckily we don't really care about memory leaks in
objtool.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://lore.kernel.org/r/20221216-objtool-memory-v2-3-17968f85a464@weissschuh.net
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 tools/objtool/elf.c                 | 42 ++++++++++++++++++-------------------
 tools/objtool/include/objtool/elf.h |  4 ++++
 2 files changed, 25 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 64443a7f4bbf..6806ce01d933 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -284,13 +284,13 @@ static int read_sections(struct elf *elf)
 	    !elf_alloc_hash(section_name, sections_nr))
 		return -1;
 
+	elf->section_data = calloc(sections_nr, sizeof(*sec));
+	if (!elf->section_data) {
+		perror("calloc");
+		return -1;
+	}
 	for (i = 0; i < sections_nr; i++) {
-		sec = malloc(sizeof(*sec));
-		if (!sec) {
-			perror("malloc");
-			return -1;
-		}
-		memset(sec, 0, sizeof(*sec));
+		sec = &elf->section_data[i];
 
 		INIT_LIST_HEAD(&sec->symbol_list);
 		INIT_LIST_HEAD(&sec->reloc_list);
@@ -422,13 +422,13 @@ static int read_symbols(struct elf *elf)
 	    !elf_alloc_hash(symbol_name, symbols_nr))
 		return -1;
 
+	elf->symbol_data = calloc(symbols_nr, sizeof(*sym));
+	if (!elf->symbol_data) {
+		perror("calloc");
+		return -1;
+	}
 	for (i = 0; i < symbols_nr; i++) {
-		sym = malloc(sizeof(*sym));
-		if (!sym) {
-			perror("malloc");
-			return -1;
-		}
-		memset(sym, 0, sizeof(*sym));
+		sym = &elf->symbol_data[i];
 
 		sym->idx = i;
 
@@ -918,13 +918,13 @@ static int read_relocs(struct elf *elf)
 		sec->base->reloc = sec;
 
 		nr_reloc = 0;
+		sec->reloc_data = calloc(sec->sh.sh_size / sec->sh.sh_entsize, sizeof(*reloc));
+		if (!sec->reloc_data) {
+			perror("calloc");
+			return -1;
+		}
 		for (i = 0; i < sec->sh.sh_size / sec->sh.sh_entsize; i++) {
-			reloc = malloc(sizeof(*reloc));
-			if (!reloc) {
-				perror("malloc");
-				return -1;
-			}
-			memset(reloc, 0, sizeof(*reloc));
+			reloc = &sec->reloc_data[i];
 			switch (sec->sh.sh_type) {
 			case SHT_REL:
 				if (read_rel_reloc(sec, i, reloc, &symndx))
@@ -1453,16 +1453,16 @@ void elf_close(struct elf *elf)
 		list_for_each_entry_safe(sym, tmpsym, &sec->symbol_list, list) {
 			list_del(&sym->list);
 			hash_del(&sym->hash);
-			free(sym);
 		}
 		list_for_each_entry_safe(reloc, tmpreloc, &sec->reloc_list, list) {
 			list_del(&reloc->list);
 			hash_del(&reloc->hash);
-			free(reloc);
 		}
 		list_del(&sec->list);
-		free(sec);
+		free(sec->reloc_data);
 	}
 
+	free(elf->symbol_data);
+	free(elf->section_data);
 	free(elf);
 }
diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index bb60fd42b46f..1c90f0ac0d53 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -39,6 +39,7 @@ struct section {
 	char *name;
 	int idx;
 	bool changed, text, rodata, noinstr, init, truncate;
+	struct reloc *reloc_data;
 };
 
 struct symbol {
@@ -104,6 +105,9 @@ struct elf {
 	struct hlist_head *section_hash;
 	struct hlist_head *section_name_hash;
 	struct hlist_head *reloc_hash;
+
+	struct section *section_data;
+	struct symbol *symbol_data;
 };
 
 #define OFFSET_STRIDE_BITS	4
-- 
cgit 


From 21a899f9fc78be6b289ee4627bccadf560930eb5 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Tue, 27 Dec 2022 16:01:02 +0000
Subject: objtool: Optimize layout of struct symbol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reduce the size of struct symbol on x86_64 from 208 to 200 bytes.
This structure is allocated a lot and never freed.

This reduces maximum memory usage while processing vmlinux.o from
2919716 KB to 2917988 KB (-0.5%) on my notebooks "localmodconfig".

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://lore.kernel.org/r/20221216-objtool-memory-v2-6-17968f85a464@weissschuh.net
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 tools/objtool/include/objtool/elf.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index 1c90f0ac0d53..ad0024da262b 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -50,12 +50,11 @@ struct symbol {
 	GElf_Sym sym;
 	struct section *sec;
 	char *name;
-	unsigned int idx;
-	unsigned char bind, type;
+	unsigned int idx, len;
 	unsigned long offset;
-	unsigned int len;
 	unsigned long __subtree_last;
 	struct symbol *pfunc, *cfunc, *alias;
+	unsigned char bind, type;
 	u8 uaccess_safe      : 1;
 	u8 static_call_tramp : 1;
 	u8 retpoline_thunk   : 1;
-- 
cgit 


From a20717aca33b1ff133f513721050fe6c3d7f97b5 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Tue, 27 Dec 2022 16:01:03 +0000
Subject: objtool: Optimize layout of struct special_alt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reduce the size of struct special_alt from 72 to 64 bytes.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://lore.kernel.org/r/20221216-objtool-memory-v2-7-17968f85a464@weissschuh.net
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 tools/objtool/include/objtool/special.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h
index dc4721e19002..86d4af9c5aa9 100644
--- a/tools/objtool/include/objtool/special.h
+++ b/tools/objtool/include/objtool/special.h
@@ -19,6 +19,7 @@ struct special_alt {
 	bool skip_orig;
 	bool skip_alt;
 	bool jump_or_nop;
+	u8 key_addend;
 
 	struct section *orig_sec;
 	unsigned long orig_off;
@@ -27,7 +28,6 @@ struct special_alt {
 	unsigned long new_off;
 
 	unsigned int orig_len, new_len; /* group only */
-	u8 key_addend;
 };
 
 int special_get_alts(struct elf *elf, struct list_head *alts);
-- 
cgit 


From 4365eec8190c237aea723e6ac9529789215558e1 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 31 Jan 2023 18:28:05 +0000
Subject: kselftest/arm64: Don't require FA64 for streaming SVE tests

During early development a dependedncy was added on having FA64
available so we could use the full FPSIMD register set in the signal
handler.  Subsequently the ABI was finialised so the handler is run with
streaming mode disabled meaning this is redundant but the dependency was
never removed, do so now.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230131-arm64-kselfetest-ssve-fa64-v1-1-f418efcc2b60@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/testcases/ssve_regs.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
index cd738265cdcd..d0eceea92073 100644
--- a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
@@ -121,12 +121,7 @@ static int sme_regs(struct tdescr *td, siginfo_t *si, ucontext_t *uc)
 struct tdescr tde = {
 	.name = "Streaming SVE registers",
 	.descr = "Check that we get the right Streaming SVE registers reported",
-	/*
-	 * We shouldn't require FA64 but things like memset() used in the
-	 * helpers might use unsupported instructions so for now disable
-	 * the test unless we've got the full instruction set.
-	 */
-	.feats_required = FEAT_SME | FEAT_SME_FA64,
+	.feats_required = FEAT_SME,
 	.timeout = 3,
 	.init = sme_get_vls,
 	.run = sme_regs,
-- 
cgit 


From 5f389238534ac8ca4ee3ab12eeb89d3984d303a1 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 31 Jan 2023 22:56:34 +0000
Subject: kselftest/arm64: Fix enumeration of systems without 128 bit SME

The current signal handling tests for SME do not account for the fact that
unlike SVE all SME vector lengths are optional so we can't guarantee that
we will encounter the minimum possible VL, they will hang enumerating VLs
on such systems. Abort enumeration when we find the lowest VL.

Fixes: 4963aeb35a9e ("kselftest/arm64: signal: Add SME signal handling tests")
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230131-arm64-kselftest-sig-sme-no-128-v1-1-d47c13dc8e1e@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/testcases/ssve_regs.c | 4 ++++
 tools/testing/selftests/arm64/signal/testcases/za_regs.c   | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
index d0eceea92073..3d37daafcff5 100644
--- a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
@@ -34,6 +34,10 @@ static bool sme_get_vls(struct tdescr *td)
 
 		vl &= PR_SME_VL_LEN_MASK;
 
+		/* Did we find the lowest supported VL? */
+		if (vq < sve_vq_from_vl(vl))
+			break;
+
 		/* Skip missing VLs */
 		vq = sve_vq_from_vl(vl);
 
diff --git a/tools/testing/selftests/arm64/signal/testcases/za_regs.c b/tools/testing/selftests/arm64/signal/testcases/za_regs.c
index ea45acb115d5..174ad6656696 100644
--- a/tools/testing/selftests/arm64/signal/testcases/za_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/za_regs.c
@@ -34,6 +34,10 @@ static bool sme_get_vls(struct tdescr *td)
 
 		vl &= PR_SME_VL_LEN_MASK;
 
+		/* Did we find the lowest supported VL? */
+		if (vq < sve_vq_from_vl(vl))
+			break;
+
 		/* Skip missing VLs */
 		vq = sve_vq_from_vl(vl);
 
-- 
cgit 


From a7db82f18cd3d85ea8ef70fca5946b441187ed6d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 31 Jan 2023 22:56:35 +0000
Subject: kselftest/arm64: Fix enumeration of systems without 128 bit SME for
 SSVE+ZA

The current signal handling tests for SME do not account for the fact that
unlike SVE all SME vector lengths are optional so we can't guarantee that
we will encounter the minimum possible VL, they will hang enumerating VLs
on such systems. Abort enumeration when we find the lowest VL in the newly
added ssve_za_regs test.

Fixes: bc69da5ff087 ("kselftest/arm64: Verify simultaneous SSVE and ZA context generation")
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230131-arm64-kselftest-sig-sme-no-128-v1-2-d47c13dc8e1e@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c
index 954a21f6121a..1f62621794d5 100644
--- a/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c
@@ -34,6 +34,10 @@ static bool sme_get_vls(struct tdescr *td)
 
 		vl &= PR_SME_VL_LEN_MASK;
 
+		/* Did we find the lowest supported VL? */
+		if (vq < sve_vq_from_vl(vl))
+			break;
+
 		/* Skip missing VLs */
 		vq = sve_vq_from_vl(vl);
 
-- 
cgit 


From c4a46627e5a846e59f0097e3196f142eb6142f4f Mon Sep 17 00:00:00 2001
From: Vipin Sharma <vipinsh@google.com>
Date: Mon, 12 Dec 2022 10:37:17 -0800
Subject: KVM: selftests: Test Hyper-V extended hypercall enablement

Test Hyper-V extended hypercall, HV_EXT_CALL_QUERY_CAPABILITIES
(0x8001), access denied and invalid parameter cases.

Access is denied if CPUID.0x40000003.EBX BIT(20) is not set.
Invalid parameter if call has fast bit set.

Signed-off-by: Vipin Sharma <vipinsh@google.com>
Link: https://lore.kernel.org/r/20221212183720.4062037-11-vipinsh@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/x86_64/hyperv.h  | 5 +++++
 tools/testing/selftests/kvm/x86_64/hyperv_features.c | 9 +++++++++
 2 files changed, 14 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h
index 28eb99046475..fa65b908b13e 100644
--- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h
+++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h
@@ -137,6 +137,8 @@
 	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 11)
 #define HV_CPU_MANAGEMENT			\
 	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 12)
+#define HV_ENABLE_EXTENDED_HYPERCALLS		\
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 20)
 #define HV_ISOLATION				\
 	KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 22)
 
@@ -213,6 +215,9 @@
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
 
+/* Extended hypercalls */
+#define HV_EXT_CALL_QUERY_CAPABILITIES		0x8001
+
 #define HV_FLUSH_ALL_PROCESSORS			BIT(0)
 #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES	BIT(1)
 #define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY	BIT(2)
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
index 258267df8e2a..c5e3b39edd07 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
@@ -649,6 +649,15 @@ static void guest_test_hcalls_access(void)
 			hcall->expect = HV_STATUS_SUCCESS;
 			break;
 		case 19:
+			hcall->control = HV_EXT_CALL_QUERY_CAPABILITIES;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 20:
+			vcpu_set_cpuid_feature(vcpu, HV_ENABLE_EXTENDED_HYPERCALLS);
+			hcall->control = HV_EXT_CALL_QUERY_CAPABILITIES | HV_HYPERCALL_FAST_BIT;
+			hcall->expect = HV_STATUS_INVALID_PARAMETER;
+			break;
+		case 21:
 			kvm_vm_free(vm);
 			return;
 		}
-- 
cgit 


From f65092015a83348fdc2f509e4ad8278e0c2df0cd Mon Sep 17 00:00:00 2001
From: Vipin Sharma <vipinsh@google.com>
Date: Mon, 12 Dec 2022 10:37:18 -0800
Subject: KVM: selftests: Replace hardcoded Linux OS id with HYPERV_LINUX_OS_ID

Use HYPERV_LINUX_OS_ID macro instead of hardcoded 0x8100 << 48

Signed-off-by: Vipin Sharma <vipinsh@google.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20221212183720.4062037-12-vipinsh@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86_64/hyperv_clock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
index d576bc8ce823..2ee0af0d449e 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
@@ -104,7 +104,7 @@ static void guest_main(struct ms_hyperv_tsc_page *tsc_page, vm_paddr_t tsc_page_
 
 	/* Set Guest OS id to enable Hyper-V emulation */
 	GUEST_SYNC(1);
-	wrmsr(HV_X64_MSR_GUEST_OS_ID, (u64)0x8100 << 48);
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
 	GUEST_SYNC(2);
 
 	check_tsc_msr_rdtsc();
-- 
cgit 


From 60325261235accc838158c82b403086e0e76e6a9 Mon Sep 17 00:00:00 2001
From: Vipin Sharma <vipinsh@google.com>
Date: Mon, 12 Dec 2022 10:37:20 -0800
Subject: KVM: selftests: Test Hyper-V extended hypercall exit to userspace

Hyper-V extended hypercalls by default exit to userspace. Verify
userspace gets the call, update the result and then verify in guest
correct result is received.

Add KVM_EXIT_HYPERV to list of "known" hypercalls so errors generate
pretty strings.

Signed-off-by: Vipin Sharma <vipinsh@google.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20221212183720.4062037-14-vipinsh@google.com
[sean: add KVM_EXIT_HYPERV to exit_reasons_known]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile               |  1 +
 tools/testing/selftests/kvm/lib/kvm_util.c         |  1 +
 .../kvm/x86_64/hyperv_extended_hypercalls.c        | 97 ++++++++++++++++++++++
 3 files changed, 99 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/hyperv_extended_hypercalls.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 1750f91dd936..844417601618 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -67,6 +67,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_evmcs
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_extended_hypercalls
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_ipi
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 56d5ea949cbb..f25b3e9b5a07 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1844,6 +1844,7 @@ static struct exit_reason {
 	{KVM_EXIT_X86_RDMSR, "RDMSR"},
 	{KVM_EXIT_X86_WRMSR, "WRMSR"},
 	{KVM_EXIT_XEN, "XEN"},
+	{KVM_EXIT_HYPERV, "HYPERV"},
 #ifdef KVM_EXIT_MEMORY_NOT_PRESENT
 	{KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"},
 #endif
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_extended_hypercalls.c b/tools/testing/selftests/kvm/x86_64/hyperv_extended_hypercalls.c
new file mode 100644
index 000000000000..73af44d2167f
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_extended_hypercalls.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test Hyper-V extended hypercall, HV_EXT_CALL_QUERY_CAPABILITIES (0x8001),
+ * exit to userspace and receive result in guest.
+ *
+ * Negative tests are present in hyperv_features.c
+ *
+ * Copyright 2022 Google LLC
+ * Author: Vipin Sharma <vipinsh@google.com>
+ */
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "hyperv.h"
+
+/* Any value is fine */
+#define EXT_CAPABILITIES 0xbull
+
+static void guest_code(vm_paddr_t in_pg_gpa, vm_paddr_t out_pg_gpa,
+		       vm_vaddr_t out_pg_gva)
+{
+	uint64_t *output_gva;
+
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	wrmsr(HV_X64_MSR_HYPERCALL, in_pg_gpa);
+
+	output_gva = (uint64_t *)out_pg_gva;
+
+	hyperv_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, in_pg_gpa, out_pg_gpa);
+
+	/* TLFS states output will be a uint64_t value */
+	GUEST_ASSERT_EQ(*output_gva, EXT_CAPABILITIES);
+
+	GUEST_DONE();
+}
+
+int main(void)
+{
+	vm_vaddr_t hcall_out_page;
+	vm_vaddr_t hcall_in_page;
+	struct kvm_vcpu *vcpu;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	uint64_t *outval;
+	struct ucall uc;
+
+	/* Verify if extended hypercalls are supported */
+	if (!kvm_cpuid_has(kvm_get_supported_hv_cpuid(),
+			   HV_ENABLE_EXTENDED_HYPERCALLS)) {
+		print_skip("Extended calls not supported by the kernel");
+		exit(KSFT_SKIP);
+	}
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	run = vcpu->run;
+	vcpu_set_hv_cpuid(vcpu);
+
+	/* Hypercall input */
+	hcall_in_page = vm_vaddr_alloc_pages(vm, 1);
+	memset(addr_gva2hva(vm, hcall_in_page), 0x0, vm->page_size);
+
+	/* Hypercall output */
+	hcall_out_page = vm_vaddr_alloc_pages(vm, 1);
+	memset(addr_gva2hva(vm, hcall_out_page), 0x0, vm->page_size);
+
+	vcpu_args_set(vcpu, 3, addr_gva2gpa(vm, hcall_in_page),
+		      addr_gva2gpa(vm, hcall_out_page), hcall_out_page);
+
+	vcpu_run(vcpu);
+
+	TEST_ASSERT(run->exit_reason == KVM_EXIT_HYPERV,
+		    "Unexpected exit reason: %u (%s)",
+		    run->exit_reason, exit_reason_str(run->exit_reason));
+
+	outval = addr_gpa2hva(vm, run->hyperv.u.hcall.params[1]);
+	*outval = EXT_CAPABILITIES;
+	run->hyperv.u.hcall.result = HV_STATUS_SUCCESS;
+
+	vcpu_run(vcpu);
+
+	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+		    "Unexpected exit reason: %u (%s)",
+		    run->exit_reason, exit_reason_str(run->exit_reason));
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT_2(uc, "arg1 = %ld, arg2 = %ld");
+		break;
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unhandled ucall: %ld", uc.cmd);
+	}
+
+	kvm_vm_free(vm);
+	return 0;
+}
-- 
cgit 


From 400031e05adfcef9e80eca80bdfc3f4b63658be4 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 1 Feb 2023 11:30:15 -0600
Subject: bpf: Add __bpf_kfunc tag to all kfuncs

Now that we have the __bpf_kfunc tag, we should use add it to all
existing kfuncs to ensure that they'll never be elided in LTO builds.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/20230201173016.342758-4-void@manifault.com
---
 kernel/bpf/cpumask.c                               | 60 +++++++++++-----------
 kernel/bpf/helpers.c                               | 38 +++++++-------
 kernel/cgroup/rstat.c                              |  4 +-
 kernel/kexec_core.c                                |  3 +-
 kernel/trace/bpf_trace.c                           |  8 +--
 net/bpf/test_run.c                                 | 55 ++++++++++----------
 net/core/xdp.c                                     |  5 +-
 net/ipv4/tcp_bbr.c                                 | 16 +++---
 net/ipv4/tcp_cong.c                                | 10 ++--
 net/ipv4/tcp_cubic.c                               | 12 ++---
 net/ipv4/tcp_dctcp.c                               | 12 ++---
 net/netfilter/nf_conntrack_bpf.c                   | 20 ++++----
 net/netfilter/nf_nat_bpf.c                         |  6 +--
 net/xfrm/xfrm_interface_bpf.c                      |  7 +--
 .../selftests/bpf/bpf_testmod/bpf_testmod.c        |  2 +-
 15 files changed, 130 insertions(+), 128 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 6bbb67dfc998..52b981512a35 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -48,7 +48,7 @@ __diag_ignore_all("-Wmissing-prototypes",
  * bpf_cpumask_create() allocates memory using the BPF memory allocator, and
  * will not block. It may return NULL if no memory is available.
  */
-struct bpf_cpumask *bpf_cpumask_create(void)
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)
 {
 	struct bpf_cpumask *cpumask;
 
@@ -74,7 +74,7 @@ struct bpf_cpumask *bpf_cpumask_create(void)
  * must either be embedded in a map as a kptr, or freed with
  * bpf_cpumask_release().
  */
-struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
 {
 	refcount_inc(&cpumask->usage);
 	return cpumask;
@@ -90,7 +90,7 @@ struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
  * kptr, or freed with bpf_cpumask_release(). This function may return NULL if
  * no BPF cpumask was found in the specified map value.
  */
-struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp)
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp)
 {
 	struct bpf_cpumask *cpumask;
 
@@ -116,7 +116,7 @@ struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp)
  * reference of the BPF cpumask has been released, it is subsequently freed in
  * an RCU callback in the BPF memory allocator.
  */
-void bpf_cpumask_release(struct bpf_cpumask *cpumask)
+__bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
 {
 	if (!cpumask)
 		return;
@@ -135,7 +135,7 @@ void bpf_cpumask_release(struct bpf_cpumask *cpumask)
  * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask
  * pointer may be safely passed to this function.
  */
-u32 bpf_cpumask_first(const struct cpumask *cpumask)
+__bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask)
 {
 	return cpumask_first(cpumask);
 }
@@ -148,7 +148,7 @@ u32 bpf_cpumask_first(const struct cpumask *cpumask)
  * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask
  * pointer may be safely passed to this function.
  */
-u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
+__bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
 {
 	return cpumask_first_zero(cpumask);
 }
@@ -158,7 +158,7 @@ u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
  * @cpu: The CPU to be set in the cpumask.
  * @cpumask: The BPF cpumask in which a bit is being set.
  */
-void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+__bpf_kfunc void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
 {
 	if (!cpu_valid(cpu))
 		return;
@@ -171,7 +171,7 @@ void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
  * @cpu: The CPU to be cleared from the cpumask.
  * @cpumask: The BPF cpumask in which a bit is being cleared.
  */
-void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+__bpf_kfunc void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
 {
 	if (!cpu_valid(cpu))
 		return;
@@ -188,7 +188,7 @@ void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
  * * true  - @cpu is set in the cpumask
  * * false - @cpu was not set in the cpumask, or @cpu is an invalid cpu.
  */
-bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask)
+__bpf_kfunc bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask)
 {
 	if (!cpu_valid(cpu))
 		return false;
@@ -205,7 +205,7 @@ bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask)
  * * true  - @cpu is set in the cpumask
  * * false - @cpu was not set in the cpumask, or @cpu is invalid.
  */
-bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+__bpf_kfunc bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
 {
 	if (!cpu_valid(cpu))
 		return false;
@@ -223,7 +223,7 @@ bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
  * * true  - @cpu is set in the cpumask
  * * false - @cpu was not set in the cpumask, or @cpu is invalid.
  */
-bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+__bpf_kfunc bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
 {
 	if (!cpu_valid(cpu))
 		return false;
@@ -235,7 +235,7 @@ bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
  * bpf_cpumask_setall() - Set all of the bits in a BPF cpumask.
  * @cpumask: The BPF cpumask having all of its bits set.
  */
-void bpf_cpumask_setall(struct bpf_cpumask *cpumask)
+__bpf_kfunc void bpf_cpumask_setall(struct bpf_cpumask *cpumask)
 {
 	cpumask_setall((struct cpumask *)cpumask);
 }
@@ -244,7 +244,7 @@ void bpf_cpumask_setall(struct bpf_cpumask *cpumask)
  * bpf_cpumask_clear() - Clear all of the bits in a BPF cpumask.
  * @cpumask: The BPF cpumask being cleared.
  */
-void bpf_cpumask_clear(struct bpf_cpumask *cpumask)
+__bpf_kfunc void bpf_cpumask_clear(struct bpf_cpumask *cpumask)
 {
 	cpumask_clear((struct cpumask *)cpumask);
 }
@@ -261,9 +261,9 @@ void bpf_cpumask_clear(struct bpf_cpumask *cpumask)
  *
  * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
  */
-bool bpf_cpumask_and(struct bpf_cpumask *dst,
-		     const struct cpumask *src1,
-		     const struct cpumask *src2)
+__bpf_kfunc bool bpf_cpumask_and(struct bpf_cpumask *dst,
+				 const struct cpumask *src1,
+				 const struct cpumask *src2)
 {
 	return cpumask_and((struct cpumask *)dst, src1, src2);
 }
@@ -276,9 +276,9 @@ bool bpf_cpumask_and(struct bpf_cpumask *dst,
  *
  * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
  */
-void bpf_cpumask_or(struct bpf_cpumask *dst,
-		    const struct cpumask *src1,
-		    const struct cpumask *src2)
+__bpf_kfunc void bpf_cpumask_or(struct bpf_cpumask *dst,
+				const struct cpumask *src1,
+				const struct cpumask *src2)
 {
 	cpumask_or((struct cpumask *)dst, src1, src2);
 }
@@ -291,9 +291,9 @@ void bpf_cpumask_or(struct bpf_cpumask *dst,
  *
  * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
  */
-void bpf_cpumask_xor(struct bpf_cpumask *dst,
-		     const struct cpumask *src1,
-		     const struct cpumask *src2)
+__bpf_kfunc void bpf_cpumask_xor(struct bpf_cpumask *dst,
+				 const struct cpumask *src1,
+				 const struct cpumask *src2)
 {
 	cpumask_xor((struct cpumask *)dst, src1, src2);
 }
@@ -309,7 +309,7 @@ void bpf_cpumask_xor(struct bpf_cpumask *dst,
  *
  * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
  */
-bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2)
+__bpf_kfunc bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2)
 {
 	return cpumask_equal(src1, src2);
 }
@@ -325,7 +325,7 @@ bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2)
  *
  * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
  */
-bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2)
+__bpf_kfunc bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2)
 {
 	return cpumask_intersects(src1, src2);
 }
@@ -341,7 +341,7 @@ bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *sr
  *
  * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
  */
-bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2)
+__bpf_kfunc bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2)
 {
 	return cpumask_subset(src1, src2);
 }
@@ -356,7 +356,7 @@ bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2)
  *
  * A struct bpf_cpumask pointer may be safely passed to @cpumask.
  */
-bool bpf_cpumask_empty(const struct cpumask *cpumask)
+__bpf_kfunc bool bpf_cpumask_empty(const struct cpumask *cpumask)
 {
 	return cpumask_empty(cpumask);
 }
@@ -371,7 +371,7 @@ bool bpf_cpumask_empty(const struct cpumask *cpumask)
  *
  * A struct bpf_cpumask pointer may be safely passed to @cpumask.
  */
-bool bpf_cpumask_full(const struct cpumask *cpumask)
+__bpf_kfunc bool bpf_cpumask_full(const struct cpumask *cpumask)
 {
 	return cpumask_full(cpumask);
 }
@@ -383,7 +383,7 @@ bool bpf_cpumask_full(const struct cpumask *cpumask)
  *
  * A struct bpf_cpumask pointer may be safely passed to @src.
  */
-void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src)
+__bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src)
 {
 	cpumask_copy((struct cpumask *)dst, src);
 }
@@ -398,7 +398,7 @@ void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src)
  *
  * A struct bpf_cpumask pointer may be safely passed to @src.
  */
-u32 bpf_cpumask_any(const struct cpumask *cpumask)
+__bpf_kfunc u32 bpf_cpumask_any(const struct cpumask *cpumask)
 {
 	return cpumask_any(cpumask);
 }
@@ -415,7 +415,7 @@ u32 bpf_cpumask_any(const struct cpumask *cpumask)
  *
  * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
  */
-u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2)
+__bpf_kfunc u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2)
 {
 	return cpumask_any_and(src1, src2);
 }
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 458db2db2f81..2dae44581922 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1776,7 +1776,7 @@ __diag_push();
 __diag_ignore_all("-Wmissing-prototypes",
 		  "Global functions as their definitions will be in vmlinux BTF");
 
-void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 {
 	struct btf_struct_meta *meta = meta__ign;
 	u64 size = local_type_id__k;
@@ -1790,7 +1790,7 @@ void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 	return p;
 }
 
-void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
+__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
 {
 	struct btf_struct_meta *meta = meta__ign;
 	void *p = p__alloc;
@@ -1811,12 +1811,12 @@ static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *hea
 	tail ? list_add_tail(n, h) : list_add(n, h);
 }
 
-void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node)
+__bpf_kfunc void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node)
 {
 	return __bpf_list_add(node, head, false);
 }
 
-void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node)
+__bpf_kfunc void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node)
 {
 	return __bpf_list_add(node, head, true);
 }
@@ -1834,12 +1834,12 @@ static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tai
 	return (struct bpf_list_node *)n;
 }
 
-struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
 {
 	return __bpf_list_del(head, false);
 }
 
-struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
 {
 	return __bpf_list_del(head, true);
 }
@@ -1850,7 +1850,7 @@ struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
  * bpf_task_release().
  * @p: The task on which a reference is being acquired.
  */
-struct task_struct *bpf_task_acquire(struct task_struct *p)
+__bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
 {
 	return get_task_struct(p);
 }
@@ -1861,7 +1861,7 @@ struct task_struct *bpf_task_acquire(struct task_struct *p)
  * released by calling bpf_task_release().
  * @p: The task on which a reference is being acquired.
  */
-struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
+__bpf_kfunc struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
 {
 	/* For the time being this function returns NULL, as it's not currently
 	 * possible to safely acquire a reference to a task with RCU protection
@@ -1913,7 +1913,7 @@ struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
  * be released by calling bpf_task_release().
  * @pp: A pointer to a task kptr on which a reference is being acquired.
  */
-struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
+__bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
 {
 	/* We must return NULL here until we have clarity on how to properly
 	 * leverage RCU for ensuring a task's lifetime. See the comment above
@@ -1926,7 +1926,7 @@ struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
  * bpf_task_release - Release the reference acquired on a task.
  * @p: The task on which a reference is being released.
  */
-void bpf_task_release(struct task_struct *p)
+__bpf_kfunc void bpf_task_release(struct task_struct *p)
 {
 	if (!p)
 		return;
@@ -1941,7 +1941,7 @@ void bpf_task_release(struct task_struct *p)
  * calling bpf_cgroup_release().
  * @cgrp: The cgroup on which a reference is being acquired.
  */
-struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
+__bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
 {
 	cgroup_get(cgrp);
 	return cgrp;
@@ -1953,7 +1953,7 @@ struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
  * be released by calling bpf_cgroup_release().
  * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired.
  */
-struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
+__bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
 {
 	struct cgroup *cgrp;
 
@@ -1985,7 +1985,7 @@ struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
  * drops to 0.
  * @cgrp: The cgroup on which a reference is being released.
  */
-void bpf_cgroup_release(struct cgroup *cgrp)
+__bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
 {
 	if (!cgrp)
 		return;
@@ -2000,7 +2000,7 @@ void bpf_cgroup_release(struct cgroup *cgrp)
  * @cgrp: The cgroup for which we're performing a lookup.
  * @level: The level of ancestor to look up.
  */
-struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
+__bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
 {
 	struct cgroup *ancestor;
 
@@ -2019,7 +2019,7 @@ struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
  * stored in a map, or released with bpf_task_release().
  * @pid: The pid of the task being looked up.
  */
-struct task_struct *bpf_task_from_pid(s32 pid)
+__bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
 {
 	struct task_struct *p;
 
@@ -2032,22 +2032,22 @@ struct task_struct *bpf_task_from_pid(s32 pid)
 	return p;
 }
 
-void *bpf_cast_to_kern_ctx(void *obj)
+__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
 {
 	return obj;
 }
 
-void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)
+__bpf_kfunc void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)
 {
 	return obj__ign;
 }
 
-void bpf_rcu_read_lock(void)
+__bpf_kfunc void bpf_rcu_read_lock(void)
 {
 	rcu_read_lock();
 }
 
-void bpf_rcu_read_unlock(void)
+__bpf_kfunc void bpf_rcu_read_unlock(void)
 {
 	rcu_read_unlock();
 }
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 793ecff29038..831f1f472bb8 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -26,7 +26,7 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
  * rstat_cpu->updated_children list.  See the comment on top of
  * cgroup_rstat_cpu definition for details.
  */
-void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
+__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
 {
 	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
 	unsigned long flags;
@@ -231,7 +231,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
  *
  * This function may block.
  */
-void cgroup_rstat_flush(struct cgroup *cgrp)
+__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
 {
 	might_sleep();
 
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 969e8f52f7da..b1cf259854ca 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -6,6 +6,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/btf.h>
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
@@ -975,7 +976,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
 }
 STACK_FRAME_NON_STANDARD(__crash_kexec);
 
-void crash_kexec(struct pt_regs *regs)
+__bpf_kfunc void crash_kexec(struct pt_regs *regs)
 {
 	int old_cpu, this_cpu;
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b1eff2efd3b4..ff1458e541a8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1236,7 +1236,7 @@ __diag_ignore_all("-Wmissing-prototypes",
  * Return: a bpf_key pointer with a valid key pointer if the key is found, a
  *         NULL pointer otherwise.
  */
-struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
+__bpf_kfunc struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
 {
 	key_ref_t key_ref;
 	struct bpf_key *bkey;
@@ -1285,7 +1285,7 @@ struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
  * Return: a bpf_key pointer with an invalid key pointer set from the
  *         pre-determined ID on success, a NULL pointer otherwise
  */
-struct bpf_key *bpf_lookup_system_key(u64 id)
+__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
 {
 	struct bpf_key *bkey;
 
@@ -1309,7 +1309,7 @@ struct bpf_key *bpf_lookup_system_key(u64 id)
  * Decrement the reference count of the key inside *bkey*, if the pointer
  * is valid, and free *bkey*.
  */
-void bpf_key_put(struct bpf_key *bkey)
+__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
 {
 	if (bkey->has_ref)
 		key_put(bkey->key);
@@ -1329,7 +1329,7 @@ void bpf_key_put(struct bpf_key *bkey)
  *
  * Return: 0 on success, a negative value on error.
  */
-int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
+__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
 			       struct bpf_dynptr_kern *sig_ptr,
 			       struct bpf_key *trusted_keyring)
 {
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 7dbefa4fd2eb..af9827c4b351 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -484,7 +484,7 @@ out:
 __diag_push();
 __diag_ignore_all("-Wmissing-prototypes",
 		  "Global functions as their definitions will be in vmlinux BTF");
-int noinline bpf_fentry_test1(int a)
+__bpf_kfunc int bpf_fentry_test1(int a)
 {
 	return a + 1;
 }
@@ -529,23 +529,23 @@ int noinline bpf_fentry_test8(struct bpf_fentry_test_t *arg)
 	return (long)arg->a;
 }
 
-int noinline bpf_modify_return_test(int a, int *b)
+__bpf_kfunc int bpf_modify_return_test(int a, int *b)
 {
 	*b += 1;
 	return a + *b;
 }
 
-u64 noinline bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d)
+__bpf_kfunc u64 bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d)
 {
 	return a + b + c + d;
 }
 
-int noinline bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b)
+__bpf_kfunc int bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b)
 {
 	return a + b;
 }
 
-struct sock * noinline bpf_kfunc_call_test3(struct sock *sk)
+__bpf_kfunc struct sock *bpf_kfunc_call_test3(struct sock *sk)
 {
 	return sk;
 }
@@ -582,21 +582,21 @@ static struct prog_test_ref_kfunc prog_test_struct = {
 	.cnt = REFCOUNT_INIT(1),
 };
 
-noinline struct prog_test_ref_kfunc *
+__bpf_kfunc struct prog_test_ref_kfunc *
 bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr)
 {
 	refcount_inc(&prog_test_struct.cnt);
 	return &prog_test_struct;
 }
 
-noinline struct prog_test_member *
+__bpf_kfunc struct prog_test_member *
 bpf_kfunc_call_memb_acquire(void)
 {
 	WARN_ON_ONCE(1);
 	return NULL;
 }
 
-noinline void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p)
+__bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p)
 {
 	if (!p)
 		return;
@@ -604,11 +604,11 @@ noinline void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p)
 	refcount_dec(&p->cnt);
 }
 
-noinline void bpf_kfunc_call_memb_release(struct prog_test_member *p)
+__bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p)
 {
 }
 
-noinline void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p)
+__bpf_kfunc void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p)
 {
 	WARN_ON_ONCE(1);
 }
@@ -621,12 +621,14 @@ static int *__bpf_kfunc_call_test_get_mem(struct prog_test_ref_kfunc *p, const i
 	return (int *)p;
 }
 
-noinline int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, const int rdwr_buf_size)
+__bpf_kfunc int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p,
+						  const int rdwr_buf_size)
 {
 	return __bpf_kfunc_call_test_get_mem(p, rdwr_buf_size);
 }
 
-noinline int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size)
+__bpf_kfunc int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p,
+						    const int rdonly_buf_size)
 {
 	return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size);
 }
@@ -636,16 +638,17 @@ noinline int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p,
  * Acquire functions must return struct pointers, so these ones are
  * failing.
  */
-noinline int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size)
+__bpf_kfunc int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p,
+						    const int rdonly_buf_size)
 {
 	return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size);
 }
 
-noinline void bpf_kfunc_call_int_mem_release(int *p)
+__bpf_kfunc void bpf_kfunc_call_int_mem_release(int *p)
 {
 }
 
-noinline struct prog_test_ref_kfunc *
+__bpf_kfunc struct prog_test_ref_kfunc *
 bpf_kfunc_call_test_kptr_get(struct prog_test_ref_kfunc **pp, int a, int b)
 {
 	struct prog_test_ref_kfunc *p = READ_ONCE(*pp);
@@ -694,47 +697,47 @@ struct prog_test_fail3 {
 	char arr2[];
 };
 
-noinline void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb)
+__bpf_kfunc void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb)
 {
 }
 
-noinline void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p)
+__bpf_kfunc void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p)
 {
 }
 
-noinline void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p)
+__bpf_kfunc void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p)
 {
 }
 
-noinline void bpf_kfunc_call_test_fail1(struct prog_test_fail1 *p)
+__bpf_kfunc void bpf_kfunc_call_test_fail1(struct prog_test_fail1 *p)
 {
 }
 
-noinline void bpf_kfunc_call_test_fail2(struct prog_test_fail2 *p)
+__bpf_kfunc void bpf_kfunc_call_test_fail2(struct prog_test_fail2 *p)
 {
 }
 
-noinline void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p)
+__bpf_kfunc void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p)
 {
 }
 
-noinline void bpf_kfunc_call_test_mem_len_pass1(void *mem, int mem__sz)
+__bpf_kfunc void bpf_kfunc_call_test_mem_len_pass1(void *mem, int mem__sz)
 {
 }
 
-noinline void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len)
+__bpf_kfunc void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len)
 {
 }
 
-noinline void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
+__bpf_kfunc void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
 {
 }
 
-noinline void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p)
+__bpf_kfunc void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p)
 {
 }
 
-noinline void bpf_kfunc_call_test_destructive(void)
+__bpf_kfunc void bpf_kfunc_call_test_destructive(void)
 {
 }
 
diff --git a/net/core/xdp.c b/net/core/xdp.c
index a5a7ecf6391c..787fb9f92b36 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -4,6 +4,7 @@
  * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
  */
 #include <linux/bpf.h>
+#include <linux/btf.h>
 #include <linux/btf_ids.h>
 #include <linux/filter.h>
 #include <linux/types.h>
@@ -722,7 +723,7 @@ __diag_ignore_all("-Wmissing-prototypes",
  *
  * Returns 0 on success or ``-errno`` on error.
  */
-int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
+__bpf_kfunc int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
 {
 	return -EOPNOTSUPP;
 }
@@ -734,7 +735,7 @@ int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
  *
  * Returns 0 on success or ``-errno`` on error.
  */
-int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash)
+__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index d2c470524e58..146792cd26fe 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -295,7 +295,7 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
 }
 
 /* override sysctl_tcp_min_tso_segs */
-static u32 bbr_min_tso_segs(struct sock *sk)
+__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
 {
 	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
 }
@@ -328,7 +328,7 @@ static void bbr_save_cwnd(struct sock *sk)
 		bbr->prior_cwnd = max(bbr->prior_cwnd, tcp_snd_cwnd(tp));
 }
 
-static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+__bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
@@ -1023,7 +1023,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
 	bbr_update_gains(sk);
 }
 
-static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs)
 {
 	struct bbr *bbr = inet_csk_ca(sk);
 	u32 bw;
@@ -1035,7 +1035,7 @@ static void bbr_main(struct sock *sk, const struct rate_sample *rs)
 	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
 }
 
-static void bbr_init(struct sock *sk)
+__bpf_kfunc static void bbr_init(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
@@ -1077,7 +1077,7 @@ static void bbr_init(struct sock *sk)
 	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
 }
 
-static u32 bbr_sndbuf_expand(struct sock *sk)
+__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
 {
 	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
 	return 3;
@@ -1086,7 +1086,7 @@ static u32 bbr_sndbuf_expand(struct sock *sk)
 /* In theory BBR does not need to undo the cwnd since it does not
  * always reduce cwnd on losses (see bbr_main()). Keep it for now.
  */
-static u32 bbr_undo_cwnd(struct sock *sk)
+__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
 {
 	struct bbr *bbr = inet_csk_ca(sk);
 
@@ -1097,7 +1097,7 @@ static u32 bbr_undo_cwnd(struct sock *sk)
 }
 
 /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
-static u32 bbr_ssthresh(struct sock *sk)
+__bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
 {
 	bbr_save_cwnd(sk);
 	return tcp_sk(sk)->snd_ssthresh;
@@ -1125,7 +1125,7 @@ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
 	return 0;
 }
 
-static void bbr_set_state(struct sock *sk, u8 new_state)
+__bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
 {
 	struct bbr *bbr = inet_csk_ca(sk);
 
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index d3cae40749e8..db8b4b488c31 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -403,7 +403,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
  * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
  * returns the leftover acks to adjust cwnd in congestion avoidance mode.
  */
-u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
+__bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
 {
 	u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh);
 
@@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(tcp_slow_start);
 /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
  * for every packet that was ACKed.
  */
-void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
+__bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
 {
 	/* If credits accumulated at a higher w, apply them gently now. */
 	if (tp->snd_cwnd_cnt >= w) {
@@ -443,7 +443,7 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
 /* This is Jacobson's slow start and congestion avoidance.
  * SIGCOMM '88, p. 328.
  */
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+__bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -462,7 +462,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
 
 /* Slow start threshold is half the congestion window (min 2) */
-u32 tcp_reno_ssthresh(struct sock *sk)
+__bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 
@@ -470,7 +470,7 @@ u32 tcp_reno_ssthresh(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
 
-u32 tcp_reno_undo_cwnd(struct sock *sk)
+__bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 768c10c1f649..0fd78ecb67e7 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -126,7 +126,7 @@ static inline void bictcp_hystart_reset(struct sock *sk)
 	ca->sample_cnt = 0;
 }
 
-static void cubictcp_init(struct sock *sk)
+__bpf_kfunc static void cubictcp_init(struct sock *sk)
 {
 	struct bictcp *ca = inet_csk_ca(sk);
 
@@ -139,7 +139,7 @@ static void cubictcp_init(struct sock *sk)
 		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 
-static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+__bpf_kfunc static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 {
 	if (event == CA_EVENT_TX_START) {
 		struct bictcp *ca = inet_csk_ca(sk);
@@ -321,7 +321,7 @@ tcp_friendliness:
 	ca->cnt = max(ca->cnt, 2U);
 }
 
-static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+__bpf_kfunc static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
@@ -338,7 +338,7 @@ static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 	tcp_cong_avoid_ai(tp, ca->cnt, acked);
 }
 
-static u32 cubictcp_recalc_ssthresh(struct sock *sk)
+__bpf_kfunc static u32 cubictcp_recalc_ssthresh(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
@@ -355,7 +355,7 @@ static u32 cubictcp_recalc_ssthresh(struct sock *sk)
 	return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U);
 }
 
-static void cubictcp_state(struct sock *sk, u8 new_state)
+__bpf_kfunc static void cubictcp_state(struct sock *sk, u8 new_state)
 {
 	if (new_state == TCP_CA_Loss) {
 		bictcp_reset(inet_csk_ca(sk));
@@ -445,7 +445,7 @@ static void hystart_update(struct sock *sk, u32 delay)
 	}
 }
 
-static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample)
+__bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index e0a2ca7456ff..bb23bb5b387a 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -75,7 +75,7 @@ static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
 	ca->old_delivered_ce = tp->delivered_ce;
 }
 
-static void dctcp_init(struct sock *sk)
+__bpf_kfunc static void dctcp_init(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 
@@ -104,7 +104,7 @@ static void dctcp_init(struct sock *sk)
 	INET_ECN_dontxmit(sk);
 }
 
-static u32 dctcp_ssthresh(struct sock *sk)
+__bpf_kfunc static u32 dctcp_ssthresh(struct sock *sk)
 {
 	struct dctcp *ca = inet_csk_ca(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -113,7 +113,7 @@ static u32 dctcp_ssthresh(struct sock *sk)
 	return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * ca->dctcp_alpha) >> 11U), 2U);
 }
 
-static void dctcp_update_alpha(struct sock *sk, u32 flags)
+__bpf_kfunc static void dctcp_update_alpha(struct sock *sk, u32 flags)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct dctcp *ca = inet_csk_ca(sk);
@@ -169,7 +169,7 @@ static void dctcp_react_to_loss(struct sock *sk)
 	tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U);
 }
 
-static void dctcp_state(struct sock *sk, u8 new_state)
+__bpf_kfunc static void dctcp_state(struct sock *sk, u8 new_state)
 {
 	if (new_state == TCP_CA_Recovery &&
 	    new_state != inet_csk(sk)->icsk_ca_state)
@@ -179,7 +179,7 @@ static void dctcp_state(struct sock *sk, u8 new_state)
 	 */
 }
 
-static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+__bpf_kfunc static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
 {
 	struct dctcp *ca = inet_csk_ca(sk);
 
@@ -229,7 +229,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
 	return 0;
 }
 
-static u32 dctcp_cwnd_undo(struct sock *sk)
+__bpf_kfunc static u32 dctcp_cwnd_undo(struct sock *sk)
 {
 	const struct dctcp *ca = inet_csk_ca(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 24002bc61e07..34913521c385 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -249,7 +249,7 @@ __diag_ignore_all("-Wmissing-prototypes",
  * @opts__sz	- Length of the bpf_ct_opts structure
  *		    Must be NF_BPF_CT_OPTS_SZ (12)
  */
-struct nf_conn___init *
+__bpf_kfunc struct nf_conn___init *
 bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
 		 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
 {
@@ -283,7 +283,7 @@ bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
  * @opts__sz	- Length of the bpf_ct_opts structure
  *		    Must be NF_BPF_CT_OPTS_SZ (12)
  */
-struct nf_conn *
+__bpf_kfunc struct nf_conn *
 bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
 		  u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
 {
@@ -316,7 +316,7 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
  * @opts__sz	- Length of the bpf_ct_opts structure
  *		    Must be NF_BPF_CT_OPTS_SZ (12)
  */
-struct nf_conn___init *
+__bpf_kfunc struct nf_conn___init *
 bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
 		 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
 {
@@ -351,7 +351,7 @@ bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
  * @opts__sz	- Length of the bpf_ct_opts structure
  *		    Must be NF_BPF_CT_OPTS_SZ (12)
  */
-struct nf_conn *
+__bpf_kfunc struct nf_conn *
 bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
 		  u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
 {
@@ -376,7 +376,7 @@ bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
  * @nfct	 - Pointer to referenced nf_conn___init object, obtained
  *		   using bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
  */
-struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
+__bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
 {
 	struct nf_conn *nfct = (struct nf_conn *)nfct_i;
 	int err;
@@ -400,7 +400,7 @@ struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
  * @nf_conn	 - Pointer to referenced nf_conn object, obtained using
  *		   bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
  */
-void bpf_ct_release(struct nf_conn *nfct)
+__bpf_kfunc void bpf_ct_release(struct nf_conn *nfct)
 {
 	if (!nfct)
 		return;
@@ -417,7 +417,7 @@ void bpf_ct_release(struct nf_conn *nfct)
  *                 bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
  * @timeout      - Timeout in msecs.
  */
-void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
+__bpf_kfunc void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
 {
 	__nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout));
 }
@@ -432,7 +432,7 @@ void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
  *		   bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup.
  * @timeout      - New timeout in msecs.
  */
-int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
+__bpf_kfunc int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
 {
 	return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout));
 }
@@ -447,7 +447,7 @@ int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
  *		   bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
  * @status       - New status value.
  */
-int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
+__bpf_kfunc int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
 {
 	return nf_ct_change_status_common((struct nf_conn *)nfct, status);
 }
@@ -462,7 +462,7 @@ int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
  *		   bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
  * @status       - New status value.
  */
-int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
+__bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
 {
 	return nf_ct_change_status_common(nfct, status);
 }
diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c
index 0fa5a0bbb0ff..141ee7783223 100644
--- a/net/netfilter/nf_nat_bpf.c
+++ b/net/netfilter/nf_nat_bpf.c
@@ -30,9 +30,9 @@ __diag_ignore_all("-Wmissing-prototypes",
  *		  interpreted as select a random port.
  * @manip	- NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST
  */
-int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
-			union nf_inet_addr *addr, int port,
-			enum nf_nat_manip_type manip)
+__bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
+				    union nf_inet_addr *addr, int port,
+				    enum nf_nat_manip_type manip)
 {
 	struct nf_conn *ct = (struct nf_conn *)nfct;
 	u16 proto = nf_ct_l3num(ct);
diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
index 1ef2162cebcf..d74f3fd20f2b 100644
--- a/net/xfrm/xfrm_interface_bpf.c
+++ b/net/xfrm/xfrm_interface_bpf.c
@@ -39,8 +39,7 @@ __diag_ignore_all("-Wmissing-prototypes",
  * @to		- Pointer to memory to which the metadata will be copied
  *		    Cannot be NULL
  */
-__used noinline
-int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
+__bpf_kfunc int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
 {
 	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
 	struct xfrm_md_info *info;
@@ -62,9 +61,7 @@ int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
  * @from	- Pointer to memory from which the metadata will be copied
  *		    Cannot be NULL
  */
-__used noinline
-int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
-			  const struct bpf_xfrm_info *from)
+__bpf_kfunc int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx, const struct bpf_xfrm_info *from)
 {
 	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
 	struct metadata_dst *md_dst;
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 5085fea3cac5..46500636d8cd 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -59,7 +59,7 @@ bpf_testmod_test_struct_arg_5(void) {
 	return bpf_testmod_test_struct_arg_result;
 }
 
-noinline void
+__bpf_kfunc void
 bpf_testmod_test_mod_kfunc(int i)
 {
 	*(int *)this_cpu_ptr(&bpf_testmod_ksym_percpu) = i;
-- 
cgit 


From 6aed15e330bfec6a423f40582b2a8b53d9ce1757 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 1 Feb 2023 11:30:16 -0600
Subject: selftests/bpf: Add testcase for static kfunc with unused arg

kfuncs are allowed to be static, or not use one or more of their
arguments. For example, bpf_xdp_metadata_rx_hash() in net/core/xdp.c is
meant to be implemented by drivers, with the default implementation just
returning -EOPNOTSUPP. As described in [0], such kfuncs can have their
arguments elided, which can cause BTF encoding to be skipped. The new
__bpf_kfunc macro should address this, and this patch adds a selftest
which verifies that a static kfunc with at least one unused argument can
still be encoded and invoked by a BPF program.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230201173016.342758-5-void@manifault.com
---
 net/bpf/test_run.c                                  |  6 ++++++
 tools/testing/selftests/bpf/prog_tests/kfunc_call.c |  1 +
 tools/testing/selftests/bpf/progs/kfunc_call_test.c | 11 +++++++++++
 3 files changed, 18 insertions(+)

(limited to 'tools')

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index af9827c4b351..e6f773d12045 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -741,6 +741,11 @@ __bpf_kfunc void bpf_kfunc_call_test_destructive(void)
 {
 }
 
+__bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused)
+{
+	return arg;
+}
+
 __diag_pop();
 
 BTF_SET8_START(bpf_test_modify_return_ids)
@@ -779,6 +784,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
 BTF_SET8_END(test_sk_check_kfunc_ids)
 
 static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
index bb4cd82a788a..a543742cd7bd 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
@@ -77,6 +77,7 @@ static struct kfunc_test_params kfunc_tests[] = {
 	TC_TEST(kfunc_call_test_get_mem, 42),
 	SYSCALL_TEST(kfunc_syscall_test, 0),
 	SYSCALL_NULL_CTX_TEST(kfunc_syscall_test_null, 0),
+	TC_TEST(kfunc_call_test_static_unused_arg, 0),
 };
 
 struct syscall_test_args {
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
index d91c58d06d38..7daa8f5720b9 100644
--- a/tools/testing/selftests/bpf/progs/kfunc_call_test.c
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
@@ -17,6 +17,7 @@ extern void bpf_kfunc_call_test_mem_len_pass1(void *mem, int len) __ksym;
 extern void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym;
 extern int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, const int rdwr_buf_size) __ksym;
 extern int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) __ksym;
+extern u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused) __ksym;
 
 SEC("tc")
 int kfunc_call_test4(struct __sk_buff *skb)
@@ -181,4 +182,14 @@ int kfunc_call_test_get_mem(struct __sk_buff *skb)
 	return ret;
 }
 
+SEC("tc")
+int kfunc_call_test_static_unused_arg(struct __sk_buff *skb)
+{
+
+	u32 expected = 5, actual;
+
+	actual = bpf_kfunc_call_test_static_unused_arg(expected, 0xdeadbeef);
+	return actual != expected ? -1 : 0;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit 


From f2922f77a6a6e6c549f1fd639ec01d2f7999fbc0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 1 Feb 2023 19:12:54 +0100
Subject: selftests/bpf: Fix unmap bug in prog_tests/xdp_metadata.c

The function close_xsk() unmap via munmap() the wrong memory pointer.
The call xsk_umem__delete(xsk->umem) have already freed xsk->umem.
Thus the call to munmap(xsk->umem, UMEM_SIZE) will have unpredictable
behavior that can lead to Segmentation fault elsewhere, as man page
explain subsequent references to these pages will generate SIGSEGV.

Fixes: e2a46d54d7a1 ("selftests/bpf: Verify xdp_metadata xdp->af_xdp path")
Reported-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/167527517464.938135.13750760520577765269.stgit@firesoul
---
 tools/testing/selftests/bpf/prog_tests/xdp_metadata.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
index e033d48288c0..241909d71c7e 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
@@ -121,7 +121,7 @@ static void close_xsk(struct xsk *xsk)
 		xsk_umem__delete(xsk->umem);
 	if (xsk->socket)
 		xsk_socket__delete(xsk->socket);
-	munmap(xsk->umem, UMEM_SIZE);
+	munmap(xsk->umem_area, UMEM_SIZE);
 }
 
 static void ip_csum(struct iphdr *iph)
-- 
cgit 


From 3fd9dcd689a5582e682fde57b3139066d032367a Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 1 Feb 2023 18:31:50 +0100
Subject: selftests/bpf: xdp_hw_metadata clear metadata when -EOPNOTSUPP

The AF_XDP userspace part of xdp_hw_metadata see non-zero as a signal of
the availability of rx_timestamp and rx_hash in data_meta area. The
kernel-side BPF-prog code doesn't initialize these members when kernel
returns an error e.g. -EOPNOTSUPP.  This memory area is not guaranteed to
be zeroed, and can contain garbage/previous values, which will be read
and interpreted by AF_XDP userspace side.

Tested this on different drivers. The experiences are that for most
packets they will have zeroed this data_meta area, but occasionally it
will contain garbage data.

Example of failure tested on ixgbe:

 poll: 1 (0)
 xsk_ring_cons__peek: 1
 0x18ec788: rx_desc[0]->addr=100000000008000 addr=8100 comp_addr=8000
 rx_hash: 3697961069
 rx_timestamp:  9024981991734834796 (sec:9024981991.7348)
 0x18ec788: complete idx=8 addr=8000

Converting to date:

 date -d @9024981991
 2255-12-28T20:26:31 CET

I choose a simple fix in this patch. When kfunc fails or isn't supported
assign zero to the corresponding struct meta value.

It's up to the individual BPF-programmer to do something smarter e.g.
that fits their use-case, like getting a software timestamp and marking
a flag that gives the type of timestamp.

Fixes: 297a3f124155 ("selftests/bpf: Simple program to dump XDP RX metadata")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/167527271027.937063.5177725618616476592.stgit@firesoul
---
 tools/testing/selftests/bpf/progs/xdp_hw_metadata.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c
index 25b8178735ee..4c55b4d79d3d 100644
--- a/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c
@@ -70,10 +70,14 @@ int rx(struct xdp_md *ctx)
 	}
 
 	if (!bpf_xdp_metadata_rx_timestamp(ctx, &meta->rx_timestamp))
-		bpf_printk("populated rx_timestamp with %u", meta->rx_timestamp);
+		bpf_printk("populated rx_timestamp with %llu", meta->rx_timestamp);
+	else
+		meta->rx_timestamp = 0; /* Used by AF_XDP as not avail signal */
 
 	if (!bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash))
 		bpf_printk("populated rx_hash with %u", meta->rx_hash);
+	else
+		meta->rx_hash = 0; /* Used by AF_XDP as not avail signal */
 
 	return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS);
 }
-- 
cgit 


From a19a62e56478ba4afadfa7df94d0819542b7ccf8 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 1 Feb 2023 18:31:55 +0100
Subject: selftests/bpf: xdp_hw_metadata cleanup cause segfault

Using xdp_hw_metadata I experince Segmentation fault after seeing
"detaching bpf program....".

On my system the segfault happened when accessing bpf_obj->skeleton
in xdp_hw_metadata__destroy(bpf_obj) call. That doesn't make any sense
as this memory have not been freed by program at this point in time.

Prior to calling xdp_hw_metadata__destroy(bpf_obj) the function
close_xsk() is called for each RX-queue xsk.  The real bug lays
in close_xsk() that unmap via munmap() the wrong memory pointer.
The call xsk_umem__delete(xsk->umem) will free xsk->umem, thus
the call to munmap(xsk->umem, UMEM_SIZE) will have unpredictable
behavior. And man page explain subsequent references to these
pages will generate SIGSEGV.

Unmapping xsk->umem_area instead removes the segfault.

Fixes: 297a3f124155 ("selftests/bpf: Simple program to dump XDP RX metadata")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/167527271533.937063.5717065138099679142.stgit@firesoul
---
 tools/testing/selftests/bpf/xdp_hw_metadata.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index 3823b1c499cc..438083e34cce 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -121,7 +121,7 @@ static void close_xsk(struct xsk *xsk)
 		xsk_umem__delete(xsk->umem);
 	if (xsk->socket)
 		xsk_socket__delete(xsk->socket);
-	munmap(xsk->umem, UMEM_SIZE);
+	munmap(xsk->umem_area, UMEM_SIZE);
 }
 
 static void refill_rx(struct xsk *xsk, __u64 addr)
-- 
cgit 


From 7bd4224deecd2d917fcbb52f9d13ab1453be219a Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 1 Feb 2023 18:32:00 +0100
Subject: selftests/bpf: xdp_hw_metadata correct status value in error(3)

The glibc error reporting function error():

 void error(int status, int errnum, const char *format, ...);

The status argument should be a positive value between 0-255 as it
is passed over to the exit(3) function as the value as the shell exit
status. The least significant byte of status (i.e., status & 0xFF) is
returned to the shell parent.

Fix this by using 1 instead of -1. As 1 corresponds to C standard
constant EXIT_FAILURE.

Fixes: 297a3f124155 ("selftests/bpf: Simple program to dump XDP RX metadata")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/167527272038.937063.9137108142012298120.stgit@firesoul
---
 tools/testing/selftests/bpf/xdp_hw_metadata.c | 28 +++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index 438083e34cce..58fde35abad7 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -165,7 +165,7 @@ static void verify_skb_metadata(int fd)
 	hdr.msg_controllen = sizeof(cmsg_buf);
 
 	if (recvmsg(fd, &hdr, 0) < 0)
-		error(-1, errno, "recvmsg");
+		error(1, errno, "recvmsg");
 
 	for (cmsg = CMSG_FIRSTHDR(&hdr); cmsg != NULL;
 	     cmsg = CMSG_NXTHDR(&hdr, cmsg)) {
@@ -275,11 +275,11 @@ static int rxq_num(const char *ifname)
 
 	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
 	if (fd < 0)
-		error(-1, errno, "socket");
+		error(1, errno, "socket");
 
 	ret = ioctl(fd, SIOCETHTOOL, &ifr);
 	if (ret < 0)
-		error(-1, errno, "ioctl(SIOCETHTOOL)");
+		error(1, errno, "ioctl(SIOCETHTOOL)");
 
 	close(fd);
 
@@ -296,11 +296,11 @@ static void hwtstamp_ioctl(int op, const char *ifname, struct hwtstamp_config *c
 
 	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
 	if (fd < 0)
-		error(-1, errno, "socket");
+		error(1, errno, "socket");
 
 	ret = ioctl(fd, op, &ifr);
 	if (ret < 0)
-		error(-1, errno, "ioctl(%d)", op);
+		error(1, errno, "ioctl(%d)", op);
 
 	close(fd);
 }
@@ -360,7 +360,7 @@ static void timestamping_enable(int fd, int val)
 
 	ret = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val));
 	if (ret < 0)
-		error(-1, errno, "setsockopt(SO_TIMESTAMPING)");
+		error(1, errno, "setsockopt(SO_TIMESTAMPING)");
 }
 
 int main(int argc, char *argv[])
@@ -386,13 +386,13 @@ int main(int argc, char *argv[])
 
 	rx_xsk = malloc(sizeof(struct xsk) * rxq);
 	if (!rx_xsk)
-		error(-1, ENOMEM, "malloc");
+		error(1, ENOMEM, "malloc");
 
 	for (i = 0; i < rxq; i++) {
 		printf("open_xsk(%s, %p, %d)\n", ifname, &rx_xsk[i], i);
 		ret = open_xsk(ifindex, &rx_xsk[i], i);
 		if (ret)
-			error(-1, -ret, "open_xsk");
+			error(1, -ret, "open_xsk");
 
 		printf("xsk_socket__fd() -> %d\n", xsk_socket__fd(rx_xsk[i].socket));
 	}
@@ -400,7 +400,7 @@ int main(int argc, char *argv[])
 	printf("open bpf program...\n");
 	bpf_obj = xdp_hw_metadata__open();
 	if (libbpf_get_error(bpf_obj))
-		error(-1, libbpf_get_error(bpf_obj), "xdp_hw_metadata__open");
+		error(1, libbpf_get_error(bpf_obj), "xdp_hw_metadata__open");
 
 	prog = bpf_object__find_program_by_name(bpf_obj->obj, "rx");
 	bpf_program__set_ifindex(prog, ifindex);
@@ -409,12 +409,12 @@ int main(int argc, char *argv[])
 	printf("load bpf program...\n");
 	ret = xdp_hw_metadata__load(bpf_obj);
 	if (ret)
-		error(-1, -ret, "xdp_hw_metadata__load");
+		error(1, -ret, "xdp_hw_metadata__load");
 
 	printf("prepare skb endpoint...\n");
 	server_fd = start_server(AF_INET6, SOCK_DGRAM, NULL, 9092, 1000);
 	if (server_fd < 0)
-		error(-1, errno, "start_server");
+		error(1, errno, "start_server");
 	timestamping_enable(server_fd,
 			    SOF_TIMESTAMPING_SOFTWARE |
 			    SOF_TIMESTAMPING_RAW_HARDWARE);
@@ -427,7 +427,7 @@ int main(int argc, char *argv[])
 		printf("map[%d] = %d\n", queue_id, sock_fd);
 		ret = bpf_map_update_elem(bpf_map__fd(bpf_obj->maps.xsk), &queue_id, &sock_fd, 0);
 		if (ret)
-			error(-1, -ret, "bpf_map_update_elem");
+			error(1, -ret, "bpf_map_update_elem");
 	}
 
 	printf("attach bpf program...\n");
@@ -435,12 +435,12 @@ int main(int argc, char *argv[])
 			     bpf_program__fd(bpf_obj->progs.rx),
 			     XDP_FLAGS, NULL);
 	if (ret)
-		error(-1, -ret, "bpf_xdp_attach");
+		error(1, -ret, "bpf_xdp_attach");
 
 	signal(SIGINT, handle_signal);
 	ret = verify_metadata(rx_xsk, rxq, server_fd);
 	close(server_fd);
 	cleanup();
 	if (ret)
-		error(-1, -ret, "verify_metadata");
+		error(1, -ret, "verify_metadata");
 }
-- 
cgit 


From e8a3c8bd687068bafb640ca524905f0bec716a13 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 1 Feb 2023 18:32:05 +0100
Subject: selftests/bpf: xdp_hw_metadata use strncpy for ifname

The ifname char pointer is taken directly from the command line
as input and the string is copied directly into struct ifreq
via strcpy. This makes it easy to corrupt other members of ifreq
and generally do stack overflows.

Most often the ioctl will fail with:

 ./xdp_hw_metadata: ioctl(SIOCETHTOOL): Bad address

As people will likely copy-paste code for getting NIC queue
channels (rxq_num) and enabling HW timestamping (hwtstamp_ioctl)
lets make this code a bit more secure by using strncpy.

Fixes: 297a3f124155 ("selftests/bpf: Simple program to dump XDP RX metadata")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/167527272543.937063.16993147790832546209.stgit@firesoul
---
 tools/testing/selftests/bpf/xdp_hw_metadata.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index 58fde35abad7..2a66bd3f2c9f 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -270,7 +270,7 @@ static int rxq_num(const char *ifname)
 	struct ifreq ifr = {
 		.ifr_data = (void *)&ch,
 	};
-	strcpy(ifr.ifr_name, ifname);
+	strncpy(ifr.ifr_name, ifname, IF_NAMESIZE - 1);
 	int fd, ret;
 
 	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
@@ -291,7 +291,7 @@ static void hwtstamp_ioctl(int op, const char *ifname, struct hwtstamp_config *c
 	struct ifreq ifr = {
 		.ifr_data = (void *)cfg,
 	};
-	strcpy(ifr.ifr_name, ifname);
+	strncpy(ifr.ifr_name, ifname, IF_NAMESIZE - 1);
 	int fd, ret;
 
 	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
-- 
cgit 


From eb98192576315d3f4c6c990d589ab398e7091782 Mon Sep 17 00:00:00 2001
From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Date: Mon, 9 Jan 2023 08:06:05 -0500
Subject: KVM: selftests: Verify APIC_ID is set when forcing x2APIC=>xAPIC
 transition

Add a sub-test to verify that KVM stuffs the APIC_ID when userspace forces
a transition from x2APIC to xAPIC without first disabling the APIC.  Such
a transition is architecturally disallowed (WRMSR will #GP), but needs to
be handled by KVM to allow userspace to emulate RESET (ignoring that
userspace should also stuff local APIC state on RESET).

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Link: https://lore.kernel.org/r/20230109130605.2013555-3-eesposit@redhat.com
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/x86_64/xapic_state_test.c        | 55 ++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
index d7d37dae3eeb..396c13f42457 100644
--- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
@@ -132,6 +132,59 @@ static void test_icr(struct xapic_vcpu *x)
 	__test_icr(x, -1ull & ~APIC_DM_FIXED_MASK);
 }
 
+static void __test_apic_id(struct kvm_vcpu *vcpu, uint64_t apic_base)
+{
+	uint32_t apic_id, expected;
+	struct kvm_lapic_state xapic;
+
+	vcpu_set_msr(vcpu, MSR_IA32_APICBASE, apic_base);
+
+	vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic);
+
+	expected = apic_base & X2APIC_ENABLE ? vcpu->id : vcpu->id << 24;
+	apic_id = *((u32 *)&xapic.regs[APIC_ID]);
+
+	TEST_ASSERT(apic_id == expected,
+		    "APIC_ID not set back to %s format; wanted = %x, got = %x",
+		    (apic_base & X2APIC_ENABLE) ? "x2APIC" : "xAPIC",
+		    expected, apic_id);
+}
+
+/*
+ * Verify that KVM switches the APIC_ID between xAPIC and x2APIC when userspace
+ * stuffs MSR_IA32_APICBASE.  Setting the APIC_ID when x2APIC is enabled and
+ * when the APIC transitions for DISABLED to ENABLED is architectural behavior
+ * (on Intel), whereas the x2APIC => xAPIC transition behavior is KVM ABI since
+ * attempted to transition from x2APIC to xAPIC without disabling the APIC is
+ * architecturally disallowed.
+ */
+static void test_apic_id(void)
+{
+	const uint32_t NR_VCPUS = 3;
+	struct kvm_vcpu *vcpus[NR_VCPUS];
+	uint64_t apic_base;
+	struct kvm_vm *vm;
+	int i;
+
+	vm = vm_create_with_vcpus(NR_VCPUS, NULL, vcpus);
+	vm_enable_cap(vm, KVM_CAP_X2APIC_API, KVM_X2APIC_API_USE_32BIT_IDS);
+
+	for (i = 0; i < NR_VCPUS; i++) {
+		apic_base = vcpu_get_msr(vcpus[i], MSR_IA32_APICBASE);
+
+		TEST_ASSERT(apic_base & MSR_IA32_APICBASE_ENABLE,
+			    "APIC not in ENABLED state at vCPU RESET");
+		TEST_ASSERT(!(apic_base & X2APIC_ENABLE),
+			    "APIC not in xAPIC mode at vCPU RESET");
+
+		__test_apic_id(vcpus[i], apic_base);
+		__test_apic_id(vcpus[i], apic_base | X2APIC_ENABLE);
+		__test_apic_id(vcpus[i], apic_base);
+	}
+
+	kvm_vm_free(vm);
+}
+
 int main(int argc, char *argv[])
 {
 	struct xapic_vcpu x = {
@@ -157,4 +210,6 @@ int main(int argc, char *argv[])
 	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
 	test_icr(&x);
 	kvm_vm_free(vm);
+
+	test_apic_id();
 }
-- 
cgit 


From 8b79b34a66cd61769802255a2a6bdd446d4f93ca Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 1 Feb 2023 15:36:40 -0800
Subject: selftests/bpf: Don't refill on completion in xdp_metadata

We only need to consume TX completion instead of refilling 'fill' ring.
It's currently not an issue because we never RX more than 8 packets.

Fixes: e2a46d54d7a1 ("selftests/bpf: Verify xdp_metadata xdp->af_xdp path")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230201233640.367646-1-sdf@google.com
---
 tools/testing/selftests/bpf/prog_tests/xdp_metadata.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
index 241909d71c7e..aa4beae99f4f 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
@@ -205,9 +205,8 @@ static void complete_tx(struct xsk *xsk)
 	if (ASSERT_EQ(xsk_ring_cons__peek(&xsk->comp, 1, &idx), 1, "xsk_ring_cons__peek")) {
 		addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx);
 
-		printf("%p: refill idx=%u addr=%llx\n", xsk, idx, addr);
-		*xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr;
-		xsk_ring_prod__submit(&xsk->fill, 1);
+		printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr);
+		xsk_ring_cons__release(&xsk->comp, 1);
 	}
 }
 
-- 
cgit 


From 4bc32df7a9c387ea1a1b7336e2c7d44aa8cee9f4 Mon Sep 17 00:00:00 2001
From: Ye Xingchen <ye.xingchen@zte.com.cn>
Date: Tue, 31 Jan 2023 14:40:51 +0800
Subject: selftests/bpf: Remove duplicate include header in xdp_hw_metadata

The linux/net_tstamp.h is included more than once, thus clean it up.

Signed-off-by: Ye Xingchen <ye.xingchen@zte.com.cn>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/202301311440516312161@zte.com.cn
---
 tools/testing/selftests/bpf/xdp_hw_metadata.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index 2a66bd3f2c9f..1c8acb68b977 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -24,7 +24,6 @@
 #include <linux/net_tstamp.h>
 #include <linux/udp.h>
 #include <linux/sockios.h>
-#include <linux/net_tstamp.h>
 #include <sys/mman.h>
 #include <net/if.h>
 #include <poll.h>
-- 
cgit 


From cf3e0251868c56b10ccb3e0b3628fcfb0c9c4ec5 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <zwisler@chromium.org>
Date: Mon, 30 Jan 2023 11:19:11 -0700
Subject: PM: tools: use canonical ftrace path

The canonical location for the tracefs filesystem is at /sys/kernel/tracing.

But, from Documentation/trace/ftrace.rst:

  Before 4.1, all ftrace tracing control files were within the debugfs
  file system, which is typically located at /sys/kernel/debug/tracing.
  For backward compatibility, when mounting the debugfs file system,
  the tracefs file system will be automatically mounted at:

  /sys/kernel/debug/tracing

A few scripts in tools/power still refer to this older debugfs path, so
let's update them to avoid confusion.

Signed-off-by: Ross Zwisler <zwisler@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 tools/power/pm-graph/sleepgraph.py                         |  4 ++--
 tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py      |  4 ++--
 tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py | 10 +++++-----
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/power/pm-graph/sleepgraph.py b/tools/power/pm-graph/sleepgraph.py
index c60c90f35d18..82c09cd25cc2 100755
--- a/tools/power/pm-graph/sleepgraph.py
+++ b/tools/power/pm-graph/sleepgraph.py
@@ -120,9 +120,9 @@ class SystemValues:
 	cgexp = False
 	testdir = ''
 	outdir = ''
-	tpath = '/sys/kernel/debug/tracing/'
+	tpath = '/sys/kernel/tracing/'
 	fpdtpath = '/sys/firmware/acpi/tables/FPDT'
-	epath = '/sys/kernel/debug/tracing/events/power/'
+	epath = '/sys/kernel/tracing/events/power/'
 	pmdpath = '/sys/power/pm_debug_messages'
 	s0ixpath = '/sys/module/intel_pmc_core/parameters/warn_on_s0ix_failures'
 	s0ixres = '/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us'
diff --git a/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py b/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py
index 2dea4032ac56..904df0ea0a1e 100755
--- a/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py
+++ b/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py
@@ -248,7 +248,7 @@ def signal_handler(signal, frame):
         ipt.free_trace_buffer()
         sys.exit(0)
 
-trace_file = "/sys/kernel/debug/tracing/events/amd_cpu/enable"
+trace_file = "/sys/kernel/tracing/events/amd_cpu/enable"
 signal.signal(signal.SIGINT, signal_handler)
 
 interval = ""
@@ -319,7 +319,7 @@ print(cur_version)
 cleanup_data_files()
 
 if interval:
-    file_name = "/sys/kernel/debug/tracing/trace"
+    file_name = "/sys/kernel/tracing/trace"
     ipt.clear_trace_file()
     ipt.set_trace_buffer_size(memory)
     ipt.enable_trace(trace_file)
diff --git a/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py b/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py
index b46e9eb8f5aa..ec3323100e1a 100755
--- a/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py
+++ b/tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py
@@ -373,7 +373,7 @@ def clear_trace_file():
     """ Clear trace file """
 
     try:
-        f_handle = open('/sys/kernel/debug/tracing/trace', 'w')
+        f_handle = open('/sys/kernel/tracing/trace', 'w')
         f_handle.close()
     except:
         print('IO error clearing trace file ')
@@ -401,7 +401,7 @@ def set_trace_buffer_size(memory):
     """ Set trace buffer size """
 
     try:
-       with open('/sys/kernel/debug/tracing/buffer_size_kb', 'w') as fp:
+       with open('/sys/kernel/tracing/buffer_size_kb', 'w') as fp:
           fp.write(memory)
     except:
        print('IO error setting trace buffer size ')
@@ -411,7 +411,7 @@ def free_trace_buffer():
     """ Free the trace buffer memory """
 
     try:
-       open('/sys/kernel/debug/tracing/buffer_size_kb'
+       open('/sys/kernel/tracing/buffer_size_kb'
                  , 'w').write("1")
     except:
         print('IO error freeing trace buffer ')
@@ -495,7 +495,7 @@ def signal_handler(signal, frame):
         sys.exit(0)
 
 if __name__ == "__main__":
-    trace_file = "/sys/kernel/debug/tracing/events/power/pstate_sample/enable"
+    trace_file = "/sys/kernel/tracing/events/power/pstate_sample/enable"
     signal.signal(signal.SIGINT, signal_handler)
 
     interval = ""
@@ -569,7 +569,7 @@ if __name__ == "__main__":
     cleanup_data_files()
 
     if interval:
-        filename = "/sys/kernel/debug/tracing/trace"
+        filename = "/sys/kernel/tracing/trace"
         clear_trace_file()
         set_trace_buffer_size(memory)
         enable_trace(trace_file)
-- 
cgit 


From 27e348b221f6a78cbe86e7def8e2611f84509211 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Tue, 31 Jan 2023 16:38:53 +0100
Subject: rtla/timerlat: Add auto-analysis core

Currently, timerlat displays a summary of the timerlat tracer results
saving the trace if the system hits a stop condition.

While this represented a huge step forward, the root cause was not
that is accessible to non-expert users.

The auto-analysis fulfill this gap by parsing the trace timerlat runs,
printing an intuitive auto-analysis.

Link: https://lkml.kernel.org/r/1ee073822f6a2cbb33da0c817331d0d4045e837f.1675179318.git.bristot@kernel.org

Cc: Daniel Bristot de Oliveira <bristot@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/timerlat_aa.c | 990 +++++++++++++++++++++++++++++++++++
 tools/tracing/rtla/src/timerlat_aa.h |  12 +
 tools/tracing/rtla/src/utils.h       |   3 +
 3 files changed, 1005 insertions(+)
 create mode 100644 tools/tracing/rtla/src/timerlat_aa.c
 create mode 100644 tools/tracing/rtla/src/timerlat_aa.h

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/timerlat_aa.c b/tools/tracing/rtla/src/timerlat_aa.c
new file mode 100644
index 000000000000..ec4e0f4b0e6c
--- /dev/null
+++ b/tools/tracing/rtla/src/timerlat_aa.c
@@ -0,0 +1,990 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include "utils.h"
+#include "osnoise.h"
+#include "timerlat.h"
+
+enum timelat_state {
+	TIMERLAT_INIT = 0,
+	TIMERLAT_WAITING_IRQ,
+	TIMERLAT_WAITING_THREAD,
+};
+
+#define MAX_COMM		24
+
+/*
+ * Per-cpu data statistics and data.
+ */
+struct timerlat_aa_data {
+	/* Current CPU state */
+	int			curr_state;
+
+	/* timerlat IRQ latency */
+	unsigned long long	tlat_irq_seqnum;
+	unsigned long long	tlat_irq_latency;
+	unsigned long long	tlat_irq_timstamp;
+
+	/* timerlat Thread latency */
+	unsigned long long	tlat_thread_seqnum;
+	unsigned long long	tlat_thread_latency;
+	unsigned long long	tlat_thread_timstamp;
+
+	/*
+	 * Information about the thread running when the IRQ
+	 * arrived.
+	 *
+	 * This can be blocking or interference, depending on the
+	 * priority of the thread. Assuming timerlat is the highest
+	 * prio, it is blocking. If timerlat has a lower prio, it is
+	 * interference.
+	 * note: "unsigned long long" because they are fetch using tep_get_field_val();
+	 */
+	unsigned long long	run_thread_pid;
+	char			run_thread_comm[MAX_COMM];
+	unsigned long long	thread_blocking_duration;
+	unsigned long long	max_exit_idle_latency;
+
+	/* Information about the timerlat timer irq */
+	unsigned long long	timer_irq_start_time;
+	unsigned long long	timer_irq_start_delay;
+	unsigned long long	timer_irq_duration;
+	unsigned long long	timer_exit_from_idle;
+
+	/*
+	 * Information about the last IRQ before the timerlat irq
+	 * arrived.
+	 *
+	 * If now - timestamp is <= latency, it might have influenced
+	 * in the timerlat irq latency. Otherwise, ignore it.
+	 */
+	unsigned long long	prev_irq_duration;
+	unsigned long long	prev_irq_timstamp;
+
+	/*
+	 * Interference sum.
+	 */
+	unsigned long long	thread_nmi_sum;
+	unsigned long long	thread_irq_sum;
+	unsigned long long	thread_softirq_sum;
+	unsigned long long	thread_thread_sum;
+
+	/*
+	 * Interference task information.
+	 */
+	struct trace_seq	*prev_irqs_seq;
+	struct trace_seq	*nmi_seq;
+	struct trace_seq	*irqs_seq;
+	struct trace_seq	*softirqs_seq;
+	struct trace_seq	*threads_seq;
+	struct trace_seq	*stack_seq;
+
+	/*
+	 * Current thread.
+	 */
+	char			current_comm[MAX_COMM];
+	unsigned long long	current_pid;
+
+	/*
+	 * Is the system running a kworker?
+	 */
+	unsigned long long	kworker;
+	unsigned long long	kworker_func;
+};
+
+/*
+ * The analysis context and system wide view
+ */
+struct timerlat_aa_context {
+	int nr_cpus;
+	int dump_tasks;
+
+	/* per CPU data */
+	struct timerlat_aa_data *taa_data;
+
+	/*
+	 * required to translate function names and register
+	 * events.
+	 */
+	struct osnoise_tool *tool;
+};
+
+/*
+ * The data is stored as a local variable, but accessed via a helper function.
+ *
+ * It could be stored inside the trace context. But every access would
+ * require container_of() + a series of pointers. Do we need it? Not sure.
+ *
+ * For now keep it simple. If needed, store it in the tool, add the *context
+ * as a parameter in timerlat_aa_get_ctx() and do the magic there.
+ */
+static struct timerlat_aa_context *__timerlat_aa_ctx;
+
+static struct timerlat_aa_context *timerlat_aa_get_ctx(void)
+{
+	return __timerlat_aa_ctx;
+}
+
+/*
+ * timerlat_aa_get_data - Get the per-cpu data from the timerlat context
+ */
+static struct timerlat_aa_data
+*timerlat_aa_get_data(struct timerlat_aa_context *taa_ctx, int cpu)
+{
+	return &taa_ctx->taa_data[cpu];
+}
+
+/*
+ * timerlat_aa_irq_latency - Handles timerlat IRQ event
+ */
+static int timerlat_aa_irq_latency(struct timerlat_aa_data *taa_data,
+				   struct trace_seq *s, struct tep_record *record,
+				   struct tep_event *event)
+{
+	/*
+	 * For interference, we start now looking for things that can delay
+	 * the thread.
+	 */
+	taa_data->curr_state = TIMERLAT_WAITING_THREAD;
+	taa_data->tlat_irq_timstamp = record->ts;
+
+	/*
+	 * Zero values.
+	 */
+	taa_data->thread_nmi_sum = 0;
+	taa_data->thread_irq_sum = 0;
+	taa_data->thread_softirq_sum = 0;
+	taa_data->thread_blocking_duration = 0;
+	taa_data->timer_irq_start_time = 0;
+	taa_data->timer_irq_duration = 0;
+	taa_data->timer_exit_from_idle = 0;
+
+	/*
+	 * Zero interference tasks.
+	 */
+	trace_seq_reset(taa_data->nmi_seq);
+	trace_seq_reset(taa_data->irqs_seq);
+	trace_seq_reset(taa_data->softirqs_seq);
+	trace_seq_reset(taa_data->threads_seq);
+
+	/* IRQ latency values */
+	tep_get_field_val(s, event, "timer_latency", record, &taa_data->tlat_irq_latency, 1);
+	tep_get_field_val(s, event, "seqnum", record, &taa_data->tlat_irq_seqnum, 1);
+
+	/* The thread that can cause blocking */
+	tep_get_common_field_val(s, event, "common_pid", record, &taa_data->run_thread_pid, 1);
+
+	/*
+	 * Get exit from idle case.
+	 *
+	 * If it is not idle thread:
+	 */
+	if (taa_data->run_thread_pid)
+		return 0;
+
+	/*
+	 * if the latency is shorter than the known exit from idle:
+	 */
+	if (taa_data->tlat_irq_latency < taa_data->max_exit_idle_latency)
+		return 0;
+
+	/*
+	 * To be safe, ignore the cases in which an IRQ/NMI could have
+	 * interfered with the timerlat IRQ.
+	 */
+	if (taa_data->tlat_irq_timstamp - taa_data->tlat_irq_latency
+	    < taa_data->prev_irq_timstamp + taa_data->prev_irq_duration)
+		return 0;
+
+	taa_data->max_exit_idle_latency = taa_data->tlat_irq_latency;
+
+	return 0;
+}
+
+/*
+ * timerlat_aa_thread_latency - Handles timerlat thread event
+ */
+static int timerlat_aa_thread_latency(struct timerlat_aa_data *taa_data,
+				      struct trace_seq *s, struct tep_record *record,
+				      struct tep_event *event)
+{
+	/*
+	 * For interference, we start now looking for things that can delay
+	 * the IRQ of the next cycle.
+	 */
+	taa_data->curr_state = TIMERLAT_WAITING_IRQ;
+	taa_data->tlat_thread_timstamp = record->ts;
+
+	/* Thread latency values */
+	tep_get_field_val(s, event, "timer_latency", record, &taa_data->tlat_thread_latency, 1);
+	tep_get_field_val(s, event, "seqnum", record, &taa_data->tlat_thread_seqnum, 1);
+
+	return 0;
+}
+
+/*
+ * timerlat_aa_handler - Handle timerlat events
+ *
+ * This function is called to handle timerlat events recording statistics.
+ *
+ * Returns 0 on success, -1 otherwise.
+ */
+int timerlat_aa_handler(struct trace_seq *s, struct tep_record *record,
+			struct tep_event *event, void *context)
+{
+	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
+	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
+	unsigned long long thread;
+
+	if (!taa_data)
+		return -1;
+
+	tep_get_field_val(s, event, "context", record, &thread, 1);
+	if (!thread)
+		return timerlat_aa_irq_latency(taa_data, s, record, event);
+	else
+		return timerlat_aa_thread_latency(taa_data, s, record, event);
+}
+
+/*
+ * timerlat_aa_nmi_handler - Handles NMI noise
+ *
+ * It is used to collect information about interferences from NMI. It is
+ * hooked to the osnoise:nmi_noise event.
+ */
+static int timerlat_aa_nmi_handler(struct trace_seq *s, struct tep_record *record,
+				   struct tep_event *event, void *context)
+{
+	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
+	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
+	unsigned long long duration;
+	unsigned long long start;
+
+	tep_get_field_val(s, event, "duration", record, &duration, 1);
+	tep_get_field_val(s, event, "start", record, &start, 1);
+
+	if (taa_data->curr_state == TIMERLAT_WAITING_IRQ) {
+		taa_data->prev_irq_duration = duration;
+		taa_data->prev_irq_timstamp = start;
+
+		trace_seq_reset(taa_data->prev_irqs_seq);
+		trace_seq_printf(taa_data->prev_irqs_seq, "\t%24s	\t\t\t%9.2f us\n",
+			 "nmi", ns_to_usf(duration));
+		return 0;
+	}
+
+	taa_data->thread_nmi_sum += duration;
+	trace_seq_printf(taa_data->nmi_seq, "	%24s	\t\t\t%9.2f us\n",
+		 "nmi", ns_to_usf(duration));
+
+	return 0;
+}
+
+/*
+ * timerlat_aa_irq_handler - Handles IRQ noise
+ *
+ * It is used to collect information about interferences from IRQ. It is
+ * hooked to the osnoise:irq_noise event.
+ *
+ * It is a little bit more complex than the other because it measures:
+ *	- The IRQs that can delay the timer IRQ before it happened.
+ *	- The Timerlat IRQ handler
+ *	- The IRQs that happened between the timerlat IRQ and the timerlat thread
+ *	  (IRQ interference).
+ */
+static int timerlat_aa_irq_handler(struct trace_seq *s, struct tep_record *record,
+				   struct tep_event *event, void *context)
+{
+	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
+	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
+	unsigned long long expected_start;
+	unsigned long long duration;
+	unsigned long long vector;
+	unsigned long long start;
+	char *desc;
+	int val;
+
+	tep_get_field_val(s, event, "duration", record, &duration, 1);
+	tep_get_field_val(s, event, "start", record, &start, 1);
+	tep_get_field_val(s, event, "vector", record, &vector, 1);
+	desc = tep_get_field_raw(s, event, "desc", record, &val, 1);
+
+	/*
+	 * Before the timerlat IRQ.
+	 */
+	if (taa_data->curr_state == TIMERLAT_WAITING_IRQ) {
+		taa_data->prev_irq_duration = duration;
+		taa_data->prev_irq_timstamp = start;
+
+		trace_seq_reset(taa_data->prev_irqs_seq);
+		trace_seq_printf(taa_data->prev_irqs_seq, "\t%24s:%-3llu	\t\t%9.2f us\n",
+				 desc, vector, ns_to_usf(duration));
+		return 0;
+	}
+
+	/*
+	 * The timerlat IRQ: taa_data->timer_irq_start_time is zeroed at
+	 * the timerlat irq handler.
+	 */
+	if (!taa_data->timer_irq_start_time) {
+		expected_start = taa_data->tlat_irq_timstamp - taa_data->tlat_irq_latency;
+
+		taa_data->timer_irq_start_time = start;
+		taa_data->timer_irq_duration = duration;
+
+		taa_data->timer_irq_start_delay = taa_data->timer_irq_start_time - expected_start;
+
+		/*
+		 * not exit from idle.
+		 */
+		if (taa_data->run_thread_pid)
+			return 0;
+
+		if (expected_start > taa_data->prev_irq_timstamp + taa_data->prev_irq_duration)
+			taa_data->timer_exit_from_idle = taa_data->timer_irq_start_delay;
+
+		return 0;
+	}
+
+	/*
+	 * IRQ interference.
+	 */
+	taa_data->thread_irq_sum += duration;
+	trace_seq_printf(taa_data->irqs_seq, "	%24s:%-3llu	\t	%9.2f us\n",
+			 desc, vector, ns_to_usf(duration));
+
+	return 0;
+}
+
+static char *softirq_name[] = { "HI", "TIMER",	"NET_TX", "NET_RX", "BLOCK",
+				"IRQ_POLL", "TASKLET", "SCHED", "HRTIMER", "RCU" };
+
+
+/*
+ * timerlat_aa_softirq_handler - Handles Softirq noise
+ *
+ * It is used to collect information about interferences from Softirq. It is
+ * hooked to the osnoise:softirq_noise event.
+ *
+ * It is only printed in the non-rt kernel, as softirqs become thread on RT.
+ */
+static int timerlat_aa_softirq_handler(struct trace_seq *s, struct tep_record *record,
+				       struct tep_event *event, void *context)
+{
+	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
+	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
+	unsigned long long duration;
+	unsigned long long vector;
+	unsigned long long start;
+
+	if (taa_data->curr_state == TIMERLAT_WAITING_IRQ)
+		return 0;
+
+	tep_get_field_val(s, event, "duration", record, &duration, 1);
+	tep_get_field_val(s, event, "start", record, &start, 1);
+	tep_get_field_val(s, event, "vector", record, &vector, 1);
+
+	taa_data->thread_softirq_sum += duration;
+
+	trace_seq_printf(taa_data->softirqs_seq, "\t%24s:%-3llu	\t	%9.2f us\n",
+			 softirq_name[vector], vector, ns_to_usf(duration));
+	return 0;
+}
+
+/*
+ * timerlat_aa_softirq_handler - Handles thread noise
+ *
+ * It is used to collect information about interferences from threads. It is
+ * hooked to the osnoise:thread_noise event.
+ *
+ * Note: if you see thread noise, your timerlat thread was not the highest prio one.
+ */
+static int timerlat_aa_thread_handler(struct trace_seq *s, struct tep_record *record,
+				      struct tep_event *event, void *context)
+{
+	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
+	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
+	unsigned long long duration;
+	unsigned long long start;
+	unsigned long long pid;
+	const char *comm;
+	int val;
+
+	if (taa_data->curr_state == TIMERLAT_WAITING_IRQ)
+		return 0;
+
+	tep_get_field_val(s, event, "duration", record, &duration, 1);
+	tep_get_field_val(s, event, "start", record, &start, 1);
+
+	tep_get_common_field_val(s, event, "common_pid", record, &pid, 1);
+	comm = tep_get_field_raw(s, event, "comm", record, &val, 1);
+
+	if (pid == taa_data->run_thread_pid && !taa_data->thread_blocking_duration) {
+		taa_data->thread_blocking_duration = duration;
+
+		if (comm)
+			strncpy(taa_data->run_thread_comm, comm, MAX_COMM);
+		else
+			sprintf(taa_data->run_thread_comm, "<...>");
+
+	} else {
+		taa_data->thread_thread_sum += duration;
+
+		trace_seq_printf(taa_data->threads_seq, "\t%24s:%-3llu	\t\t%9.2f us\n",
+			 comm, pid, ns_to_usf(duration));
+	}
+
+	return 0;
+}
+
+/*
+ * timerlat_aa_stack_handler - Handles timerlat IRQ stack trace
+ *
+ * Saves and parse the stack trace generated by the timerlat IRQ.
+ */
+static int timerlat_aa_stack_handler(struct trace_seq *s, struct tep_record *record,
+			      struct tep_event *event, void *context)
+{
+	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
+	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
+	unsigned long *caller;
+	const char *function;
+	int val, i;
+
+	trace_seq_reset(taa_data->stack_seq);
+
+	trace_seq_printf(taa_data->stack_seq, "    Blocking thread stack trace\n");
+	caller = tep_get_field_raw(s, event, "caller", record, &val, 1);
+	if (caller) {
+		for (i = 0; ; i++) {
+			function = tep_find_function(taa_ctx->tool->trace.tep, caller[i]);
+			if (!function)
+				break;
+			trace_seq_printf(taa_data->stack_seq, "\t\t-> %s\n", function);
+		}
+	}
+	return 0;
+}
+
+/*
+ * timerlat_aa_sched_switch_handler - Tracks the current thread running on the CPU
+ *
+ * Handles the sched:sched_switch event to trace the current thread running on the
+ * CPU. It is used to display the threads running on the other CPUs when the trace
+ * stops.
+ */
+static int timerlat_aa_sched_switch_handler(struct trace_seq *s, struct tep_record *record,
+					    struct tep_event *event, void *context)
+{
+	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
+	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
+	const char *comm;
+	int val;
+
+	tep_get_field_val(s, event, "next_pid", record, &taa_data->current_pid, 1);
+	comm = tep_get_field_raw(s, event, "next_comm", record, &val, 1);
+
+	strncpy(taa_data->current_comm, comm, MAX_COMM);
+
+	/*
+	 * If this was a kworker, clean the last kworkers that ran.
+	 */
+	taa_data->kworker = 0;
+	taa_data->kworker_func = 0;
+
+	return 0;
+}
+
+/*
+ * timerlat_aa_kworker_start_handler - Tracks a kworker running on the CPU
+ *
+ * Handles workqueue:workqueue_execute_start event, keeping track of
+ * the job that a kworker could be doing in the CPU.
+ *
+ * We already catch problems of hardware related latencies caused by work queues
+ * running driver code that causes hardware stall. For example, with DRM drivers.
+ */
+static int timerlat_aa_kworker_start_handler(struct trace_seq *s, struct tep_record *record,
+					     struct tep_event *event, void *context)
+{
+	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
+	struct timerlat_aa_data *taa_data = timerlat_aa_get_data(taa_ctx, record->cpu);
+
+	tep_get_field_val(s, event, "work", record, &taa_data->kworker, 1);
+	tep_get_field_val(s, event, "function", record, &taa_data->kworker_func, 1);
+	return 0;
+}
+
+/*
+ * timerlat_thread_analysis - Prints the analysis of a CPU that hit a stop tracing
+ *
+ * This is the core of the analysis.
+ */
+static void timerlat_thread_analysis(struct timerlat_aa_data *taa_data, int cpu,
+				     int irq_thresh, int thread_thresh)
+{
+	unsigned long long exp_irq_ts;
+	int total;
+	int irq;
+
+	/*
+	 * IRQ latency or Thread latency?
+	 */
+	if (taa_data->tlat_irq_seqnum > taa_data->tlat_thread_seqnum) {
+		irq = 1;
+		total = taa_data->tlat_irq_latency;
+	} else {
+		irq = 0;
+		total = taa_data->tlat_thread_latency;
+	}
+
+	/*
+	 * Expected IRQ arrival time using the trace clock as the base.
+	 */
+	exp_irq_ts = taa_data->timer_irq_start_time - taa_data->timer_irq_start_delay;
+
+	if (exp_irq_ts < taa_data->prev_irq_timstamp + taa_data->prev_irq_duration)
+		printf("  Previous IRQ interference:	\t	up to %9.2f us",
+			ns_to_usf(taa_data->prev_irq_duration));
+
+	/*
+	 * The delay that the IRQ suffered before starting.
+	 */
+	printf("  IRQ handler delay:		%16s	%9.2f us (%.2f %%)\n",
+		(ns_to_usf(taa_data->timer_exit_from_idle) > 10) ? "(exit from idle)" : "",
+		ns_to_usf(taa_data->timer_irq_start_delay),
+		ns_to_per(total, taa_data->timer_irq_start_delay));
+
+	/*
+	 * Timerlat IRQ.
+	 */
+	printf("  IRQ latency:	\t\t\t\t	%9.2f us\n",
+		ns_to_usf(taa_data->tlat_irq_latency));
+
+	if (irq) {
+		/*
+		 * If the trace stopped due to IRQ, the other events will not happen
+		 * because... the trace stopped :-).
+		 *
+		 * That is all folks, the stack trace was printed before the stop,
+		 * so it will be displayed, it is the key.
+		 */
+		printf("  Blocking thread:\n");
+		printf("	%24s:%-9llu\n",
+			taa_data->run_thread_comm, taa_data->run_thread_pid);
+	} else  {
+		/*
+		 * The duration of the IRQ handler that handled the timerlat IRQ.
+		 */
+		printf("  Timerlat IRQ duration:	\t\t	%9.2f us (%.2f %%)\n",
+			ns_to_usf(taa_data->timer_irq_duration),
+			ns_to_per(total, taa_data->timer_irq_duration));
+
+		/*
+		 * The amount of time that the current thread postponed the scheduler.
+		 *
+		 * Recalling that it is net from NMI/IRQ/Softirq interference, so there
+		 * is no need to compute values here.
+		 */
+		printf("  Blocking thread:	\t\t\t	%9.2f us (%.2f %%)\n",
+			ns_to_usf(taa_data->thread_blocking_duration),
+			ns_to_per(total, taa_data->thread_blocking_duration));
+
+		printf("	%24s:%-9llu		%9.2f us\n",
+			taa_data->run_thread_comm, taa_data->run_thread_pid,
+			ns_to_usf(taa_data->thread_blocking_duration));
+	}
+
+	/*
+	 * Print the stack trace!
+	 */
+	trace_seq_do_printf(taa_data->stack_seq);
+
+	/*
+	 * NMIs can happen during the IRQ, so they are always possible.
+	 */
+	if (taa_data->thread_nmi_sum)
+		printf("  NMI interference	\t\t\t	%9.2f us (%.2f %%)\n",
+			ns_to_usf(taa_data->thread_nmi_sum),
+			ns_to_per(total, taa_data->thread_nmi_sum));
+
+	/*
+	 * If it is an IRQ latency, the other factors can be skipped.
+	 */
+	if (irq)
+		goto print_total;
+
+	/*
+	 * Prints the interference caused by IRQs to the thread latency.
+	 */
+	if (taa_data->thread_irq_sum) {
+		printf("  IRQ interference	\t\t\t	%9.2f us (%.2f %%)\n",
+			ns_to_usf(taa_data->thread_irq_sum),
+			ns_to_per(total, taa_data->thread_irq_sum));
+
+		trace_seq_do_printf(taa_data->irqs_seq);
+	}
+
+	/*
+	 * Prints the interference caused by Softirqs to the thread latency.
+	 */
+	if (taa_data->thread_softirq_sum) {
+		printf("  Softirq interference	\t\t\t	%9.2f us (%.2f %%)\n",
+			ns_to_usf(taa_data->thread_softirq_sum),
+			ns_to_per(total, taa_data->thread_softirq_sum));
+
+		trace_seq_do_printf(taa_data->softirqs_seq);
+	}
+
+	/*
+	 * Prints the interference caused by other threads to the thread latency.
+	 *
+	 * If this happens, your timerlat is not the highest prio. OK, migration
+	 * thread can happen. But otherwise, you are not measuring the "scheduling
+	 * latency" only, and here is the difference from scheduling latency and
+	 * timer handling latency.
+	 */
+	if (taa_data->thread_thread_sum) {
+		printf("  Thread interference	\t\t\t	%9.2f us (%.2f %%)\n",
+			ns_to_usf(taa_data->thread_thread_sum),
+			ns_to_per(total, taa_data->thread_thread_sum));
+
+		trace_seq_do_printf(taa_data->threads_seq);
+	}
+
+	/*
+	 * Done.
+	 */
+print_total:
+	printf("------------------------------------------------------------------------\n");
+	printf("  %s latency:	\t\t\t	%9.2f us (100%%)\n", irq ? "IRQ" : "Thread",
+		ns_to_usf(total));
+}
+
+/**
+ * timerlat_auto_analysis - Analyze the collected data
+ */
+void timerlat_auto_analysis(int irq_thresh, int thread_thresh)
+{
+	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
+	unsigned long long max_exit_from_idle = 0;
+	struct timerlat_aa_data *taa_data;
+	int max_exit_from_idle_cpu;
+	struct tep_handle *tep;
+	int cpu;
+
+	/* bring stop tracing to the ns scale */
+	irq_thresh = irq_thresh * 1000;
+	thread_thresh = thread_thresh * 1000;
+
+	for (cpu = 0; cpu < taa_ctx->nr_cpus; cpu++) {
+		taa_data = timerlat_aa_get_data(taa_ctx, cpu);
+
+		if (irq_thresh && taa_data->tlat_irq_latency >= irq_thresh) {
+			printf("## CPU %d hit stop tracing, analyzing it ##\n", cpu);
+			timerlat_thread_analysis(taa_data, cpu, irq_thresh, thread_thresh);
+		} else if (thread_thresh && (taa_data->tlat_thread_latency) >= thread_thresh) {
+			printf("## CPU %d hit stop tracing, analyzing it ##\n", cpu);
+			timerlat_thread_analysis(taa_data, cpu, irq_thresh, thread_thresh);
+		}
+
+		if (taa_data->max_exit_idle_latency > max_exit_from_idle) {
+			max_exit_from_idle = taa_data->max_exit_idle_latency;
+			max_exit_from_idle_cpu = cpu;
+		}
+
+	}
+
+	if (max_exit_from_idle) {
+		printf("\n");
+		printf("Max timerlat IRQ latency from idle: %.2f us in cpu %d\n",
+			ns_to_usf(max_exit_from_idle), max_exit_from_idle_cpu);
+	}
+	if (!taa_ctx->dump_tasks)
+		return;
+
+	printf("\n");
+	printf("Printing CPU tasks:\n");
+	for (cpu = 0; cpu < taa_ctx->nr_cpus; cpu++) {
+		taa_data = timerlat_aa_get_data(taa_ctx, cpu);
+		tep = taa_ctx->tool->trace.tep;
+
+		printf("    [%.3d] %24s:%llu", cpu, taa_data->current_comm, taa_data->current_pid);
+
+		if (taa_data->kworker_func)
+			printf(" kworker:%s:%s",
+				tep_find_function(tep, taa_data->kworker) ? : "<...>",
+				tep_find_function(tep, taa_data->kworker_func));
+		printf("\n");
+	}
+
+}
+
+/*
+ * timerlat_aa_destroy_seqs - Destroy seq files used to store parsed data
+ */
+static void timerlat_aa_destroy_seqs(struct timerlat_aa_context *taa_ctx)
+{
+	struct timerlat_aa_data *taa_data;
+	int i;
+
+	if (!taa_ctx->taa_data)
+		return;
+
+	for (i = 0; i < taa_ctx->nr_cpus; i++) {
+		taa_data = timerlat_aa_get_data(taa_ctx, i);
+
+		if (taa_data->prev_irqs_seq) {
+			trace_seq_destroy(taa_data->prev_irqs_seq);
+			free(taa_data->prev_irqs_seq);
+		}
+
+		if (taa_data->nmi_seq) {
+			trace_seq_destroy(taa_data->nmi_seq);
+			free(taa_data->nmi_seq);
+		}
+
+		if (taa_data->irqs_seq) {
+			trace_seq_destroy(taa_data->irqs_seq);
+			free(taa_data->irqs_seq);
+		}
+
+		if (taa_data->softirqs_seq) {
+			trace_seq_destroy(taa_data->softirqs_seq);
+			free(taa_data->softirqs_seq);
+		}
+
+		if (taa_data->threads_seq) {
+			trace_seq_destroy(taa_data->threads_seq);
+			free(taa_data->threads_seq);
+		}
+
+		if (taa_data->stack_seq) {
+			trace_seq_destroy(taa_data->stack_seq);
+			free(taa_data->stack_seq);
+		}
+	}
+}
+
+/*
+ * timerlat_aa_init_seqs - Init seq files used to store parsed information
+ *
+ * Instead of keeping data structures to store raw data, use seq files to
+ * store parsed data.
+ *
+ * Allocates and initialize seq files.
+ *
+ * Returns 0 on success, -1 otherwise.
+ */
+static int timerlat_aa_init_seqs(struct timerlat_aa_context *taa_ctx)
+{
+	struct timerlat_aa_data *taa_data;
+	int i;
+
+	for (i = 0; i < taa_ctx->nr_cpus; i++) {
+
+		taa_data = timerlat_aa_get_data(taa_ctx, i);
+
+		taa_data->prev_irqs_seq = calloc(1, sizeof(*taa_data->prev_irqs_seq));
+		if (!taa_data->prev_irqs_seq)
+			goto out_err;
+
+		trace_seq_init(taa_data->prev_irqs_seq);
+
+		taa_data->nmi_seq = calloc(1, sizeof(*taa_data->nmi_seq));
+		if (!taa_data->nmi_seq)
+			goto out_err;
+
+		trace_seq_init(taa_data->nmi_seq);
+
+		taa_data->irqs_seq = calloc(1, sizeof(*taa_data->irqs_seq));
+		if (!taa_data->irqs_seq)
+			goto out_err;
+
+		trace_seq_init(taa_data->irqs_seq);
+
+		taa_data->softirqs_seq = calloc(1, sizeof(*taa_data->softirqs_seq));
+		if (!taa_data->softirqs_seq)
+			goto out_err;
+
+		trace_seq_init(taa_data->softirqs_seq);
+
+		taa_data->threads_seq = calloc(1, sizeof(*taa_data->threads_seq));
+		if (!taa_data->threads_seq)
+			goto out_err;
+
+		trace_seq_init(taa_data->threads_seq);
+
+		taa_data->stack_seq = calloc(1, sizeof(*taa_data->stack_seq));
+		if (!taa_data->stack_seq)
+			goto out_err;
+
+		trace_seq_init(taa_data->stack_seq);
+	}
+
+	return 0;
+
+out_err:
+	timerlat_aa_destroy_seqs(taa_ctx);
+	return -1;
+}
+
+/*
+ * timerlat_aa_unregister_events - Unregister events used in the auto-analysis
+ */
+static void timerlat_aa_unregister_events(struct osnoise_tool *tool, int dump_tasks)
+{
+	tracefs_event_disable(tool->trace.inst, "osnoise", NULL);
+
+	tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "nmi_noise",
+				     timerlat_aa_nmi_handler, tool);
+
+	tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "irq_noise",
+				     timerlat_aa_irq_handler, tool);
+
+	tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "softirq_noise",
+				     timerlat_aa_softirq_handler, tool);
+
+	tep_unregister_event_handler(tool->trace.tep, -1, "osnoise", "thread_noise",
+				     timerlat_aa_thread_handler, tool);
+
+	tep_unregister_event_handler(tool->trace.tep, -1, "ftrace", "kernel_stack",
+				     timerlat_aa_stack_handler, tool);
+	if (!dump_tasks)
+		return;
+
+	tracefs_event_disable(tool->trace.inst, "sched", "sched_switch");
+	tep_unregister_event_handler(tool->trace.tep, -1, "sched", "sched_switch",
+				     timerlat_aa_sched_switch_handler, tool);
+
+	tracefs_event_disable(tool->trace.inst, "workqueue", "workqueue_execute_start");
+	tep_unregister_event_handler(tool->trace.tep, -1, "workqueue", "workqueue_execute_start",
+				     timerlat_aa_kworker_start_handler, tool);
+}
+
+/*
+ * timerlat_aa_register_events - Register events used in the auto-analysis
+ *
+ * Returns 0 on success, -1 otherwise.
+ */
+static int timerlat_aa_register_events(struct osnoise_tool *tool, int dump_tasks)
+{
+	int retval;
+
+	/*
+	 * register auto-analysis handlers.
+	 */
+	retval = tracefs_event_enable(tool->trace.inst, "osnoise", NULL);
+	if (retval < 0 && !errno) {
+		err_msg("Could not find osnoise events\n");
+		goto out_err;
+	}
+
+	tep_register_event_handler(tool->trace.tep, -1, "osnoise", "nmi_noise",
+				   timerlat_aa_nmi_handler, tool);
+
+	tep_register_event_handler(tool->trace.tep, -1, "osnoise", "irq_noise",
+				   timerlat_aa_irq_handler, tool);
+
+	tep_register_event_handler(tool->trace.tep, -1, "osnoise", "softirq_noise",
+				   timerlat_aa_softirq_handler, tool);
+
+	tep_register_event_handler(tool->trace.tep, -1, "osnoise", "thread_noise",
+				   timerlat_aa_thread_handler, tool);
+
+	tep_register_event_handler(tool->trace.tep, -1, "ftrace", "kernel_stack",
+				   timerlat_aa_stack_handler, tool);
+
+	if (!dump_tasks)
+		return 0;
+
+	/*
+	 * Dump task events.
+	 */
+	retval = tracefs_event_enable(tool->trace.inst, "sched", "sched_switch");
+	if (retval < 0 && !errno) {
+		err_msg("Could not find sched_switch\n");
+		goto out_err;
+	}
+
+	tep_register_event_handler(tool->trace.tep, -1, "sched", "sched_switch",
+				   timerlat_aa_sched_switch_handler, tool);
+
+	retval = tracefs_event_enable(tool->trace.inst, "workqueue", "workqueue_execute_start");
+	if (retval < 0 && !errno) {
+		err_msg("Could not find workqueue_execute_start\n");
+		goto out_err;
+	}
+
+	tep_register_event_handler(tool->trace.tep, -1, "workqueue", "workqueue_execute_start",
+				   timerlat_aa_kworker_start_handler, tool);
+
+	return 0;
+
+out_err:
+	timerlat_aa_unregister_events(tool, dump_tasks);
+	return -1;
+}
+
+/**
+ * timerlat_aa_destroy - Destroy timerlat auto-analysis
+ */
+void timerlat_aa_destroy(void)
+{
+	struct timerlat_aa_context *taa_ctx = timerlat_aa_get_ctx();
+
+	if (!taa_ctx)
+		return;
+
+	if (!taa_ctx->taa_data)
+		goto out_ctx;
+
+	timerlat_aa_unregister_events(taa_ctx->tool, taa_ctx->dump_tasks);
+	timerlat_aa_destroy_seqs(taa_ctx);
+	free(taa_ctx->taa_data);
+out_ctx:
+	free(taa_ctx);
+}
+
+/**
+ * timerlat_aa_init - Initialize timerlat auto-analysis
+ *
+ * Returns 0 on success, -1 otherwise.
+ */
+int timerlat_aa_init(struct osnoise_tool *tool, int nr_cpus, int dump_tasks)
+{
+	struct timerlat_aa_context *taa_ctx;
+	int retval;
+
+	taa_ctx = calloc(1, sizeof(*taa_ctx));
+	if (!taa_ctx)
+		return -1;
+
+	__timerlat_aa_ctx = taa_ctx;
+
+	taa_ctx->nr_cpus = nr_cpus;
+	taa_ctx->tool = tool;
+	taa_ctx->dump_tasks = dump_tasks;
+
+	taa_ctx->taa_data = calloc(nr_cpus, sizeof(*taa_ctx->taa_data));
+	if (!taa_ctx->taa_data)
+		goto out_err;
+
+	retval = timerlat_aa_init_seqs(taa_ctx);
+	if (retval)
+		goto out_err;
+
+	retval = timerlat_aa_register_events(tool, dump_tasks);
+	if (retval)
+		goto out_err;
+
+	return 0;
+
+out_err:
+	timerlat_aa_destroy();
+	return -1;
+}
diff --git a/tools/tracing/rtla/src/timerlat_aa.h b/tools/tracing/rtla/src/timerlat_aa.h
new file mode 100644
index 000000000000..d4f6ca7e342a
--- /dev/null
+++ b/tools/tracing/rtla/src/timerlat_aa.h
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 Red Hat Inc, Daniel Bristot de Oliveira <bristot@kernel.org>
+ */
+
+int timerlat_aa_init(struct osnoise_tool *tool, int nr_cpus, int dump_task);
+void timerlat_aa_destroy(void);
+
+int timerlat_aa_handler(struct trace_seq *s, struct tep_record *record,
+			struct tep_event *event, void *context);
+
+void timerlat_auto_analysis(int irq_thresh, int thread_thresh);
diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h
index 5571afd3b549..90e4f52a030b 100644
--- a/tools/tracing/rtla/src/utils.h
+++ b/tools/tracing/rtla/src/utils.h
@@ -56,3 +56,6 @@ struct sched_attr {
 int parse_prio(char *arg, struct sched_attr *sched_param);
 int set_comm_sched_attr(const char *comm_prefix, struct sched_attr *attr);
 int set_cpu_dma_latency(int32_t latency);
+
+#define ns_to_usf(x) (((double)x/1000))
+#define ns_to_per(total, part) ((part * 100) / (double)total)
-- 
cgit 


From 5def33df84d2326ebf5f29ae9ddc702a4593c337 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Tue, 31 Jan 2023 16:38:54 +0100
Subject: rtla/timerlat: Add auto-analysis support to timerlat top

Currently, timerlat top displays the timerlat tracer latency results, saving
the intuitive timerlat trace for the developer to analyze.

This patch goes a step forward in the automaton of the scheduling latency
analysis by providing a summary of the root cause of a latency higher than
the passed "stop tracing" parameter if the trace stops.

The output is intuitive enough for non-expert users to have a general idea
of the root cause by looking at each factor's contribution percentage while
keeping the technical detail in the output for more expert users to start
an in dept debug or to correlate a root cause with an existing one.

The terminology is in line with recent industry and academic publications
to facilitate the understanding of both audiences.

Here is one example of tool output:
 ----------------------------------------- %< -----------------------------------------------------
  # taskset -c 0 timerlat -a 40 -c 1-23 -q
                                     Timer Latency
    0 00:00:12   |          IRQ Timer Latency (us)        |         Thread Timer Latency (us)
  CPU COUNT      |      cur       min       avg       max |      cur       min       avg       max
    1 #12322     |        0         0         1        15 |       10         3         9        31
    2 #12322     |        3         0         1        12 |       10         3         9        23
    3 #12322     |        1         0         1        21 |        8         2         8        34
    4 #12322     |        1         0         1        17 |       10         2        11        33
    5 #12322     |        0         0         1        12 |        8         3         8        25
    6 #12322     |        1         0         1        14 |       16         3        11        35
    7 #12322     |        0         0         1        14 |        9         2         8        29
    8 #12322     |        1         0         1        22 |        9         3         9        34
    9 #12322     |        0         0         1        14 |        8         2         8        24
   10 #12322     |        1         0         0        12 |        9         3         8        24
   11 #12322     |        0         0         0        15 |        6         2         7        29
   12 #12321     |        1         0         0        13 |        5         3         8        23
   13 #12319     |        0         0         1        14 |        9         3         9        26
   14 #12321     |        1         0         0        13 |        6         2         8        24
   15 #12321     |        1         0         1        15 |       12         3        11        27
   16 #12318     |        0         0         1        13 |        7         3        10        24
   17 #12319     |        0         0         1        13 |       11         3         9        25
   18 #12318     |        0         0         0        12 |        8         2         8        20
   19 #12319     |        0         0         1        18 |       10         2         9        28
   20 #12317     |        0         0         0        20 |        9         3         8        34
   21 #12318     |        0         0         0        13 |        8         3         8        28
   22 #12319     |        0         0         1        11 |        8         3        10        22
   23 #12320     |       28         0         1        28 |       41         3        11        41
  rtla timerlat hit stop tracing
  ## CPU 23 hit stop tracing, analyzing it ##
  IRQ handler delay:				      	    27.49 us (65.52 %)
  IRQ latency:						    28.13 us
  Timerlat IRQ duration:				     9.59 us (22.85 %)
  Blocking thread:					     3.79 us (9.03 %)
			objtool:49256    		     3.79 us
    Blocking thread stacktrace
		-> timerlat_irq
		-> __hrtimer_run_queues
		-> hrtimer_interrupt
		-> __sysvec_apic_timer_interrupt
		-> sysvec_apic_timer_interrupt
		-> asm_sysvec_apic_timer_interrupt
		-> _raw_spin_unlock_irqrestore
		-> cgroup_rstat_flush_locked
		-> cgroup_rstat_flush_irqsafe
		-> mem_cgroup_flush_stats
		-> mem_cgroup_wb_stats
		-> balance_dirty_pages
		-> balance_dirty_pages_ratelimited_flags
		-> btrfs_buffered_write
		-> btrfs_do_write_iter
		-> vfs_write
		-> __x64_sys_pwrite64
		-> do_syscall_64
		-> entry_SYSCALL_64_after_hwframe
  ------------------------------------------------------------------------
    Thread latency:					    41.96 us (100%)

  The system has exit from idle latency!
    Max timerlat IRQ latency from idle: 17.48 us in cpu 4
  Saving trace to timerlat_trace.txt
 ----------------------------------------- >% -----------------------------------------------------

In this case, the major factor was the delay suffered by the IRQ handler
that handles timerlat wakeup: 65.52 %. This can be caused by the
current thread masking interrupts, which can be seen in the blocking
thread stacktrace: the current thread (objtool:49256) disabled interrupts
via raw spin lock operations inside mem cgroup, while doing write
syscall in a btrfs file system.

A simple search for the function name on Google shows that this is
a legit case for disabling the interrupts:

  cgroup: Use irqsave in cgroup_rstat_flush_locked()
  lore.kernel.org/linux-mm/20220301122143.1521823-2-bigeasy@linutronix.de/

The output also prints other reasons for the latency root cause, such as:

  - an IRQ that happened before the IRQ handler that caused delays
  - The interference from NMI, IRQ, Softirq, and Threads

The details about how these factors affect the scheduling latency
can be found here:

   https://bristot.me/demystifying-the-real-time-linux-latency/

Link: https://lkml.kernel.org/r/3d45f40e630317f51ac6d678e2d96d310e495729.1675179318.git.bristot@kernel.org

Cc: Daniel Bristot de Oliveira <bristot@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/src/timerlat_top.c | 46 +++++++++++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 334271935222..eea5b3357e27 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -10,10 +10,12 @@
 #include <unistd.h>
 #include <stdio.h>
 #include <time.h>
+#include <errno.h>
 
 #include "utils.h"
 #include "osnoise.h"
 #include "timerlat.h"
+#include "timerlat_aa.h"
 
 struct timerlat_top_params {
 	char			*cpus;
@@ -30,6 +32,8 @@ struct timerlat_top_params {
 	int			quiet;
 	int			set_sched;
 	int			dma_latency;
+	int			no_aa;
+	int			dump_tasks;
 	struct sched_attr	sched_param;
 	struct trace_events	*events;
 };
@@ -130,17 +134,22 @@ timerlat_top_handler(struct trace_seq *s, struct tep_record *record,
 		     struct tep_event *event, void *context)
 {
 	struct trace_instance *trace = context;
+	struct timerlat_top_params *params;
 	unsigned long long latency, thread;
 	struct osnoise_tool *top;
 	int cpu = record->cpu;
 
 	top = container_of(trace, struct osnoise_tool, trace);
+	params = top->params;
 
 	tep_get_field_val(s, event, "context", record, &thread, 1);
 	tep_get_field_val(s, event, "timer_latency", record, &latency, 1);
 
 	timerlat_top_update(top, cpu, thread, latency);
 
+	if (!params->no_aa)
+		timerlat_aa_handler(s, record, event, context);
+
 	return 0;
 }
 
@@ -281,11 +290,13 @@ static void timerlat_top_usage(char *usage)
 		"	  -c/--cpus cpus: run the tracer only on the given cpus",
 		"	  -d/--duration time[m|h|d]: duration of the session in seconds",
 		"	  -D/--debug: print debug info",
+		"	     --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)",
 		"	  -t/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]",
 		"	  -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed",
 		"	     --filter <command>: enable a trace event filter to the previous -e event",
 		"	     --trigger <command>: enable a trace event trigger to the previous -e event",
 		"	  -n/--nano: display data in nanoseconds",
+		"	     --no-aa: disable auto-analysis, reducing rtla timerlat cpu usage",
 		"	  -q/--quiet print only a summary at the end",
 		"	     --dma-latency us: set /dev/cpu_dma_latency latency <us> to reduce exit from idle latency",
 		"	  -P/--priority o:prio|r:prio|f:prio|d:runtime:period : set scheduling parameters",
@@ -349,13 +360,15 @@ static struct timerlat_top_params
 			{"trigger",		required_argument,	0, '0'},
 			{"filter",		required_argument,	0, '1'},
 			{"dma-latency",		required_argument,	0, '2'},
+			{"no-aa",		no_argument,		0, '3'},
+			{"dump-tasks",		no_argument,		0, '4'},
 			{0, 0, 0, 0}
 		};
 
 		/* getopt_long stores the option index here. */
 		int option_index = 0;
 
-		c = getopt_long(argc, argv, "a:c:d:De:hi:np:P:qs:t::T:0:1:2:",
+		c = getopt_long(argc, argv, "a:c:d:De:hi:np:P:qs:t::T:0:1:2:34",
 				 long_options, &option_index);
 
 		/* detect the end of the options. */
@@ -368,13 +381,13 @@ static struct timerlat_top_params
 
 			/* set thread stop to auto_thresh */
 			params->stop_total_us = auto_thresh;
+			params->stop_us = auto_thresh;
 
 			/* get stack trace */
 			params->print_stack = auto_thresh;
 
 			/* set trace */
 			params->trace_output = "timerlat_trace.txt";
-
 			break;
 		case 'c':
 			retval = parse_cpu_list(optarg, &params->monitored_cpus);
@@ -437,6 +450,7 @@ static struct timerlat_top_params
 				params->trace_output = &optarg[1];
 			else
 				params->trace_output = "timerlat_trace.txt";
+
 			break;
 		case '0': /* trigger */
 			if (params->events) {
@@ -467,6 +481,12 @@ static struct timerlat_top_params
 				exit(EXIT_FAILURE);
 			}
 			break;
+		case '3': /* no-aa */
+			params->no_aa = 1;
+			break;
+		case '4':
+			params->dump_tasks = 1;
+			break;
 		default:
 			timerlat_top_usage("Invalid option");
 		}
@@ -477,6 +497,12 @@ static struct timerlat_top_params
 		exit(EXIT_FAILURE);
 	}
 
+	/*
+	 * Auto analysis only happens if stop tracing, thus:
+	 */
+	if (!params->stop_us && !params->stop_total_us)
+		params->no_aa = 1;
+
 	return params;
 }
 
@@ -547,6 +573,7 @@ static struct osnoise_tool
 {
 	struct osnoise_tool *top;
 	int nr_cpus;
+	int retval;
 
 	nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
 
@@ -563,6 +590,16 @@ static struct osnoise_tool
 	tep_register_event_handler(top->trace.tep, -1, "ftrace", "timerlat",
 				   timerlat_top_handler, top);
 
+	/*
+	 * If no auto analysis, we are ready.
+	 */
+	if (params->no_aa)
+		return top;
+
+	retval = timerlat_aa_init(top, nr_cpus, params->dump_tasks);
+	if (retval)
+		goto out_err;
+
 	return top;
 
 out_err:
@@ -688,6 +725,10 @@ int timerlat_top_main(int argc, char *argv[])
 
 	if (trace_is_off(&top->trace, &record->trace)) {
 		printf("rtla timerlat hit stop tracing\n");
+
+		if (!params->no_aa)
+			timerlat_auto_analysis(params->stop_us, params->stop_total_us);
+
 		if (params->trace_output) {
 			printf("  Saving trace to %s\n", params->trace_output);
 			save_trace_to_file(record->trace.inst, params->trace_output);
@@ -701,6 +742,7 @@ out_top:
 	params->events = NULL;
 out_free:
 	timerlat_free_top(top->data);
+	timerlat_aa_destroy();
 	osnoise_destroy_tool(record);
 	osnoise_destroy_tool(top);
 	free(params);
-- 
cgit 


From 354bb4a0e0b6be8f55bacbe7f08c94b4741f5658 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 3 Feb 2023 00:53:35 +0100
Subject: selftests/bpf: Initialize tc in xdp_synproxy

xdp_synproxy/xdp fails in CI with:

    Error: bpf_tc_hook_create: File exists

The XDP version of the test should not be calling bpf_tc_hook_create();
the reason it's happening anyway is that if we don't specify --tc on the
command line, tc variable remains uninitialized.

Fixes: 784d5dc0efc2 ("selftests/bpf: Add selftests for raw syncookie helpers in TC mode")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Reported-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20230202235335.3403781-1-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/xdp_synproxy.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xdp_synproxy.c b/tools/testing/selftests/bpf/xdp_synproxy.c
index 410a1385a01d..6dbe0b745198 100644
--- a/tools/testing/selftests/bpf/xdp_synproxy.c
+++ b/tools/testing/selftests/bpf/xdp_synproxy.c
@@ -116,6 +116,7 @@ static void parse_options(int argc, char *argv[], unsigned int *ifindex, __u32 *
 	*tcpipopts = 0;
 	*ports = NULL;
 	*single = false;
+	*tc = false;
 
 	while (true) {
 		int opt;
-- 
cgit 


From e2bd9742989b6af873371de48c228a9ac1e87cba Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Thu, 2 Feb 2023 14:31:25 +0800
Subject: tools/bpf: Use tab instead of white spaces to sync bpf.h

Just silence the following build warning:

Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf.h' differs from latest version at 'include/uapi/linux/bpf.h'

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Link: https://lore.kernel.org/r/1675319486-27744-2-git-send-email-yangtiezhu@loongson.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/bpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7f024ac22edd..ba0f0cfb5e42 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5817,8 +5817,8 @@ enum {
 	BPF_F_ADJ_ROOM_ENCAP_L4_UDP	= (1ULL << 4),
 	BPF_F_ADJ_ROOM_NO_CSUM_RESET	= (1ULL << 5),
 	BPF_F_ADJ_ROOM_ENCAP_L2_ETH	= (1ULL << 6),
-	BPF_F_ADJ_ROOM_DECAP_L3_IPV4    = (1ULL << 7),
-	BPF_F_ADJ_ROOM_DECAP_L3_IPV6    = (1ULL << 8),
+	BPF_F_ADJ_ROOM_DECAP_L3_IPV4	= (1ULL << 7),
+	BPF_F_ADJ_ROOM_DECAP_L3_IPV6	= (1ULL << 8),
 };
 
 enum {
-- 
cgit 


From 150809082aab95f7078f343c1c82770d6c9ebe68 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Thu, 2 Feb 2023 14:31:26 +0800
Subject: selftests/bpf: Use semicolon instead of comma in test_verifier.c

Just silence the following checkpatch warning:

WARNING: Possible comma where semicolon could be used

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Link: https://lore.kernel.org/r/1675319486-27744-3-git-send-email-yangtiezhu@loongson.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_verifier.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 8c808551dfd7..887c49dc5abd 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -209,7 +209,7 @@ loop:
 		insn[i++] = BPF_MOV64_IMM(BPF_REG_2, 1);
 		insn[i++] = BPF_MOV64_IMM(BPF_REG_3, 2);
 		insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
-					 BPF_FUNC_skb_vlan_push),
+					 BPF_FUNC_skb_vlan_push);
 		insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 3);
 		i++;
 	}
@@ -220,7 +220,7 @@ loop:
 		i++;
 		insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
 		insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
-					 BPF_FUNC_skb_vlan_pop),
+					 BPF_FUNC_skb_vlan_pop);
 		insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 3);
 		i++;
 	}
-- 
cgit 


From d3d854fd6a1d97157f790604e07f6386e8df8fe4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 1 Feb 2023 11:24:17 +0100
Subject: netdev-genl: create a simple family for netdev stuff

Add a Netlink spec-compatible family for netdevs.
This is a very simple implementation without much
thought going into it.

It allows us to reap all the benefits of Netlink specs,
one can use the generic client to issue the commands:

  $ ./cli.py --spec netdev.yaml --dump dev_get
  [{'ifindex': 1, 'xdp-features': set()},
   {'ifindex': 2, 'xdp-features': {'basic', 'ndo-xmit', 'redirect'}},
   {'ifindex': 3, 'xdp-features': {'rx-sg'}}]

the generic python library does not have flags-by-name
support, yet, but we also don't have to carry strings
in the messages, as user space can get the names from
the spec.

Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Co-developed-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Co-developed-by: Marek Majtyka <alardam@gmail.com>
Signed-off-by: Marek Majtyka <alardam@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/327ad9c9868becbe1e601b580c962549c8cd81f2.1675245258.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml | 100 ++++++++++++++++++
 include/linux/netdevice.h               |   3 +
 include/net/xdp.h                       |   3 +
 include/uapi/linux/netdev.h             |  59 +++++++++++
 net/core/Makefile                       |   3 +-
 net/core/dev.c                          |   1 +
 net/core/netdev-genl-gen.c              |  48 +++++++++
 net/core/netdev-genl-gen.h              |  23 ++++
 net/core/netdev-genl.c                  | 179 ++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/netdev.h       |  59 +++++++++++
 10 files changed, 477 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/netlink/specs/netdev.yaml
 create mode 100644 include/uapi/linux/netdev.h
 create mode 100644 net/core/netdev-genl-gen.c
 create mode 100644 net/core/netdev-genl-gen.h
 create mode 100644 net/core/netdev-genl.c
 create mode 100644 tools/include/uapi/linux/netdev.h

(limited to 'tools')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
new file mode 100644
index 000000000000..b4dcdae54ffd
--- /dev/null
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -0,0 +1,100 @@
+name: netdev
+
+doc:
+  netdev configuration over generic netlink.
+
+definitions:
+  -
+    type: flags
+    name: xdp-act
+    entries:
+      -
+        name: basic
+        doc:
+          XDP feautues set supported by all drivers
+          (XDP_ABORTED, XDP_DROP, XDP_PASS, XDP_TX)
+      -
+        name: redirect
+        doc:
+          The netdev supports XDP_REDIRECT
+      -
+        name: ndo-xmit
+        doc:
+          This feature informs if netdev implements ndo_xdp_xmit callback.
+      -
+        name: xsk-zerocopy
+        doc:
+          This feature informs if netdev supports AF_XDP in zero copy mode.
+      -
+        name: hw-offload
+        doc:
+         This feature informs if netdev supports XDP hw oflloading.
+      -
+        name: rx-sg
+        doc:
+          This feature informs if netdev implements non-linear XDP buffer
+          support in the driver napi callback.
+      -
+        name: ndo-xmit-sg
+        doc:
+          This feature informs if netdev implements non-linear XDP buffer
+          support in ndo_xdp_xmit callback.
+
+attribute-sets:
+  -
+    name: dev
+    attributes:
+      -
+        name: ifindex
+        doc: netdev ifindex
+        type: u32
+        value: 1
+        checks:
+          min: 1
+      -
+        name: pad
+        type: pad
+      -
+        name: xdp-features
+        doc: Bitmask of enabled xdp-features.
+        type: u64
+        enum: xdp-act
+        enum-as-flags: true
+
+operations:
+  list:
+    -
+      name: dev-get
+      doc: Get / dump information about a netdev.
+      value: 1
+      attribute-set: dev
+      do:
+        request:
+          attributes:
+            - ifindex
+        reply: &dev-all
+          attributes:
+            - ifindex
+            - xdp-features
+      dump:
+        reply: *dev-all
+    -
+      name: dev-add-ntf
+      doc: Notification about device appearing.
+      notify: dev-get
+      mcgrp: mgmt
+    -
+      name: dev-del-ntf
+      doc: Notification about device disappearing.
+      notify: dev-get
+      mcgrp: mgmt
+    -
+      name: dev-change-ntf
+      doc: Notification about device configuration being changed.
+      notify: dev-get
+      mcgrp: mgmt
+
+mcast-groups:
+  list:
+    -
+      name: mgmt
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2466afa25078..0f7967591288 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -47,6 +47,7 @@
 #include <uapi/linux/netdevice.h>
 #include <uapi/linux/if_bonding.h>
 #include <uapi/linux/pkt_cls.h>
+#include <uapi/linux/netdev.h>
 #include <linux/hashtable.h>
 #include <linux/rbtree.h>
 #include <net/net_trackers.h>
@@ -2055,6 +2056,7 @@ struct net_device {
 
 	/* Read-mostly cache-line for fast-path access */
 	unsigned int		flags;
+	xdp_features_t		xdp_features;
 	unsigned long long	priv_flags;
 	const struct net_device_ops *netdev_ops;
 	const struct xdp_metadata_ops *xdp_metadata_ops;
@@ -2839,6 +2841,7 @@ enum netdev_cmd {
 	NETDEV_OFFLOAD_XSTATS_DISABLE,
 	NETDEV_OFFLOAD_XSTATS_REPORT_USED,
 	NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
+	NETDEV_XDP_FEAT_CHANGE,
 };
 const char *netdev_cmd_to_name(enum netdev_cmd cmd);
 
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 91292aa13bc0..8d1c86914f4c 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -7,6 +7,7 @@
 #define __LINUX_NET_XDP_H__
 
 #include <linux/skbuff.h> /* skb_shared_info */
+#include <uapi/linux/netdev.h>
 
 /**
  * DOC: XDP RX-queue information
@@ -43,6 +44,8 @@ enum xdp_mem_type {
 	MEM_TYPE_MAX,
 };
 
+typedef u32 xdp_features_t;
+
 /* XDP flags for ndo_xdp_xmit */
 #define XDP_XMIT_FLUSH		(1U << 0)	/* doorbell signal consumer */
 #define XDP_XMIT_FLAGS_MASK	XDP_XMIT_FLUSH
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
new file mode 100644
index 000000000000..9ee459872600
--- /dev/null
+++ b/include/uapi/linux/netdev.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/netdev.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _UAPI_LINUX_NETDEV_H
+#define _UAPI_LINUX_NETDEV_H
+
+#define NETDEV_FAMILY_NAME	"netdev"
+#define NETDEV_FAMILY_VERSION	1
+
+/**
+ * enum netdev_xdp_act
+ * @NETDEV_XDP_ACT_BASIC: XDP feautues set supported by all drivers
+ *   (XDP_ABORTED, XDP_DROP, XDP_PASS, XDP_TX)
+ * @NETDEV_XDP_ACT_REDIRECT: The netdev supports XDP_REDIRECT
+ * @NETDEV_XDP_ACT_NDO_XMIT: This feature informs if netdev implements
+ *   ndo_xdp_xmit callback.
+ * @NETDEV_XDP_ACT_XSK_ZEROCOPY: This feature informs if netdev supports AF_XDP
+ *   in zero copy mode.
+ * @NETDEV_XDP_ACT_HW_OFFLOAD: This feature informs if netdev supports XDP hw
+ *   oflloading.
+ * @NETDEV_XDP_ACT_RX_SG: This feature informs if netdev implements non-linear
+ *   XDP buffer support in the driver napi callback.
+ * @NETDEV_XDP_ACT_NDO_XMIT_SG: This feature informs if netdev implements
+ *   non-linear XDP buffer support in ndo_xdp_xmit callback.
+ */
+enum netdev_xdp_act {
+	NETDEV_XDP_ACT_BASIC = 1,
+	NETDEV_XDP_ACT_REDIRECT = 2,
+	NETDEV_XDP_ACT_NDO_XMIT = 4,
+	NETDEV_XDP_ACT_XSK_ZEROCOPY = 8,
+	NETDEV_XDP_ACT_HW_OFFLOAD = 16,
+	NETDEV_XDP_ACT_RX_SG = 32,
+	NETDEV_XDP_ACT_NDO_XMIT_SG = 64,
+};
+
+enum {
+	NETDEV_A_DEV_IFINDEX = 1,
+	NETDEV_A_DEV_PAD,
+	NETDEV_A_DEV_XDP_FEATURES,
+
+	__NETDEV_A_DEV_MAX,
+	NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
+};
+
+enum {
+	NETDEV_CMD_DEV_GET = 1,
+	NETDEV_CMD_DEV_ADD_NTF,
+	NETDEV_CMD_DEV_DEL_NTF,
+	NETDEV_CMD_DEV_CHANGE_NTF,
+
+	__NETDEV_CMD_MAX,
+	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
+};
+
+#define NETDEV_MCGRP_MGMT	"mgmt"
+
+#endif /* _UAPI_LINUX_NETDEV_H */
diff --git a/net/core/Makefile b/net/core/Makefile
index 10edd66a8a37..8f367813bc68 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -12,7 +12,8 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
 obj-y		     += dev.o dev_addr_lists.o dst.o netevent.o \
 			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
 			sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
-			fib_notifier.o xdp.o flow_offload.o gro.o
+			fib_notifier.o xdp.o flow_offload.o gro.o \
+			netdev-genl.o netdev-genl-gen.o
 
 obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
 
diff --git a/net/core/dev.c b/net/core/dev.c
index f72f5c4ee7e2..9ac0eeb2c8cd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1614,6 +1614,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
 	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
+	N(XDP_FEAT_CHANGE)
 	}
 #undef N
 	return "UNKNOWN_NETDEV_EVENT";
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
new file mode 100644
index 000000000000..48812ec843f5
--- /dev/null
+++ b/net/core/netdev-genl-gen.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/netdev.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "netdev-genl-gen.h"
+
+#include <linux/netdev.h>
+
+/* NETDEV_CMD_DEV_GET - do */
+static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = {
+	[NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+/* Ops table for netdev */
+static const struct genl_split_ops netdev_nl_ops[2] = {
+	{
+		.cmd		= NETDEV_CMD_DEV_GET,
+		.doit		= netdev_nl_dev_get_doit,
+		.policy		= netdev_dev_get_nl_policy,
+		.maxattr	= NETDEV_A_DEV_IFINDEX,
+		.flags		= GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd	= NETDEV_CMD_DEV_GET,
+		.dumpit	= netdev_nl_dev_get_dumpit,
+		.flags	= GENL_CMD_CAP_DUMP,
+	},
+};
+
+static const struct genl_multicast_group netdev_nl_mcgrps[] = {
+	[NETDEV_NLGRP_MGMT] = { "mgmt", },
+};
+
+struct genl_family netdev_nl_family __ro_after_init = {
+	.name		= NETDEV_FAMILY_NAME,
+	.version	= NETDEV_FAMILY_VERSION,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.module		= THIS_MODULE,
+	.split_ops	= netdev_nl_ops,
+	.n_split_ops	= ARRAY_SIZE(netdev_nl_ops),
+	.mcgrps		= netdev_nl_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(netdev_nl_mcgrps),
+};
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
new file mode 100644
index 000000000000..b16dc7e026bb
--- /dev/null
+++ b/net/core/netdev-genl-gen.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/netdev.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_NETDEV_GEN_H
+#define _LINUX_NETDEV_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <linux/netdev.h>
+
+int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+
+enum {
+	NETDEV_NLGRP_MGMT,
+};
+
+extern struct genl_family netdev_nl_family;
+
+#endif /* _LINUX_NETDEV_GEN_H */
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
new file mode 100644
index 000000000000..a4270fafdf11
--- /dev/null
+++ b/net/core/netdev-genl.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include "netdev-genl-gen.h"
+
+static int
+netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
+		   u32 portid, u32 seq, int flags, u32 cmd)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(rsp, portid, seq, &netdev_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
+	    nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
+			      netdev->xdp_features, NETDEV_A_DEV_PAD)) {
+		genlmsg_cancel(rsp, hdr);
+		return -EINVAL;
+	}
+
+	genlmsg_end(rsp, hdr);
+
+	return 0;
+}
+
+static void
+netdev_genl_dev_notify(struct net_device *netdev, int cmd)
+{
+	struct sk_buff *ntf;
+
+	if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev),
+				NETDEV_NLGRP_MGMT))
+		return;
+
+	ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!ntf)
+		return;
+
+	if (netdev_nl_dev_fill(netdev, ntf, 0, 0, 0, cmd)) {
+		nlmsg_free(ntf);
+		return;
+	}
+
+	genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf,
+				0, NETDEV_NLGRP_MGMT, GFP_KERNEL);
+}
+
+int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net_device *netdev;
+	struct sk_buff *rsp;
+	u32 ifindex;
+	int err;
+
+	if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX))
+		return -EINVAL;
+
+	ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+
+	rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!rsp)
+		return -ENOMEM;
+
+	rtnl_lock();
+
+	netdev = __dev_get_by_index(genl_info_net(info), ifindex);
+	if (netdev)
+		err = netdev_nl_dev_fill(netdev, rsp, info->snd_portid,
+					 info->snd_seq, 0, info->genlhdr->cmd);
+	else
+		err = -ENODEV;
+
+	rtnl_unlock();
+
+	if (err)
+		goto err_free_msg;
+
+	return genlmsg_reply(rsp, info);
+
+err_free_msg:
+	nlmsg_free(rsp);
+	return err;
+}
+
+int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct net_device *netdev;
+	int idx = 0, s_idx;
+	int h, s_h;
+	int err;
+
+	s_h = cb->args[0];
+	s_idx = cb->args[1];
+
+	rtnl_lock();
+
+	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+		struct hlist_head *head;
+
+		idx = 0;
+		head = &net->dev_index_head[h];
+		hlist_for_each_entry(netdev, head, index_hlist) {
+			if (idx < s_idx)
+				goto cont;
+			err = netdev_nl_dev_fill(netdev, skb,
+						 NETLINK_CB(cb->skb).portid,
+						 cb->nlh->nlmsg_seq, 0,
+						 NETDEV_CMD_DEV_GET);
+			if (err < 0)
+				break;
+cont:
+			idx++;
+		}
+	}
+
+	rtnl_unlock();
+
+	if (err != -EMSGSIZE)
+		return err;
+
+	cb->args[1] = idx;
+	cb->args[0] = h;
+	cb->seq = net->dev_base_seq;
+
+	return skb->len;
+}
+
+static int netdev_genl_netdevice_event(struct notifier_block *nb,
+				       unsigned long event, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF);
+		break;
+	case NETDEV_UNREGISTER:
+		netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF);
+		break;
+	case NETDEV_XDP_FEAT_CHANGE:
+		netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block netdev_genl_nb = {
+	.notifier_call	= netdev_genl_netdevice_event,
+};
+
+static int __init netdev_genl_init(void)
+{
+	int err;
+
+	err = register_netdevice_notifier(&netdev_genl_nb);
+	if (err)
+		return err;
+
+	err = genl_register_family(&netdev_nl_family);
+	if (err)
+		goto err_unreg_ntf;
+
+	return 0;
+
+err_unreg_ntf:
+	unregister_netdevice_notifier(&netdev_genl_nb);
+	return err;
+}
+
+subsys_initcall(netdev_genl_init);
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
new file mode 100644
index 000000000000..9ee459872600
--- /dev/null
+++ b/tools/include/uapi/linux/netdev.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/netdev.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _UAPI_LINUX_NETDEV_H
+#define _UAPI_LINUX_NETDEV_H
+
+#define NETDEV_FAMILY_NAME	"netdev"
+#define NETDEV_FAMILY_VERSION	1
+
+/**
+ * enum netdev_xdp_act
+ * @NETDEV_XDP_ACT_BASIC: XDP feautues set supported by all drivers
+ *   (XDP_ABORTED, XDP_DROP, XDP_PASS, XDP_TX)
+ * @NETDEV_XDP_ACT_REDIRECT: The netdev supports XDP_REDIRECT
+ * @NETDEV_XDP_ACT_NDO_XMIT: This feature informs if netdev implements
+ *   ndo_xdp_xmit callback.
+ * @NETDEV_XDP_ACT_XSK_ZEROCOPY: This feature informs if netdev supports AF_XDP
+ *   in zero copy mode.
+ * @NETDEV_XDP_ACT_HW_OFFLOAD: This feature informs if netdev supports XDP hw
+ *   oflloading.
+ * @NETDEV_XDP_ACT_RX_SG: This feature informs if netdev implements non-linear
+ *   XDP buffer support in the driver napi callback.
+ * @NETDEV_XDP_ACT_NDO_XMIT_SG: This feature informs if netdev implements
+ *   non-linear XDP buffer support in ndo_xdp_xmit callback.
+ */
+enum netdev_xdp_act {
+	NETDEV_XDP_ACT_BASIC = 1,
+	NETDEV_XDP_ACT_REDIRECT = 2,
+	NETDEV_XDP_ACT_NDO_XMIT = 4,
+	NETDEV_XDP_ACT_XSK_ZEROCOPY = 8,
+	NETDEV_XDP_ACT_HW_OFFLOAD = 16,
+	NETDEV_XDP_ACT_RX_SG = 32,
+	NETDEV_XDP_ACT_NDO_XMIT_SG = 64,
+};
+
+enum {
+	NETDEV_A_DEV_IFINDEX = 1,
+	NETDEV_A_DEV_PAD,
+	NETDEV_A_DEV_XDP_FEATURES,
+
+	__NETDEV_A_DEV_MAX,
+	NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
+};
+
+enum {
+	NETDEV_CMD_DEV_GET = 1,
+	NETDEV_CMD_DEV_ADD_NTF,
+	NETDEV_CMD_DEV_DEL_NTF,
+	NETDEV_CMD_DEV_CHANGE_NTF,
+
+	__NETDEV_CMD_MAX,
+	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
+};
+
+#define NETDEV_MCGRP_MGMT	"mgmt"
+
+#endif /* _UAPI_LINUX_NETDEV_H */
-- 
cgit 


From 8f1669319c31dcdd07ae292822c9455aeb3bc7c7 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Wed, 1 Feb 2023 11:24:20 +0100
Subject: libbpf: add the capability to specify netlink proto in
 libbpf_netlink_send_recv

This is a preliminary patch in order to introduce netlink_generic
protocol support to libbpf.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/7878a54667e74afeec3ee519999c044bd514b44c.1675245258.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/netlink.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index 35104580870c..d2468a04a6c3 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -41,7 +41,7 @@ struct xdp_id_md {
 	struct xdp_link_info info;
 };
 
-static int libbpf_netlink_open(__u32 *nl_pid)
+static int libbpf_netlink_open(__u32 *nl_pid, int proto)
 {
 	struct sockaddr_nl sa;
 	socklen_t addrlen;
@@ -51,7 +51,7 @@ static int libbpf_netlink_open(__u32 *nl_pid)
 	memset(&sa, 0, sizeof(sa));
 	sa.nl_family = AF_NETLINK;
 
-	sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+	sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, proto);
 	if (sock < 0)
 		return -errno;
 
@@ -212,14 +212,14 @@ done:
 }
 
 static int libbpf_netlink_send_recv(struct libbpf_nla_req *req,
-				    __dump_nlmsg_t parse_msg,
+				    int proto, __dump_nlmsg_t parse_msg,
 				    libbpf_dump_nlmsg_t parse_attr,
 				    void *cookie)
 {
 	__u32 nl_pid = 0;
 	int sock, ret;
 
-	sock = libbpf_netlink_open(&nl_pid);
+	sock = libbpf_netlink_open(&nl_pid, proto);
 	if (sock < 0)
 		return sock;
 
@@ -271,7 +271,7 @@ static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd,
 	}
 	nlattr_end_nested(&req, nla);
 
-	return libbpf_netlink_send_recv(&req, NULL, NULL, NULL);
+	return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL);
 }
 
 int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags, const struct bpf_xdp_attach_opts *opts)
@@ -382,7 +382,7 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts)
 	xdp_id.ifindex = ifindex;
 	xdp_id.flags = xdp_flags;
 
-	err = libbpf_netlink_send_recv(&req, __dump_link_nlmsg,
+	err = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, __dump_link_nlmsg,
 				       get_xdp_info, &xdp_id);
 	if (err)
 		return libbpf_err(err);
@@ -493,7 +493,7 @@ static int tc_qdisc_modify(struct bpf_tc_hook *hook, int cmd, int flags)
 	if (ret < 0)
 		return ret;
 
-	return libbpf_netlink_send_recv(&req, NULL, NULL, NULL);
+	return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL);
 }
 
 static int tc_qdisc_create_excl(struct bpf_tc_hook *hook)
@@ -673,7 +673,8 @@ int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts)
 
 	info.opts = opts;
 
-	ret = libbpf_netlink_send_recv(&req, get_tc_info, NULL, &info);
+	ret = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, get_tc_info, NULL,
+				       &info);
 	if (ret < 0)
 		return libbpf_err(ret);
 	if (!info.processed)
@@ -739,7 +740,7 @@ static int __bpf_tc_detach(const struct bpf_tc_hook *hook,
 			return ret;
 	}
 
-	return libbpf_netlink_send_recv(&req, NULL, NULL, NULL);
+	return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL);
 }
 
 int bpf_tc_detach(const struct bpf_tc_hook *hook,
@@ -804,7 +805,8 @@ int bpf_tc_query(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts)
 
 	info.opts = opts;
 
-	ret = libbpf_netlink_send_recv(&req, get_tc_info, NULL, &info);
+	ret = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, get_tc_info, NULL,
+				       &info);
 	if (ret < 0)
 		return libbpf_err(ret);
 	if (!info.processed)
-- 
cgit 


From 04d58f1b26a436c429581d286528ea3c01624462 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Wed, 1 Feb 2023 11:24:21 +0100
Subject: libbpf: add API to get XDP/XSK supported features

Extend bpf_xdp_query routine in order to get XDP/XSK supported features
of netdev over route netlink interface.
Extend libbpf netlink implementation in order to support netlink_generic
protocol.

Co-developed-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Co-developed-by: Marek Majtyka <alardam@gmail.com>
Signed-off-by: Marek Majtyka <alardam@gmail.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/a72609ef4f0de7fee5376c40dbf54ad7f13bfb8d.1675245258.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/libbpf.h  |  3 +-
 tools/lib/bpf/netlink.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/nlattr.h  | 12 +++++++
 3 files changed, 110 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 8777ff21ea1d..b18581277eb2 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1048,9 +1048,10 @@ struct bpf_xdp_query_opts {
 	__u32 hw_prog_id;	/* output */
 	__u32 skb_prog_id;	/* output */
 	__u8 attach_mode;	/* output */
+	__u64 feature_flags;	/* output */
 	size_t :0;
 };
-#define bpf_xdp_query_opts__last_field attach_mode
+#define bpf_xdp_query_opts__last_field feature_flags
 
 LIBBPF_API int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags,
 			      const struct bpf_xdp_attach_opts *opts);
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index d2468a04a6c3..32b13b7a11b0 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -9,6 +9,7 @@
 #include <linux/if_ether.h>
 #include <linux/pkt_cls.h>
 #include <linux/rtnetlink.h>
+#include <linux/netdev.h>
 #include <sys/socket.h>
 #include <errno.h>
 #include <time.h>
@@ -39,6 +40,12 @@ struct xdp_id_md {
 	int ifindex;
 	__u32 flags;
 	struct xdp_link_info info;
+	__u64 feature_flags;
+};
+
+struct xdp_features_md {
+	int ifindex;
+	__u64 flags;
 };
 
 static int libbpf_netlink_open(__u32 *nl_pid, int proto)
@@ -238,6 +245,43 @@ out:
 	return ret;
 }
 
+static int parse_genl_family_id(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn,
+				void *cookie)
+{
+	struct genlmsghdr *gnl = NLMSG_DATA(nh);
+	struct nlattr *na = (struct nlattr *)((void *)gnl + GENL_HDRLEN);
+	struct nlattr *tb[CTRL_ATTR_FAMILY_ID + 1];
+	__u16 *id = cookie;
+
+	libbpf_nla_parse(tb, CTRL_ATTR_FAMILY_ID, na,
+			 NLMSG_PAYLOAD(nh, sizeof(*gnl)), NULL);
+	if (!tb[CTRL_ATTR_FAMILY_ID])
+		return NL_CONT;
+
+	*id = libbpf_nla_getattr_u16(tb[CTRL_ATTR_FAMILY_ID]);
+	return NL_DONE;
+}
+
+static int libbpf_netlink_resolve_genl_family_id(const char *name,
+						 __u16 len, __u16 *id)
+{
+	struct libbpf_nla_req req = {
+		.nh.nlmsg_len	= NLMSG_LENGTH(GENL_HDRLEN),
+		.nh.nlmsg_type	= GENL_ID_CTRL,
+		.nh.nlmsg_flags	= NLM_F_REQUEST,
+		.gnl.cmd	= CTRL_CMD_GETFAMILY,
+		.gnl.version	= 2,
+	};
+	int err;
+
+	err = nlattr_add(&req, CTRL_ATTR_FAMILY_NAME, name, len);
+	if (err < 0)
+		return err;
+
+	return libbpf_netlink_send_recv(&req, NETLINK_GENERIC,
+					parse_genl_family_id, NULL, id);
+}
+
 static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd,
 					 __u32 flags)
 {
@@ -357,6 +401,29 @@ static int get_xdp_info(void *cookie, void *msg, struct nlattr **tb)
 	return 0;
 }
 
+static int parse_xdp_features(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn,
+			      void *cookie)
+{
+	struct genlmsghdr *gnl = NLMSG_DATA(nh);
+	struct nlattr *na = (struct nlattr *)((void *)gnl + GENL_HDRLEN);
+	struct nlattr *tb[NETDEV_CMD_MAX + 1];
+	struct xdp_features_md *md = cookie;
+	__u32 ifindex;
+
+	libbpf_nla_parse(tb, NETDEV_CMD_MAX, na,
+			 NLMSG_PAYLOAD(nh, sizeof(*gnl)), NULL);
+
+	if (!tb[NETDEV_A_DEV_IFINDEX] || !tb[NETDEV_A_DEV_XDP_FEATURES])
+		return NL_CONT;
+
+	ifindex = libbpf_nla_getattr_u32(tb[NETDEV_A_DEV_IFINDEX]);
+	if (ifindex != md->ifindex)
+		return NL_CONT;
+
+	md->flags = libbpf_nla_getattr_u64(tb[NETDEV_A_DEV_XDP_FEATURES]);
+	return NL_DONE;
+}
+
 int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts)
 {
 	struct libbpf_nla_req req = {
@@ -366,6 +433,10 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts)
 		.ifinfo.ifi_family = AF_PACKET,
 	};
 	struct xdp_id_md xdp_id = {};
+	struct xdp_features_md md = {
+		.ifindex = ifindex,
+	};
+	__u16 id;
 	int err;
 
 	if (!OPTS_VALID(opts, bpf_xdp_query_opts))
@@ -393,6 +464,31 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts)
 	OPTS_SET(opts, skb_prog_id, xdp_id.info.skb_prog_id);
 	OPTS_SET(opts, attach_mode, xdp_id.info.attach_mode);
 
+	if (!OPTS_HAS(opts, feature_flags))
+		return 0;
+
+	err = libbpf_netlink_resolve_genl_family_id("netdev", sizeof("netdev"), &id);
+	if (err < 0)
+		return libbpf_err(err);
+
+	memset(&req, 0, sizeof(req));
+	req.nh.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+	req.nh.nlmsg_flags = NLM_F_REQUEST;
+	req.nh.nlmsg_type = id;
+	req.gnl.cmd = NETDEV_CMD_DEV_GET;
+	req.gnl.version = 2;
+
+	err = nlattr_add(&req, NETDEV_A_DEV_IFINDEX, &ifindex, sizeof(ifindex));
+	if (err < 0)
+		return err;
+
+	err = libbpf_netlink_send_recv(&req, NETLINK_GENERIC,
+				       parse_xdp_features, NULL, &md);
+	if (err)
+		return libbpf_err(err);
+
+	opts->feature_flags = md.flags;
+
 	return 0;
 }
 
diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h
index 4d15ae2ff812..d92d1c1de700 100644
--- a/tools/lib/bpf/nlattr.h
+++ b/tools/lib/bpf/nlattr.h
@@ -14,6 +14,7 @@
 #include <errno.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
+#include <linux/genetlink.h>
 
 /* avoid multiple definition of netlink features */
 #define __LINUX_NETLINK_H
@@ -58,6 +59,7 @@ struct libbpf_nla_req {
 	union {
 		struct ifinfomsg ifinfo;
 		struct tcmsg tc;
+		struct genlmsghdr gnl;
 	};
 	char buf[128];
 };
@@ -89,11 +91,21 @@ static inline uint8_t libbpf_nla_getattr_u8(const struct nlattr *nla)
 	return *(uint8_t *)libbpf_nla_data(nla);
 }
 
+static inline uint16_t libbpf_nla_getattr_u16(const struct nlattr *nla)
+{
+	return *(uint16_t *)libbpf_nla_data(nla);
+}
+
 static inline uint32_t libbpf_nla_getattr_u32(const struct nlattr *nla)
 {
 	return *(uint32_t *)libbpf_nla_data(nla);
 }
 
+static inline uint64_t libbpf_nla_getattr_u64(const struct nlattr *nla)
+{
+	return *(uint64_t *)libbpf_nla_data(nla);
+}
+
 static inline const char *libbpf_nla_getattr_str(const struct nlattr *nla)
 {
 	return (const char *)libbpf_nla_data(nla);
-- 
cgit 


From 84050074e51b68e6f1d3e89ebe8f8f6d73472ec3 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Wed, 1 Feb 2023 11:24:23 +0100
Subject: selftests/bpf: add test for bpf_xdp_query xdp-features support

Introduce a self-test to verify libbpf bpf_xdp_query capability to dump
the xdp-features supported by the device (lo and veth in this case).

Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/534550318a2c883e174811683909544c63632f05.1675245258.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/xdp_do_redirect.c     | 27 +++++++++++++++++++++-
 tools/testing/selftests/bpf/prog_tests/xdp_info.c  |  8 +++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
index ac70e871d62f..2666c84dbd01 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
@@ -4,10 +4,12 @@
 #include <net/if.h>
 #include <linux/if_ether.h>
 #include <linux/if_packet.h>
+#include <linux/if_link.h>
 #include <linux/ipv6.h>
 #include <linux/in6.h>
 #include <linux/udp.h>
 #include <bpf/bpf_endian.h>
+#include <uapi/linux/netdev.h>
 #include "test_xdp_do_redirect.skel.h"
 
 #define SYS(fmt, ...)						\
@@ -96,7 +98,7 @@ void test_xdp_do_redirect(void)
 	struct test_xdp_do_redirect *skel = NULL;
 	struct nstoken *nstoken = NULL;
 	struct bpf_link *link;
-
+	LIBBPF_OPTS(bpf_xdp_query_opts, query_opts);
 	struct xdp_md ctx_in = { .data = sizeof(__u32),
 				 .data_end = sizeof(data) };
 	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
@@ -157,6 +159,29 @@ void test_xdp_do_redirect(void)
 	    !ASSERT_NEQ(ifindex_dst, 0, "ifindex_dst"))
 		goto out;
 
+	/* Check xdp features supported by veth driver */
+	err = bpf_xdp_query(ifindex_src, XDP_FLAGS_DRV_MODE, &query_opts);
+	if (!ASSERT_OK(err, "veth_src bpf_xdp_query"))
+		goto out;
+
+	if (!ASSERT_EQ(query_opts.feature_flags,
+		       NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT |
+		       NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_RX_SG |
+		       NETDEV_XDP_ACT_NDO_XMIT_SG,
+		       "veth_src query_opts.feature_flags"))
+		goto out;
+
+	err = bpf_xdp_query(ifindex_dst, XDP_FLAGS_DRV_MODE, &query_opts);
+	if (!ASSERT_OK(err, "veth_dst bpf_xdp_query"))
+		goto out;
+
+	if (!ASSERT_EQ(query_opts.feature_flags,
+		       NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT |
+		       NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_RX_SG |
+		       NETDEV_XDP_ACT_NDO_XMIT_SG,
+		       "veth_dst query_opts.feature_flags"))
+		goto out;
+
 	memcpy(skel->rodata->expect_dst, &pkt_udp.eth.h_dest, ETH_ALEN);
 	skel->rodata->ifindex_out = ifindex_src; /* redirect back to the same iface */
 	skel->rodata->ifindex_in = ifindex_src;
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_info.c b/tools/testing/selftests/bpf/prog_tests/xdp_info.c
index cd3aa340e65e..286c21ecdc65 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_info.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_info.c
@@ -8,6 +8,7 @@ void serial_test_xdp_info(void)
 {
 	__u32 len = sizeof(struct bpf_prog_info), duration = 0, prog_id;
 	const char *file = "./xdp_dummy.bpf.o";
+	LIBBPF_OPTS(bpf_xdp_query_opts, opts);
 	struct bpf_prog_info info = {};
 	struct bpf_object *obj;
 	int err, prog_fd;
@@ -61,6 +62,13 @@ void serial_test_xdp_info(void)
 	if (CHECK(prog_id, "prog_id_drv", "unexpected prog_id=%u\n", prog_id))
 		goto out;
 
+	/* Check xdp features supported by lo device */
+	opts.feature_flags = ~0;
+	err = bpf_xdp_query(IFINDEX_LO, XDP_FLAGS_DRV_MODE, &opts);
+	if (!ASSERT_OK(err, "bpf_xdp_query"))
+		goto out;
+
+	ASSERT_EQ(opts.feature_flags, 0, "opts.feature_flags");
 out:
 	bpf_xdp_detach(IFINDEX_LO, 0, NULL);
 out_close:
-- 
cgit 


From 4dba3e7852b7717a20ba206ab23ad289a0a5c504 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Wed, 1 Feb 2023 11:24:24 +0100
Subject: selftests/bpf: introduce XDP compliance test tool

Introduce xdp_features tool in order to test XDP features supported by
the NIC and match them against advertised ones.
In order to test supported/advertised XDP features, xdp_features must
run on the Device Under Test (DUT) and on a Tester device.
xdp_features opens a control TCP channel between DUT and Tester devices
to send control commands from Tester to the DUT and a UDP data channel
where the Tester sends UDP 'echo' packets and the DUT is expected to
reply back with the same packet. DUT installs multiple XDP programs on the
NIC to test XDP capabilities and reports back to the Tester some XDP stats.
Currently xdp_features supports the following XDP features:
- XDP_DROP
- XDP_ABORTED
- XDP_PASS
- XDP_TX
- XDP_REDIRECT
- XDP_NDO_XMIT

Co-developed-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/7c1af8e7e6ef0614cf32fa9e6bdaa2d8d605f859.1675245258.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/.gitignore           |   1 +
 tools/testing/selftests/bpf/Makefile             |  11 +-
 tools/testing/selftests/bpf/progs/xdp_features.c | 269 +++++++++
 tools/testing/selftests/bpf/test_xdp_features.sh | 107 ++++
 tools/testing/selftests/bpf/xdp_features.c       | 699 +++++++++++++++++++++++
 tools/testing/selftests/bpf/xdp_features.h       |  20 +
 6 files changed, 1105 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/xdp_features.c
 create mode 100755 tools/testing/selftests/bpf/test_xdp_features.sh
 create mode 100644 tools/testing/selftests/bpf/xdp_features.c
 create mode 100644 tools/testing/selftests/bpf/xdp_features.h

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 4aa5bba956ff..116fecf80ca1 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -48,3 +48,4 @@ xskxceiver
 xdp_redirect_multi
 xdp_synproxy
 xdp_hw_metadata
+xdp_features
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index e79039726173..b2eb3201b85a 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -73,7 +73,8 @@ TEST_PROGS := test_kmod.sh \
 	test_bpftool.sh \
 	test_bpftool_metadata.sh \
 	test_doc_build.sh \
-	test_xsk.sh
+	test_xsk.sh \
+	test_xdp_features.sh
 
 TEST_PROGS_EXTENDED := with_addr.sh \
 	with_tunnels.sh ima_setup.sh verify_sig_setup.sh \
@@ -83,7 +84,8 @@ TEST_PROGS_EXTENDED := with_addr.sh \
 TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
 	flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
 	test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
-	xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata
+	xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \
+	xdp_features
 
 TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/sign-file
 TEST_GEN_FILES += liburandom_read.so
@@ -385,6 +387,7 @@ test_subskeleton_lib.skel.h-deps := test_subskeleton_lib2.bpf.o test_subskeleton
 test_usdt.skel.h-deps := test_usdt.bpf.o test_usdt_multispec.bpf.o
 xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o
 xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o
+xdp_features.skel.h-deps := xdp_features.bpf.o
 
 LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps)))
 
@@ -587,6 +590,10 @@ $(OUTPUT)/xdp_hw_metadata: xdp_hw_metadata.c $(OUTPUT)/network_helpers.o $(OUTPU
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
 
+$(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp_features.skel.h | $(OUTPUT)
+	$(call msg,BINARY,,$@)
+	$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
+
 # Make sure we are able to include and link libbpf against c++.
 $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ)
 	$(call msg,CXX,,$@)
diff --git a/tools/testing/selftests/bpf/progs/xdp_features.c b/tools/testing/selftests/bpf/progs/xdp_features.c
new file mode 100644
index 000000000000..87c247d56f72
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_features.c
@@ -0,0 +1,269 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <linux/netdev.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_tracing.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/udp.h>
+#include <asm-generic/errno-base.h>
+
+#include "xdp_features.h"
+
+#define ipv6_addr_equal(a, b)	((a).s6_addr32[0] == (b).s6_addr32[0] &&	\
+				 (a).s6_addr32[1] == (b).s6_addr32[1] &&	\
+				 (a).s6_addr32[2] == (b).s6_addr32[2] &&	\
+				 (a).s6_addr32[3] == (b).s6_addr32[3])
+
+struct net_device;
+struct bpf_prog;
+
+struct xdp_cpumap_stats {
+	unsigned int redirect;
+	unsigned int pass;
+	unsigned int drop;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, __u32);
+	__type(value, __u32);
+	__uint(max_entries, 1);
+} stats SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, __u32);
+	__type(value, __u32);
+	__uint(max_entries, 1);
+} dut_stats SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_CPUMAP);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct bpf_cpumap_val));
+	__uint(max_entries, 1);
+} cpu_map SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_DEVMAP);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct bpf_devmap_val));
+	__uint(max_entries, 1);
+} dev_map SEC(".maps");
+
+const volatile struct in6_addr tester_addr;
+const volatile struct in6_addr dut_addr;
+
+static __always_inline int
+xdp_process_echo_packet(struct xdp_md *xdp, bool dut)
+{
+	void *data_end = (void *)(long)xdp->data_end;
+	void *data = (void *)(long)xdp->data;
+	struct ethhdr *eh = data;
+	struct tlv_hdr *tlv;
+	struct udphdr *uh;
+	__be16 port;
+	__u8 *cmd;
+
+	if (eh + 1 > (struct ethhdr *)data_end)
+		return -EINVAL;
+
+	if (eh->h_proto == bpf_htons(ETH_P_IP)) {
+		struct iphdr *ih = (struct iphdr *)(eh + 1);
+		__be32 saddr = dut ? tester_addr.s6_addr32[3]
+				   : dut_addr.s6_addr32[3];
+		__be32 daddr = dut ? dut_addr.s6_addr32[3]
+				   : tester_addr.s6_addr32[3];
+
+		ih = (struct iphdr *)(eh + 1);
+		if (ih + 1 > (struct iphdr *)data_end)
+			return -EINVAL;
+
+		if (saddr != ih->saddr)
+			return -EINVAL;
+
+		if (daddr != ih->daddr)
+			return -EINVAL;
+
+		if (ih->protocol != IPPROTO_UDP)
+			return -EINVAL;
+
+		uh = (struct udphdr *)(ih + 1);
+	} else if (eh->h_proto == bpf_htons(ETH_P_IPV6)) {
+		struct in6_addr saddr = dut ? tester_addr : dut_addr;
+		struct in6_addr daddr = dut ? dut_addr : tester_addr;
+		struct ipv6hdr *ih6 = (struct ipv6hdr *)(eh + 1);
+
+		if (ih6 + 1 > (struct ipv6hdr *)data_end)
+			return -EINVAL;
+
+		if (!ipv6_addr_equal(saddr, ih6->saddr))
+			return -EINVAL;
+
+		if (!ipv6_addr_equal(daddr, ih6->daddr))
+			return -EINVAL;
+
+		if (ih6->nexthdr != IPPROTO_UDP)
+			return -EINVAL;
+
+		uh = (struct udphdr *)(ih6 + 1);
+	} else {
+		return -EINVAL;
+	}
+
+	if (uh + 1 > (struct udphdr *)data_end)
+		return -EINVAL;
+
+	port = dut ? uh->dest : uh->source;
+	if (port != bpf_htons(DUT_ECHO_PORT))
+		return -EINVAL;
+
+	tlv = (struct tlv_hdr *)(uh + 1);
+	if (tlv + 1 > data_end)
+		return -EINVAL;
+
+	return bpf_htons(tlv->type) == CMD_ECHO ? 0 : -EINVAL;
+}
+
+static __always_inline int
+xdp_update_stats(struct xdp_md *xdp, bool tx, bool dut)
+{
+	__u32 *val, key = 0;
+
+	if (xdp_process_echo_packet(xdp, tx))
+		return -EINVAL;
+
+	if (dut)
+		val = bpf_map_lookup_elem(&dut_stats, &key);
+	else
+		val = bpf_map_lookup_elem(&stats, &key);
+
+	if (val)
+		__sync_add_and_fetch(val, 1);
+
+	return 0;
+}
+
+/* Tester */
+
+SEC("xdp")
+int xdp_tester_check_tx(struct xdp_md *xdp)
+{
+	xdp_update_stats(xdp, true, false);
+
+	return XDP_PASS;
+}
+
+SEC("xdp")
+int xdp_tester_check_rx(struct xdp_md *xdp)
+{
+	xdp_update_stats(xdp, false, false);
+
+	return XDP_PASS;
+}
+
+/* DUT */
+
+SEC("xdp")
+int xdp_do_pass(struct xdp_md *xdp)
+{
+	xdp_update_stats(xdp, true, true);
+
+	return XDP_PASS;
+}
+
+SEC("xdp")
+int xdp_do_drop(struct xdp_md *xdp)
+{
+	if (xdp_update_stats(xdp, true, true))
+		return XDP_PASS;
+
+	return XDP_DROP;
+}
+
+SEC("xdp")
+int xdp_do_aborted(struct xdp_md *xdp)
+{
+	if (xdp_process_echo_packet(xdp, true))
+		return XDP_PASS;
+
+	return XDP_ABORTED;
+}
+
+SEC("xdp")
+int xdp_do_tx(struct xdp_md *xdp)
+{
+	void *data = (void *)(long)xdp->data;
+	struct ethhdr *eh = data;
+	__u8 tmp_mac[ETH_ALEN];
+
+	if (xdp_update_stats(xdp, true, true))
+		return XDP_PASS;
+
+	__builtin_memcpy(tmp_mac, eh->h_source, ETH_ALEN);
+	__builtin_memcpy(eh->h_source, eh->h_dest, ETH_ALEN);
+	__builtin_memcpy(eh->h_dest, tmp_mac, ETH_ALEN);
+
+	return XDP_TX;
+}
+
+SEC("xdp")
+int xdp_do_redirect(struct xdp_md *xdp)
+{
+	if (xdp_process_echo_packet(xdp, true))
+		return XDP_PASS;
+
+	return bpf_redirect_map(&cpu_map, 0, 0);
+}
+
+SEC("tp_btf/xdp_exception")
+int BPF_PROG(xdp_exception, const struct net_device *dev,
+	     const struct bpf_prog *xdp, __u32 act)
+{
+	__u32 *val, key = 0;
+
+	val = bpf_map_lookup_elem(&dut_stats, &key);
+	if (val)
+		__sync_add_and_fetch(val, 1);
+
+	return 0;
+}
+
+SEC("tp_btf/xdp_cpumap_kthread")
+int BPF_PROG(tp_xdp_cpumap_kthread, int map_id, unsigned int processed,
+	     unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats)
+{
+	__u32 *val, key = 0;
+
+	val = bpf_map_lookup_elem(&dut_stats, &key);
+	if (val)
+		__sync_add_and_fetch(val, 1);
+
+	return 0;
+}
+
+SEC("xdp/cpumap")
+int xdp_do_redirect_cpumap(struct xdp_md *xdp)
+{
+	void *data = (void *)(long)xdp->data;
+	struct ethhdr *eh = data;
+	__u8 tmp_mac[ETH_ALEN];
+
+	if (xdp_process_echo_packet(xdp, true))
+		return XDP_PASS;
+
+	__builtin_memcpy(tmp_mac, eh->h_source, ETH_ALEN);
+	__builtin_memcpy(eh->h_source, eh->h_dest, ETH_ALEN);
+	__builtin_memcpy(eh->h_dest, tmp_mac, ETH_ALEN);
+
+	return bpf_redirect_map(&dev_map, 0, 0);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_xdp_features.sh b/tools/testing/selftests/bpf/test_xdp_features.sh
new file mode 100755
index 000000000000..0aa71c4455c0
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_features.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+readonly NS="ns1-$(mktemp -u XXXXXX)"
+readonly V0_IP4=10.10.0.11
+readonly V1_IP4=10.10.0.1
+readonly V0_IP6=2001:db8::11
+readonly V1_IP6=2001:db8::1
+
+ret=1
+
+setup() {
+	{
+		ip netns add ${NS}
+
+		ip link add v1 type veth peer name v0 netns ${NS}
+
+		ip link set v1 up
+		ip addr add $V1_IP4/24 dev v1
+		ip addr add $V1_IP6/64 nodad dev v1
+		ip -n ${NS} link set dev v0 up
+		ip -n ${NS} addr add $V0_IP4/24 dev v0
+		ip -n ${NS} addr add $V0_IP6/64 nodad dev v0
+
+		# Enable XDP mode and disable checksum offload
+		ethtool -K v1 gro on
+		ethtool -K v1 tx-checksumming off
+		ip netns exec ${NS} ethtool -K v0 gro on
+		ip netns exec ${NS} ethtool -K v0 tx-checksumming off
+	} > /dev/null 2>&1
+}
+
+cleanup() {
+	ip link del v1 2> /dev/null
+	ip netns del ${NS} 2> /dev/null
+	[ "$(pidof xdp_features)" = "" ] || kill $(pidof xdp_features) 2> /dev/null
+}
+
+wait_for_dut_server() {
+	while sleep 1; do
+		ss -tlp | grep -q xdp_features
+		[ $? -eq 0 ] && break
+	done
+}
+
+test_xdp_features() {
+	setup
+
+	## XDP_PASS
+	./xdp_features -f XDP_PASS -D $V1_IP6 -T $V0_IP6 v1 &
+	wait_for_dut_server
+	ip netns exec ${NS} ./xdp_features -t -f XDP_PASS \
+					   -D $V1_IP6 -C $V1_IP6 \
+					   -T $V0_IP6 v0
+	[ $? -ne 0 ] && exit
+
+	## XDP_DROP
+	./xdp_features -f XDP_DROP -D ::ffff:$V1_IP4 -T ::ffff:$V0_IP4 v1 &
+	wait_for_dut_server
+	ip netns exec ${NS} ./xdp_features -t -f XDP_DROP \
+					   -D ::ffff:$V1_IP4 \
+					   -C ::ffff:$V1_IP4 \
+					   -T ::ffff:$V0_IP4 v0
+	[ $? -ne 0 ] && exit
+
+	## XDP_ABORTED
+	./xdp_features -f XDP_ABORTED -D $V1_IP6 -T $V0_IP6 v1 &
+	wait_for_dut_server
+	ip netns exec ${NS} ./xdp_features -t -f XDP_ABORTED \
+					   -D $V1_IP6 -C $V1_IP6 \
+					   -T $V0_IP6 v0
+	[ $? -ne 0 ] && exit
+
+	## XDP_TX
+	./xdp_features -f XDP_TX -D ::ffff:$V1_IP4 -T ::ffff:$V0_IP4 v1 &
+	wait_for_dut_server
+	ip netns exec ${NS} ./xdp_features -t -f XDP_TX \
+					   -D ::ffff:$V1_IP4 \
+					   -C ::ffff:$V1_IP4 \
+					   -T ::ffff:$V0_IP4 v0
+	[ $? -ne 0 ] && exit
+
+	## XDP_REDIRECT
+	./xdp_features -f XDP_REDIRECT -D $V1_IP6 -T $V0_IP6 v1 &
+	wait_for_dut_server
+	ip netns exec ${NS} ./xdp_features -t -f XDP_REDIRECT \
+					   -D $V1_IP6 -C $V1_IP6 \
+					   -T $V0_IP6 v0
+	[ $? -ne 0 ] && exit
+
+	## XDP_NDO_XMIT
+	./xdp_features -f XDP_NDO_XMIT -D ::ffff:$V1_IP4 -T ::ffff:$V0_IP4 v1 &
+	wait_for_dut_server
+	ip netns exec ${NS} ./xdp_features -t -f XDP_NDO_XMIT \
+					   -D ::ffff:$V1_IP4 \
+					   -C ::ffff:$V1_IP4 \
+					   -T ::ffff:$V0_IP4 v0
+	ret=$?
+	cleanup
+}
+
+set -e
+trap cleanup 2 3 6 9
+
+test_xdp_features
+
+exit $ret
diff --git a/tools/testing/selftests/bpf/xdp_features.c b/tools/testing/selftests/bpf/xdp_features.c
new file mode 100644
index 000000000000..10fad1243573
--- /dev/null
+++ b/tools/testing/selftests/bpf/xdp_features.c
@@ -0,0 +1,699 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/netdev.h>
+#include <linux/if_link.h>
+#include <signal.h>
+#include <argp.h>
+#include <net/if.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include <pthread.h>
+
+#include <network_helpers.h>
+
+#include "xdp_features.skel.h"
+#include "xdp_features.h"
+
+#define RED(str)	"\033[0;31m" str "\033[0m"
+#define GREEN(str)	"\033[0;32m" str "\033[0m"
+#define YELLOW(str)	"\033[0;33m" str "\033[0m"
+
+static struct env {
+	bool verbosity;
+	int ifindex;
+	bool is_tester;
+	struct {
+		enum netdev_xdp_act drv_feature;
+		enum xdp_action action;
+	} feature;
+	struct sockaddr_storage dut_ctrl_addr;
+	struct sockaddr_storage dut_addr;
+	struct sockaddr_storage tester_addr;
+} env;
+
+#define BUFSIZE		128
+
+void test__fail(void) { /* for network_helpers.c */ }
+
+static int libbpf_print_fn(enum libbpf_print_level level,
+			   const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !env.verbosity)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static volatile bool exiting;
+
+static void sig_handler(int sig)
+{
+	exiting = true;
+}
+
+const char *argp_program_version = "xdp-features 0.0";
+const char argp_program_doc[] =
+"XDP features detecion application.\n"
+"\n"
+"XDP features application checks the XDP advertised features match detected ones.\n"
+"\n"
+"USAGE: ./xdp-features [-vt] [-f <xdp-feature>] [-D <dut-data-ip>] [-T <tester-data-ip>] [-C <dut-ctrl-ip>] <iface-name>\n"
+"\n"
+"dut-data-ip, tester-data-ip, dut-ctrl-ip: IPv6 or IPv4-mapped-IPv6 addresses;\n"
+"\n"
+"XDP features\n:"
+"- XDP_PASS\n"
+"- XDP_DROP\n"
+"- XDP_ABORTED\n"
+"- XDP_REDIRECT\n"
+"- XDP_NDO_XMIT\n"
+"- XDP_TX\n";
+
+static const struct argp_option opts[] = {
+	{ "verbose", 'v', NULL, 0, "Verbose debug output" },
+	{ "tester", 't', NULL, 0, "Tester mode" },
+	{ "feature", 'f', "XDP-FEATURE", 0, "XDP feature to test" },
+	{ "dut_data_ip", 'D', "DUT-DATA-IP", 0, "DUT IP data channel" },
+	{ "dut_ctrl_ip", 'C', "DUT-CTRL-IP", 0, "DUT IP control channel" },
+	{ "tester_data_ip", 'T', "TESTER-DATA-IP", 0, "Tester IP data channel" },
+	{},
+};
+
+static int get_xdp_feature(const char *arg)
+{
+	if (!strcmp(arg, "XDP_PASS")) {
+		env.feature.action = XDP_PASS;
+		env.feature.drv_feature = NETDEV_XDP_ACT_BASIC;
+	} else if (!strcmp(arg, "XDP_DROP")) {
+		env.feature.drv_feature = NETDEV_XDP_ACT_BASIC;
+		env.feature.action = XDP_DROP;
+	} else if (!strcmp(arg, "XDP_ABORTED")) {
+		env.feature.drv_feature = NETDEV_XDP_ACT_BASIC;
+		env.feature.action = XDP_ABORTED;
+	} else if (!strcmp(arg, "XDP_TX")) {
+		env.feature.drv_feature = NETDEV_XDP_ACT_BASIC;
+		env.feature.action = XDP_TX;
+	} else if (!strcmp(arg, "XDP_REDIRECT")) {
+		env.feature.drv_feature = NETDEV_XDP_ACT_REDIRECT;
+		env.feature.action = XDP_REDIRECT;
+	} else if (!strcmp(arg, "XDP_NDO_XMIT")) {
+		env.feature.drv_feature = NETDEV_XDP_ACT_NDO_XMIT;
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static char *get_xdp_feature_str(void)
+{
+	switch (env.feature.action) {
+	case XDP_PASS:
+		return YELLOW("XDP_PASS");
+	case XDP_DROP:
+		return YELLOW("XDP_DROP");
+	case XDP_ABORTED:
+		return YELLOW("XDP_ABORTED");
+	case XDP_TX:
+		return YELLOW("XDP_TX");
+	case XDP_REDIRECT:
+		return YELLOW("XDP_REDIRECT");
+	default:
+		break;
+	}
+
+	if (env.feature.drv_feature == NETDEV_XDP_ACT_NDO_XMIT)
+		return YELLOW("XDP_NDO_XMIT");
+
+	return "";
+}
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+	switch (key) {
+	case 'v':
+		env.verbosity = true;
+		break;
+	case 't':
+		env.is_tester = true;
+		break;
+	case 'f':
+		if (get_xdp_feature(arg) < 0) {
+			fprintf(stderr, "Invalid xdp feature: %s\n", arg);
+			argp_usage(state);
+			return ARGP_ERR_UNKNOWN;
+		}
+		break;
+	case 'D':
+		if (make_sockaddr(AF_INET6, arg, DUT_ECHO_PORT,
+				  &env.dut_addr, NULL)) {
+			fprintf(stderr, "Invalid DUT address: %s\n", arg);
+			return ARGP_ERR_UNKNOWN;
+		}
+		break;
+	case 'C':
+		if (make_sockaddr(AF_INET6, arg, DUT_CTRL_PORT,
+				  &env.dut_ctrl_addr, NULL)) {
+			fprintf(stderr, "Invalid DUT CTRL address: %s\n", arg);
+			return ARGP_ERR_UNKNOWN;
+		}
+		break;
+	case 'T':
+		if (make_sockaddr(AF_INET6, arg, 0, &env.tester_addr, NULL)) {
+			fprintf(stderr, "Invalid Tester address: %s\n", arg);
+			return ARGP_ERR_UNKNOWN;
+		}
+		break;
+	case ARGP_KEY_ARG:
+		errno = 0;
+		if (strlen(arg) >= IF_NAMESIZE) {
+			fprintf(stderr, "Invalid device name: %s\n", arg);
+			argp_usage(state);
+			return ARGP_ERR_UNKNOWN;
+		}
+
+		env.ifindex = if_nametoindex(arg);
+		if (!env.ifindex)
+			env.ifindex = strtoul(arg, NULL, 0);
+		if (!env.ifindex) {
+			fprintf(stderr,
+				"Bad interface index or name (%d): %s\n",
+				errno, strerror(errno));
+			argp_usage(state);
+			return ARGP_ERR_UNKNOWN;
+		}
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+
+	return 0;
+}
+
+static const struct argp argp = {
+	.options = opts,
+	.parser = parse_arg,
+	.doc = argp_program_doc,
+};
+
+static void set_env_default(void)
+{
+	env.feature.drv_feature = NETDEV_XDP_ACT_NDO_XMIT;
+	env.feature.action = -EINVAL;
+	env.ifindex = -ENODEV;
+	make_sockaddr(AF_INET6, "::ffff:127.0.0.1", DUT_CTRL_PORT,
+		      &env.dut_ctrl_addr, NULL);
+	make_sockaddr(AF_INET6, "::ffff:127.0.0.1", DUT_ECHO_PORT,
+		      &env.dut_addr, NULL);
+	make_sockaddr(AF_INET6, "::ffff:127.0.0.1", 0, &env.tester_addr, NULL);
+}
+
+static void *dut_echo_thread(void *arg)
+{
+	unsigned char buf[sizeof(struct tlv_hdr)];
+	int sockfd = *(int *)arg;
+
+	while (!exiting) {
+		struct tlv_hdr *tlv = (struct tlv_hdr *)buf;
+		struct sockaddr_storage addr;
+		socklen_t addrlen;
+		size_t n;
+
+		n = recvfrom(sockfd, buf, sizeof(buf), MSG_WAITALL,
+			     (struct sockaddr *)&addr, &addrlen);
+		if (n != ntohs(tlv->len))
+			continue;
+
+		if (ntohs(tlv->type) != CMD_ECHO)
+			continue;
+
+		sendto(sockfd, buf, sizeof(buf), MSG_NOSIGNAL | MSG_CONFIRM,
+		       (struct sockaddr *)&addr, addrlen);
+	}
+
+	pthread_exit((void *)0);
+	close(sockfd);
+
+	return NULL;
+}
+
+static int dut_run_echo_thread(pthread_t *t, int *sockfd)
+{
+	int err;
+
+	sockfd = start_reuseport_server(AF_INET6, SOCK_DGRAM, NULL,
+					DUT_ECHO_PORT, 0, 1);
+	if (!sockfd) {
+		fprintf(stderr, "Failed to create echo socket\n");
+		return -errno;
+	}
+
+	/* start echo channel */
+	err = pthread_create(t, NULL, dut_echo_thread, sockfd);
+	if (err) {
+		fprintf(stderr, "Failed creating dut_echo thread: %s\n",
+			strerror(-err));
+		free_fds(sockfd, 1);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int dut_attach_xdp_prog(struct xdp_features *skel, int flags)
+{
+	enum xdp_action action = env.feature.action;
+	struct bpf_program *prog;
+	unsigned int key = 0;
+	int err, fd = 0;
+
+	if (env.feature.drv_feature == NETDEV_XDP_ACT_NDO_XMIT) {
+		struct bpf_devmap_val entry = {
+			.ifindex = env.ifindex,
+		};
+
+		err = bpf_map__update_elem(skel->maps.dev_map,
+					   &key, sizeof(key),
+					   &entry, sizeof(entry), 0);
+		if (err < 0)
+			return err;
+
+		fd = bpf_program__fd(skel->progs.xdp_do_redirect_cpumap);
+		action = XDP_REDIRECT;
+	}
+
+	switch (action) {
+	case XDP_TX:
+		prog = skel->progs.xdp_do_tx;
+		break;
+	case XDP_DROP:
+		prog = skel->progs.xdp_do_drop;
+		break;
+	case XDP_ABORTED:
+		prog = skel->progs.xdp_do_aborted;
+		break;
+	case XDP_PASS:
+		prog = skel->progs.xdp_do_pass;
+		break;
+	case XDP_REDIRECT: {
+		struct bpf_cpumap_val entry = {
+			.qsize = 2048,
+			.bpf_prog.fd = fd,
+		};
+
+		err = bpf_map__update_elem(skel->maps.cpu_map,
+					   &key, sizeof(key),
+					   &entry, sizeof(entry), 0);
+		if (err < 0)
+			return err;
+
+		prog = skel->progs.xdp_do_redirect;
+		break;
+	}
+	default:
+		return -EINVAL;
+	}
+
+	err = bpf_xdp_attach(env.ifindex, bpf_program__fd(prog), flags, NULL);
+	if (err)
+		fprintf(stderr,
+			"Failed to attach XDP program to ifindex %d\n",
+			env.ifindex);
+	return err;
+}
+
+static int recv_msg(int sockfd, void *buf, size_t bufsize, void *val,
+		    size_t val_size)
+{
+	struct tlv_hdr *tlv = (struct tlv_hdr *)buf;
+	size_t len;
+
+	len = recv(sockfd, buf, bufsize, 0);
+	if (len != ntohs(tlv->len) || len < sizeof(*tlv))
+		return -EINVAL;
+
+	if (val) {
+		len -= sizeof(*tlv);
+		if (len > val_size)
+			return -ENOMEM;
+
+		memcpy(val, tlv->data, len);
+	}
+
+	return 0;
+}
+
+static int dut_run(struct xdp_features *skel)
+{
+	int flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_DRV_MODE;
+	int state, err, *sockfd, ctrl_sockfd, echo_sockfd;
+	struct sockaddr_storage ctrl_addr;
+	pthread_t dut_thread;
+	socklen_t addrlen;
+
+	sockfd = start_reuseport_server(AF_INET6, SOCK_STREAM, NULL,
+					DUT_CTRL_PORT, 0, 1);
+	if (!sockfd) {
+		fprintf(stderr, "Failed to create DUT socket\n");
+		return -errno;
+	}
+
+	ctrl_sockfd = accept(*sockfd, (struct sockaddr *)&ctrl_addr, &addrlen);
+	if (ctrl_sockfd < 0) {
+		fprintf(stderr, "Failed to accept connection on DUT socket\n");
+		free_fds(sockfd, 1);
+		return -errno;
+	}
+
+	/* CTRL loop */
+	while (!exiting) {
+		unsigned char buf[BUFSIZE] = {};
+		struct tlv_hdr *tlv = (struct tlv_hdr *)buf;
+
+		err = recv_msg(ctrl_sockfd, buf, BUFSIZE, NULL, 0);
+		if (err)
+			continue;
+
+		switch (ntohs(tlv->type)) {
+		case CMD_START: {
+			if (state == CMD_START)
+				continue;
+
+			state = CMD_START;
+			/* Load the XDP program on the DUT */
+			err = dut_attach_xdp_prog(skel, flags);
+			if (err)
+				goto out;
+
+			err = dut_run_echo_thread(&dut_thread, &echo_sockfd);
+			if (err < 0)
+				goto out;
+
+			tlv->type = htons(CMD_ACK);
+			tlv->len = htons(sizeof(*tlv));
+			err = send(ctrl_sockfd, buf, sizeof(*tlv), 0);
+			if (err < 0)
+				goto end_thread;
+			break;
+		}
+		case CMD_STOP:
+			if (state != CMD_START)
+				break;
+
+			state = CMD_STOP;
+
+			exiting = true;
+			bpf_xdp_detach(env.ifindex, flags, NULL);
+
+			tlv->type = htons(CMD_ACK);
+			tlv->len = htons(sizeof(*tlv));
+			err = send(ctrl_sockfd, buf, sizeof(*tlv), 0);
+			goto end_thread;
+		case CMD_GET_XDP_CAP: {
+			LIBBPF_OPTS(bpf_xdp_query_opts, opts);
+			unsigned long long val;
+			size_t n;
+
+			err = bpf_xdp_query(env.ifindex, XDP_FLAGS_DRV_MODE,
+					    &opts);
+			if (err) {
+				fprintf(stderr,
+					"Failed to query XDP cap for ifindex %d\n",
+					env.ifindex);
+				goto end_thread;
+			}
+
+			tlv->type = htons(CMD_ACK);
+			n = sizeof(*tlv) + sizeof(opts.feature_flags);
+			tlv->len = htons(n);
+
+			val = htobe64(opts.feature_flags);
+			memcpy(tlv->data, &val, sizeof(val));
+
+			err = send(ctrl_sockfd, buf, n, 0);
+			if (err < 0)
+				goto end_thread;
+			break;
+		}
+		case CMD_GET_STATS: {
+			unsigned int key = 0, val;
+			size_t n;
+
+			err = bpf_map__lookup_elem(skel->maps.dut_stats,
+						   &key, sizeof(key),
+						   &val, sizeof(val), 0);
+			if (err) {
+				fprintf(stderr, "bpf_map_lookup_elem failed\n");
+				goto end_thread;
+			}
+
+			tlv->type = htons(CMD_ACK);
+			n = sizeof(*tlv) + sizeof(val);
+			tlv->len = htons(n);
+
+			val = htonl(val);
+			memcpy(tlv->data, &val, sizeof(val));
+
+			err = send(ctrl_sockfd, buf, n, 0);
+			if (err < 0)
+				goto end_thread;
+			break;
+		}
+		default:
+			break;
+		}
+	}
+
+end_thread:
+	pthread_join(dut_thread, NULL);
+out:
+	bpf_xdp_detach(env.ifindex, flags, NULL);
+	close(ctrl_sockfd);
+	free_fds(sockfd, 1);
+
+	return err;
+}
+
+static bool tester_collect_detected_cap(struct xdp_features *skel,
+					unsigned int dut_stats)
+{
+	unsigned int err, key = 0, val;
+
+	if (!dut_stats)
+		return false;
+
+	err = bpf_map__lookup_elem(skel->maps.stats, &key, sizeof(key),
+				   &val, sizeof(val), 0);
+	if (err) {
+		fprintf(stderr, "bpf_map_lookup_elem failed\n");
+		return false;
+	}
+
+	switch (env.feature.action) {
+	case XDP_PASS:
+	case XDP_TX:
+	case XDP_REDIRECT:
+		return val > 0;
+	case XDP_DROP:
+	case XDP_ABORTED:
+		return val == 0;
+	default:
+		break;
+	}
+
+	if (env.feature.drv_feature == NETDEV_XDP_ACT_NDO_XMIT)
+		return val > 0;
+
+	return false;
+}
+
+static int send_and_recv_msg(int sockfd, enum test_commands cmd, void *val,
+			     size_t val_size)
+{
+	unsigned char buf[BUFSIZE] = {};
+	struct tlv_hdr *tlv = (struct tlv_hdr *)buf;
+	int err;
+
+	tlv->type = htons(cmd);
+	tlv->len = htons(sizeof(*tlv));
+
+	err = send(sockfd, buf, sizeof(*tlv), 0);
+	if (err < 0)
+		return err;
+
+	err = recv_msg(sockfd, buf, BUFSIZE, val, val_size);
+	if (err < 0)
+		return err;
+
+	return ntohs(tlv->type) == CMD_ACK ? 0 : -EINVAL;
+}
+
+static int send_echo_msg(void)
+{
+	unsigned char buf[sizeof(struct tlv_hdr)];
+	struct tlv_hdr *tlv = (struct tlv_hdr *)buf;
+	int sockfd, n;
+
+	sockfd = socket(AF_INET6, SOCK_DGRAM, 0);
+	if (sockfd < 0) {
+		fprintf(stderr, "Failed to create echo socket\n");
+		return -errno;
+	}
+
+	tlv->type = htons(CMD_ECHO);
+	tlv->len = htons(sizeof(*tlv));
+
+	n = sendto(sockfd, buf, sizeof(*tlv), MSG_NOSIGNAL | MSG_CONFIRM,
+		   (struct sockaddr *)&env.dut_addr, sizeof(env.dut_addr));
+	close(sockfd);
+
+	return n == ntohs(tlv->len) ? 0 : -EINVAL;
+}
+
+static int tester_run(struct xdp_features *skel)
+{
+	int flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_DRV_MODE;
+	unsigned long long advertised_feature;
+	struct bpf_program *prog;
+	unsigned int stats;
+	int i, err, sockfd;
+	bool detected_cap;
+
+	sockfd = socket(AF_INET6, SOCK_STREAM, 0);
+	if (sockfd < 0) {
+		fprintf(stderr, "Failed to create tester socket\n");
+		return -errno;
+	}
+
+	if (settimeo(sockfd, 1000) < 0)
+		return -EINVAL;
+
+	err = connect(sockfd, (struct sockaddr *)&env.dut_ctrl_addr,
+		      sizeof(env.dut_ctrl_addr));
+	if (err) {
+		fprintf(stderr, "Failed to connect to the DUT\n");
+		return -errno;
+	}
+
+	err = send_and_recv_msg(sockfd, CMD_GET_XDP_CAP, &advertised_feature,
+				sizeof(advertised_feature));
+	if (err < 0) {
+		close(sockfd);
+		return err;
+	}
+
+	advertised_feature = be64toh(advertised_feature);
+
+	if (env.feature.drv_feature == NETDEV_XDP_ACT_NDO_XMIT ||
+	    env.feature.action == XDP_TX)
+		prog = skel->progs.xdp_tester_check_tx;
+	else
+		prog = skel->progs.xdp_tester_check_rx;
+
+	err = bpf_xdp_attach(env.ifindex, bpf_program__fd(prog), flags, NULL);
+	if (err) {
+		fprintf(stderr, "Failed to attach XDP program to ifindex %d\n",
+			env.ifindex);
+		goto out;
+	}
+
+	err = send_and_recv_msg(sockfd, CMD_START, NULL, 0);
+	if (err)
+		goto out;
+
+	for (i = 0; i < 10 && !exiting; i++) {
+		err = send_echo_msg();
+		if (err < 0)
+			goto out;
+
+		sleep(1);
+	}
+
+	err = send_and_recv_msg(sockfd, CMD_GET_STATS, &stats, sizeof(stats));
+	if (err)
+		goto out;
+
+	/* stop the test */
+	err = send_and_recv_msg(sockfd, CMD_STOP, NULL, 0);
+	/* send a new echo message to wake echo thread of the dut */
+	send_echo_msg();
+
+	detected_cap = tester_collect_detected_cap(skel, ntohl(stats));
+
+	fprintf(stdout, "Feature %s: [%s][%s]\n", get_xdp_feature_str(),
+		detected_cap ? GREEN("DETECTED") : RED("NOT DETECTED"),
+		env.feature.drv_feature & advertised_feature ? GREEN("ADVERTISED")
+							     : RED("NOT ADVERTISED"));
+out:
+	bpf_xdp_detach(env.ifindex, flags, NULL);
+	close(sockfd);
+	return err < 0 ? err : 0;
+}
+
+int main(int argc, char **argv)
+{
+	struct xdp_features *skel;
+	int err;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+	libbpf_set_print(libbpf_print_fn);
+
+	signal(SIGINT, sig_handler);
+	signal(SIGTERM, sig_handler);
+
+	set_env_default();
+
+	/* Parse command line arguments */
+	err = argp_parse(&argp, argc, argv, 0, NULL, NULL);
+	if (err)
+		return err;
+
+	if (env.ifindex < 0) {
+		fprintf(stderr, "Invalid ifindex\n");
+		return -ENODEV;
+	}
+
+	/* Load and verify BPF application */
+	skel = xdp_features__open();
+	if (!skel) {
+		fprintf(stderr, "Failed to open and load BPF skeleton\n");
+		return -EINVAL;
+	}
+
+	skel->rodata->tester_addr =
+		((struct sockaddr_in6 *)&env.tester_addr)->sin6_addr;
+	skel->rodata->dut_addr =
+		((struct sockaddr_in6 *)&env.dut_addr)->sin6_addr;
+
+	/* Load & verify BPF programs */
+	err = xdp_features__load(skel);
+	if (err) {
+		fprintf(stderr, "Failed to load and verify BPF skeleton\n");
+		goto cleanup;
+	}
+
+	err = xdp_features__attach(skel);
+	if (err) {
+		fprintf(stderr, "Failed to attach BPF skeleton\n");
+		goto cleanup;
+	}
+
+	if (env.is_tester) {
+		/* Tester */
+		fprintf(stdout, "Starting tester on device %d\n", env.ifindex);
+		err = tester_run(skel);
+	} else {
+		/* DUT */
+		fprintf(stdout, "Starting DUT on device %d\n", env.ifindex);
+		err = dut_run(skel);
+	}
+
+cleanup:
+	xdp_features__destroy(skel);
+
+	return err < 0 ? -err : 0;
+}
diff --git a/tools/testing/selftests/bpf/xdp_features.h b/tools/testing/selftests/bpf/xdp_features.h
new file mode 100644
index 000000000000..2670c541713b
--- /dev/null
+++ b/tools/testing/selftests/bpf/xdp_features.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* test commands */
+enum test_commands {
+	CMD_STOP,		/* CMD */
+	CMD_START,		/* CMD */
+	CMD_ECHO,		/* CMD */
+	CMD_ACK,		/* CMD + data */
+	CMD_GET_XDP_CAP,	/* CMD */
+	CMD_GET_STATS,		/* CMD */
+};
+
+#define DUT_CTRL_PORT	12345
+#define DUT_ECHO_PORT	12346
+
+struct tlv_hdr {
+	__be16 type;
+	__be16 len;
+	__u8 data[];
+};
-- 
cgit 


From 377c16fa3f3c60d21e4b05314c8be034ce37f2eb Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <tong@infragraf.org>
Date: Thu, 2 Feb 2023 21:17:01 +0800
Subject: bpftool: profile online CPUs instead of possible

The number of online cpu may be not equal to possible cpu.
"bpftool prog profile" can not create pmu event on possible
but on online cpu.

$ dmidecode -s system-product-name
PowerEdge R620
$ cat /sys/devices/system/cpu/possible
0-47
$ cat /sys/devices/system/cpu/online
0-31

Disable cpu dynamically:
$ echo 0 > /sys/devices/system/cpu/cpuX/online

If one cpu is offline, perf_event_open will return ENODEV.
To fix this issue:
* check value returned and skip offline cpu.
* close pmu_fd immediately on error path, avoid fd leaking.

Fixes: 47c09d6a9f67 ("bpftool: Introduce "prog profile" command")
Signed-off-by: Tonghao Zhang <tong@infragraf.org>
Cc: Quentin Monnet <quentin@isovalent.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Song Liu <song@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Stanislav Fomichev <sdf@google.com>
Cc: Hao Luo <haoluo@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/r/20230202131701.29519-1-tong@infragraf.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/bpf/bpftool/prog.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index cfc9fdc1e863..e87738dbffc1 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -2233,10 +2233,38 @@ static void profile_close_perf_events(struct profiler_bpf *obj)
 	profile_perf_event_cnt = 0;
 }
 
+static int profile_open_perf_event(int mid, int cpu, int map_fd)
+{
+	int pmu_fd;
+
+	pmu_fd = syscall(__NR_perf_event_open, &metrics[mid].attr,
+			 -1 /*pid*/, cpu, -1 /*group_fd*/, 0);
+	if (pmu_fd < 0) {
+		if (errno == ENODEV) {
+			p_info("cpu %d may be offline, skip %s profiling.",
+				cpu, metrics[mid].name);
+			profile_perf_event_cnt++;
+			return 0;
+		}
+		return -1;
+	}
+
+	if (bpf_map_update_elem(map_fd,
+				&profile_perf_event_cnt,
+				&pmu_fd, BPF_ANY) ||
+	    ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) {
+		close(pmu_fd);
+		return -1;
+	}
+
+	profile_perf_events[profile_perf_event_cnt++] = pmu_fd;
+	return 0;
+}
+
 static int profile_open_perf_events(struct profiler_bpf *obj)
 {
 	unsigned int cpu, m;
-	int map_fd, pmu_fd;
+	int map_fd;
 
 	profile_perf_events = calloc(
 		sizeof(int), obj->rodata->num_cpu * obj->rodata->num_metric);
@@ -2255,17 +2283,11 @@ static int profile_open_perf_events(struct profiler_bpf *obj)
 		if (!metrics[m].selected)
 			continue;
 		for (cpu = 0; cpu < obj->rodata->num_cpu; cpu++) {
-			pmu_fd = syscall(__NR_perf_event_open, &metrics[m].attr,
-					 -1/*pid*/, cpu, -1/*group_fd*/, 0);
-			if (pmu_fd < 0 ||
-			    bpf_map_update_elem(map_fd, &profile_perf_event_cnt,
-						&pmu_fd, BPF_ANY) ||
-			    ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) {
+			if (profile_open_perf_event(m, cpu, map_fd)) {
 				p_err("failed to create event %s on cpu %d",
 				      metrics[m].name, cpu);
 				return -1;
 			}
-			profile_perf_events[profile_perf_event_cnt++] = pmu_fd;
 		}
 	}
 	return 0;
-- 
cgit 


From 16ddcb15497e11a2695c604357e77140010d3d51 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 10 Jan 2023 19:03:59 +0000
Subject: selftests/damon/sysfs: hide expected write failures

DAMON selftests for sysfs (sysfs.sh) tests if some writes to DAMON sysfs
interface files fails as expected.  It makes the test results noisy with
the failure error message because it tests a number of such failures.
Redirect the expected failure error messages to /dev/null to make the
results clean.

Link: https://lkml.kernel.org/r/20230110190400.119388-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/sysfs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh
index a00336ffdcad..bcd4734ca094 100644
--- a/tools/testing/selftests/damon/sysfs.sh
+++ b/tools/testing/selftests/damon/sysfs.sh
@@ -24,7 +24,7 @@ ensure_write_fail()
 	content=$2
 	reason=$3
 
-	if echo "$content" > "$file"
+	if (echo "$content" > "$file") 2> /dev/null
 	then
 		echo "writing $content to $file succeed ($fail_reason)"
 		echo "expected failure because $reason"
-- 
cgit 


From 75cb348714f527ce2de3446202b76ce74808b668 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 10 Jan 2023 19:04:00 +0000
Subject: selftests/damon/debugfs_rm_non_contexts: hide expected write error
 messages

A selftest case for DAMON debugfs interface has a test for expected
failure.  To make the test output clean, hide the expected failure error
message.

Link: https://lkml.kernel.org/r/20230110190400.119388-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/debugfs_rm_non_contexts.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh
index 48b7af6b022c..f3ffeb1343cf 100644
--- a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh
+++ b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh
@@ -10,7 +10,7 @@ dmesg -C
 
 for file in "$DBGFS/"*
 do
-	echo "$(basename "$f")" > "$DBGFS/rm_contexts"
+	(echo "$(basename "$f")" > "$DBGFS/rm_contexts") &> /dev/null
 	if dmesg | grep -q BUG
 	then
 		dmesg
-- 
cgit 


From c5d5546ea06512accc894cd19265c7041a6ac81a Mon Sep 17 00:00:00 2001
From: Vernon Yang <vernon2gm@gmail.com>
Date: Tue, 10 Jan 2023 23:42:11 +0800
Subject: maple_tree: remove the parameter entry of mas_preallocate

The parameter entry of mas_preallocate is not used, so drop it.

Link: https://lkml.kernel.org/r/20230110154211.1758562-1-vernon2gm@gmail.com
Signed-off-by: Vernon Yang <vernon2gm@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h       |  2 +-
 lib/maple_tree.c                 |  3 +--
 mm/mmap.c                        | 16 ++++++++--------
 mm/nommu.c                       |  8 ++++----
 tools/testing/radix-tree/maple.c | 32 ++++++++++++++++----------------
 5 files changed, 30 insertions(+), 31 deletions(-)

(limited to 'tools')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 815a27661517..a7bf58fd7cc6 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -455,7 +455,7 @@ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp);
 void mas_store_prealloc(struct ma_state *mas, void *entry);
 void *mas_find(struct ma_state *mas, unsigned long max);
 void *mas_find_rev(struct ma_state *mas, unsigned long min);
-int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
+int mas_preallocate(struct ma_state *mas, gfp_t gfp);
 bool mas_is_err(struct ma_state *mas);
 
 bool mas_nomem(struct ma_state *mas, gfp_t gfp);
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index baff62a012e1..5be99550e36d 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -5700,12 +5700,11 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc);
 /**
  * mas_preallocate() - Preallocate enough nodes for a store operation
  * @mas: The maple state
- * @entry: The entry that will be stored
  * @gfp: The GFP_FLAGS to use for allocations.
  *
  * Return: 0 on success, -ENOMEM if memory could not be allocated.
  */
-int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
+int mas_preallocate(struct ma_state *mas, gfp_t gfp)
 {
 	int ret;
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 425a9349e610..4fe29b8f99b0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -472,7 +472,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
 	MA_STATE(mas, &mm->mm_mt, 0, 0);
 	struct address_space *mapping = NULL;
 
-	if (mas_preallocate(&mas, vma, GFP_KERNEL))
+	if (mas_preallocate(&mas, GFP_KERNEL))
 		return -ENOMEM;
 
 	if (vma->vm_file) {
@@ -538,7 +538,7 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma,
 	/* Only handles expanding */
 	VM_BUG_ON(vma->vm_start < start || vma->vm_end > end);
 
-	if (mas_preallocate(mas, vma, GFP_KERNEL))
+	if (mas_preallocate(mas, GFP_KERNEL))
 		goto nomem;
 
 	vma_adjust_trans_huge(vma, start, end, 0);
@@ -712,7 +712,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 		}
 	}
 
-	if (mas_preallocate(&mas, vma, GFP_KERNEL))
+	if (mas_preallocate(&mas, GFP_KERNEL))
 		return -ENOMEM;
 
 	vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
@@ -1938,7 +1938,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 		/* Check that both stack segments have the same anon_vma? */
 	}
 
-	if (mas_preallocate(&mas, vma, GFP_KERNEL))
+	if (mas_preallocate(&mas, GFP_KERNEL))
 		return -ENOMEM;
 
 	/* We must make sure the anon_vma is allocated. */
@@ -2019,7 +2019,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
 			return -ENOMEM;
 	}
 
-	if (mas_preallocate(&mas, vma, GFP_KERNEL))
+	if (mas_preallocate(&mas, GFP_KERNEL))
 		return -ENOMEM;
 
 	/* We must make sure the anon_vma is allocated. */
@@ -2311,7 +2311,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
 	mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
 	mt_set_external_lock(&mt_detach, &mm->mmap_lock);
 
-	if (mas_preallocate(mas, vma, GFP_KERNEL))
+	if (mas_preallocate(mas, GFP_KERNEL))
 		return -ENOMEM;
 
 	mas->last = end - 1;
@@ -2680,7 +2680,7 @@ cannot_expand:
 			goto free_vma;
 	}
 
-	if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+	if (mas_preallocate(&mas, GFP_KERNEL)) {
 		error = -ENOMEM;
 		if (file)
 			goto close_and_free_vma;
@@ -2953,7 +2953,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
 	    can_vma_merge_after(vma, flags, NULL, NULL,
 				addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
 		mas_set_range(mas, vma->vm_start, addr + len - 1);
-		if (mas_preallocate(mas, vma, GFP_KERNEL))
+		if (mas_preallocate(mas, GFP_KERNEL))
 			goto unacct_fail;
 
 		vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
diff --git a/mm/nommu.c b/mm/nommu.c
index df1711acdf5b..0481922fe66e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -602,7 +602,7 @@ static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 {
 	MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);
 
-	if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+	if (mas_preallocate(&mas, GFP_KERNEL)) {
 		pr_warn("Allocation of vma tree for process %d failed\n",
 		       current->pid);
 		return -ENOMEM;
@@ -633,7 +633,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma)
 {
 	MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0);
 
-	if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+	if (mas_preallocate(&mas, GFP_KERNEL)) {
 		pr_warn("Allocation of vma tree for process %d failed\n",
 		       current->pid);
 		return -ENOMEM;
@@ -1091,7 +1091,7 @@ unsigned long do_mmap(struct file *file,
 	if (!vma)
 		goto error_getting_vma;
 
-	if (mas_preallocate(&mas, vma, GFP_KERNEL))
+	if (mas_preallocate(&mas, GFP_KERNEL))
 		goto error_maple_preallocate;
 
 	region->vm_usage = 1;
@@ -1369,7 +1369,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!new)
 		goto err_vma_dup;
 
-	if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+	if (mas_preallocate(&mas, GFP_KERNEL)) {
 		pr_warn("Allocation of vma tree for process %d failed\n",
 			current->pid);
 		goto err_mas_preallocate;
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 1f36bc1c5d36..958ee9bdb316 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -35342,7 +35342,7 @@ static noinline void check_prealloc(struct maple_tree *mt)
 	for (i = 0; i <= max; i++)
 		mtree_test_store_range(mt, i * 10, i * 10 + 5, &i);
 
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
 	MT_BUG_ON(mt, allocated == 0);
@@ -35351,18 +35351,18 @@ static noinline void check_prealloc(struct maple_tree *mt)
 	allocated = mas_allocated(&mas);
 	MT_BUG_ON(mt, allocated != 0);
 
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
 	MT_BUG_ON(mt, allocated == 0);
 	MT_BUG_ON(mt, allocated != 1 + height * 3);
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
 	mas_destroy(&mas);
 	allocated = mas_allocated(&mas);
 	MT_BUG_ON(mt, allocated != 0);
 
 
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
 	MT_BUG_ON(mt, allocated == 0);
@@ -35370,25 +35370,25 @@ static noinline void check_prealloc(struct maple_tree *mt)
 	mn = mas_pop_node(&mas);
 	MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1);
 	ma_free_rcu(mn);
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
 	mas_destroy(&mas);
 	allocated = mas_allocated(&mas);
 	MT_BUG_ON(mt, allocated != 0);
 
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
 	MT_BUG_ON(mt, allocated == 0);
 	MT_BUG_ON(mt, allocated != 1 + height * 3);
 	mn = mas_pop_node(&mas);
 	MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1);
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
 	mas_destroy(&mas);
 	allocated = mas_allocated(&mas);
 	MT_BUG_ON(mt, allocated != 0);
 	ma_free_rcu(mn);
 
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
 	MT_BUG_ON(mt, allocated == 0);
@@ -35397,12 +35397,12 @@ static noinline void check_prealloc(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1);
 	mas_push_node(&mas, mn);
 	MT_BUG_ON(mt, mas_allocated(&mas) != allocated);
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
 	mas_destroy(&mas);
 	allocated = mas_allocated(&mas);
 	MT_BUG_ON(mt, allocated != 0);
 
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
 	MT_BUG_ON(mt, allocated == 0);
@@ -35410,21 +35410,21 @@ static noinline void check_prealloc(struct maple_tree *mt)
 	mas_store_prealloc(&mas, ptr);
 	MT_BUG_ON(mt, mas_allocated(&mas) != 0);
 
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
 	MT_BUG_ON(mt, allocated == 0);
 	MT_BUG_ON(mt, allocated != 1 + height * 3);
 	mas_store_prealloc(&mas, ptr);
 	MT_BUG_ON(mt, mas_allocated(&mas) != 0);
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
 	MT_BUG_ON(mt, allocated == 0);
 	MT_BUG_ON(mt, allocated != 1 + height * 3);
 	mas_store_prealloc(&mas, ptr);
 
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
 	MT_BUG_ON(mt, allocated == 0);
@@ -35432,14 +35432,14 @@ static noinline void check_prealloc(struct maple_tree *mt)
 	mas_store_prealloc(&mas, ptr);
 	MT_BUG_ON(mt, mas_allocated(&mas) != 0);
 	mt_set_non_kernel(1);
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
 	MT_BUG_ON(mt, allocated != 0);
 	mas_destroy(&mas);
 
 
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
 	MT_BUG_ON(mt, allocated == 0);
@@ -35447,7 +35447,7 @@ static noinline void check_prealloc(struct maple_tree *mt)
 	mas_store_prealloc(&mas, ptr);
 	MT_BUG_ON(mt, mas_allocated(&mas) != 0);
 	mt_set_non_kernel(1);
-	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0);
+	MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
 	MT_BUG_ON(mt, allocated != 0);
-- 
cgit 


From 2973d8229b78d3f148e0c45916a1e8b237dc6167 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 13 Jan 2023 11:12:17 +0000
Subject: mm: discard __GFP_ATOMIC

__GFP_ATOMIC serves little purpose.  Its main effect is to set
ALLOC_HARDER which adds a few little boosts to increase the chance of an
allocation succeeding, one of which is to lower the water-mark at which it
will succeed.

It is *always* paired with __GFP_HIGH which sets ALLOC_HIGH which also
adjusts this watermark.  It is probable that other users of __GFP_HIGH
should benefit from the other little bonuses that __GFP_ATOMIC gets.

__GFP_ATOMIC also gives a warning if used with __GFP_DIRECT_RECLAIM.
There is little point to this.  We already get a might_sleep() warning if
__GFP_DIRECT_RECLAIM is set.

__GFP_ATOMIC allows the "watermark_boost" to be side-stepped.  It is
probable that testing ALLOC_HARDER is a better fit here.

__GFP_ATOMIC is used by tegra-smmu.c to check if the allocation might
sleep.  This should test __GFP_DIRECT_RECLAIM instead.

This patch:
 - removes __GFP_ATOMIC
 - allows __GFP_HIGH allocations to ignore watermark boosting as well
   as GFP_ATOMIC requests.
 - makes other adjustments as suggested by the above.

The net result is not change to GFP_ATOMIC allocations.  Other
allocations that use __GFP_HIGH will benefit from a few different extra
privileges.  This affects:
  xen, dm, md, ntfs3
  the vermillion frame buffer
  hibernation
  ksm
  swap
all of which likely produce more benefit than cost if these selected
allocation are more likely to succeed quickly.

[mgorman: Minor adjustments to rework on top of a series]
Link: https://lkml.kernel.org/r/163712397076.13692.4727608274002939094@noble.neil.brown.name
Link: https://lkml.kernel.org/r/20230113111217.14134-7-mgorman@techsingularity.net
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Thierry Reding <thierry.reding@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/balance.rst   |  2 +-
 drivers/iommu/tegra-smmu.c     |  4 ++--
 include/linux/gfp_types.h      | 12 ++++--------
 include/trace/events/mmflags.h |  1 -
 lib/test_printf.c              |  8 ++++----
 mm/internal.h                  |  2 +-
 mm/page_alloc.c                | 13 +++----------
 tools/perf/builtin-kmem.c      |  1 -
 8 files changed, 15 insertions(+), 28 deletions(-)

(limited to 'tools')

diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst
index 6a1fadf3e173..e38e9d83c1c7 100644
--- a/Documentation/mm/balance.rst
+++ b/Documentation/mm/balance.rst
@@ -6,7 +6,7 @@ Memory Balancing
 
 Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
 
-Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
+Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as
 well as for non __GFP_IO allocations.
 
 The first reason why a caller may avoid reclaim is that the caller can not
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index 5b1af40221ec..af8d0e685260 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -671,12 +671,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as,
 	 * allocate page in a sleeping context if GFP flags permit. Hence
 	 * spinlock needs to be unlocked and re-locked after allocation.
 	 */
-	if (!(gfp & __GFP_ATOMIC))
+	if (gfpflags_allow_blocking(gfp))
 		spin_unlock_irqrestore(&as->lock, *flags);
 
 	page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO);
 
-	if (!(gfp & __GFP_ATOMIC))
+	if (gfpflags_allow_blocking(gfp))
 		spin_lock_irqsave(&as->lock, *flags);
 
 	/*
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index d88c46ca82e1..5088637fe5c2 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -31,7 +31,7 @@ typedef unsigned int __bitwise gfp_t;
 #define ___GFP_IO		0x40u
 #define ___GFP_FS		0x80u
 #define ___GFP_ZERO		0x100u
-#define ___GFP_ATOMIC		0x200u
+/* 0x200u unused */
 #define ___GFP_DIRECT_RECLAIM	0x400u
 #define ___GFP_KSWAPD_RECLAIM	0x800u
 #define ___GFP_WRITE		0x1000u
@@ -116,11 +116,8 @@ typedef unsigned int __bitwise gfp_t;
  *
  * %__GFP_HIGH indicates that the caller is high-priority and that granting
  * the request is necessary before the system can make forward progress.
- * For example, creating an IO context to clean pages.
- *
- * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
- * high priority. Users are typically interrupt handlers. This may be
- * used in conjunction with %__GFP_HIGH
+ * For example creating an IO context to clean pages and requests
+ * from atomic context.
  *
  * %__GFP_MEMALLOC allows access to all memory. This should only be used when
  * the caller guarantees the allocation will allow more memory to be freed
@@ -135,7 +132,6 @@ typedef unsigned int __bitwise gfp_t;
  * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
  * This takes precedence over the %__GFP_MEMALLOC flag if both are set.
  */
-#define __GFP_ATOMIC	((__force gfp_t)___GFP_ATOMIC)
 #define __GFP_HIGH	((__force gfp_t)___GFP_HIGH)
 #define __GFP_MEMALLOC	((__force gfp_t)___GFP_MEMALLOC)
 #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
@@ -329,7 +325,7 @@ typedef unsigned int __bitwise gfp_t;
  * version does not attempt reclaim/compaction at all and is by default used
  * in page fault path, while the non-light is used by khugepaged.
  */
-#define GFP_ATOMIC	(__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
+#define GFP_ATOMIC	(__GFP_HIGH|__GFP_KSWAPD_RECLAIM)
 #define GFP_KERNEL	(__GFP_RECLAIM | __GFP_IO | __GFP_FS)
 #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
 #define GFP_NOWAIT	(__GFP_KSWAPD_RECLAIM)
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 412b5a46374c..9db52bc4ce19 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -31,7 +31,6 @@
 	gfpflag_string(__GFP_HIGHMEM),		\
 	gfpflag_string(GFP_DMA32),		\
 	gfpflag_string(__GFP_HIGH),		\
-	gfpflag_string(__GFP_ATOMIC),		\
 	gfpflag_string(__GFP_IO),		\
 	gfpflag_string(__GFP_FS),		\
 	gfpflag_string(__GFP_NOWARN),		\
diff --git a/lib/test_printf.c b/lib/test_printf.c
index d34dc636b81c..46b4e6c414a3 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -674,17 +674,17 @@ flags(void)
 	gfp = GFP_ATOMIC|__GFP_DMA;
 	test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp);
 
-	gfp = __GFP_ATOMIC;
-	test("__GFP_ATOMIC", "%pGg", &gfp);
+	gfp = __GFP_HIGH;
+	test("__GFP_HIGH", "%pGg", &gfp);
 
 	/* Any flags not translated by the table should remain numeric */
 	gfp = ~__GFP_BITS_MASK;
 	snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp);
 	test(cmp_buffer, "%pGg", &gfp);
 
-	snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx",
+	snprintf(cmp_buffer, BUF_SIZE, "__GFP_HIGH|%#lx",
 							(unsigned long) gfp);
-	gfp |= __GFP_ATOMIC;
+	gfp |= __GFP_HIGH;
 	test(cmp_buffer, "%pGg", &gfp);
 
 	kfree(cmp_buffer);
diff --git a/mm/internal.h b/mm/internal.h
index b0b88a95347f..2d09a7a0600a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,7 +24,7 @@ struct folio_batch;
 #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
 			__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
 			__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
-			__GFP_ATOMIC|__GFP_NOLOCKDEP)
+			__GFP_NOLOCKDEP)
 
 /* The GFP flags allowed during early boot */
 #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 18ca33a1945d..0cfad30fb44c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4105,13 +4105,14 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
 	if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
 					free_pages))
 		return true;
+
 	/*
-	 * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
+	 * Ignore watermark boosting for __GFP_HIGH order-0 allocations
 	 * when checking the min watermark. The min watermark is the
 	 * point where boosting is ignored so that kswapd is woken up
 	 * when below the low watermark.
 	 */
-	if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
+	if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost
 		&& ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
 		mark = z->_watermark[WMARK_MIN];
 		return __zone_watermark_ok(z, order, mark, highest_zoneidx,
@@ -5076,14 +5077,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned int zonelist_iter_cookie;
 	int reserve_flags;
 
-	/*
-	 * We also sanity check to catch abuse of atomic reserves being used by
-	 * callers that are not in atomic context.
-	 */
-	if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
-				(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
-		gfp_mask &= ~__GFP_ATOMIC;
-
 restart:
 	compaction_retries = 0;
 	no_progress_loops = 0;
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 8ae0a1535293..f3029742b800 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -653,7 +653,6 @@ static const struct {
 	{ "__GFP_HIGHMEM",		"HM" },
 	{ "GFP_DMA32",			"D32" },
 	{ "__GFP_HIGH",			"H" },
-	{ "__GFP_ATOMIC",		"_A" },
 	{ "__GFP_IO",			"I" },
 	{ "__GFP_FS",			"F" },
 	{ "__GFP_NOWARN",		"NWR" },
-- 
cgit 


From e6d2c436ff693869e83a65e61643b922e193e162 Mon Sep 17 00:00:00 2001
From: "Herton R. Krzesinski" <herton@redhat.com>
Date: Mon, 16 Jan 2023 19:49:21 -0300
Subject: tools/mm: allow users to provide additional cflags/ldflags

Right now there is no way to provide additional cflags/ldflags when
building tools/vm binaries.  And using eg.  make CFLAGS=<options> will
override the CFLAGS being set in the Makefile, making the build fail since
it requires the include of the ../lib dir (for libapi).

This change then allows you to specify:
CFLAGS=<options> LDFLAGS=<options> make V=1 -C tools/vm

And the options will be correctly appended as can be seen from the
make output.

Link: https://lkml.kernel.org/r/20230116224921.4106324-1-herton@redhat.com
Signed-off-by: Herton R. Krzesinski <herton@redhat.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Justin Forbes <jforbes@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Scott Weaver <scweaver@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/mm/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/mm/Makefile b/tools/mm/Makefile
index 9860622cbb15..6c1da51f4177 100644
--- a/tools/mm/Makefile
+++ b/tools/mm/Makefile
@@ -8,8 +8,8 @@ TARGETS=page-types slabinfo page_owner_sort
 LIB_DIR = ../lib/api
 LIBS = $(LIB_DIR)/libapi.a
 
-CFLAGS = -Wall -Wextra -I../lib/
-LDFLAGS = $(LIBS)
+CFLAGS += -Wall -Wextra -I../lib/
+LDFLAGS += $(LIBS)
 
 all: $(TARGETS)
 
-- 
cgit 


From 4cf1fe34fd18b752ae2403927277715d4444f331 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 19 Jan 2023 16:03:44 +0000
Subject: kselftest: vm: add tests for memory-deny-write-execute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add some tests to cover the new PR_SET_MDWE prctl.

Link: https://lkml.kernel.org/r/20230119160344.54358-3-joey.gouly@arm.com
Co-developed-by: Joey Gouly <joey.gouly@arm.com>
Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Mark Brown <broonie@kernel.org>
Cc: nd <nd@arm.com>
Cc: Szabolcs Nagy <szabolcs.nagy@arm.com>
Cc: Topi Miettinen <toiwoton@gmail.com>
Cc: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/Makefile    |   1 +
 tools/testing/selftests/mm/mdwe_test.c | 197 +++++++++++++++++++++++++++++++++
 2 files changed, 198 insertions(+)
 create mode 100644 tools/testing/selftests/mm/mdwe_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 0a44d77f8437..d90cdc06aa59 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -60,6 +60,7 @@ TEST_GEN_PROGS += soft-dirty
 TEST_GEN_PROGS += split_huge_page_test
 TEST_GEN_FILES += ksm_tests
 TEST_GEN_PROGS += ksm_functional_tests
+TEST_GEN_PROGS += mdwe_test
 
 ifeq ($(MACHINE),x86_64)
 CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32)
diff --git a/tools/testing/selftests/mm/mdwe_test.c b/tools/testing/selftests/mm/mdwe_test.c
new file mode 100644
index 000000000000..f466a099f1bf
--- /dev/null
+++ b/tools/testing/selftests/mm/mdwe_test.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifdef __aarch64__
+#include <asm/hwcap.h>
+#endif
+
+#include <linux/mman.h>
+#include <linux/prctl.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+#ifndef __aarch64__
+# define PROT_BTI	0
+#endif
+
+TEST(prctl_flags)
+{
+	EXPECT_LT(prctl(PR_SET_MDWE, 7L, 0L, 0L, 0L), 0);
+	EXPECT_LT(prctl(PR_SET_MDWE, 0L, 7L, 0L, 0L), 0);
+	EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 7L, 0L), 0);
+	EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 0L, 7L), 0);
+
+	EXPECT_LT(prctl(PR_GET_MDWE, 7L, 0L, 0L, 0L), 0);
+	EXPECT_LT(prctl(PR_GET_MDWE, 0L, 7L, 0L, 0L), 0);
+	EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 7L, 0L), 0);
+	EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 0L, 7L), 0);
+}
+
+FIXTURE(mdwe)
+{
+	void *p;
+	int flags;
+	size_t size;
+	pid_t pid;
+};
+
+FIXTURE_VARIANT(mdwe)
+{
+	bool enabled;
+	bool forked;
+};
+
+FIXTURE_VARIANT_ADD(mdwe, stock)
+{
+        .enabled = false,
+	.forked = false,
+};
+
+FIXTURE_VARIANT_ADD(mdwe, enabled)
+{
+        .enabled = true,
+	.forked = false,
+};
+
+FIXTURE_VARIANT_ADD(mdwe, forked)
+{
+        .enabled = true,
+	.forked = true,
+};
+
+FIXTURE_SETUP(mdwe)
+{
+	int ret, status;
+
+	self->p = NULL;
+	self->flags = MAP_SHARED | MAP_ANONYMOUS;
+	self->size = getpagesize();
+
+	if (!variant->enabled)
+		return;
+
+	ret = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0L, 0L, 0L);
+	ASSERT_EQ(ret, 0) {
+		TH_LOG("PR_SET_MDWE failed or unsupported");
+	}
+
+	ret = prctl(PR_GET_MDWE, 0L, 0L, 0L, 0L);
+	ASSERT_EQ(ret, 1);
+
+	if (variant->forked) {
+		self->pid = fork();
+		ASSERT_GE(self->pid, 0) {
+			TH_LOG("fork failed\n");
+		}
+
+		if (self->pid > 0) {
+			ret = waitpid(self->pid, &status, 0);
+			ASSERT_TRUE(WIFEXITED(status));
+			exit(WEXITSTATUS(status));
+		}
+	}
+}
+
+FIXTURE_TEARDOWN(mdwe)
+{
+	if (self->p && self->p != MAP_FAILED)
+		munmap(self->p, self->size);
+}
+
+TEST_F(mdwe, mmap_READ_EXEC)
+{
+	self->p = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0);
+	EXPECT_NE(self->p, MAP_FAILED);
+}
+
+TEST_F(mdwe, mmap_WRITE_EXEC)
+{
+	self->p = mmap(NULL, self->size, PROT_WRITE | PROT_EXEC, self->flags, 0, 0);
+	if (variant->enabled) {
+		EXPECT_EQ(self->p, MAP_FAILED);
+	} else {
+		EXPECT_NE(self->p, MAP_FAILED);
+	}
+}
+
+TEST_F(mdwe, mprotect_stay_EXEC)
+{
+	int ret;
+
+	self->p = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0);
+	ASSERT_NE(self->p, MAP_FAILED);
+
+	ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC);
+	EXPECT_EQ(ret, 0);
+}
+
+TEST_F(mdwe, mprotect_add_EXEC)
+{
+	int ret;
+
+	self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0);
+	ASSERT_NE(self->p, MAP_FAILED);
+
+	ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC);
+	if (variant->enabled) {
+		EXPECT_LT(ret, 0);
+	} else {
+		EXPECT_EQ(ret, 0);
+	}
+}
+
+TEST_F(mdwe, mprotect_WRITE_EXEC)
+{
+	int ret;
+
+	self->p = mmap(NULL, self->size, PROT_WRITE, self->flags, 0, 0);
+	ASSERT_NE(self->p, MAP_FAILED);
+
+	ret = mprotect(self->p, self->size, PROT_WRITE | PROT_EXEC);
+	if (variant->enabled) {
+		EXPECT_LT(ret, 0);
+	} else {
+		EXPECT_EQ(ret, 0);
+	}
+}
+
+TEST_F(mdwe, mmap_FIXED)
+{
+	void *p, *p2;
+
+	p2 = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0);
+	self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0);
+	ASSERT_NE(self->p, MAP_FAILED);
+
+	p = mmap(self->p + self->size, self->size, PROT_READ | PROT_EXEC,
+		 self->flags | MAP_FIXED, 0, 0);
+	if (variant->enabled) {
+		EXPECT_EQ(p, MAP_FAILED);
+	} else {
+		EXPECT_EQ(p, self->p);
+	}
+}
+
+TEST_F(mdwe, arm64_BTI)
+{
+	int ret;
+
+#ifdef __aarch64__
+	if (!(getauxval(AT_HWCAP2) & HWCAP2_BTI))
+#endif
+		SKIP(return, "HWCAP2_BTI not supported");
+
+	self->p = mmap(NULL, self->size, PROT_EXEC, self->flags, 0, 0);
+	ASSERT_NE(self->p, MAP_FAILED);
+
+	ret = mprotect(self->p, self->size, PROT_EXEC | PROT_BTI);
+	EXPECT_EQ(ret, 0);
+}
+
+TEST_HARNESS_MAIN
-- 
cgit 


From 507fa17a6c46c111f6b0d2bc483c1b3563fd16c5 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 12 Jan 2023 14:43:33 +0800
Subject: tools/power/x86/intel-speed-select: Remove wrong check in
 set_isst_id()

struct isst_id *id is a pointer, comparing it with less than zero is wrong.

The check is there to make sure the id->pkg and id->die is set to -1, when
it is illegal or unavailable. Here comparing with MAX_PACKAGE_COUNT and
MAX_DIE_PER_PACKAGE is sufficient.

Hence remove the wrong check.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
[srinivas.pandruvada@linux.intel.com: Subject and changelog edits]
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 tools/power/x86/intel-speed-select/isst-config.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c
index a160bad291eb..b5822ee67cb3 100644
--- a/tools/power/x86/intel-speed-select/isst-config.c
+++ b/tools/power/x86/intel-speed-select/isst-config.c
@@ -383,11 +383,11 @@ void set_isst_id(struct isst_id *id, int cpu)
 	id->cpu = cpu;
 
 	id->pkg = get_physical_package_id(cpu);
-	if (id < 0 || id->pkg >= MAX_PACKAGE_COUNT)
+	if (id->pkg >= MAX_PACKAGE_COUNT)
 		id->pkg = -1;
 
 	id->die = get_physical_die_id(cpu);
-	if (id < 0 || id->die >= MAX_DIE_PER_PACKAGE)
+	if (id->die >= MAX_DIE_PER_PACKAGE)
 		id->die = -1;
 }
 
-- 
cgit 


From b8bebc8e58d5dc8f2c528b0be4e858959c48e340 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 12 Jan 2023 14:43:34 +0800
Subject: tools/power/x86/intel-speed-select: Remove unused non_block flag

variable 'non_block' is always 0, thus remove the variable and the
handling for "non_block != 0" case.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 tools/power/x86/intel-speed-select/hfi-events.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/hfi-events.c b/tools/power/x86/intel-speed-select/hfi-events.c
index be96e90cc2a1..174b99d9f864 100644
--- a/tools/power/x86/intel-speed-select/hfi-events.c
+++ b/tools/power/x86/intel-speed-select/hfi-events.c
@@ -247,7 +247,6 @@ int hfi_main(void)
 	struct nl_cb *cb;
 	int err = 0;
 	int mcast_id;
-	int no_block = 0;
 
 	if (!check_hf_suport()) {
 		fprintf(stderr, "CPU Doesn't support HFI\n");
@@ -287,9 +286,6 @@ int hfi_main(void)
 	nl_cb_set(cb, NL_CB_SEQ_CHECK, NL_CB_CUSTOM, seq_check_handler, 0);
 	nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, handle_event, NULL);
 
-	if (no_block)
-		nl_socket_set_nonblocking(sock);
-
 	debug_printf("hfi is initialized\n");
 
 	while (!_hfi_exit && !err) {
-- 
cgit 


From 364ba3b7115087ab8960473a94cedc7336f345e2 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 12 Jan 2023 14:43:35 +0800
Subject: tools/power/x86/intel-speed-select: Handle open() failure case

Add handling for open() failure case to make sure a valid file
descriptor is passed to dup().

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 tools/power/x86/intel-speed-select/isst-daemon.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-daemon.c b/tools/power/x86/intel-speed-select/isst-daemon.c
index 0699137c0901..4ad6a64f1545 100644
--- a/tools/power/x86/intel-speed-select/isst-daemon.c
+++ b/tools/power/x86/intel-speed-select/isst-daemon.c
@@ -174,6 +174,8 @@ static void daemonize(char *rundir, char *pidfile)
 		close(i);
 
 	i = open("/dev/null", O_RDWR);
+	if (i < 0)
+		exit(EXIT_FAILURE);
 	ret = dup(i);
 	if (ret == -1)
 		exit(EXIT_FAILURE);
-- 
cgit 


From 8a44d27542cd84352c69e37deaafb8b8976f21a0 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 12 Jan 2023 14:43:36 +0800
Subject: tools/power/x86/intel-speed-select: Remove duplicate dup()

Remove the duplicate dup() invocation.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 tools/power/x86/intel-speed-select/isst-daemon.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-daemon.c b/tools/power/x86/intel-speed-select/isst-daemon.c
index 4ad6a64f1545..c2290ef0e3af 100644
--- a/tools/power/x86/intel-speed-select/isst-daemon.c
+++ b/tools/power/x86/intel-speed-select/isst-daemon.c
@@ -176,9 +176,6 @@ static void daemonize(char *rundir, char *pidfile)
 	i = open("/dev/null", O_RDWR);
 	if (i < 0)
 		exit(EXIT_FAILURE);
-	ret = dup(i);
-	if (ret == -1)
-		exit(EXIT_FAILURE);
 
 	ret = dup(i);
 	if (ret == -1)
-- 
cgit 


From 689dfc9e40036cb74f30c0700905b44b2c935846 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Thu, 12 Jan 2023 14:43:37 +0800
Subject: tools/power/x86/intel-speed-select: Use null-terminated string

strlen() and strtok() takes null-termimated strings as input.
Make sure these strings are null-terminated before using them.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 tools/power/x86/intel-speed-select/isst-config.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c
index b5822ee67cb3..b549e6f0946d 100644
--- a/tools/power/x86/intel-speed-select/isst-config.c
+++ b/tools/power/x86/intel-speed-select/isst-config.c
@@ -1583,6 +1583,7 @@ static int set_cpufreq_scaling_min_max_from_cpuinfo(int cpu, int cpuinfo_max, in
 	if (fd < 0)
 		return fd;
 
+	min_freq[15] = '\0';
 	len = strlen(min_freq);
 	ret = write(fd, min_freq, len);
 	if (ret == -1) {
@@ -2015,6 +2016,7 @@ static void set_fact_enable(int arg)
 			if (len < 0)
 				continue;
 
+			sibling_list[127] = '\0';
 			cpu_str = strtok(sibling_list, ",");
 			while (cpu_str != NULL) {
 				int cpu;
-- 
cgit 


From cf3b8e8f55e1a6c747e89599695fb8783c8a69c2 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Fri, 6 Jan 2023 17:44:26 -0800
Subject: tools/power/x86/intel-speed-select: cpufreq reads on offline CPUs

Due to some recent kernel changes, reading cpufreq attributes like
scaling_max_freq on offline CPUs returns error. So avoid reading
cpufreq attributes on offline CPUs.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 tools/power/x86/intel-speed-select/isst-config.c | 33 ++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c
index b549e6f0946d..2a737bff79cd 100644
--- a/tools/power/x86/intel-speed-select/isst-config.c
+++ b/tools/power/x86/intel-speed-select/isst-config.c
@@ -413,6 +413,33 @@ int get_topo_max_cpus(void)
 	return topo_max_cpus;
 }
 
+static unsigned int is_cpu_online(int cpu)
+{
+	char buffer[128];
+	int fd, ret;
+	unsigned char online;
+
+	snprintf(buffer, sizeof(buffer),
+		 "/sys/devices/system/cpu/cpu%d/online", cpu);
+
+	fd = open(buffer, O_RDONLY);
+	if (fd < 0)
+		return fd;
+
+	ret = read(fd, &online, sizeof(online));
+	close(fd);
+
+	if (ret == -1)
+		return ret;
+
+	if (online == '1')
+		online = 1;
+	else
+		online = 0;
+
+	return online;
+}
+
 void set_cpu_online_offline(int cpu, int state)
 {
 	char buffer[128];
@@ -1603,6 +1630,9 @@ static void set_scaling_min_to_cpuinfo_max(struct isst_id *id)
 		if (!is_cpu_in_power_domain(i, id))
 			continue;
 
+		if (is_cpu_online(i) != 1)
+			continue;
+
 		adjust_scaling_max_from_base_freq(i);
 		set_cpufreq_scaling_min_max_from_cpuinfo(i, 1, 0);
 		adjust_scaling_min_from_base_freq(i);
@@ -1617,6 +1647,9 @@ static void set_scaling_min_to_cpuinfo_min(struct isst_id *id)
 		if (!is_cpu_in_power_domain(i, id))
 			continue;
 
+		if (is_cpu_online(i) != 1)
+			continue;
+
 		adjust_scaling_max_from_base_freq(i);
 		set_cpufreq_scaling_min_max_from_cpuinfo(i, 0, 0);
 	}
-- 
cgit 


From 6ed9e363157cb858bdbb8c595f04839d3f245414 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Fri, 6 Jan 2023 17:48:07 -0800
Subject: tools/power/x86/intel-speed-select: turbo-freq auto mode with SMT off

When SMT is disabled from kernel command line, sibling CPUs still
appears in the sysfs as offline CPUs. This is a problem when turbo-freq
is enabled in auto mode. They are still assigned to CLOS value
of 3 as they are still in the present CPU list. But they are not in the
sibling list of a CPU. When the CPU is a high priority CPU, because of
sibling it will be still set to CLOS to 3 as CLOS is assigned at core
level not at CPU level.

So, avoid setting CLOS 3 to offline CPU.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 tools/power/x86/intel-speed-select/isst-config.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c
index 2a737bff79cd..5f5349faeb23 100644
--- a/tools/power/x86/intel-speed-select/isst-config.c
+++ b/tools/power/x86/intel-speed-select/isst-config.c
@@ -2066,6 +2066,9 @@ static void set_fact_enable(int arg)
 			if (!CPU_ISSET_S(i, present_cpumask_size, present_cpumask))
 				continue;
 
+			if (is_cpu_online(i) != 1)
+				continue;
+
 			set_isst_id(&id, i);
 			ret = set_clos_param(&id, 0, 0, 0, 0, 0xff);
 			if (ret)
-- 
cgit 


From 0d5eea3527e4618ee32fce91a5d54638a7f8c411 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Fri, 9 Dec 2022 11:23:02 -0800
Subject: tools/power/x86/intel-speed-select: Fix display of uncore min
 frequency

Uncore P1 is not uncore minmum frequency. This is uncore base frequency.
Correct display from uncore-frequency-min(MHz)
to uncore-frequency-base(Mhz).

To get uncore min frequency use mailbox command
CONFIG_TDP_GET_RATIO_INFO. Use this mailbox to get uncore frequency
limits when present.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 tools/power/x86/intel-speed-select/isst-core.c    | 23 +++++++++++++++++++++++
 tools/power/x86/intel-speed-select/isst-display.c | 11 +++++++++--
 tools/power/x86/intel-speed-select/isst.h         |  4 ++++
 3 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-core.c b/tools/power/x86/intel-speed-select/isst-core.c
index f701b45c832c..2bfc118c4b87 100644
--- a/tools/power/x86/intel-speed-select/isst-core.c
+++ b/tools/power/x86/intel-speed-select/isst-core.c
@@ -156,6 +156,29 @@ void isst_get_uncore_p0_p1_info(struct isst_id *id, int config_index,
 {
 	unsigned int resp;
 	int ret;
+
+	ctdp_level->uncore_pm = 0;
+	ctdp_level->uncore_p0 = 0;
+	ctdp_level->uncore_p1 = 0;
+
+	ret = isst_send_mbox_command(id->cpu, CONFIG_TDP,
+				     CONFIG_TDP_GET_RATIO_INFO, 0,
+				     (BIT(16) | config_index), &resp);
+	if (ret)
+		goto try_uncore_mbox;
+
+	ctdp_level->uncore_p0 = resp & GENMASK(7, 0);
+	ctdp_level->uncore_p1 = (resp & GENMASK(15, 8)) >> 8;
+	ctdp_level->uncore_pm = (resp & GENMASK(31, 24)) >> 24;
+
+	debug_printf(
+		"cpu:%d ctdp:%d CONFIG_TDP_GET_RATIO_INFO resp:%x uncore p0:%d uncore p1:%d uncore pm:%d\n",
+		id->cpu, config_index, resp, ctdp_level->uncore_p0, ctdp_level->uncore_p1,
+		ctdp_level->uncore_pm);
+
+	return;
+
+try_uncore_mbox:
 	ret = isst_send_mbox_command(id->cpu, CONFIG_TDP,
 				     CONFIG_TDP_GET_UNCORE_P0_P1_INFO, 0,
 				     config_index, &resp);
diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c
index b19f57d30f55..7feadac04a6f 100644
--- a/tools/power/x86/intel-speed-select/isst-display.c
+++ b/tools/power/x86/intel-speed-select/isst-display.c
@@ -423,10 +423,10 @@ void isst_ctdp_display_information(struct isst_id *id, FILE *outf, int tdp_level
 			format_and_print(outf, level + 2, header, value);
 		}
 
-		if (ctdp_level->uncore_p1) {
+		if (ctdp_level->uncore_pm) {
 			snprintf(header, sizeof(header), "uncore-frequency-min(MHz)");
 			snprintf(value, sizeof(value), "%d",
-				 ctdp_level->uncore_p1 * DISP_FREQ_MULTIPLIER);
+				 ctdp_level->uncore_pm * DISP_FREQ_MULTIPLIER);
 			format_and_print(outf, level + 2, header, value);
 		}
 
@@ -437,6 +437,13 @@ void isst_ctdp_display_information(struct isst_id *id, FILE *outf, int tdp_level
 			format_and_print(outf, level + 2, header, value);
 		}
 
+		if (ctdp_level->uncore_p1) {
+			snprintf(header, sizeof(header), "uncore-frequency-base(MHz)");
+			snprintf(value, sizeof(value), "%d",
+				 ctdp_level->uncore_p1 * DISP_FREQ_MULTIPLIER);
+			format_and_print(outf, level + 2, header, value);
+		}
+
 		if (ctdp_level->mem_freq) {
 			snprintf(header, sizeof(header), "mem-frequency(MHz)");
 			snprintf(value, sizeof(value), "%d",
diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h
index 409fcc9c8033..824876e31e23 100644
--- a/tools/power/x86/intel-speed-select/isst.h
+++ b/tools/power/x86/intel-speed-select/isst.h
@@ -47,6 +47,7 @@
 #define CONFIG_TDP_GET_UNCORE_P0_P1_INFO	0X09
 #define CONFIG_TDP_GET_P1_INFO			0x0a
 #define CONFIG_TDP_GET_MEM_FREQ			0x0b
+#define	CONFIG_TDP_GET_RATIO_INFO		0x0c
 
 #define CONFIG_TDP_GET_FACT_HP_TURBO_LIMIT_NUMCORES	0x10
 #define CONFIG_TDP_GET_FACT_HP_TURBO_LIMIT_RATIOS	0x11
@@ -144,6 +145,7 @@ struct isst_pkg_ctdp_level_info {
 	int t_proc_hot;
 	int uncore_p0;
 	int uncore_p1;
+	int uncore_pm;
 	int sse_p1;
 	int avx2_p1;
 	int avx512_p1;
@@ -208,6 +210,8 @@ extern int isst_get_ctdp_control(struct isst_id *id, int config_index,
 				 struct isst_pkg_ctdp_level_info *ctdp_level);
 extern int isst_get_coremask_info(struct isst_id *id, int config_index,
 			   struct isst_pkg_ctdp_level_info *ctdp_level);
+extern void isst_get_uncore_p0_p1_info(struct isst_id *id, int config_index,
+					struct isst_pkg_ctdp_level_info *ctdp_level);
 extern int isst_get_process_ctdp(struct isst_id *id, int tdp_level,
 				 struct isst_pkg_ctdp *pkg_dev);
 extern void isst_get_process_ctdp_complete(struct isst_id *id,
-- 
cgit 


From 61f9fdcdcd01f9a996b6db4e7092fcdfe8414ad5 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 24 Aug 2022 15:44:42 +0800
Subject: tools/power/x86/intel-speed-select: Add Emerald Rapid quirk

Need memory frequency quirk as Sapphire Rapids in Emerald Rapids.
So add Emerald Rapids CPU model check in is_spr_platform().

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
[srinivas.pandruvada@linux.intel.com: Subject, changelog and code edits]
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 tools/power/x86/intel-speed-select/isst-config.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c
index 5f5349faeb23..9f201189a06d 100644
--- a/tools/power/x86/intel-speed-select/isst-config.c
+++ b/tools/power/x86/intel-speed-select/isst-config.c
@@ -110,7 +110,7 @@ int is_skx_based_platform(void)
 
 int is_spr_platform(void)
 {
-	if (cpu_model == 0x8F)
+	if (cpu_model == 0x8F || cpu_model == 0xCF)
 		return 1;
 
 	return 0;
-- 
cgit 


From 2612ae596129ad9792b9cd5bf5937614edca185f Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 17 Jan 2023 14:34:27 -0800
Subject: tools/power/x86/intel-speed-select: Adjust uncore max/min frequency

When perf level is changed, uncore limits can change. Set the uncore
limits via Linux uncore sysfs, when user changes perf level with
-o option.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 tools/power/x86/intel-speed-select/isst-config.c | 36 ++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c
index 9f201189a06d..15bee7f52044 100644
--- a/tools/power/x86/intel-speed-select/isst-config.c
+++ b/tools/power/x86/intel-speed-select/isst-config.c
@@ -1319,6 +1319,34 @@ static void dump_isst_config(int arg)
 	isst_ctdp_display_information_end(outf);
 }
 
+static int set_uncore_min_max(struct isst_id *id, int max, int freq)
+{
+	char buffer[128], freq_str[16];
+	int fd, ret, len;
+
+	if (max)
+		snprintf(buffer, sizeof(buffer),
+			 "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/max_freq_khz", id->pkg, id->die);
+	else
+		snprintf(buffer, sizeof(buffer),
+			 "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/min_freq_khz", id->pkg, id->die);
+
+	fd = open(buffer, O_WRONLY);
+	if (fd < 0)
+		return fd;
+
+	snprintf(freq_str, sizeof(freq_str), "%d", freq);
+	len = strlen(freq_str);
+	ret = write(fd, freq_str, len);
+	if (ret == -1) {
+		close(fd);
+		return ret;
+	}
+	close(fd);
+
+	return 0;
+}
+
 static void adjust_scaling_max_from_base_freq(int cpu);
 
 static void set_tdp_level_for_cpu(struct isst_id *id, void *arg1, void *arg2, void *arg3,
@@ -1340,6 +1368,14 @@ static void set_tdp_level_for_cpu(struct isst_id *id, void *arg1, void *arg2, vo
 			/* Wait for updated base frequencies */
 			usleep(2000);
 
+			/* Adjusting uncore freq */
+			isst_get_uncore_p0_p1_info(id, tdp_level, &ctdp_level);
+			if (ctdp_level.uncore_pm)
+				set_uncore_min_max(id, 0, ctdp_level.uncore_pm * 100000);
+
+			if (ctdp_level.uncore_p0)
+				set_uncore_min_max(id, 1, ctdp_level.uncore_p0 * 100000);
+
 			fprintf(stderr, "Option is set to online/offline\n");
 			ctdp_level.core_cpumask_size =
 				alloc_cpu_set(&ctdp_level.core_cpumask);
-- 
cgit 


From d1fcb7493fc36a192b19fb8b046c416d05b6d6fe Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 17 Jan 2023 14:40:34 -0800
Subject: tools/power/x86/intel-speed-select: v1.14 release

This release adds following change:
- Minor fixes for coverity static analysis
- Don't read cpufreq on offline CPUs
- SST turbo-freq enable on auto mode when user disables SMT from
kernel command line
- Fix uncore frequency display
- Set uncore frequency max/min limits on perf level change

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 tools/power/x86/intel-speed-select/isst-config.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c
index 15bee7f52044..55d0a35df41c 100644
--- a/tools/power/x86/intel-speed-select/isst-config.c
+++ b/tools/power/x86/intel-speed-select/isst-config.c
@@ -15,7 +15,7 @@ struct process_cmd_struct {
 	int arg;
 };
 
-static const char *version_str = "v1.13";
+static const char *version_str = "v1.14";
 
 static const int supported_api_ver = 1;
 static struct isst_if_platform_info isst_platform_info;
-- 
cgit 


From 6043829fdb714f050ba5117044dbd1384865286c Mon Sep 17 00:00:00 2001
From: Shaoqin Huang <shahuang@redhat.com>
Date: Fri, 3 Feb 2023 14:10:36 +0800
Subject: KVM: selftests: Remove redundant setbuf()

Since setbuf(stdout, NULL) has been called in kvm_util.c with
__attribute((constructor)). Selftests no need to setup it in their own
code.

Signed-off-by: Shaoqin Huang <shahuang@redhat.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/r/20230203061038.277655-1-shahuang@redhat.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 tools/testing/selftests/kvm/aarch64/page_fault_test.c               | 2 --
 tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c | 3 ---
 2 files changed, 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c
index beb944fa6fd4..513b20bec3c2 100644
--- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c
+++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c
@@ -1093,8 +1093,6 @@ int main(int argc, char *argv[])
 	enum vm_mem_backing_src_type src_type;
 	int opt;
 
-	setbuf(stdout, NULL);
-
 	src_type = DEFAULT_VM_MEM_SRC;
 
 	while ((opt = getopt(argc, argv, "hm:s:")) != -1) {
diff --git a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c
index 37c61f712fd5..e334844d6e1d 100644
--- a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c
+++ b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c
@@ -26,9 +26,6 @@ int main(int argc, char *argv[])
 	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
 
-	/* Tell stdout not to buffer its content */
-	setbuf(stdout, NULL);
-
 	TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE));
 
 	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-- 
cgit 


From 17c9b4e1a7d14719378c3eedefc2960018f5a7bb Mon Sep 17 00:00:00 2001
From: Florian Lehner <dev@der-flo.net>
Date: Fri, 3 Feb 2023 13:14:39 +0100
Subject: bpf: fix typo in header for bpf_perf_prog_read_value

Fix a simple typo in the documentation for bpf_perf_prog_read_value.

Signed-off-by: Florian Lehner <dev@der-flo.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/r/20230203121439.25884-1-dev@der-flo.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/uapi/linux/bpf.h       | 2 +-
 tools/include/uapi/linux/bpf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ba0f0cfb5e42..17afd2b35ee5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2801,7 +2801,7 @@ union bpf_attr {
  *
  * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
  * 	Description
- * 		For en eBPF program attached to a perf event, retrieve the
+ * 		For an eBPF program attached to a perf event, retrieve the
  * 		value of the event counter associated to *ctx* and store it in
  * 		the structure pointed by *buf* and of size *buf_size*. Enabled
  * 		and running times are also stored in the structure (see
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ba0f0cfb5e42..17afd2b35ee5 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2801,7 +2801,7 @@ union bpf_attr {
  *
  * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
  * 	Description
- * 		For en eBPF program attached to a perf event, retrieve the
+ * 		For an eBPF program attached to a perf event, retrieve the
  * 		value of the event counter associated to *ctx* and store it in
  * 		the structure pointed by *buf* and of size *buf_size*. Enabled
  * 		and running times are also stored in the structure (see
-- 
cgit 


From 69218b59be20689cc62ee87cd2890c58c41bae15 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 3 Feb 2023 19:52:47 +0000
Subject: kselftest/alsa: Run PCM tests for multiple cards in parallel

With each test taking 4 seconds the runtime of pcm-test can add up. Since
generally each card in the system is physically independent and will be
unaffected by what's going on with other cards we can mitigate this by
testing each card in parallel. Make a list of cards as we enumerate the
system and then start a thread for each, then join the threads to ensure
they have all finished. The threads each run the same tests we currently
run for each PCM on the card before exiting.

The list of PCMs is kept global since it helps with global operations
like working out our planned number of tests and identifying missing PCMs
and it seemed neater to check for PCMs on the right card in the card
thread than make every PCM loop iterate over cards as well.

We don't run per-PCM tests in parallel since in embedded systems it can
be the case that resources are shared between the PCMs and operations on
one PCM on a card may constrain what can be done on another PCM on the same
card leading to potentially unstable results.

We use a mutex to ensure that the reporting of results is serialised and we
don't have issues with anything like the current test number, we could do
this in the kselftest framework but it seems like this might cause problems
for other tests that are doing lower level testing and building in
constrained environments such as nolibc so this seems more sensible.

Note that the ordering of the tests can't be guaranteed as things stand,
this does not seem like a major problem since the numbering of tests often
changes as test programs are changed so results parsers are expected to
rely on the test name rather than the test numbers. We also now prefix the
machine generated test name when printing the description of the test since
this is logged before streaming starts.

On my two card desktop system this reduces the overall runtime by a
third.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230203-alsa-pcm-test-card-thread-v1-1-59941640ebba@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 tools/testing/selftests/alsa/Makefile   |  2 +
 tools/testing/selftests/alsa/pcm-test.c | 80 +++++++++++++++++++++++++++++----
 2 files changed, 74 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/Makefile b/tools/testing/selftests/alsa/Makefile
index 77fba3e498cc..901949db80ad 100644
--- a/tools/testing/selftests/alsa/Makefile
+++ b/tools/testing/selftests/alsa/Makefile
@@ -8,6 +8,8 @@ LDLIBS += -lasound
 endif
 CFLAGS += -L$(OUTPUT) -Wl,-rpath=./
 
+LDLIBS+=-lpthread
+
 OVERRIDE_TARGETS = 1
 
 TEST_GEN_PROGS := mixer-test pcm-test
diff --git a/tools/testing/selftests/alsa/pcm-test.c b/tools/testing/selftests/alsa/pcm-test.c
index 57d3f6dcb46b..58b525a4a32c 100644
--- a/tools/testing/selftests/alsa/pcm-test.c
+++ b/tools/testing/selftests/alsa/pcm-test.c
@@ -15,12 +15,21 @@
 #include <stdbool.h>
 #include <errno.h>
 #include <assert.h>
+#include <pthread.h>
 
 #include "../kselftest.h"
 #include "alsa-local.h"
 
 typedef struct timespec timestamp_t;
 
+struct card_data {
+	int card;
+	pthread_t thread;
+	struct card_data *next;
+};
+
+struct card_data *card_list = NULL;
+
 struct pcm_data {
 	snd_pcm_t *handle;
 	int card;
@@ -36,6 +45,11 @@ struct pcm_data *pcm_list = NULL;
 int num_missing = 0;
 struct pcm_data *pcm_missing = NULL;
 
+snd_config_t *default_pcm_config;
+
+/* Lock while reporting results since kselftest doesn't */
+pthread_mutex_t results_lock = PTHREAD_MUTEX_INITIALIZER;
+
 enum test_class {
 	TEST_CLASS_DEFAULT,
 	TEST_CLASS_SYSTEM,
@@ -141,6 +155,7 @@ static void find_pcms(void)
 	snd_ctl_t *handle;
 	snd_pcm_info_t *pcm_info;
 	snd_config_t *config, *card_config, *pcm_config;
+	struct card_data *card_data;
 
 	snd_pcm_info_alloca(&pcm_info);
 
@@ -162,6 +177,13 @@ static void find_pcms(void)
 
 		card_config = conf_by_card(card);
 
+		card_data = calloc(1, sizeof(*card_data));
+		if (!card_data)
+			ksft_exit_fail_msg("Out of memory\n");
+		card_data->card = card;
+		card_data->next = card_list;
+		card_list = card_data;
+
 		dev = -1;
 		while (1) {
 			if (snd_ctl_pcm_next_device(handle, &dev) < 0)
@@ -246,10 +268,6 @@ static void test_pcm_time(struct pcm_data *data, enum test_class class,
 	bool skip = true;
 	const char *desc;
 
-	desc = conf_get_string(pcm_cfg, "description", NULL, NULL);
-	if (desc)
-		ksft_print_msg("%s\n", desc);
-
 	switch (class) {
 	case TEST_CLASS_DEFAULT:
 		test_class_name = "default";
@@ -262,6 +280,15 @@ static void test_pcm_time(struct pcm_data *data, enum test_class class,
 		break;
 	}
 
+	desc = conf_get_string(pcm_cfg, "description", NULL, NULL);
+	if (desc)
+		ksft_print_msg("%s.%s.%d.%d.%d.%s - %s\n",
+			       test_class_name, test_name,
+			       data->card, data->device, data->subdevice,
+			       snd_pcm_stream_name(data->stream),
+			       desc);
+
+
 	snd_pcm_hw_params_alloca(&hw_params);
 	snd_pcm_sw_params_alloca(&sw_params);
 
@@ -443,6 +470,8 @@ __format:
 	msg[0] = '\0';
 	pass = true;
 __close:
+	pthread_mutex_lock(&results_lock);
+
 	switch (class) {
 	case TEST_CLASS_SYSTEM:
 		test_class_name = "system";
@@ -471,6 +500,9 @@ __close:
 				 data->card, data->device, data->subdevice,
 				 snd_pcm_stream_name(data->stream),
 				 msg[0] ? " " : "", msg);
+
+	pthread_mutex_unlock(&results_lock);
+
 	free(samples);
 	if (handle)
 		snd_pcm_close(handle);
@@ -502,11 +534,30 @@ void run_time_tests(struct pcm_data *pcm, enum test_class class,
 	}
 }
 
+void *card_thread(void *data)
+{
+	struct card_data *card = data;
+	struct pcm_data *pcm;
+
+	for (pcm = pcm_list; pcm != NULL; pcm = pcm->next) {
+		if (pcm->card != card->card)
+			continue;
+
+		run_time_tests(pcm, TEST_CLASS_DEFAULT, default_pcm_config);
+		run_time_tests(pcm, TEST_CLASS_SYSTEM, pcm->pcm_config);
+	}
+
+	return 0;
+}
+
 int main(void)
 {
+	struct card_data *card;
 	struct pcm_data *pcm;
-	snd_config_t *global_config, *default_pcm_config, *cfg, *pcm_cfg;
+	snd_config_t *global_config, *cfg, *pcm_cfg;
 	int num_pcm_tests = 0, num_tests, num_std_pcm_tests;
+	int ret;
+	void *thread_ret;
 
 	ksft_print_header();
 
@@ -540,9 +591,22 @@ int main(void)
 				 snd_pcm_stream_name(pcm->stream));
 	}
 
-	for (pcm = pcm_list; pcm != NULL; pcm = pcm->next) {
-		run_time_tests(pcm, TEST_CLASS_DEFAULT, default_pcm_config);
-		run_time_tests(pcm, TEST_CLASS_SYSTEM, pcm->pcm_config);
+	for (card = card_list; card != NULL; card = card->next) {
+		ret = pthread_create(&card->thread, NULL, card_thread, card);
+		if (ret != 0) {
+			ksft_exit_fail_msg("Failed to create card %d thread: %d (%s)\n",
+					   card->card, ret,
+					   strerror(errno));
+		}
+	}
+
+	for (card = card_list; card != NULL; card = card->next) {
+		ret = pthread_join(card->thread, &thread_ret);
+		if (ret != 0) {
+			ksft_exit_fail_msg("Failed to join card %d thread: %d (%s)\n",
+					   card->card, ret,
+					   strerror(errno));
+		}
 	}
 
 	snd_config_delete(global_config);
-- 
cgit 


From 344dd2c9e74373cca454dcea4b62be50adc35939 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 2 Feb 2023 18:59:27 +0100
Subject: selftests: forwarding: Move IGMP- and MLD-related functions to lib

These functions will be helpful for other testsuites as well. Extract them
to a common place.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../testing/selftests/net/forwarding/bridge_mdb.sh | 49 ----------------------
 tools/testing/selftests/net/forwarding/lib.sh      | 49 ++++++++++++++++++++++
 2 files changed, 49 insertions(+), 49 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb.sh b/tools/testing/selftests/net/forwarding/bridge_mdb.sh
index 2fa5973c0c28..51f2b0d77067 100755
--- a/tools/testing/selftests/net/forwarding/bridge_mdb.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_mdb.sh
@@ -1018,26 +1018,6 @@ fwd_test()
 	ip -6 address del fe80::1/64 dev br0
 }
 
-igmpv3_is_in_get()
-{
-	local igmpv3
-
-	igmpv3=$(:
-		)"22:"$(			: Type - Membership Report
-		)"00:"$(			: Reserved
-		)"2a:f8:"$(			: Checksum
-		)"00:00:"$(			: Reserved
-		)"00:01:"$(			: Number of Group Records
-		)"01:"$(			: Record Type - IS_IN
-		)"00:"$(			: Aux Data Len
-		)"00:01:"$(			: Number of Sources
-		)"ef:01:01:01:"$(		: Multicast Address - 239.1.1.1
-		)"c0:00:02:02"$(		: Source Address - 192.0.2.2
-		)
-
-	echo $igmpv3
-}
-
 ctrl_igmpv3_is_in_test()
 {
 	RET=0
@@ -1077,35 +1057,6 @@ ctrl_igmpv3_is_in_test()
 	log_test "IGMPv3 MODE_IS_INCLUE tests"
 }
 
-mldv2_is_in_get()
-{
-	local hbh
-	local icmpv6
-
-	hbh=$(:
-		)"3a:"$(			: Next Header - ICMPv6
-		)"00:"$(			: Hdr Ext Len
-		)"00:00:00:00:00:00:"$(		: Options and Padding
-		)
-
-	icmpv6=$(:
-		)"8f:"$(			: Type - MLDv2 Report
-		)"00:"$(			: Code
-		)"45:39:"$(			: Checksum
-		)"00:00:"$(			: Reserved
-		)"00:01:"$(			: Number of Group Records
-		)"01:"$(			: Record Type - IS_IN
-		)"00:"$(			: Aux Data Len
-		)"00:01:"$(			: Number of Sources
-		)"ff:0e:00:00:00:00:00:00:"$(	: Multicast address - ff0e::1
-		)"00:00:00:00:00:00:00:01:"$(	:
-		)"20:01:0d:b8:00:01:00:00:"$(	: Source Address - 2001:db8:1::2
-		)"00:00:00:00:00:00:00:02:"$(	:
-		)
-
-	echo ${hbh}${icmpv6}
-}
-
 ctrl_mldv2_is_in_test()
 {
 	RET=0
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 29cd4705c752..736f56098cc7 100755
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -1671,3 +1671,52 @@ hw_stats_monitor_test()
 
 	log_test "${type}_stats notifications"
 }
+
+igmpv3_is_in_get()
+{
+	local igmpv3
+
+	igmpv3=$(:
+		)"22:"$(			: Type - Membership Report
+		)"00:"$(			: Reserved
+		)"2a:f8:"$(			: Checksum
+		)"00:00:"$(			: Reserved
+		)"00:01:"$(			: Number of Group Records
+		)"01:"$(			: Record Type - IS_IN
+		)"00:"$(			: Aux Data Len
+		)"00:01:"$(			: Number of Sources
+		)"ef:01:01:01:"$(		: Multicast Address - 239.1.1.1
+		)"c0:00:02:02"$(		: Source Address - 192.0.2.2
+		)
+
+	echo $igmpv3
+}
+
+mldv2_is_in_get()
+{
+	local hbh
+	local icmpv6
+
+	hbh=$(:
+		)"3a:"$(			: Next Header - ICMPv6
+		)"00:"$(			: Hdr Ext Len
+		)"00:00:00:00:00:00:"$(		: Options and Padding
+		)
+
+	icmpv6=$(:
+		)"8f:"$(			: Type - MLDv2 Report
+		)"00:"$(			: Code
+		)"45:39:"$(			: Checksum
+		)"00:00:"$(			: Reserved
+		)"00:01:"$(			: Number of Group Records
+		)"01:"$(			: Record Type - IS_IN
+		)"00:"$(			: Aux Data Len
+		)"00:01:"$(			: Number of Sources
+		)"ff:0e:00:00:00:00:00:00:"$(	: Multicast address - ff0e::1
+		)"00:00:00:00:00:00:00:01:"$(	:
+		)"20:01:0d:b8:00:01:00:00:"$(	: Source Address - 2001:db8:1::2
+		)"00:00:00:00:00:00:00:02:"$(	:
+		)
+
+	echo ${hbh}${icmpv6}
+}
-- 
cgit 


From f7ccf60c4adada7e13d3d6798621cabcdaf3828e Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 2 Feb 2023 18:59:28 +0100
Subject: selftests: forwarding: bridge_mdb: Fix a typo

Add the letter missing from the word "INCLUDE".

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/bridge_mdb.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb.sh b/tools/testing/selftests/net/forwarding/bridge_mdb.sh
index 51f2b0d77067..4e16677f02ba 100755
--- a/tools/testing/selftests/net/forwarding/bridge_mdb.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_mdb.sh
@@ -1054,7 +1054,7 @@ ctrl_igmpv3_is_in_test()
 
 	bridge mdb del dev br0 port $swp1 grp 239.1.1.1 vid 10
 
-	log_test "IGMPv3 MODE_IS_INCLUE tests"
+	log_test "IGMPv3 MODE_IS_INCLUDE tests"
 }
 
 ctrl_mldv2_is_in_test()
-- 
cgit 


From fcf4927632eecb25e40c8c5ac1e40df2090cc2eb Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 2 Feb 2023 18:59:29 +0100
Subject: selftests: forwarding: lib: Add helpers for IP address handling

In order to generate IGMPv3 and MLDv2 packets on the fly, we will need
helpers to expand IPv4 and IPv6 addresses given as parameters in
mausezahn payload notation. Add helpers that do it.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/lib.sh | 37 +++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 736f56098cc7..9fe3004af6dc 100755
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -1672,6 +1672,43 @@ hw_stats_monitor_test()
 	log_test "${type}_stats notifications"
 }
 
+ipv4_to_bytes()
+{
+	local IP=$1; shift
+
+	printf '%02x:' ${IP//./ } |
+	    sed 's/:$//'
+}
+
+# Convert a given IPv6 address, `IP' such that the :: token, if present, is
+# expanded, and each 16-bit group is padded with zeroes to be 4 hexadecimal
+# digits. An optional `BYTESEP' parameter can be given to further separate
+# individual bytes of each 16-bit group.
+expand_ipv6()
+{
+	local IP=$1; shift
+	local bytesep=$1; shift
+
+	local cvt_ip=${IP/::/_}
+	local colons=${cvt_ip//[^:]/}
+	local allcol=:::::::
+	# IP where :: -> the appropriate number of colons:
+	local allcol_ip=${cvt_ip/_/${allcol:${#colons}}}
+
+	echo $allcol_ip | tr : '\n' |
+	    sed s/^/0000/ |
+	    sed 's/.*\(..\)\(..\)/\1'"$bytesep"'\2/' |
+	    tr '\n' : |
+	    sed 's/:$//'
+}
+
+ipv6_to_bytes()
+{
+	local IP=$1; shift
+
+	expand_ipv6 "$IP" :
+}
+
 igmpv3_is_in_get()
 {
 	local igmpv3
-- 
cgit 


From 952e0ee38c7215c45192d8c899acd1830873f28b Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 2 Feb 2023 18:59:30 +0100
Subject: selftests: forwarding: lib: Add helpers for checksum handling

In order to generate IGMPv3 and MLDv2 packets on the fly, we will need
helpers to calculate the packet checksum.

The approach presented in this patch revolves around payload templates
for mausezahn. These are mausezahn-like payload strings (01:23:45:...)
with possibly one 2-byte sequence replaced with the word PAYLOAD. The
main function is payload_template_calc_checksum(), which calculates
RFC 1071 checksum of the message. There are further helpers to then
convert the checksum to the payload format, and to expand it.

For IPv6, MLDv2 message checksum is computed using a pseudoheader that
differs from the header used in the payload itself. The fact that the
two messages are different means that the checksum needs to be
returned as a separate quantity, instead of being expanded in-place in
the payload itself. Furthermore, the pseudoheader includes a length of
the message. Much like the checksum, this needs to be expanded in
mausezahn format. And likewise for number of addresses for (S,G)
entries. Thus we have several places where a computed quantity needs
to be presented in the payload format. Add a helper u16_to_bytes(),
which will be used in all these cases.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/lib.sh | 56 +++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 9fe3004af6dc..733bee34631a 100755
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -1709,6 +1709,62 @@ ipv6_to_bytes()
 	expand_ipv6 "$IP" :
 }
 
+u16_to_bytes()
+{
+	local u16=$1; shift
+
+	printf "%04x" $u16 | sed 's/^/000/;s/^.*\(..\)\(..\)$/\1:\2/'
+}
+
+# Given a mausezahn-formatted payload (colon-separated bytes given as %02x),
+# possibly with a keyword CHECKSUM stashed where a 16-bit checksum should be,
+# calculate checksum as per RFC 1071, assuming the CHECKSUM field (if any)
+# stands for 00:00.
+payload_template_calc_checksum()
+{
+	local payload=$1; shift
+
+	(
+	    # Set input radix.
+	    echo "16i"
+	    # Push zero for the initial checksum.
+	    echo 0
+
+	    # Pad the payload with a terminating 00: in case we get an odd
+	    # number of bytes.
+	    echo "${payload%:}:00:" |
+		sed 's/CHECKSUM/00:00/g' |
+		tr '[:lower:]' '[:upper:]' |
+		# Add the word to the checksum.
+		sed 's/\(..\):\(..\):/\1\2+\n/g' |
+		# Strip the extra odd byte we pushed if left unconverted.
+		sed 's/\(..\):$//'
+
+	    echo "10000 ~ +"	# Calculate and add carry.
+	    echo "FFFF r - p"	# Bit-flip and print.
+	) |
+	    dc |
+	    tr '[:upper:]' '[:lower:]'
+}
+
+payload_template_expand_checksum()
+{
+	local payload=$1; shift
+	local checksum=$1; shift
+
+	local ckbytes=$(u16_to_bytes $checksum)
+
+	echo "$payload" | sed "s/CHECKSUM/$ckbytes/g"
+}
+
+payload_template_nbytes()
+{
+	local payload=$1; shift
+
+	payload_template_expand_checksum "${payload%:}" 0 |
+		sed 's/:/\n/g' | wc -l
+}
+
 igmpv3_is_in_get()
 {
 	local igmpv3
-- 
cgit 


From 506a1ac9d32b52606f59acbb0e20c1a7adbb8511 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 2 Feb 2023 18:59:31 +0100
Subject: selftests: forwarding: lib: Parameterize IGMPv3/MLDv2 generation

In order to generate IGMPv3 and MLDv2 packets on the fly, the
functions that generate these packets need to be able to generate
packets for different groups and different sources. Generating MLDv2
packets further needs the source address of the packet for purposes of
checksum calculation. Add the necessary parameters, and generate the
payload accordingly by dispatching to helpers added in the previous
patches.

Adjust the sole client, bridge_mdb.sh, as well.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../testing/selftests/net/forwarding/bridge_mdb.sh |  9 +++---
 tools/testing/selftests/net/forwarding/lib.sh      | 36 ++++++++++++++++------
 2 files changed, 31 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb.sh b/tools/testing/selftests/net/forwarding/bridge_mdb.sh
index 4e16677f02ba..b48867d8cadf 100755
--- a/tools/testing/selftests/net/forwarding/bridge_mdb.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_mdb.sh
@@ -1029,7 +1029,7 @@ ctrl_igmpv3_is_in_test()
 
 	# IS_IN ( 192.0.2.2 )
 	$MZ $h1.10 -c 1 -A 192.0.2.1 -B 239.1.1.1 \
-		-t ip proto=2,p=$(igmpv3_is_in_get) -q
+		-t ip proto=2,p=$(igmpv3_is_in_get 239.1.1.1 192.0.2.2) -q
 
 	bridge -d mdb show dev br0 vid 10 | grep 239.1.1.1 | grep -q 192.0.2.2
 	check_fail $? "Permanent entry affected by IGMP packet"
@@ -1042,7 +1042,7 @@ ctrl_igmpv3_is_in_test()
 
 	# IS_IN ( 192.0.2.2 )
 	$MZ $h1.10 -c 1 -A 192.0.2.1 -B 239.1.1.1 \
-		-t ip proto=2,p=$(igmpv3_is_in_get) -q
+		-t ip proto=2,p=$(igmpv3_is_in_get 239.1.1.1 192.0.2.2) -q
 
 	bridge -d mdb show dev br0 vid 10 | grep 239.1.1.1 | grep -v "src" | \
 		grep -q 192.0.2.2
@@ -1067,8 +1067,9 @@ ctrl_mldv2_is_in_test()
 		filter_mode include source_list 2001:db8:1::1
 
 	# IS_IN ( 2001:db8:1::2 )
+	local p=$(mldv2_is_in_get fe80::1 ff0e::1 2001:db8:1::2)
 	$MZ -6 $h1.10 -c 1 -A fe80::1 -B ff0e::1 \
-		-t ip hop=1,next=0,p=$(mldv2_is_in_get) -q
+		-t ip hop=1,next=0,p="$p" -q
 
 	bridge -d mdb show dev br0 vid 10 | grep ff0e::1 | \
 		grep -q 2001:db8:1::2
@@ -1082,7 +1083,7 @@ ctrl_mldv2_is_in_test()
 
 	# IS_IN ( 2001:db8:1::2 )
 	$MZ -6 $h1.10 -c 1 -A fe80::1 -B ff0e::1 \
-		-t ip hop=1,next=0,p=$(mldv2_is_in_get) -q
+		-t ip hop=1,next=0,p="$p" -q
 
 	bridge -d mdb show dev br0 vid 10 | grep ff0e::1 | grep -v "src" | \
 		grep -q 2001:db8:1::2
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 733bee34631a..4896f21493c8 100755
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -1767,26 +1767,35 @@ payload_template_nbytes()
 
 igmpv3_is_in_get()
 {
+	local GRP=$1; shift
+	local IP=$1; shift
+
 	local igmpv3
 
+	# IS_IN ( $IP )
 	igmpv3=$(:
 		)"22:"$(			: Type - Membership Report
 		)"00:"$(			: Reserved
-		)"2a:f8:"$(			: Checksum
+		)"CHECKSUM:"$(			: Checksum
 		)"00:00:"$(			: Reserved
 		)"00:01:"$(			: Number of Group Records
 		)"01:"$(			: Record Type - IS_IN
 		)"00:"$(			: Aux Data Len
 		)"00:01:"$(			: Number of Sources
-		)"ef:01:01:01:"$(		: Multicast Address - 239.1.1.1
-		)"c0:00:02:02"$(		: Source Address - 192.0.2.2
+		)"$(ipv4_to_bytes $GRP):"$(	: Multicast Address
+		)"$(ipv4_to_bytes $IP)"$(	: Source Address
 		)
+	local checksum=$(payload_template_calc_checksum "$igmpv3")
 
-	echo $igmpv3
+	payload_template_expand_checksum "$igmpv3" $checksum
 }
 
 mldv2_is_in_get()
 {
+	local SIP=$1; shift
+	local GRP=$1; shift
+	local IP=$1; shift
+
 	local hbh
 	local icmpv6
 
@@ -1799,17 +1808,24 @@ mldv2_is_in_get()
 	icmpv6=$(:
 		)"8f:"$(			: Type - MLDv2 Report
 		)"00:"$(			: Code
-		)"45:39:"$(			: Checksum
+		)"CHECKSUM:"$(			: Checksum
 		)"00:00:"$(			: Reserved
 		)"00:01:"$(			: Number of Group Records
 		)"01:"$(			: Record Type - IS_IN
 		)"00:"$(			: Aux Data Len
 		)"00:01:"$(			: Number of Sources
-		)"ff:0e:00:00:00:00:00:00:"$(	: Multicast address - ff0e::1
-		)"00:00:00:00:00:00:00:01:"$(	:
-		)"20:01:0d:b8:00:01:00:00:"$(	: Source Address - 2001:db8:1::2
-		)"00:00:00:00:00:00:00:02:"$(	:
+		)"$(ipv6_to_bytes $GRP):"$(	: Multicast address
+		)"$(ipv6_to_bytes $IP):"$(	: Source Address
 		)
 
-	echo ${hbh}${icmpv6}
+	local len=$(u16_to_bytes $(payload_template_nbytes $icmpv6))
+	local sudohdr=$(:
+		)"$(ipv6_to_bytes $SIP):"$(	: SIP
+		)"$(ipv6_to_bytes $GRP):"$(	: DIP is multicast address
+	        )"${len}:"$(			: Upper-layer length
+	        )"00:3a:"$(			: Zero and next-header
+	        )
+	local checksum=$(payload_template_calc_checksum ${sudohdr}${icmpv6})
+
+	payload_template_expand_checksum "$hbh$icmpv6" $checksum
 }
-- 
cgit 


From 705d4bc7b6b6563e7c2d4b3b3da8da9af842aacc Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 2 Feb 2023 18:59:32 +0100
Subject: selftests: forwarding: lib: Allow list of IPs for IGMPv3/MLDv2

The testsuite that checks for mcast_max_groups functionality will need
to generate IGMP and MLD packets with configurable number of (S,G)
addresses. To that end, further extend igmpv3_is_in_get() and
mldv2_is_in_get() to allow a list of IP addresses instead of one
address.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/lib.sh | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 4896f21493c8..df6b3208e236 100755
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -1768,11 +1768,12 @@ payload_template_nbytes()
 igmpv3_is_in_get()
 {
 	local GRP=$1; shift
-	local IP=$1; shift
+	local sources=("$@")
 
 	local igmpv3
+	local nsources=$(u16_to_bytes ${#sources[@]})
 
-	# IS_IN ( $IP )
+	# IS_IN ( $sources )
 	igmpv3=$(:
 		)"22:"$(			: Type - Membership Report
 		)"00:"$(			: Reserved
@@ -1781,9 +1782,12 @@ igmpv3_is_in_get()
 		)"00:01:"$(			: Number of Group Records
 		)"01:"$(			: Record Type - IS_IN
 		)"00:"$(			: Aux Data Len
-		)"00:01:"$(			: Number of Sources
+		)"${nsources}:"$(		: Number of Sources
 		)"$(ipv4_to_bytes $GRP):"$(	: Multicast Address
-		)"$(ipv4_to_bytes $IP)"$(	: Source Address
+		)"$(for src in "${sources[@]}"; do
+			ipv4_to_bytes $src
+			echo -n :
+		    done)"$(			: Source Addresses
 		)
 	local checksum=$(payload_template_calc_checksum "$igmpv3")
 
@@ -1794,10 +1798,11 @@ mldv2_is_in_get()
 {
 	local SIP=$1; shift
 	local GRP=$1; shift
-	local IP=$1; shift
+	local sources=("$@")
 
 	local hbh
 	local icmpv6
+	local nsources=$(u16_to_bytes ${#sources[@]})
 
 	hbh=$(:
 		)"3a:"$(			: Next Header - ICMPv6
@@ -1813,9 +1818,12 @@ mldv2_is_in_get()
 		)"00:01:"$(			: Number of Group Records
 		)"01:"$(			: Record Type - IS_IN
 		)"00:"$(			: Aux Data Len
-		)"00:01:"$(			: Number of Sources
+		)"${nsources}:"$(		: Number of Sources
 		)"$(ipv6_to_bytes $GRP):"$(	: Multicast address
-		)"$(ipv6_to_bytes $IP):"$(	: Source Address
+		)"$(for src in "${sources[@]}"; do
+			ipv6_to_bytes $src
+			echo -n :
+		    done)"$(			: Source Addresses
 		)
 
 	local len=$(u16_to_bytes $(payload_template_nbytes $icmpv6))
-- 
cgit 


From 9ae8546973171230488ad631d2b9eb844466a37d Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 2 Feb 2023 18:59:33 +0100
Subject: selftests: forwarding: lib: Add helpers to build IGMP/MLD leave
 packets

The testsuite that checks for mcast_max_groups functionality will need to
wipe the added groups as well. Add helpers to build an IGMP or MLD packets
announcing that host is leaving a given group.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/lib.sh | 50 +++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index df6b3208e236..cc7c4f89a097 100755
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -1794,6 +1794,21 @@ igmpv3_is_in_get()
 	payload_template_expand_checksum "$igmpv3" $checksum
 }
 
+igmpv2_leave_get()
+{
+	local GRP=$1; shift
+
+	local payload=$(:
+		)"17:"$(			: Type - Leave Group
+		)"00:"$(			: Max Resp Time - not meaningful
+		)"CHECKSUM:"$(			: Checksum
+		)"$(ipv4_to_bytes $GRP)"$(	: Group Address
+		)
+	local checksum=$(payload_template_calc_checksum "$payload")
+
+	payload_template_expand_checksum "$payload" $checksum
+}
+
 mldv2_is_in_get()
 {
 	local SIP=$1; shift
@@ -1837,3 +1852,38 @@ mldv2_is_in_get()
 
 	payload_template_expand_checksum "$hbh$icmpv6" $checksum
 }
+
+mldv1_done_get()
+{
+	local SIP=$1; shift
+	local GRP=$1; shift
+
+	local hbh
+	local icmpv6
+
+	hbh=$(:
+		)"3a:"$(			: Next Header - ICMPv6
+		)"00:"$(			: Hdr Ext Len
+		)"00:00:00:00:00:00:"$(		: Options and Padding
+		)
+
+	icmpv6=$(:
+		)"84:"$(			: Type - MLDv1 Done
+		)"00:"$(			: Code
+		)"CHECKSUM:"$(			: Checksum
+		)"00:00:"$(			: Max Resp Delay - not meaningful
+		)"00:00:"$(			: Reserved
+		)"$(ipv6_to_bytes $GRP):"$(	: Multicast address
+		)
+
+	local len=$(u16_to_bytes $(payload_template_nbytes $icmpv6))
+	local sudohdr=$(:
+		)"$(ipv6_to_bytes $SIP):"$(	: SIP
+		)"$(ipv6_to_bytes $GRP):"$(	: DIP is multicast address
+	        )"${len}:"$(			: Upper-layer length
+	        )"00:3a:"$(			: Zero and next-header
+	        )
+	local checksum=$(payload_template_calc_checksum ${sudohdr}${icmpv6})
+
+	payload_template_expand_checksum "$hbh$icmpv6" $checksum
+}
-- 
cgit 


From 3446dcd7df05ed6cc0095f21f871a61f1f28ed4e Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Thu, 2 Feb 2023 18:59:34 +0100
Subject: selftests: forwarding: bridge_mdb_max: Add a new selftest

Add a suite covering mcast_n_groups and mcast_max_groups bridge features.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/Makefile    |    1 +
 .../selftests/net/forwarding/bridge_mdb_max.sh     | 1336 ++++++++++++++++++++
 2 files changed, 1337 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/bridge_mdb_max.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile
index 453ae006fbcf..91201ab3c4fc 100644
--- a/tools/testing/selftests/net/forwarding/Makefile
+++ b/tools/testing/selftests/net/forwarding/Makefile
@@ -4,6 +4,7 @@ TEST_PROGS = bridge_igmp.sh \
 	bridge_locked_port.sh \
 	bridge_mdb.sh \
 	bridge_mdb_host.sh \
+	bridge_mdb_max.sh \
 	bridge_mdb_port_down.sh \
 	bridge_mld.sh \
 	bridge_port_isolation.sh \
diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh b/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh
new file mode 100755
index 000000000000..ae255b662ba3
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh
@@ -0,0 +1,1336 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +-----------------------+                          +------------------------+
+# | H1 (vrf)              |                          | H2 (vrf)               |
+# | + $h1.10              |                          | + $h2.10               |
+# | | 192.0.2.1/28        |                          | | 192.0.2.2/28         |
+# | | 2001:db8:1::1/64    |                          | | 2001:db8:1::2/64     |
+# | |                     |                          | |                      |
+# | |  + $h1.20           |                          | |  + $h2.20            |
+# | \  | 198.51.100.1/24  |                          | \  | 198.51.100.2/24   |
+# |  \ | 2001:db8:2::1/64 |                          |  \ | 2001:db8:2::2/64  |
+# |   \|                  |                          |   \|                   |
+# |    + $h1              |                          |    + $h2               |
+# +----|------------------+                          +----|-------------------+
+#      |                                                  |
+# +----|--------------------------------------------------|-------------------+
+# | SW |                                                  |                   |
+# | +--|--------------------------------------------------|-----------------+ |
+# | |  + $swp1                   BR0 (802.1q)             + $swp2           | |
+# | |     vid 10                                             vid 10         | |
+# | |     vid 20                                             vid 20         | |
+# | |                                                                       | |
+# | +-----------------------------------------------------------------------+ |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+	test_8021d
+	test_8021q
+	test_8021qvs
+"
+
+NUM_NETIFS=4
+source lib.sh
+source tc_common.sh
+
+h1_create()
+{
+	simple_if_init $h1
+	vlan_create $h1 10 v$h1 192.0.2.1/28 2001:db8:1::1/64
+	vlan_create $h1 20 v$h1 198.51.100.1/24 2001:db8:2::1/64
+}
+
+h1_destroy()
+{
+	vlan_destroy $h1 20
+	vlan_destroy $h1 10
+	simple_if_fini $h1
+}
+
+h2_create()
+{
+	simple_if_init $h2
+	vlan_create $h2 10 v$h2 192.0.2.2/28
+	vlan_create $h2 20 v$h2 198.51.100.2/24
+}
+
+h2_destroy()
+{
+	vlan_destroy $h2 20
+	vlan_destroy $h2 10
+	simple_if_fini $h2
+}
+
+switch_create_8021d()
+{
+	log_info "802.1d tests"
+
+	ip link add name br0 type bridge vlan_filtering 0 \
+		mcast_snooping 1 \
+		mcast_igmp_version 3 mcast_mld_version 2
+	ip link set dev br0 up
+
+	ip link set dev $swp1 master br0
+	ip link set dev $swp1 up
+	bridge link set dev $swp1 fastleave on
+
+	ip link set dev $swp2 master br0
+	ip link set dev $swp2 up
+}
+
+switch_create_8021q()
+{
+	local br_flags=$1; shift
+
+	log_info "802.1q $br_flags${br_flags:+ }tests"
+
+	ip link add name br0 type bridge vlan_filtering 1 vlan_default_pvid 0 \
+		mcast_snooping 1 $br_flags \
+		mcast_igmp_version 3 mcast_mld_version 2
+	bridge vlan add vid 10 dev br0 self
+	bridge vlan add vid 20 dev br0 self
+	ip link set dev br0 up
+
+	ip link set dev $swp1 master br0
+	ip link set dev $swp1 up
+	bridge link set dev $swp1 fastleave on
+	bridge vlan add vid 10 dev $swp1
+	bridge vlan add vid 20 dev $swp1
+
+	ip link set dev $swp2 master br0
+	ip link set dev $swp2 up
+	bridge vlan add vid 10 dev $swp2
+	bridge vlan add vid 20 dev $swp2
+}
+
+switch_create_8021qvs()
+{
+	switch_create_8021q "mcast_vlan_snooping 1"
+	bridge vlan global set dev br0 vid 10 mcast_igmp_version 3
+	bridge vlan global set dev br0 vid 10 mcast_mld_version 2
+	bridge vlan global set dev br0 vid 20 mcast_igmp_version 3
+	bridge vlan global set dev br0 vid 20 mcast_mld_version 2
+}
+
+switch_destroy()
+{
+	ip link set dev $swp2 down
+	ip link set dev $swp2 nomaster
+
+	ip link set dev $swp1 down
+	ip link set dev $swp1 nomaster
+
+	ip link set dev br0 down
+	ip link del dev br0
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	vrf_prepare
+	forwarding_enable
+
+	h1_create
+	h2_create
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	switch_destroy 2>/dev/null
+	h2_destroy
+	h1_destroy
+
+	forwarding_restore
+	vrf_cleanup
+}
+
+cfg_src_list()
+{
+	local IPs=("$@")
+	local IPstr=$(echo ${IPs[@]} | tr '[:space:]' , | sed 's/,$//')
+
+	echo ${IPstr:+source_list }${IPstr}
+}
+
+cfg_group_op()
+{
+	local op=$1; shift
+	local locus=$1; shift
+	local GRP=$1; shift
+	local state=$1; shift
+	local IPs=("$@")
+
+	local source_list=$(cfg_src_list ${IPs[@]})
+
+	# Everything besides `bridge mdb' uses the "dev X vid Y" syntax,
+	# so we use it here as well and convert.
+	local br_locus=$(echo "$locus" | sed 's/^dev /port /')
+
+	bridge mdb $op dev br0 $br_locus grp $GRP $state \
+	       filter_mode include $source_list
+}
+
+cfg4_entries_op()
+{
+	local op=$1; shift
+	local locus=$1; shift
+	local state=$1; shift
+	local n=$1; shift
+	local grp=${1:-1}; shift
+
+	local GRP=239.1.1.${grp}
+	local IPs=$(seq -f 192.0.2.%g 1 $((n - 1)))
+	cfg_group_op "$op" "$locus" "$GRP" "$state" ${IPs[@]}
+}
+
+cfg4_entries_add()
+{
+	cfg4_entries_op add "$@"
+}
+
+cfg4_entries_del()
+{
+	cfg4_entries_op del "$@"
+}
+
+cfg6_entries_op()
+{
+	local op=$1; shift
+	local locus=$1; shift
+	local state=$1; shift
+	local n=$1; shift
+	local grp=${1:-1}; shift
+
+	local GRP=ff0e::${grp}
+	local IPs=$(printf "2001:db8:1::%x\n" $(seq 1 $((n - 1))))
+	cfg_group_op "$op" "$locus" "$GRP" "$state" ${IPs[@]}
+}
+
+cfg6_entries_add()
+{
+	cfg6_entries_op add "$@"
+}
+
+cfg6_entries_del()
+{
+	cfg6_entries_op del "$@"
+}
+
+locus_dev_peer()
+{
+	local dev_kw=$1; shift
+	local dev=$1; shift
+	local vid_kw=$1; shift
+	local vid=$1; shift
+
+	echo "$h1.${vid:-10}"
+}
+
+locus_dev()
+{
+	local dev_kw=$1; shift
+	local dev=$1; shift
+
+	echo $dev
+}
+
+ctl4_entries_add()
+{
+	local locus=$1; shift
+	local state=$1; shift
+	local n=$1; shift
+	local grp=${1:-1}; shift
+
+	local IPs=$(seq -f 192.0.2.%g 1 $((n - 1)))
+	local peer=$(locus_dev_peer $locus)
+	local GRP=239.1.1.${grp}
+	$MZ $peer -c 1 -A 192.0.2.1 -B $GRP \
+		-t ip proto=2,p=$(igmpv3_is_in_get $GRP $IPs) -q
+	sleep 1
+
+	local nn=$(bridge mdb show dev br0 | grep $GRP | wc -l)
+	if ((nn != n)); then
+		echo mcast_max_groups > /dev/stderr
+		false
+	fi
+}
+
+ctl4_entries_del()
+{
+	local locus=$1; shift
+	local state=$1; shift
+	local n=$1; shift
+	local grp=${1:-1}; shift
+
+	local peer=$(locus_dev_peer $locus)
+	local GRP=239.1.1.${grp}
+	$MZ $peer -c 1 -A 192.0.2.1 -B 224.0.0.2 \
+		-t ip proto=2,p=$(igmpv2_leave_get $GRP) -q
+	sleep 1
+	! bridge mdb show dev br0 | grep -q $GRP
+}
+
+ctl6_entries_add()
+{
+	local locus=$1; shift
+	local state=$1; shift
+	local n=$1; shift
+	local grp=${1:-1}; shift
+
+	local IPs=$(printf "2001:db8:1::%x\n" $(seq 1 $((n - 1))))
+	local peer=$(locus_dev_peer $locus)
+	local SIP=fe80::1
+	local GRP=ff0e::${grp}
+	local p=$(mldv2_is_in_get $SIP $GRP $IPs)
+	$MZ -6 $peer -c 1 -A $SIP -B $GRP -t ip hop=1,next=0,p="$p" -q
+	sleep 1
+
+	local nn=$(bridge mdb show dev br0 | grep $GRP | wc -l)
+	if ((nn != n)); then
+		echo mcast_max_groups > /dev/stderr
+		false
+	fi
+}
+
+ctl6_entries_del()
+{
+	local locus=$1; shift
+	local state=$1; shift
+	local n=$1; shift
+	local grp=${1:-1}; shift
+
+	local peer=$(locus_dev_peer $locus)
+	local SIP=fe80::1
+	local GRP=ff0e::${grp}
+	local p=$(mldv1_done_get $SIP $GRP)
+	$MZ -6 $peer -c 1 -A $SIP -B $GRP -t ip hop=1,next=0,p="$p" -q
+	sleep 1
+	! bridge mdb show dev br0 | grep -q $GRP
+}
+
+bridge_maxgroups_errmsg_check_cfg()
+{
+	local msg=$1; shift
+	local needle=$1; shift
+
+	echo "$msg" | grep -q mcast_max_groups
+	check_err $? "Adding MDB entries failed for the wrong reason: $msg"
+}
+
+bridge_maxgroups_errmsg_check_cfg4()
+{
+	bridge_maxgroups_errmsg_check_cfg "$@"
+}
+
+bridge_maxgroups_errmsg_check_cfg6()
+{
+	bridge_maxgroups_errmsg_check_cfg "$@"
+}
+
+bridge_maxgroups_errmsg_check_ctl4()
+{
+	:
+}
+
+bridge_maxgroups_errmsg_check_ctl6()
+{
+	:
+}
+
+bridge_port_ngroups_get()
+{
+	local locus=$1; shift
+
+	bridge -j -d link show $locus |
+	    jq '.[].mcast_n_groups'
+}
+
+bridge_port_maxgroups_get()
+{
+	local locus=$1; shift
+
+	bridge -j -d link show $locus |
+	    jq '.[].mcast_max_groups'
+}
+
+bridge_port_maxgroups_set()
+{
+	local locus=$1; shift
+	local max=$1; shift
+
+	bridge link set dev $(locus_dev $locus) mcast_max_groups $max
+}
+
+bridge_port_vlan_ngroups_get()
+{
+	local locus=$1; shift
+
+	bridge -j -d vlan show $locus |
+	    jq '.[].vlans[].mcast_n_groups'
+}
+
+bridge_port_vlan_maxgroups_get()
+{
+	local locus=$1; shift
+
+	bridge -j -d vlan show $locus |
+	    jq '.[].vlans[].mcast_max_groups'
+}
+
+bridge_port_vlan_maxgroups_set()
+{
+	local locus=$1; shift
+	local max=$1; shift
+
+	bridge vlan set $locus mcast_max_groups $max
+}
+
+test_ngroups_reporting()
+{
+	local CFG=$1; shift
+	local context=$1; shift
+	local locus=$1; shift
+
+	RET=0
+
+	local n0=$(bridge_${context}_ngroups_get "$locus")
+	${CFG}_entries_add "$locus" temp 5
+	check_err $? "Couldn't add MDB entries"
+	local n1=$(bridge_${context}_ngroups_get "$locus")
+
+	((n1 == n0 + 5))
+	check_err $? "Number of groups was $n0, now is $n1, but $((n0 + 5)) expected"
+
+	${CFG}_entries_del "$locus" temp 5
+	check_err $? "Couldn't delete MDB entries"
+	local n2=$(bridge_${context}_ngroups_get "$locus")
+
+	((n2 == n0))
+	check_err $? "Number of groups was $n0, now is $n2, but should be back to $n0"
+
+	log_test "$CFG: $context: ngroups reporting"
+}
+
+test_8021d_ngroups_reporting_cfg4()
+{
+	test_ngroups_reporting cfg4 port "dev $swp1"
+}
+
+test_8021d_ngroups_reporting_ctl4()
+{
+	test_ngroups_reporting ctl4 port "dev $swp1"
+}
+
+test_8021d_ngroups_reporting_cfg6()
+{
+	test_ngroups_reporting cfg6 port "dev $swp1"
+}
+
+test_8021d_ngroups_reporting_ctl6()
+{
+	test_ngroups_reporting ctl6 port "dev $swp1"
+}
+
+test_8021q_ngroups_reporting_cfg4()
+{
+	test_ngroups_reporting cfg4 port "dev $swp1 vid 10"
+}
+
+test_8021q_ngroups_reporting_ctl4()
+{
+	test_ngroups_reporting ctl4 port "dev $swp1 vid 10"
+}
+
+test_8021q_ngroups_reporting_cfg6()
+{
+	test_ngroups_reporting cfg6 port "dev $swp1 vid 10"
+}
+
+test_8021q_ngroups_reporting_ctl6()
+{
+	test_ngroups_reporting ctl6 port "dev $swp1 vid 10"
+}
+
+test_8021qvs_ngroups_reporting_cfg4()
+{
+	test_ngroups_reporting cfg4 port_vlan "dev $swp1 vid 10"
+}
+
+test_8021qvs_ngroups_reporting_ctl4()
+{
+	test_ngroups_reporting ctl4 port_vlan "dev $swp1 vid 10"
+}
+
+test_8021qvs_ngroups_reporting_cfg6()
+{
+	test_ngroups_reporting cfg6 port_vlan "dev $swp1 vid 10"
+}
+
+test_8021qvs_ngroups_reporting_ctl6()
+{
+	test_ngroups_reporting ctl6 port_vlan "dev $swp1 vid 10"
+}
+
+test_ngroups_cross_vlan()
+{
+	local CFG=$1; shift
+
+	local locus1="dev $swp1 vid 10"
+	local locus2="dev $swp1 vid 20"
+
+	RET=0
+
+	local n10=$(bridge_port_vlan_ngroups_get "$locus1")
+	local n20=$(bridge_port_vlan_ngroups_get "$locus2")
+	${CFG}_entries_add "$locus1" temp 5 111
+	check_err $? "Couldn't add MDB entries to VLAN 10"
+	local n11=$(bridge_port_vlan_ngroups_get "$locus1")
+	local n21=$(bridge_port_vlan_ngroups_get "$locus2")
+
+	((n11 == n10 + 5))
+	check_err $? "Number of groups at VLAN 10 was $n10, now is $n11, but 5 entries added on VLAN 10, $((n10 + 5)) expected"
+
+	((n21 == n20))
+	check_err $? "Number of groups at VLAN 20 was $n20, now is $n21, but no change expected on VLAN 20"
+
+	${CFG}_entries_add "$locus2" temp 5 112
+	check_err $? "Couldn't add MDB entries to VLAN 20"
+	local n12=$(bridge_port_vlan_ngroups_get "$locus1")
+	local n22=$(bridge_port_vlan_ngroups_get "$locus2")
+
+	((n12 == n11))
+	check_err $? "Number of groups at VLAN 10 was $n11, now is $n12, but no change expected on VLAN 10"
+
+	((n22 == n21 + 5))
+	check_err $? "Number of groups at VLAN 20 was $n21, now is $n22, but 5 entries added on VLAN 20, $((n21 + 5)) expected"
+
+	${CFG}_entries_del "$locus1" temp 5 111
+	check_err $? "Couldn't delete MDB entries from VLAN 10"
+	${CFG}_entries_del "$locus2" temp 5 112
+	check_err $? "Couldn't delete MDB entries from VLAN 20"
+	local n13=$(bridge_port_vlan_ngroups_get "$locus1")
+	local n23=$(bridge_port_vlan_ngroups_get "$locus2")
+
+	((n13 == n10))
+	check_err $? "Number of groups at VLAN 10 was $n10, now is $n13, but should be back to $n10"
+
+	((n23 == n20))
+	check_err $? "Number of groups at VLAN 20 was $n20, now is $n23, but should be back to $n20"
+
+	log_test "$CFG: port_vlan: isolation of port and per-VLAN ngroups"
+}
+
+test_8021qvs_ngroups_cross_vlan_cfg4()
+{
+	test_ngroups_cross_vlan cfg4
+}
+
+test_8021qvs_ngroups_cross_vlan_ctl4()
+{
+	test_ngroups_cross_vlan ctl4
+}
+
+test_8021qvs_ngroups_cross_vlan_cfg6()
+{
+	test_ngroups_cross_vlan cfg6
+}
+
+test_8021qvs_ngroups_cross_vlan_ctl6()
+{
+	test_ngroups_cross_vlan ctl6
+}
+
+test_maxgroups_zero()
+{
+	local CFG=$1; shift
+	local context=$1; shift
+	local locus=$1; shift
+
+	RET=0
+	local max
+
+	max=$(bridge_${context}_maxgroups_get "$locus")
+	((max == 0))
+	check_err $? "Max groups on $locus should be 0, but $max reported"
+
+	bridge_${context}_maxgroups_set "$locus" 100
+	check_err $? "Failed to set max to 100"
+	max=$(bridge_${context}_maxgroups_get "$locus")
+	((max == 100))
+	check_err $? "Max groups expected to be 100, but $max reported"
+
+	bridge_${context}_maxgroups_set "$locus" 0
+	check_err $? "Couldn't set maximum to 0"
+
+	# Test that setting 0 explicitly still serves as infinity.
+	${CFG}_entries_add "$locus" temp 5
+	check_err $? "Adding 5 MDB entries failed but should have passed"
+	${CFG}_entries_del "$locus" temp 5
+	check_err $? "Couldn't delete MDB entries"
+
+	log_test "$CFG: $context maxgroups: reporting and treatment of 0"
+}
+
+test_8021d_maxgroups_zero_cfg4()
+{
+	test_maxgroups_zero cfg4 port "dev $swp1"
+}
+
+test_8021d_maxgroups_zero_ctl4()
+{
+	test_maxgroups_zero ctl4 port "dev $swp1"
+}
+
+test_8021d_maxgroups_zero_cfg6()
+{
+	test_maxgroups_zero cfg6 port "dev $swp1"
+}
+
+test_8021d_maxgroups_zero_ctl6()
+{
+	test_maxgroups_zero ctl6 port "dev $swp1"
+}
+
+test_8021q_maxgroups_zero_cfg4()
+{
+	test_maxgroups_zero cfg4 port "dev $swp1 vid 10"
+}
+
+test_8021q_maxgroups_zero_ctl4()
+{
+	test_maxgroups_zero ctl4 port "dev $swp1 vid 10"
+}
+
+test_8021q_maxgroups_zero_cfg6()
+{
+	test_maxgroups_zero cfg6 port "dev $swp1 vid 10"
+}
+
+test_8021q_maxgroups_zero_ctl6()
+{
+	test_maxgroups_zero ctl6 port "dev $swp1 vid 10"
+}
+
+test_8021qvs_maxgroups_zero_cfg4()
+{
+	test_maxgroups_zero cfg4 port_vlan "dev $swp1 vid 10"
+}
+
+test_8021qvs_maxgroups_zero_ctl4()
+{
+	test_maxgroups_zero ctl4 port_vlan "dev $swp1 vid 10"
+}
+
+test_8021qvs_maxgroups_zero_cfg6()
+{
+	test_maxgroups_zero cfg6 port_vlan "dev $swp1 vid 10"
+}
+
+test_8021qvs_maxgroups_zero_ctl6()
+{
+	test_maxgroups_zero ctl6 port_vlan "dev $swp1 vid 10"
+}
+
+test_maxgroups_zero_cross_vlan()
+{
+	local CFG=$1; shift
+
+	local locus0="dev $swp1"
+	local locus1="dev $swp1 vid 10"
+	local locus2="dev $swp1 vid 20"
+	local max
+
+	RET=0
+
+	bridge_port_vlan_maxgroups_set "$locus1" 100
+	check_err $? "$locus1: Failed to set max to 100"
+
+	max=$(bridge_port_maxgroups_get "$locus0")
+	((max == 0))
+	check_err $? "$locus0: Max groups expected to be 0, but $max reported"
+
+	max=$(bridge_port_vlan_maxgroups_get "$locus2")
+	((max == 0))
+	check_err $? "$locus2: Max groups expected to be 0, but $max reported"
+
+	bridge_port_vlan_maxgroups_set "$locus2" 100
+	check_err $? "$locus2: Failed to set max to 100"
+
+	max=$(bridge_port_maxgroups_get "$locus0")
+	((max == 0))
+	check_err $? "$locus0: Max groups expected to be 0, but $max reported"
+
+	max=$(bridge_port_vlan_maxgroups_get "$locus2")
+	((max == 100))
+	check_err $? "$locus2: Max groups expected to be 100, but $max reported"
+
+	bridge_port_maxgroups_set "$locus0" 100
+	check_err $? "$locus0: Failed to set max to 100"
+
+	max=$(bridge_port_maxgroups_get "$locus0")
+	((max == 100))
+	check_err $? "$locus0: Max groups expected to be 100, but $max reported"
+
+	max=$(bridge_port_vlan_maxgroups_get "$locus2")
+	((max == 100))
+	check_err $? "$locus2: Max groups expected to be 100, but $max reported"
+
+	bridge_port_vlan_maxgroups_set "$locus1" 0
+	check_err $? "$locus1: Failed to set max to 0"
+
+	max=$(bridge_port_maxgroups_get "$locus0")
+	((max == 100))
+	check_err $? "$locus0: Max groups expected to be 100, but $max reported"
+
+	max=$(bridge_port_vlan_maxgroups_get "$locus2")
+	((max == 100))
+	check_err $? "$locus2: Max groups expected to be 100, but $max reported"
+
+	bridge_port_vlan_maxgroups_set "$locus2" 0
+	check_err $? "$locus2: Failed to set max to 0"
+
+	max=$(bridge_port_maxgroups_get "$locus0")
+	((max == 100))
+	check_err $? "$locus0: Max groups expected to be 100, but $max reported"
+
+	max=$(bridge_port_vlan_maxgroups_get "$locus2")
+	((max == 0))
+	check_err $? "$locus2: Max groups expected to be 0 but $max reported"
+
+	bridge_port_maxgroups_set "$locus0" 0
+	check_err $? "$locus0: Failed to set max to 0"
+
+	max=$(bridge_port_maxgroups_get "$locus0")
+	((max == 0))
+	check_err $? "$locus0: Max groups expected to be 0, but $max reported"
+
+	max=$(bridge_port_vlan_maxgroups_get "$locus2")
+	((max == 0))
+	check_err $? "$locus2: Max groups expected to be 0, but $max reported"
+
+	log_test "$CFG: port_vlan maxgroups: isolation of port and per-VLAN maximums"
+}
+
+test_8021qvs_maxgroups_zero_cross_vlan_cfg4()
+{
+	test_maxgroups_zero_cross_vlan cfg4
+}
+
+test_8021qvs_maxgroups_zero_cross_vlan_ctl4()
+{
+	test_maxgroups_zero_cross_vlan ctl4
+}
+
+test_8021qvs_maxgroups_zero_cross_vlan_cfg6()
+{
+	test_maxgroups_zero_cross_vlan cfg6
+}
+
+test_8021qvs_maxgroups_zero_cross_vlan_ctl6()
+{
+	test_maxgroups_zero_cross_vlan ctl6
+}
+
+test_maxgroups_too_low()
+{
+	local CFG=$1; shift
+	local context=$1; shift
+	local locus=$1; shift
+
+	RET=0
+
+	local n=$(bridge_${context}_ngroups_get "$locus")
+	local msg
+
+	${CFG}_entries_add "$locus" temp 5 111
+	check_err $? "$locus: Couldn't add MDB entries"
+
+	bridge_${context}_maxgroups_set "$locus" $((n+2))
+	check_err $? "$locus: Setting maxgroups to $((n+2)) failed"
+
+	msg=$(${CFG}_entries_add "$locus" temp 2 112 2>&1)
+	check_fail $? "$locus: Adding more entries passed when max<n"
+	bridge_maxgroups_errmsg_check_cfg "$msg"
+
+	${CFG}_entries_del "$locus" temp 5 111
+	check_err $? "$locus: Couldn't delete MDB entries"
+
+	${CFG}_entries_add "$locus" temp 2 112
+	check_err $? "$locus: Adding more entries failed"
+
+	${CFG}_entries_del "$locus" temp 2 112
+	check_err $? "$locus: Deleting more entries failed"
+
+	bridge_${context}_maxgroups_set "$locus" 0
+	check_err $? "$locus: Couldn't set maximum to 0"
+
+	log_test "$CFG: $context maxgroups: configure below ngroups"
+}
+
+test_8021d_maxgroups_too_low_cfg4()
+{
+	test_maxgroups_too_low cfg4 port "dev $swp1"
+}
+
+test_8021d_maxgroups_too_low_ctl4()
+{
+	test_maxgroups_too_low ctl4 port "dev $swp1"
+}
+
+test_8021d_maxgroups_too_low_cfg6()
+{
+	test_maxgroups_too_low cfg6 port "dev $swp1"
+}
+
+test_8021d_maxgroups_too_low_ctl6()
+{
+	test_maxgroups_too_low ctl6 port "dev $swp1"
+}
+
+test_8021q_maxgroups_too_low_cfg4()
+{
+	test_maxgroups_too_low cfg4 port "dev $swp1 vid 10"
+}
+
+test_8021q_maxgroups_too_low_ctl4()
+{
+	test_maxgroups_too_low ctl4 port "dev $swp1 vid 10"
+}
+
+test_8021q_maxgroups_too_low_cfg6()
+{
+	test_maxgroups_too_low cfg6 port "dev $swp1 vid 10"
+}
+
+test_8021q_maxgroups_too_low_ctl6()
+{
+	test_maxgroups_too_low ctl6 port "dev $swp1 vid 10"
+}
+
+test_8021qvs_maxgroups_too_low_cfg4()
+{
+	test_maxgroups_too_low cfg4 port_vlan "dev $swp1 vid 10"
+}
+
+test_8021qvs_maxgroups_too_low_ctl4()
+{
+	test_maxgroups_too_low ctl4 port_vlan "dev $swp1 vid 10"
+}
+
+test_8021qvs_maxgroups_too_low_cfg6()
+{
+	test_maxgroups_too_low cfg6 port_vlan "dev $swp1 vid 10"
+}
+
+test_8021qvs_maxgroups_too_low_ctl6()
+{
+	test_maxgroups_too_low ctl6 port_vlan "dev $swp1 vid 10"
+}
+
+test_maxgroups_too_many_entries()
+{
+	local CFG=$1; shift
+	local context=$1; shift
+	local locus=$1; shift
+
+	RET=0
+
+	local n=$(bridge_${context}_ngroups_get "$locus")
+	local msg
+
+	# Configure a low maximum
+	bridge_${context}_maxgroups_set "$locus" $((n+1))
+	check_err $? "$locus: Couldn't set maximum"
+
+	# Try to add more entries than the configured maximum
+	msg=$(${CFG}_entries_add "$locus" temp 5 2>&1)
+	check_fail $? "Adding 5 MDB entries passed, but should have failed"
+	bridge_maxgroups_errmsg_check_${CFG} "$msg"
+
+	# When adding entries through the control path, as many as possible
+	# get created. That's consistent with the mcast_hash_max behavior.
+	# So there, drop the entries explicitly.
+	if [[ ${CFG%[46]} == ctl ]]; then
+		${CFG}_entries_del "$locus" temp 17 2>&1
+	fi
+
+	local n2=$(bridge_${context}_ngroups_get "$locus")
+	((n2 == n))
+	check_err $? "Number of groups was $n, but after a failed attempt to add MDB entries it changed to $n2"
+
+	bridge_${context}_maxgroups_set "$locus" 0
+	check_err $? "$locus: Couldn't set maximum to 0"
+
+	log_test "$CFG: $context maxgroups: add too many MDB entries"
+}
+
+test_8021d_maxgroups_too_many_entries_cfg4()
+{
+	test_maxgroups_too_many_entries cfg4 port "dev $swp1"
+}
+
+test_8021d_maxgroups_too_many_entries_ctl4()
+{
+	test_maxgroups_too_many_entries ctl4 port "dev $swp1"
+}
+
+test_8021d_maxgroups_too_many_entries_cfg6()
+{
+	test_maxgroups_too_many_entries cfg6 port "dev $swp1"
+}
+
+test_8021d_maxgroups_too_many_entries_ctl6()
+{
+	test_maxgroups_too_many_entries ctl6 port "dev $swp1"
+}
+
+test_8021q_maxgroups_too_many_entries_cfg4()
+{
+	test_maxgroups_too_many_entries cfg4 port "dev $swp1 vid 10"
+}
+
+test_8021q_maxgroups_too_many_entries_ctl4()
+{
+	test_maxgroups_too_many_entries ctl4 port "dev $swp1 vid 10"
+}
+
+test_8021q_maxgroups_too_many_entries_cfg6()
+{
+	test_maxgroups_too_many_entries cfg6 port "dev $swp1 vid 10"
+}
+
+test_8021q_maxgroups_too_many_entries_ctl6()
+{
+	test_maxgroups_too_many_entries ctl6 port "dev $swp1 vid 10"
+}
+
+test_8021qvs_maxgroups_too_many_entries_cfg4()
+{
+	test_maxgroups_too_many_entries cfg4 port_vlan "dev $swp1 vid 10"
+}
+
+test_8021qvs_maxgroups_too_many_entries_ctl4()
+{
+	test_maxgroups_too_many_entries ctl4 port_vlan "dev $swp1 vid 10"
+}
+
+test_8021qvs_maxgroups_too_many_entries_cfg6()
+{
+	test_maxgroups_too_many_entries cfg6 port_vlan "dev $swp1 vid 10"
+}
+
+test_8021qvs_maxgroups_too_many_entries_ctl6()
+{
+	test_maxgroups_too_many_entries ctl6 port_vlan "dev $swp1 vid 10"
+}
+
+test_maxgroups_too_many_cross_vlan()
+{
+	local CFG=$1; shift
+
+	RET=0
+
+	local locus0="dev $swp1"
+	local locus1="dev $swp1 vid 10"
+	local locus2="dev $swp1 vid 20"
+	local n1=$(bridge_port_vlan_ngroups_get "$locus1")
+	local n2=$(bridge_port_vlan_ngroups_get "$locus2")
+	local msg
+
+	if ((n1 > n2)); then
+		local tmp=$n1
+		n1=$n2
+		n2=$tmp
+
+		tmp="$locus1"
+		locus1="$locus2"
+		locus2="$tmp"
+	fi
+
+	# Now 0 <= n1 <= n2.
+	${CFG}_entries_add "$locus2" temp 5 112
+	check_err $? "Couldn't add 5 entries"
+
+	n2=$(bridge_port_vlan_ngroups_get "$locus2")
+	# Now 0 <= n1 < n2-1.
+
+	# Setting locus1'maxgroups to n2-1 should pass. The number is
+	# smaller than both the absolute number of MDB entries, and in
+	# particular than number of locus2's number of entries, but it is
+	# large enough to cover locus1's entries. Thus we check that
+	# individual VLAN's ngroups are independent.
+	bridge_port_vlan_maxgroups_set "$locus1" $((n2-1))
+	check_err $? "Setting ${locus1}'s maxgroups to $((n2-1)) failed"
+
+	msg=$(${CFG}_entries_add "$locus1" temp $n2 111 2>&1)
+	check_fail $? "$locus1: Adding $n2 MDB entries passed, but should have failed"
+	bridge_maxgroups_errmsg_check_${CFG} "$msg"
+
+	bridge_port_maxgroups_set "$locus0" $((n1 + n2 + 2))
+	check_err $? "$locus0: Couldn't set maximum"
+
+	msg=$(${CFG}_entries_add "$locus1" temp 5 111 2>&1)
+	check_fail $? "$locus1: Adding 5 MDB entries passed, but should have failed"
+	bridge_maxgroups_errmsg_check_${CFG} "$msg"
+
+	# IGMP/MLD packets can cause several entries to be added, before
+	# the maximum is hit and the rest is then bounced. Remove what was
+	# committed, if anything.
+	${CFG}_entries_del "$locus1" temp 5 111 2>/dev/null
+
+	${CFG}_entries_add "$locus1" temp 2 111
+	check_err $? "$locus1: Adding 2 MDB entries failed, but should have passed"
+
+	${CFG}_entries_del "$locus1" temp 2 111
+	check_err $? "Couldn't delete MDB entries"
+
+	${CFG}_entries_del "$locus2" temp 5 112
+	check_err $? "Couldn't delete MDB entries"
+
+	bridge_port_vlan_maxgroups_set "$locus1" 0
+	check_err $? "$locus1: Couldn't set maximum to 0"
+
+	bridge_port_maxgroups_set "$locus0" 0
+	check_err $? "$locus0: Couldn't set maximum to 0"
+
+	log_test "$CFG: port_vlan maxgroups: isolation of port and per-VLAN ngroups"
+}
+
+test_8021qvs_maxgroups_too_many_cross_vlan_cfg4()
+{
+	test_maxgroups_too_many_cross_vlan cfg4
+}
+
+test_8021qvs_maxgroups_too_many_cross_vlan_ctl4()
+{
+	test_maxgroups_too_many_cross_vlan ctl4
+}
+
+test_8021qvs_maxgroups_too_many_cross_vlan_cfg6()
+{
+	test_maxgroups_too_many_cross_vlan cfg6
+}
+
+test_8021qvs_maxgroups_too_many_cross_vlan_ctl6()
+{
+	test_maxgroups_too_many_cross_vlan ctl6
+}
+
+test_vlan_attributes()
+{
+	local locus=$1; shift
+	local expect=$1; shift
+
+	RET=0
+
+	local max=$(bridge_port_vlan_maxgroups_get "$locus")
+	local n=$(bridge_port_vlan_ngroups_get "$locus")
+
+	eval "[[ $max $expect ]]"
+	check_err $? "$locus: maxgroups attribute expected to be $expect, but was $max"
+
+	eval "[[ $n $expect ]]"
+	check_err $? "$locus: ngroups attribute expected to be $expect, but was $n"
+
+	log_test "port_vlan: presence of ngroups and maxgroups attributes"
+}
+
+test_8021q_vlan_attributes()
+{
+	test_vlan_attributes "dev $swp1 vid 10" "== null"
+}
+
+test_8021qvs_vlan_attributes()
+{
+	test_vlan_attributes "dev $swp1 vid 10" "-ge 0"
+}
+
+test_toggle_vlan_snooping()
+{
+	local mode=$1; shift
+
+	RET=0
+
+	local CFG=cfg4
+	local context=port_vlan
+	local locus="dev $swp1 vid 10"
+
+	${CFG}_entries_add "$locus" $mode 5
+	check_err $? "Couldn't add MDB entries"
+
+	bridge_${context}_maxgroups_set "$locus" 100
+	check_err $? "Failed to set max to 100"
+
+	ip link set dev br0 type bridge mcast_vlan_snooping 0
+	sleep 1
+	ip link set dev br0 type bridge mcast_vlan_snooping 1
+
+	local n=$(bridge_${context}_ngroups_get "$locus")
+	local nn=$(bridge mdb show dev br0 | grep $swp1 | wc -l)
+	((nn == n))
+	check_err $? "mcast_n_groups expected to be $nn, but $n reported"
+
+	local max=$(bridge_${context}_maxgroups_get "$locus")
+	((max == 100))
+	check_err $? "Max groups expected to be 100 but $max reported"
+
+	bridge_${context}_maxgroups_set "$locus" 0
+	check_err $? "Failed to set max to 0"
+
+	log_test "$CFG: $context: $mode: mcast_vlan_snooping toggle"
+}
+
+test_toggle_vlan_snooping_temp()
+{
+	test_toggle_vlan_snooping temp
+}
+
+test_toggle_vlan_snooping_permanent()
+{
+	test_toggle_vlan_snooping permanent
+}
+
+# ngroup test suites
+
+test_8021d_ngroups_cfg4()
+{
+	test_8021d_ngroups_reporting_cfg4
+}
+
+test_8021d_ngroups_ctl4()
+{
+	test_8021d_ngroups_reporting_ctl4
+}
+
+test_8021d_ngroups_cfg6()
+{
+	test_8021d_ngroups_reporting_cfg6
+}
+
+test_8021d_ngroups_ctl6()
+{
+	test_8021d_ngroups_reporting_ctl6
+}
+
+test_8021q_ngroups_cfg4()
+{
+	test_8021q_ngroups_reporting_cfg4
+}
+
+test_8021q_ngroups_ctl4()
+{
+	test_8021q_ngroups_reporting_ctl4
+}
+
+test_8021q_ngroups_cfg6()
+{
+	test_8021q_ngroups_reporting_cfg6
+}
+
+test_8021q_ngroups_ctl6()
+{
+	test_8021q_ngroups_reporting_ctl6
+}
+
+test_8021qvs_ngroups_cfg4()
+{
+	test_8021qvs_ngroups_reporting_cfg4
+	test_8021qvs_ngroups_cross_vlan_cfg4
+}
+
+test_8021qvs_ngroups_ctl4()
+{
+	test_8021qvs_ngroups_reporting_ctl4
+	test_8021qvs_ngroups_cross_vlan_ctl4
+}
+
+test_8021qvs_ngroups_cfg6()
+{
+	test_8021qvs_ngroups_reporting_cfg6
+	test_8021qvs_ngroups_cross_vlan_cfg6
+}
+
+test_8021qvs_ngroups_ctl6()
+{
+	test_8021qvs_ngroups_reporting_ctl6
+	test_8021qvs_ngroups_cross_vlan_ctl6
+}
+
+# maxgroups test suites
+
+test_8021d_maxgroups_cfg4()
+{
+	test_8021d_maxgroups_zero_cfg4
+	test_8021d_maxgroups_too_low_cfg4
+	test_8021d_maxgroups_too_many_entries_cfg4
+}
+
+test_8021d_maxgroups_ctl4()
+{
+	test_8021d_maxgroups_zero_ctl4
+	test_8021d_maxgroups_too_low_ctl4
+	test_8021d_maxgroups_too_many_entries_ctl4
+}
+
+test_8021d_maxgroups_cfg6()
+{
+	test_8021d_maxgroups_zero_cfg6
+	test_8021d_maxgroups_too_low_cfg6
+	test_8021d_maxgroups_too_many_entries_cfg6
+}
+
+test_8021d_maxgroups_ctl6()
+{
+	test_8021d_maxgroups_zero_ctl6
+	test_8021d_maxgroups_too_low_ctl6
+	test_8021d_maxgroups_too_many_entries_ctl6
+}
+
+test_8021q_maxgroups_cfg4()
+{
+	test_8021q_maxgroups_zero_cfg4
+	test_8021q_maxgroups_too_low_cfg4
+	test_8021q_maxgroups_too_many_entries_cfg4
+}
+
+test_8021q_maxgroups_ctl4()
+{
+	test_8021q_maxgroups_zero_ctl4
+	test_8021q_maxgroups_too_low_ctl4
+	test_8021q_maxgroups_too_many_entries_ctl4
+}
+
+test_8021q_maxgroups_cfg6()
+{
+	test_8021q_maxgroups_zero_cfg6
+	test_8021q_maxgroups_too_low_cfg6
+	test_8021q_maxgroups_too_many_entries_cfg6
+}
+
+test_8021q_maxgroups_ctl6()
+{
+	test_8021q_maxgroups_zero_ctl6
+	test_8021q_maxgroups_too_low_ctl6
+	test_8021q_maxgroups_too_many_entries_ctl6
+}
+
+test_8021qvs_maxgroups_cfg4()
+{
+	test_8021qvs_maxgroups_zero_cfg4
+	test_8021qvs_maxgroups_zero_cross_vlan_cfg4
+	test_8021qvs_maxgroups_too_low_cfg4
+	test_8021qvs_maxgroups_too_many_entries_cfg4
+	test_8021qvs_maxgroups_too_many_cross_vlan_cfg4
+}
+
+test_8021qvs_maxgroups_ctl4()
+{
+	test_8021qvs_maxgroups_zero_ctl4
+	test_8021qvs_maxgroups_zero_cross_vlan_ctl4
+	test_8021qvs_maxgroups_too_low_ctl4
+	test_8021qvs_maxgroups_too_many_entries_ctl4
+	test_8021qvs_maxgroups_too_many_cross_vlan_ctl4
+}
+
+test_8021qvs_maxgroups_cfg6()
+{
+	test_8021qvs_maxgroups_zero_cfg6
+	test_8021qvs_maxgroups_zero_cross_vlan_cfg6
+	test_8021qvs_maxgroups_too_low_cfg6
+	test_8021qvs_maxgroups_too_many_entries_cfg6
+	test_8021qvs_maxgroups_too_many_cross_vlan_cfg6
+}
+
+test_8021qvs_maxgroups_ctl6()
+{
+	test_8021qvs_maxgroups_zero_ctl6
+	test_8021qvs_maxgroups_zero_cross_vlan_ctl6
+	test_8021qvs_maxgroups_too_low_ctl6
+	test_8021qvs_maxgroups_too_many_entries_ctl6
+	test_8021qvs_maxgroups_too_many_cross_vlan_ctl6
+}
+
+# other test suites
+
+test_8021qvs_toggle_vlan_snooping()
+{
+	test_toggle_vlan_snooping_temp
+	test_toggle_vlan_snooping_permanent
+}
+
+# test groups
+
+test_8021d()
+{
+	# Tests for vlan_filtering 0 mcast_vlan_snooping 0.
+
+	switch_create_8021d
+	setup_wait
+
+	test_8021d_ngroups_cfg4
+	test_8021d_ngroups_ctl4
+	test_8021d_ngroups_cfg6
+	test_8021d_ngroups_ctl6
+	test_8021d_maxgroups_cfg4
+	test_8021d_maxgroups_ctl4
+	test_8021d_maxgroups_cfg6
+	test_8021d_maxgroups_ctl6
+
+	switch_destroy
+}
+
+test_8021q()
+{
+	# Tests for vlan_filtering 1 mcast_vlan_snooping 0.
+
+	switch_create_8021q
+	setup_wait
+
+	test_8021q_vlan_attributes
+	test_8021q_ngroups_cfg4
+	test_8021q_ngroups_ctl4
+	test_8021q_ngroups_cfg6
+	test_8021q_ngroups_ctl6
+	test_8021q_maxgroups_cfg4
+	test_8021q_maxgroups_ctl4
+	test_8021q_maxgroups_cfg6
+	test_8021q_maxgroups_ctl6
+
+	switch_destroy
+}
+
+test_8021qvs()
+{
+	# Tests for vlan_filtering 1 mcast_vlan_snooping 1.
+
+	switch_create_8021qvs
+	setup_wait
+
+	test_8021qvs_vlan_attributes
+	test_8021qvs_ngroups_cfg4
+	test_8021qvs_ngroups_ctl4
+	test_8021qvs_ngroups_cfg6
+	test_8021qvs_ngroups_ctl6
+	test_8021qvs_maxgroups_cfg4
+	test_8021qvs_maxgroups_ctl4
+	test_8021qvs_maxgroups_cfg6
+	test_8021qvs_maxgroups_ctl6
+	test_8021qvs_toggle_vlan_snooping
+
+	switch_destroy
+}
+
+trap cleanup EXIT
+
+setup_prepare
+tests_run
+
+exit $EXIT_STATUS
-- 
cgit 


From 5646bbd6684acf5c9b9dedb863b7d2f6f5a330fb Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 25 Nov 2022 10:42:16 +0100
Subject: selftests: Emit a warning if getcpu() is missing on 32bit

The VDSO implementation for getcpu() has been wired up on 32bit so warn if
missing.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://lore.kernel.org/r/20221125094216.3663444-4-bigeasy@linutronix.de
---
 tools/testing/selftests/x86/test_vsyscall.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c
index 5b45e6986aea..47cab972807c 100644
--- a/tools/testing/selftests/x86/test_vsyscall.c
+++ b/tools/testing/selftests/x86/test_vsyscall.c
@@ -92,11 +92,8 @@ static void init_vdso(void)
 		printf("[WARN]\tfailed to find time in vDSO\n");
 
 	vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu");
-	if (!vdso_getcpu) {
-		/* getcpu() was never wired up in the 32-bit vDSO. */
-		printf("[%s]\tfailed to find getcpu in vDSO\n",
-		       sizeof(long) == 8 ? "WARN" : "NOTE");
-	}
+	if (!vdso_getcpu)
+		printf("[WARN]\tfailed to find getcpu in vDSO\n");
 }
 
 static int init_vsys(void)
-- 
cgit 


From d2365054781908bd0bd43b08dd471b40db14b6f4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 2 Feb 2023 21:01:36 +0000
Subject: KVM: selftests: Enable USERFAULTFD

The page_fault_test KVM selftest requires userfaultfd but the config
fragment for the KVM selftests does not enable it, meaning that those tests
are skipped in CI systems that rely on appropriate settings in the config
fragments except on S/390 which happens to have it in defconfig. Enable
the option in the config fragment so that the tests get run.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/r/20230202-kvm-selftest-userfaultfd-v1-1-8186ac5a33a5@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 tools/testing/selftests/kvm/config | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/config b/tools/testing/selftests/kvm/config
index 63ed533f73d6..d011b38e259e 100644
--- a/tools/testing/selftests/kvm/config
+++ b/tools/testing/selftests/kvm/config
@@ -1,3 +1,4 @@
 CONFIG_KVM=y
 CONFIG_KVM_INTEL=y
 CONFIG_KVM_AMD=y
+CONFIG_USERFAULTFD=y
-- 
cgit 


From d1d7730ff8756c6db20ff82096b577d8cfbaf547 Mon Sep 17 00:00:00 2001
From: Hao Xiang <hao.xiang@bytedance.com>
Date: Fri, 3 Feb 2023 23:48:42 +0000
Subject: libbpf: Correctly set the kernel code version in Debian kernel.

In a previous commit, Ubuntu kernel code version is correctly set
by retrieving the information from /proc/version_signature.

commit<5b3d72987701d51bf31823b39db49d10970f5c2d>
(libbpf: Improve LINUX_VERSION_CODE detection)

The /proc/version_signature file doesn't present in at least the
older versions of Debian distributions (eg, Debian 9, 10). The Debian
kernel has a similar issue where the release information from uname()
syscall doesn't give the kernel code version that matches what the
kernel actually expects. Below is an example content from Debian 10.

release: 4.19.0-23-amd64
version: #1 SMP Debian 4.19.269-1 (2022-12-20) x86_64

Debian reports incorrect kernel version in utsname::release returned
by uname() syscall, which in older kernels (Debian 9, 10) leads to
kprobe BPF programs failing to load due to the version check mismatch.

Fortunately, the correct kernel code version presents in the
utsname::version returned by uname() syscall in Debian kernels. This
change adds another get kernel version function to handle Debian in
addition to the previously added get kernel version function to handle
Ubuntu. Some minor refactoring work is also done to make the code more
readable.

Signed-off-by: Hao Xiang <hao.xiang@bytedance.com>
Signed-off-by: Ho-Ren (Jack) Chuang <horenchuang@bytedance.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230203234842.2933903-1-hao.xiang@bytedance.com
---
 tools/lib/bpf/libbpf.c        | 37 -------------------
 tools/lib/bpf/libbpf_probes.c | 83 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 37 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index eed5cec6f510..4191a78b2815 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -34,7 +34,6 @@
 #include <linux/limits.h>
 #include <linux/perf_event.h>
 #include <linux/ring_buffer.h>
-#include <linux/version.h>
 #include <sys/epoll.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
@@ -870,42 +869,6 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data,
 	return 0;
 }
 
-__u32 get_kernel_version(void)
-{
-	/* On Ubuntu LINUX_VERSION_CODE doesn't correspond to info.release,
-	 * but Ubuntu provides /proc/version_signature file, as described at
-	 * https://ubuntu.com/kernel, with an example contents below, which we
-	 * can use to get a proper LINUX_VERSION_CODE.
-	 *
-	 *   Ubuntu 5.4.0-12.15-generic 5.4.8
-	 *
-	 * In the above, 5.4.8 is what kernel is actually expecting, while
-	 * uname() call will return 5.4.0 in info.release.
-	 */
-	const char *ubuntu_kver_file = "/proc/version_signature";
-	__u32 major, minor, patch;
-	struct utsname info;
-
-	if (faccessat(AT_FDCWD, ubuntu_kver_file, R_OK, AT_EACCESS) == 0) {
-		FILE *f;
-
-		f = fopen(ubuntu_kver_file, "r");
-		if (f) {
-			if (fscanf(f, "%*s %*s %d.%d.%d\n", &major, &minor, &patch) == 3) {
-				fclose(f);
-				return KERNEL_VERSION(major, minor, patch);
-			}
-			fclose(f);
-		}
-		/* something went wrong, fall back to uname() approach */
-	}
-
-	uname(&info);
-	if (sscanf(info.release, "%u.%u.%u", &major, &minor, &patch) != 3)
-		return 0;
-	return KERNEL_VERSION(major, minor, patch);
-}
-
 static const struct btf_member *
 find_member_by_offset(const struct btf_type *t, __u32 bit_offset)
 {
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index b44fcbb4b42e..4f3bc968ff8e 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -12,11 +12,94 @@
 #include <linux/btf.h>
 #include <linux/filter.h>
 #include <linux/kernel.h>
+#include <linux/version.h>
 
 #include "bpf.h"
 #include "libbpf.h"
 #include "libbpf_internal.h"
 
+/* On Ubuntu LINUX_VERSION_CODE doesn't correspond to info.release,
+ * but Ubuntu provides /proc/version_signature file, as described at
+ * https://ubuntu.com/kernel, with an example contents below, which we
+ * can use to get a proper LINUX_VERSION_CODE.
+ *
+ *   Ubuntu 5.4.0-12.15-generic 5.4.8
+ *
+ * In the above, 5.4.8 is what kernel is actually expecting, while
+ * uname() call will return 5.4.0 in info.release.
+ */
+static __u32 get_ubuntu_kernel_version(void)
+{
+	const char *ubuntu_kver_file = "/proc/version_signature";
+	__u32 major, minor, patch;
+	int ret;
+	FILE *f;
+
+	if (faccessat(AT_FDCWD, ubuntu_kver_file, R_OK, AT_EACCESS) != 0)
+		return 0;
+
+	f = fopen(ubuntu_kver_file, "r");
+	if (!f)
+		return 0;
+
+	ret = fscanf(f, "%*s %*s %u.%u.%u\n", &major, &minor, &patch);
+	fclose(f);
+	if (ret != 3)
+		return 0;
+
+	return KERNEL_VERSION(major, minor, patch);
+}
+
+/* On Debian LINUX_VERSION_CODE doesn't correspond to info.release.
+ * Instead, it is provided in info.version. An example content of
+ * Debian 10 looks like the below.
+ *
+ *   utsname::release   4.19.0-22-amd64
+ *   utsname::version   #1 SMP Debian 4.19.260-1 (2022-09-29)
+ *
+ * In the above, 4.19.260 is what kernel is actually expecting, while
+ * uname() call will return 4.19.0 in info.release.
+ */
+static __u32 get_debian_kernel_version(struct utsname *info)
+{
+	__u32 major, minor, patch;
+	char *p;
+
+	p = strstr(info->version, "Debian ");
+	if (!p) {
+		/* This is not a Debian kernel. */
+		return 0;
+	}
+
+	if (sscanf(p, "Debian %u.%u.%u", &major, &minor, &patch) != 3)
+		return 0;
+
+	return KERNEL_VERSION(major, minor, patch);
+}
+
+__u32 get_kernel_version(void)
+{
+	__u32 major, minor, patch, version;
+	struct utsname info;
+
+	/* Check if this is an Ubuntu kernel. */
+	version = get_ubuntu_kernel_version();
+	if (version != 0)
+		return version;
+
+	uname(&info);
+
+	/* Check if this is a Debian kernel. */
+	version = get_debian_kernel_version(&info);
+	if (version != 0)
+		return version;
+
+	if (sscanf(info.release, "%u.%u.%u", &major, &minor, &patch) != 3)
+		return 0;
+
+	return KERNEL_VERSION(major, minor, patch);
+}
+
 static int probe_prog_load(enum bpf_prog_type prog_type,
 			   const struct bpf_insn *insns, size_t insns_cnt,
 			   char *log_buf, size_t log_buf_sz)
-- 
cgit 


From 8306829bf845186ec8c470c771243016c30c3d74 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Mon, 6 Feb 2023 09:22:29 +0000
Subject: selftests/bpf: Fix spelling mistake "detecion" -> "detection"

There is a spelling mistake in a literal string. Fix it.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230206092229.46416-1-colin.i.king@gmail.com
---
 tools/testing/selftests/bpf/xdp_features.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/xdp_features.c b/tools/testing/selftests/bpf/xdp_features.c
index 10fad1243573..fce12165213b 100644
--- a/tools/testing/selftests/bpf/xdp_features.c
+++ b/tools/testing/selftests/bpf/xdp_features.c
@@ -57,7 +57,7 @@ static void sig_handler(int sig)
 
 const char *argp_program_version = "xdp-features 0.0";
 const char argp_program_doc[] =
-"XDP features detecion application.\n"
+"XDP features detection application.\n"
 "\n"
 "XDP features application checks the XDP advertised features match detected ones.\n"
 "\n"
-- 
cgit 


From 647037adcad00f2bab8828d3d41cd0553d41f3bd Mon Sep 17 00:00:00 2001
From: Aaron Thompson <dev@aaront.org>
Date: Tue, 7 Feb 2023 08:21:51 +0000
Subject: Revert "mm: Always release pages to the buddy allocator in
 memblock_free_late()."

This reverts commit 115d9d77bb0f9152c60b6e8646369fa7f6167593.

The pages being freed by memblock_free_late() have already been
initialized, but if they are in the deferred init range,
__free_one_page() might access nearby uninitialized pages when trying to
coalesce buddies. This can, for example, trigger this BUG:

  BUG: unable to handle page fault for address: ffffe964c02580c8
  RIP: 0010:__list_del_entry_valid+0x3f/0x70
   <TASK>
   __free_one_page+0x139/0x410
   __free_pages_ok+0x21d/0x450
   memblock_free_late+0x8c/0xb9
   efi_free_boot_services+0x16b/0x25c
   efi_enter_virtual_mode+0x403/0x446
   start_kernel+0x678/0x714
   secondary_startup_64_no_verify+0xd2/0xdb
   </TASK>

A proper fix will be more involved so revert this change for the time
being.

Fixes: 115d9d77bb0f ("mm: Always release pages to the buddy allocator in memblock_free_late().")
Signed-off-by: Aaron Thompson <dev@aaront.org>
Link: https://lore.kernel.org/r/20230207082151.1303-1-dev@aaront.org
Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
---
 mm/memblock.c                     | 8 +-------
 tools/testing/memblock/internal.h | 4 ----
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'tools')

diff --git a/mm/memblock.c b/mm/memblock.c
index 685e30e6d27c..d036c7861310 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1640,13 +1640,7 @@ void __init memblock_free_late(phys_addr_t base, phys_addr_t size)
 	end = PFN_DOWN(base + size);
 
 	for (; cursor < end; cursor++) {
-		/*
-		 * Reserved pages are always initialized by the end of
-		 * memblock_free_all() (by memmap_init() and, if deferred
-		 * initialization is enabled, memmap_init_reserved_pages()), so
-		 * these pages can be released directly to the buddy allocator.
-		 */
-		__free_pages_core(pfn_to_page(cursor), 0);
+		memblock_free_pages(pfn_to_page(cursor), cursor, 0);
 		totalram_pages_inc();
 	}
 }
diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h
index 85973e55489e..fdb7f5db7308 100644
--- a/tools/testing/memblock/internal.h
+++ b/tools/testing/memblock/internal.h
@@ -15,10 +15,6 @@ bool mirrored_kernelcore = false;
 
 struct page {};
 
-void __free_pages_core(struct page *page, unsigned int order)
-{
-}
-
 void memblock_free_pages(struct page *page, unsigned long pfn,
 			 unsigned int order)
 {
-- 
cgit 


From bbb253b206b9c417928a6c827d038e457f3012e9 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 5 Feb 2023 21:24:09 +0200
Subject: selftests: ocelot: tc_flower_chains: make test_vlan_ingress_modify()
 more comprehensive

We have two IS1 filters of the OCELOT_VCAP_KEY_ANY key type (the one with
"action vlan pop" and the one with "action vlan modify") and one of the
OCELOT_VCAP_KEY_IPV4 key type (the one with "action skbedit priority").
But we have no IS1 filter with the OCELOT_VCAP_KEY_ETYPE key type, and
there was an uncaught breakage there.

To increase test coverage, convert one of the OCELOT_VCAP_KEY_ANY
filters to OCELOT_VCAP_KEY_ETYPE, by making the filter also match on the
MAC SA of the traffic sent by mausezahn, $h1_mac.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://lore.kernel.org/r/20230205192409.1796428-2-vladimir.oltean@nxp.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh b/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh
index 9c79bbcce5a8..aff0a59f92d9 100755
--- a/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh
+++ b/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh
@@ -246,7 +246,7 @@ test_vlan_ingress_modify()
 	bridge vlan add dev $swp2 vid 300
 
 	tc filter add dev $swp1 ingress chain $(IS1 2) pref 3 \
-		protocol 802.1Q flower skip_sw vlan_id 200 \
+		protocol 802.1Q flower skip_sw vlan_id 200 src_mac $h1_mac \
 		action vlan modify id 300 \
 		action goto chain $(IS2 0 0)
 
-- 
cgit 


From d7b9dc14031b7f8865aeedc90d8bc68a4bb16023 Mon Sep 17 00:00:00 2001
From: Nina Schoetterl-Glausch <nsg@linux.ibm.com>
Date: Fri, 27 Jan 2023 18:45:52 +0100
Subject: KVM: selftests: Compile s390 tests with -march=z10

The guest used in s390 kvm selftests is not be set up to handle all
instructions the compiler might emit, i.e. vector instructions, leading
to crashes.
Limit what the compiler emits to the oldest machine model currently
supported by Linux.

Signed-off-by: Nina Schoetterl-Glausch <nsg@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Link: https://lore.kernel.org/r/20230127174552.3370169-1-nsg@linux.ibm.com
Message-Id: <20230127174552.3370169-1-nsg@linux.ibm.com>
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 tools/testing/selftests/kvm/Makefile | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 1750f91dd936..df0989949eb5 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -200,6 +200,9 @@ CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
 	-I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \
 	-I$(<D) -Iinclude/$(ARCH_DIR) -I ../rseq -I.. $(EXTRA_CFLAGS) \
 	$(KHDR_INCLUDES)
+ifeq ($(ARCH),s390)
+	CFLAGS += -march=z10
+endif
 
 no-pie-option := $(call try-run, echo 'int main(void) { return 0; }' | \
         $(CC) -Werror $(CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie)
-- 
cgit 


From 7d42b38de9927ab04162f36f8c448aed8d344e40 Mon Sep 17 00:00:00 2001
From: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Date: Mon, 6 Feb 2023 17:45:49 +0100
Subject: KVM: s390: selftest: memop: Pass mop_desc via pointer

The struct is quite large, so this seems nicer.

Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Link: https://lore.kernel.org/r/20230206164602.138068-2-scgl@linux.ibm.com
Message-Id: <20230206164602.138068-2-scgl@linux.ibm.com>
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/memop.c | 44 +++++++++++++++----------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
index 3fd81e58f40c..9c05d1205114 100644
--- a/tools/testing/selftests/kvm/s390x/memop.c
+++ b/tools/testing/selftests/kvm/s390x/memop.c
@@ -48,53 +48,53 @@ struct mop_desc {
 	uint8_t key;
 };
 
-static struct kvm_s390_mem_op ksmo_from_desc(struct mop_desc desc)
+static struct kvm_s390_mem_op ksmo_from_desc(const struct mop_desc *desc)
 {
 	struct kvm_s390_mem_op ksmo = {
-		.gaddr = (uintptr_t)desc.gaddr,
-		.size = desc.size,
-		.buf = ((uintptr_t)desc.buf),
+		.gaddr = (uintptr_t)desc->gaddr,
+		.size = desc->size,
+		.buf = ((uintptr_t)desc->buf),
 		.reserved = "ignored_ignored_ignored_ignored"
 	};
 
-	switch (desc.target) {
+	switch (desc->target) {
 	case LOGICAL:
-		if (desc.mode == READ)
+		if (desc->mode == READ)
 			ksmo.op = KVM_S390_MEMOP_LOGICAL_READ;
-		if (desc.mode == WRITE)
+		if (desc->mode == WRITE)
 			ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
 		break;
 	case SIDA:
-		if (desc.mode == READ)
+		if (desc->mode == READ)
 			ksmo.op = KVM_S390_MEMOP_SIDA_READ;
-		if (desc.mode == WRITE)
+		if (desc->mode == WRITE)
 			ksmo.op = KVM_S390_MEMOP_SIDA_WRITE;
 		break;
 	case ABSOLUTE:
-		if (desc.mode == READ)
+		if (desc->mode == READ)
 			ksmo.op = KVM_S390_MEMOP_ABSOLUTE_READ;
-		if (desc.mode == WRITE)
+		if (desc->mode == WRITE)
 			ksmo.op = KVM_S390_MEMOP_ABSOLUTE_WRITE;
 		break;
 	case INVALID:
 		ksmo.op = -1;
 	}
-	if (desc.f_check)
+	if (desc->f_check)
 		ksmo.flags |= KVM_S390_MEMOP_F_CHECK_ONLY;
-	if (desc.f_inject)
+	if (desc->f_inject)
 		ksmo.flags |= KVM_S390_MEMOP_F_INJECT_EXCEPTION;
-	if (desc._set_flags)
-		ksmo.flags = desc.set_flags;
-	if (desc.f_key) {
+	if (desc->_set_flags)
+		ksmo.flags = desc->set_flags;
+	if (desc->f_key) {
 		ksmo.flags |= KVM_S390_MEMOP_F_SKEY_PROTECTION;
-		ksmo.key = desc.key;
+		ksmo.key = desc->key;
 	}
-	if (desc._ar)
-		ksmo.ar = desc.ar;
+	if (desc->_ar)
+		ksmo.ar = desc->ar;
 	else
 		ksmo.ar = 0;
-	if (desc._sida_offset)
-		ksmo.sida_offset = desc.sida_offset;
+	if (desc->_sida_offset)
+		ksmo.sida_offset = desc->sida_offset;
 
 	return ksmo;
 }
@@ -183,7 +183,7 @@ static int err_memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo)
 		else								\
 			__desc.gaddr = __desc.gaddr_v;				\
 	}									\
-	__ksmo = ksmo_from_desc(__desc);					\
+	__ksmo = ksmo_from_desc(&__desc);					\
 	print_memop(__info.vcpu, &__ksmo);					\
 	err##memop_ioctl(__info, &__ksmo);					\
 })
-- 
cgit 


From 12d27074193bda046122b6f94bacba818e4fd6c6 Mon Sep 17 00:00:00 2001
From: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Date: Mon, 6 Feb 2023 17:45:50 +0100
Subject: KVM: s390: selftest: memop: Replace macros by functions

Replace the DEFAULT_* test helpers by functions, as they don't
need the extra flexibility.

Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Acked-by: Janosch Frank <frankja@linux.ibm.com>
Link: https://lore.kernel.org/r/20230206164602.138068-3-scgl@linux.ibm.com
Message-Id: <20230206164602.138068-3-scgl@linux.ibm.com>
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/memop.c | 82 +++++++++++++++----------------
 1 file changed, 39 insertions(+), 43 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
index 9c05d1205114..df1c726294b2 100644
--- a/tools/testing/selftests/kvm/s390x/memop.c
+++ b/tools/testing/selftests/kvm/s390x/memop.c
@@ -48,6 +48,8 @@ struct mop_desc {
 	uint8_t key;
 };
 
+const uint8_t NO_KEY = 0xff;
+
 static struct kvm_s390_mem_op ksmo_from_desc(const struct mop_desc *desc)
 {
 	struct kvm_s390_mem_op ksmo = {
@@ -85,7 +87,7 @@ static struct kvm_s390_mem_op ksmo_from_desc(const struct mop_desc *desc)
 		ksmo.flags |= KVM_S390_MEMOP_F_INJECT_EXCEPTION;
 	if (desc->_set_flags)
 		ksmo.flags = desc->set_flags;
-	if (desc->f_key) {
+	if (desc->f_key && desc->key != NO_KEY) {
 		ksmo.flags |= KVM_S390_MEMOP_F_SKEY_PROTECTION;
 		ksmo.key = desc->key;
 	}
@@ -268,34 +270,28 @@ static void prepare_mem12(void)
 #define ASSERT_MEM_EQ(p1, p2, size) \
 	TEST_ASSERT(!memcmp(p1, p2, size), "Memory contents do not match!")
 
-#define DEFAULT_WRITE_READ(copy_cpu, mop_cpu, mop_target_p, size, ...)		\
-({										\
-	struct test_info __copy_cpu = (copy_cpu), __mop_cpu = (mop_cpu);	\
-	enum mop_target __target = (mop_target_p);				\
-	uint32_t __size = (size);						\
-										\
-	prepare_mem12();							\
-	CHECK_N_DO(MOP, __mop_cpu, __target, WRITE, mem1, __size,		\
-			GADDR_V(mem1), ##__VA_ARGS__);				\
-	HOST_SYNC(__copy_cpu, STAGE_COPIED);					\
-	CHECK_N_DO(MOP, __mop_cpu, __target, READ, mem2, __size,		\
-			GADDR_V(mem2), ##__VA_ARGS__);				\
-	ASSERT_MEM_EQ(mem1, mem2, __size);					\
-})
+static void default_write_read(struct test_info copy_cpu, struct test_info mop_cpu,
+			       enum mop_target mop_target, uint32_t size, uint8_t key)
+{
+	prepare_mem12();
+	CHECK_N_DO(MOP, mop_cpu, mop_target, WRITE, mem1, size,
+		   GADDR_V(mem1), KEY(key));
+	HOST_SYNC(copy_cpu, STAGE_COPIED);
+	CHECK_N_DO(MOP, mop_cpu, mop_target, READ, mem2, size,
+		   GADDR_V(mem2), KEY(key));
+	ASSERT_MEM_EQ(mem1, mem2, size);
+}
 
-#define DEFAULT_READ(copy_cpu, mop_cpu, mop_target_p, size, ...)		\
-({										\
-	struct test_info __copy_cpu = (copy_cpu), __mop_cpu = (mop_cpu);	\
-	enum mop_target __target = (mop_target_p);				\
-	uint32_t __size = (size);						\
-										\
-	prepare_mem12();							\
-	CHECK_N_DO(MOP, __mop_cpu, __target, WRITE, mem1, __size,		\
-			GADDR_V(mem1));						\
-	HOST_SYNC(__copy_cpu, STAGE_COPIED);					\
-	CHECK_N_DO(MOP, __mop_cpu, __target, READ, mem2, __size, ##__VA_ARGS__);\
-	ASSERT_MEM_EQ(mem1, mem2, __size);					\
-})
+static void default_read(struct test_info copy_cpu, struct test_info mop_cpu,
+			 enum mop_target mop_target, uint32_t size, uint8_t key)
+{
+	prepare_mem12();
+	CHECK_N_DO(MOP, mop_cpu, mop_target, WRITE, mem1, size, GADDR_V(mem1));
+	HOST_SYNC(copy_cpu, STAGE_COPIED);
+	CHECK_N_DO(MOP, mop_cpu, mop_target, READ, mem2, size,
+		   GADDR_V(mem2), KEY(key));
+	ASSERT_MEM_EQ(mem1, mem2, size);
+}
 
 static void guest_copy(void)
 {
@@ -310,7 +306,7 @@ static void test_copy(void)
 
 	HOST_SYNC(t.vcpu, STAGE_INITED);
 
-	DEFAULT_WRITE_READ(t.vcpu, t.vcpu, LOGICAL, t.size);
+	default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, NO_KEY);
 
 	kvm_vm_free(t.kvm_vm);
 }
@@ -357,26 +353,26 @@ static void test_copy_key(void)
 	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
 
 	/* vm, no key */
-	DEFAULT_WRITE_READ(t.vcpu, t.vm, ABSOLUTE, t.size);
+	default_write_read(t.vcpu, t.vm, ABSOLUTE, t.size, NO_KEY);
 
 	/* vm/vcpu, machting key or key 0 */
-	DEFAULT_WRITE_READ(t.vcpu, t.vcpu, LOGICAL, t.size, KEY(0));
-	DEFAULT_WRITE_READ(t.vcpu, t.vcpu, LOGICAL, t.size, KEY(9));
-	DEFAULT_WRITE_READ(t.vcpu, t.vm, ABSOLUTE, t.size, KEY(0));
-	DEFAULT_WRITE_READ(t.vcpu, t.vm, ABSOLUTE, t.size, KEY(9));
+	default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, 0);
+	default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, 9);
+	default_write_read(t.vcpu, t.vm, ABSOLUTE, t.size, 0);
+	default_write_read(t.vcpu, t.vm, ABSOLUTE, t.size, 9);
 	/*
 	 * There used to be different code paths for key handling depending on
 	 * if the region crossed a page boundary.
 	 * There currently are not, but the more tests the merrier.
 	 */
-	DEFAULT_WRITE_READ(t.vcpu, t.vcpu, LOGICAL, 1, KEY(0));
-	DEFAULT_WRITE_READ(t.vcpu, t.vcpu, LOGICAL, 1, KEY(9));
-	DEFAULT_WRITE_READ(t.vcpu, t.vm, ABSOLUTE, 1, KEY(0));
-	DEFAULT_WRITE_READ(t.vcpu, t.vm, ABSOLUTE, 1, KEY(9));
+	default_write_read(t.vcpu, t.vcpu, LOGICAL, 1, 0);
+	default_write_read(t.vcpu, t.vcpu, LOGICAL, 1, 9);
+	default_write_read(t.vcpu, t.vm, ABSOLUTE, 1, 0);
+	default_write_read(t.vcpu, t.vm, ABSOLUTE, 1, 9);
 
 	/* vm/vcpu, mismatching keys on read, but no fetch protection */
-	DEFAULT_READ(t.vcpu, t.vcpu, LOGICAL, t.size, GADDR_V(mem2), KEY(2));
-	DEFAULT_READ(t.vcpu, t.vm, ABSOLUTE, t.size, GADDR_V(mem1), KEY(2));
+	default_read(t.vcpu, t.vcpu, LOGICAL, t.size, 2);
+	default_read(t.vcpu, t.vm, ABSOLUTE, t.size, 2);
 
 	kvm_vm_free(t.kvm_vm);
 }
@@ -409,7 +405,7 @@ static void test_copy_key_storage_prot_override(void)
 	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
 
 	/* vcpu, mismatching keys, storage protection override in effect */
-	DEFAULT_WRITE_READ(t.vcpu, t.vcpu, LOGICAL, t.size, KEY(2));
+	default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, 2);
 
 	kvm_vm_free(t.kvm_vm);
 }
@@ -422,8 +418,8 @@ static void test_copy_key_fetch_prot(void)
 	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
 
 	/* vm/vcpu, matching key, fetch protection in effect */
-	DEFAULT_READ(t.vcpu, t.vcpu, LOGICAL, t.size, GADDR_V(mem2), KEY(9));
-	DEFAULT_READ(t.vcpu, t.vm, ABSOLUTE, t.size, GADDR_V(mem2), KEY(9));
+	default_read(t.vcpu, t.vcpu, LOGICAL, t.size, 9);
+	default_read(t.vcpu, t.vm, ABSOLUTE, t.size, 9);
 
 	kvm_vm_free(t.kvm_vm);
 }
-- 
cgit 


From de14e014a7c6a69b31b68374e6fe30f8da683505 Mon Sep 17 00:00:00 2001
From: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Date: Mon, 6 Feb 2023 17:45:51 +0100
Subject: KVM: s390: selftest: memop: Move testlist into main

This allows checking if the necessary requirements for a test case are
met via an arbitrary expression. In particular, it is easy to check if
certain bits are set in the memop extension capability.

Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Link: https://lore.kernel.org/r/20230206164602.138068-4-scgl@linux.ibm.com
Message-Id: <20230206164602.138068-4-scgl@linux.ibm.com>
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/memop.c | 131 +++++++++++++++---------------
 1 file changed, 66 insertions(+), 65 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
index df1c726294b2..bbc191a13760 100644
--- a/tools/testing/selftests/kvm/s390x/memop.c
+++ b/tools/testing/selftests/kvm/s390x/memop.c
@@ -690,85 +690,86 @@ static void test_errors(void)
 	kvm_vm_free(t.kvm_vm);
 }
 
-struct testdef {
-	const char *name;
-	void (*test)(void);
-	int extension;
-} testlist[] = {
-	{
-		.name = "simple copy",
-		.test = test_copy,
-	},
-	{
-		.name = "generic error checks",
-		.test = test_errors,
-	},
-	{
-		.name = "copy with storage keys",
-		.test = test_copy_key,
-		.extension = 1,
-	},
-	{
-		.name = "copy with key storage protection override",
-		.test = test_copy_key_storage_prot_override,
-		.extension = 1,
-	},
-	{
-		.name = "copy with key fetch protection",
-		.test = test_copy_key_fetch_prot,
-		.extension = 1,
-	},
-	{
-		.name = "copy with key fetch protection override",
-		.test = test_copy_key_fetch_prot_override,
-		.extension = 1,
-	},
-	{
-		.name = "error checks with key",
-		.test = test_errors_key,
-		.extension = 1,
-	},
-	{
-		.name = "termination",
-		.test = test_termination,
-		.extension = 1,
-	},
-	{
-		.name = "error checks with key storage protection override",
-		.test = test_errors_key_storage_prot_override,
-		.extension = 1,
-	},
-	{
-		.name = "error checks without key fetch prot override",
-		.test = test_errors_key_fetch_prot_override_not_enabled,
-		.extension = 1,
-	},
-	{
-		.name = "error checks with key fetch prot override",
-		.test = test_errors_key_fetch_prot_override_enabled,
-		.extension = 1,
-	},
-};
 
 int main(int argc, char *argv[])
 {
 	int extension_cap, idx;
 
 	TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_MEM_OP));
+	extension_cap = kvm_check_cap(KVM_CAP_S390_MEM_OP_EXTENSION);
 
-	ksft_print_header();
+	struct testdef {
+		const char *name;
+		void (*test)(void);
+		bool requirements_met;
+	} testlist[] = {
+		{
+			.name = "simple copy",
+			.test = test_copy,
+			.requirements_met = true,
+		},
+		{
+			.name = "generic error checks",
+			.test = test_errors,
+			.requirements_met = true,
+		},
+		{
+			.name = "copy with storage keys",
+			.test = test_copy_key,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "copy with key storage protection override",
+			.test = test_copy_key_storage_prot_override,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "copy with key fetch protection",
+			.test = test_copy_key_fetch_prot,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "copy with key fetch protection override",
+			.test = test_copy_key_fetch_prot_override,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "error checks with key",
+			.test = test_errors_key,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "termination",
+			.test = test_termination,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "error checks with key storage protection override",
+			.test = test_errors_key_storage_prot_override,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "error checks without key fetch prot override",
+			.test = test_errors_key_fetch_prot_override_not_enabled,
+			.requirements_met = extension_cap > 0,
+		},
+		{
+			.name = "error checks with key fetch prot override",
+			.test = test_errors_key_fetch_prot_override_enabled,
+			.requirements_met = extension_cap > 0,
+		},
+	};
 
+	ksft_print_header();
 	ksft_set_plan(ARRAY_SIZE(testlist));
 
-	extension_cap = kvm_check_cap(KVM_CAP_S390_MEM_OP_EXTENSION);
 	for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
-		if (extension_cap >= testlist[idx].extension) {
+		if (testlist[idx].requirements_met) {
 			testlist[idx].test();
 			ksft_test_result_pass("%s\n", testlist[idx].name);
 		} else {
-			ksft_test_result_skip("%s - extension level %d not supported\n",
-					      testlist[idx].name,
-					      testlist[idx].extension);
+			ksft_test_result_skip("%s - requirements not met (kernel has extension cap %#x)\n",
+					      testlist[idx].name, extension_cap);
 		}
 	}
 
-- 
cgit 


From 06e5da81c66cd688d12c0c38ef35dd7c1c1845e5 Mon Sep 17 00:00:00 2001
From: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Date: Mon, 6 Feb 2023 17:45:52 +0100
Subject: KVM: s390: selftest: memop: Add bad address test

Add a test that tries a real write to a bad address.
The existing CHECK_ONLY test doesn't cover all paths.

Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Reviewed-by: Nico Boehr <nrb@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Link: https://lore.kernel.org/r/20230206164602.138068-5-scgl@linux.ibm.com
Message-Id: <20230206164602.138068-5-scgl@linux.ibm.com>
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/memop.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
index bbc191a13760..00737cceacda 100644
--- a/tools/testing/selftests/kvm/s390x/memop.c
+++ b/tools/testing/selftests/kvm/s390x/memop.c
@@ -641,7 +641,9 @@ static void _test_errors_common(struct test_info info, enum mop_target target, i
 
 	/* Bad guest address: */
 	rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR((void *)~0xfffUL), CHECK_ONLY);
-	TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory access");
+	TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory address with CHECK_ONLY");
+	rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR((void *)~0xfffUL));
+	TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory address on write");
 
 	/* Bad host address: */
 	rv = ERR_MOP(info, target, WRITE, 0, size, GADDR_V(mem1));
-- 
cgit 


From 76a2ee43ed9a34f163d0331e3ba320c34c66c64e Mon Sep 17 00:00:00 2001
From: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Date: Mon, 6 Feb 2023 17:45:53 +0100
Subject: KVM: s390: selftest: memop: Fix typo

"acceeded" isn't a word, should be "exceeded".

Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Nico Boehr <nrb@linux.ibm.com>
Link: https://lore.kernel.org/r/20230206164602.138068-6-scgl@linux.ibm.com
Message-Id: <20230206164602.138068-6-scgl@linux.ibm.com>
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/memop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
index 00737cceacda..033a8603a096 100644
--- a/tools/testing/selftests/kvm/s390x/memop.c
+++ b/tools/testing/selftests/kvm/s390x/memop.c
@@ -602,7 +602,7 @@ static void test_errors_key_fetch_prot_override_enabled(void)
 
 	/*
 	 * vcpu, mismatching keys on fetch,
-	 * fetch protection override does not apply because memory range acceeded
+	 * fetch protection override does not apply because memory range exceeded
 	 */
 	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, 2048 + 1, GADDR_V(0), KEY(2));
 	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, PAGE_SIZE + 2048 + 1,
-- 
cgit 


From 12c12a9924b475d6a20760f992fbf5e080747ae3 Mon Sep 17 00:00:00 2001
From: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Date: Mon, 6 Feb 2023 17:45:54 +0100
Subject: KVM: s390: selftest: memop: Fix wrong address being used in test

The guest code sets the key for mem1 only. In order to provoke a
protection exception the test codes needs to address mem1.

Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Reviewed-by: Nico Boehr <nrb@linux.ibm.com>
Link: https://lore.kernel.org/r/20230206164602.138068-7-scgl@linux.ibm.com
Message-Id: <20230206164602.138068-7-scgl@linux.ibm.com>
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/memop.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
index 033a8603a096..1ae5c01f9904 100644
--- a/tools/testing/selftests/kvm/s390x/memop.c
+++ b/tools/testing/selftests/kvm/s390x/memop.c
@@ -450,9 +450,9 @@ static void test_errors_key(void)
 
 	/* vm/vcpu, mismatching keys, fetch protection in effect */
 	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, WRITE, mem1, t.size, GADDR_V(mem1), KEY(2));
-	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, t.size, GADDR_V(mem2), KEY(2));
+	CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, t.size, GADDR_V(mem1), KEY(2));
 	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, WRITE, mem1, t.size, GADDR_V(mem1), KEY(2));
-	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, READ, mem2, t.size, GADDR_V(mem2), KEY(2));
+	CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, READ, mem2, t.size, GADDR_V(mem1), KEY(2));
 
 	kvm_vm_free(t.kvm_vm);
 }
-- 
cgit 


From dc55ceaef616bdac73810d6588c23a5b50d24911 Mon Sep 17 00:00:00 2001
From: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Date: Mon, 6 Feb 2023 17:45:55 +0100
Subject: KVM: s390: selftest: memop: Fix integer literal

The address is a 64 bit value, specifying a 32 bit value can crash the
guest. In this case things worked out with -O2 but not -O0.

Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Fixes: 1bb873495a9e ("KVM: s390: selftests: Add more copy memop tests")
Reviewed-by: Thomas Huth <thuth@redhat.com>
Link: https://lore.kernel.org/r/20230206164602.138068-8-scgl@linux.ibm.com
Message-Id: <20230206164602.138068-8-scgl@linux.ibm.com>
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/memop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
index 1ae5c01f9904..c5fec84ef3c2 100644
--- a/tools/testing/selftests/kvm/s390x/memop.c
+++ b/tools/testing/selftests/kvm/s390x/memop.c
@@ -514,7 +514,7 @@ static void guest_copy_key_fetch_prot_override(void)
 	GUEST_SYNC(STAGE_INITED);
 	set_storage_key_range(0, PAGE_SIZE, 0x18);
 	set_storage_key_range((void *)last_page_addr, PAGE_SIZE, 0x0);
-	asm volatile ("sske %[key],%[addr]\n" :: [addr] "r"(0), [key] "r"(0x18) : "cc");
+	asm volatile ("sske %[key],%[addr]\n" :: [addr] "r"(0L), [key] "r"(0x18) : "cc");
 	GUEST_SYNC(STAGE_SKEYS_SET);
 
 	for (;;) {
-- 
cgit 


From 0dd714bfd200068e3c0d6b37861056367868e612 Mon Sep 17 00:00:00 2001
From: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Date: Tue, 7 Feb 2023 17:42:25 +0100
Subject: KVM: s390: selftest: memop: Add cmpxchg tests

Test successful exchange, unsuccessful exchange, storage key protection
and invalid arguments.

Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Acked-by: Janosch Frank <frankja@linux.ibm.com>
Link: https://lore.kernel.org/r/20230207164225.2114706-1-scgl@linux.ibm.com
Message-Id: <20230207164225.2114706-1-scgl@linux.ibm.com>
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 tools/testing/selftests/kvm/s390x/memop.c | 407 ++++++++++++++++++++++++++++--
 1 file changed, 392 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
index c5fec84ef3c2..8e4b94d7b8dd 100644
--- a/tools/testing/selftests/kvm/s390x/memop.c
+++ b/tools/testing/selftests/kvm/s390x/memop.c
@@ -9,6 +9,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/ioctl.h>
+#include <pthread.h>
 
 #include <linux/bits.h>
 
@@ -26,6 +27,7 @@ enum mop_target {
 enum mop_access_mode {
 	READ,
 	WRITE,
+	CMPXCHG,
 };
 
 struct mop_desc {
@@ -44,13 +46,16 @@ struct mop_desc {
 	enum mop_access_mode mode;
 	void *buf;
 	uint32_t sida_offset;
+	void *old;
+	uint8_t old_value[16];
+	bool *cmpxchg_success;
 	uint8_t ar;
 	uint8_t key;
 };
 
 const uint8_t NO_KEY = 0xff;
 
-static struct kvm_s390_mem_op ksmo_from_desc(const struct mop_desc *desc)
+static struct kvm_s390_mem_op ksmo_from_desc(struct mop_desc *desc)
 {
 	struct kvm_s390_mem_op ksmo = {
 		.gaddr = (uintptr_t)desc->gaddr,
@@ -77,6 +82,11 @@ static struct kvm_s390_mem_op ksmo_from_desc(const struct mop_desc *desc)
 			ksmo.op = KVM_S390_MEMOP_ABSOLUTE_READ;
 		if (desc->mode == WRITE)
 			ksmo.op = KVM_S390_MEMOP_ABSOLUTE_WRITE;
+		if (desc->mode == CMPXCHG) {
+			ksmo.op = KVM_S390_MEMOP_ABSOLUTE_CMPXCHG;
+			ksmo.old_addr = (uint64_t)desc->old;
+			memcpy(desc->old_value, desc->old, desc->size);
+		}
 		break;
 	case INVALID:
 		ksmo.op = -1;
@@ -135,9 +145,13 @@ static void print_memop(struct kvm_vcpu *vcpu, const struct kvm_s390_mem_op *ksm
 	case KVM_S390_MEMOP_ABSOLUTE_WRITE:
 		printf("ABSOLUTE, WRITE, ");
 		break;
+	case KVM_S390_MEMOP_ABSOLUTE_CMPXCHG:
+		printf("ABSOLUTE, CMPXCHG, ");
+		break;
 	}
-	printf("gaddr=%llu, size=%u, buf=%llu, ar=%u, key=%u",
-	       ksmo->gaddr, ksmo->size, ksmo->buf, ksmo->ar, ksmo->key);
+	printf("gaddr=%llu, size=%u, buf=%llu, ar=%u, key=%u, old_addr=%llx",
+	       ksmo->gaddr, ksmo->size, ksmo->buf, ksmo->ar, ksmo->key,
+	       ksmo->old_addr);
 	if (ksmo->flags & KVM_S390_MEMOP_F_CHECK_ONLY)
 		printf(", CHECK_ONLY");
 	if (ksmo->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION)
@@ -147,24 +161,30 @@ static void print_memop(struct kvm_vcpu *vcpu, const struct kvm_s390_mem_op *ksm
 	puts(")");
 }
 
-static void memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo)
+static int err_memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo,
+			   struct mop_desc *desc)
 {
 	struct kvm_vcpu *vcpu = info.vcpu;
 
 	if (!vcpu)
-		vm_ioctl(info.vm, KVM_S390_MEM_OP, ksmo);
+		return __vm_ioctl(info.vm, KVM_S390_MEM_OP, ksmo);
 	else
-		vcpu_ioctl(vcpu, KVM_S390_MEM_OP, ksmo);
+		return __vcpu_ioctl(vcpu, KVM_S390_MEM_OP, ksmo);
 }
 
-static int err_memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo)
+static void memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo,
+			struct mop_desc *desc)
 {
-	struct kvm_vcpu *vcpu = info.vcpu;
+	int r;
 
-	if (!vcpu)
-		return __vm_ioctl(info.vm, KVM_S390_MEM_OP, ksmo);
-	else
-		return __vcpu_ioctl(vcpu, KVM_S390_MEM_OP, ksmo);
+	r = err_memop_ioctl(info, ksmo, desc);
+	if (ksmo->op == KVM_S390_MEMOP_ABSOLUTE_CMPXCHG) {
+		if (desc->cmpxchg_success) {
+			int diff = memcmp(desc->old_value, desc->old, desc->size);
+			*desc->cmpxchg_success = !diff;
+		}
+	}
+	TEST_ASSERT(!r, __KVM_IOCTL_ERROR("KVM_S390_MEM_OP", r));
 }
 
 #define MEMOP(err, info_p, mop_target_p, access_mode_p, buf_p, size_p, ...)	\
@@ -187,7 +207,7 @@ static int err_memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo)
 	}									\
 	__ksmo = ksmo_from_desc(&__desc);					\
 	print_memop(__info.vcpu, &__ksmo);					\
-	err##memop_ioctl(__info, &__ksmo);					\
+	err##memop_ioctl(__info, &__ksmo, &__desc);				\
 })
 
 #define MOP(...) MEMOP(, __VA_ARGS__)
@@ -201,6 +221,8 @@ static int err_memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo)
 #define AR(a) ._ar = 1, .ar = (a)
 #define KEY(a) .f_key = 1, .key = (a)
 #define INJECT .f_inject = 1
+#define CMPXCHG_OLD(o) .old = (o)
+#define CMPXCHG_SUCCESS(s) .cmpxchg_success = (s)
 
 #define CHECK_N_DO(f, ...) ({ f(__VA_ARGS__, CHECK_ONLY); f(__VA_ARGS__); })
 
@@ -210,8 +232,8 @@ static int err_memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo)
 #define CR0_FETCH_PROTECTION_OVERRIDE	(1UL << (63 - 38))
 #define CR0_STORAGE_PROTECTION_OVERRIDE	(1UL << (63 - 39))
 
-static uint8_t mem1[65536];
-static uint8_t mem2[65536];
+static uint8_t __aligned(PAGE_SIZE) mem1[65536];
+static uint8_t __aligned(PAGE_SIZE) mem2[65536];
 
 struct test_default {
 	struct kvm_vm *kvm_vm;
@@ -243,6 +265,8 @@ enum stage {
 	STAGE_SKEYS_SET,
 	/* Guest copied memory (locations up to test case) */
 	STAGE_COPIED,
+	/* End of guest code reached */
+	STAGE_DONE,
 };
 
 #define HOST_SYNC(info_p, stage)					\
@@ -254,6 +278,9 @@ enum stage {
 									\
 	vcpu_run(__vcpu);						\
 	get_ucall(__vcpu, &uc);						\
+	if (uc.cmd == UCALL_ABORT) {					\
+		REPORT_GUEST_ASSERT_2(uc, "hints: %lu, %lu");		\
+	}								\
 	ASSERT_EQ(uc.cmd, UCALL_SYNC);					\
 	ASSERT_EQ(uc.args[1], __stage);					\
 })									\
@@ -293,6 +320,44 @@ static void default_read(struct test_info copy_cpu, struct test_info mop_cpu,
 	ASSERT_MEM_EQ(mem1, mem2, size);
 }
 
+static void default_cmpxchg(struct test_default *test, uint8_t key)
+{
+	for (int size = 1; size <= 16; size *= 2) {
+		for (int offset = 0; offset < 16; offset += size) {
+			uint8_t __aligned(16) new[16] = {};
+			uint8_t __aligned(16) old[16];
+			bool succ;
+
+			prepare_mem12();
+			default_write_read(test->vcpu, test->vcpu, LOGICAL, 16, NO_KEY);
+
+			memcpy(&old, mem1, 16);
+			MOP(test->vm, ABSOLUTE, CMPXCHG, new + offset,
+			    size, GADDR_V(mem1 + offset),
+			    CMPXCHG_OLD(old + offset),
+			    CMPXCHG_SUCCESS(&succ), KEY(key));
+			HOST_SYNC(test->vcpu, STAGE_COPIED);
+			MOP(test->vm, ABSOLUTE, READ, mem2, 16, GADDR_V(mem2));
+			TEST_ASSERT(succ, "exchange of values should succeed");
+			memcpy(mem1 + offset, new + offset, size);
+			ASSERT_MEM_EQ(mem1, mem2, 16);
+
+			memcpy(&old, mem1, 16);
+			new[offset]++;
+			old[offset]++;
+			MOP(test->vm, ABSOLUTE, CMPXCHG, new + offset,
+			    size, GADDR_V(mem1 + offset),
+			    CMPXCHG_OLD(old + offset),
+			    CMPXCHG_SUCCESS(&succ), KEY(key));
+			HOST_SYNC(test->vcpu, STAGE_COPIED);
+			MOP(test->vm, ABSOLUTE, READ, mem2, 16, GADDR_V(mem2));
+			TEST_ASSERT(!succ, "exchange of values should not succeed");
+			ASSERT_MEM_EQ(mem1, mem2, 16);
+			ASSERT_MEM_EQ(&old, mem1, 16);
+		}
+	}
+}
+
 static void guest_copy(void)
 {
 	GUEST_SYNC(STAGE_INITED);
@@ -377,6 +442,248 @@ static void test_copy_key(void)
 	kvm_vm_free(t.kvm_vm);
 }
 
+static void test_cmpxchg_key(void)
+{
+	struct test_default t = test_default_init(guest_copy_key);
+
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+
+	default_cmpxchg(&t, NO_KEY);
+	default_cmpxchg(&t, 0);
+	default_cmpxchg(&t, 9);
+
+	kvm_vm_free(t.kvm_vm);
+}
+
+static __uint128_t cut_to_size(int size, __uint128_t val)
+{
+	switch (size) {
+	case 1:
+		return (uint8_t)val;
+	case 2:
+		return (uint16_t)val;
+	case 4:
+		return (uint32_t)val;
+	case 8:
+		return (uint64_t)val;
+	case 16:
+		return val;
+	}
+	GUEST_ASSERT_1(false, "Invalid size");
+	return 0;
+}
+
+static bool popcount_eq(__uint128_t a, __uint128_t b)
+{
+	unsigned int count_a, count_b;
+
+	count_a = __builtin_popcountl((uint64_t)(a >> 64)) +
+		  __builtin_popcountl((uint64_t)a);
+	count_b = __builtin_popcountl((uint64_t)(b >> 64)) +
+		  __builtin_popcountl((uint64_t)b);
+	return count_a == count_b;
+}
+
+static __uint128_t rotate(int size, __uint128_t val, int amount)
+{
+	unsigned int bits = size * 8;
+
+	amount = (amount + bits) % bits;
+	val = cut_to_size(size, val);
+	return (val << (bits - amount)) | (val >> amount);
+}
+
+const unsigned int max_block = 16;
+
+static void choose_block(bool guest, int i, int *size, int *offset)
+{
+	unsigned int rand;
+
+	rand = i;
+	if (guest) {
+		rand = rand * 19 + 11;
+		*size = 1 << ((rand % 3) + 2);
+		rand = rand * 19 + 11;
+		*offset = (rand % max_block) & ~(*size - 1);
+	} else {
+		rand = rand * 17 + 5;
+		*size = 1 << (rand % 5);
+		rand = rand * 17 + 5;
+		*offset = (rand % max_block) & ~(*size - 1);
+	}
+}
+
+static __uint128_t permutate_bits(bool guest, int i, int size, __uint128_t old)
+{
+	unsigned int rand;
+	int amount;
+	bool swap;
+
+	rand = i;
+	rand = rand * 3 + 1;
+	if (guest)
+		rand = rand * 3 + 1;
+	swap = rand % 2 == 0;
+	if (swap) {
+		int i, j;
+		__uint128_t new;
+		uint8_t byte0, byte1;
+
+		rand = rand * 3 + 1;
+		i = rand % size;
+		rand = rand * 3 + 1;
+		j = rand % size;
+		if (i == j)
+			return old;
+		new = rotate(16, old, i * 8);
+		byte0 = new & 0xff;
+		new &= ~0xff;
+		new = rotate(16, new, -i * 8);
+		new = rotate(16, new, j * 8);
+		byte1 = new & 0xff;
+		new = (new & ~0xff) | byte0;
+		new = rotate(16, new, -j * 8);
+		new = rotate(16, new, i * 8);
+		new = new | byte1;
+		new = rotate(16, new, -i * 8);
+		return new;
+	}
+	rand = rand * 3 + 1;
+	amount = rand % (size * 8);
+	return rotate(size, old, amount);
+}
+
+static bool _cmpxchg(int size, void *target, __uint128_t *old_addr, __uint128_t new)
+{
+	bool ret;
+
+	switch (size) {
+	case 4: {
+			uint32_t old = *old_addr;
+
+			asm volatile ("cs %[old],%[new],%[address]"
+			    : [old] "+d" (old),
+			      [address] "+Q" (*(uint32_t *)(target))
+			    : [new] "d" ((uint32_t)new)
+			    : "cc"
+			);
+			ret = old == (uint32_t)*old_addr;
+			*old_addr = old;
+			return ret;
+		}
+	case 8: {
+			uint64_t old = *old_addr;
+
+			asm volatile ("csg %[old],%[new],%[address]"
+			    : [old] "+d" (old),
+			      [address] "+Q" (*(uint64_t *)(target))
+			    : [new] "d" ((uint64_t)new)
+			    : "cc"
+			);
+			ret = old == (uint64_t)*old_addr;
+			*old_addr = old;
+			return ret;
+		}
+	case 16: {
+			__uint128_t old = *old_addr;
+
+			asm volatile ("cdsg %[old],%[new],%[address]"
+			    : [old] "+d" (old),
+			      [address] "+Q" (*(__uint128_t *)(target))
+			    : [new] "d" (new)
+			    : "cc"
+			);
+			ret = old == *old_addr;
+			*old_addr = old;
+			return ret;
+		}
+	}
+	GUEST_ASSERT_1(false, "Invalid size");
+	return 0;
+}
+
+const unsigned int cmpxchg_iter_outer = 100, cmpxchg_iter_inner = 10000;
+
+static void guest_cmpxchg_key(void)
+{
+	int size, offset;
+	__uint128_t old, new;
+
+	set_storage_key_range(mem1, max_block, 0x10);
+	set_storage_key_range(mem2, max_block, 0x10);
+	GUEST_SYNC(STAGE_SKEYS_SET);
+
+	for (int i = 0; i < cmpxchg_iter_outer; i++) {
+		do {
+			old = 1;
+		} while (!_cmpxchg(16, mem1, &old, 0));
+		for (int j = 0; j < cmpxchg_iter_inner; j++) {
+			choose_block(true, i + j, &size, &offset);
+			do {
+				new = permutate_bits(true, i + j, size, old);
+			} while (!_cmpxchg(size, mem2 + offset, &old, new));
+		}
+	}
+
+	GUEST_SYNC(STAGE_DONE);
+}
+
+static void *run_guest(void *data)
+{
+	struct test_info *info = data;
+
+	HOST_SYNC(*info, STAGE_DONE);
+	return NULL;
+}
+
+static char *quad_to_char(__uint128_t *quad, int size)
+{
+	return ((char *)quad) + (sizeof(*quad) - size);
+}
+
+static void test_cmpxchg_key_concurrent(void)
+{
+	struct test_default t = test_default_init(guest_cmpxchg_key);
+	int size, offset;
+	__uint128_t old, new;
+	bool success;
+	pthread_t thread;
+
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+	prepare_mem12();
+	MOP(t.vcpu, LOGICAL, WRITE, mem1, max_block, GADDR_V(mem2));
+	pthread_create(&thread, NULL, run_guest, &t.vcpu);
+
+	for (int i = 0; i < cmpxchg_iter_outer; i++) {
+		do {
+			old = 0;
+			new = 1;
+			MOP(t.vm, ABSOLUTE, CMPXCHG, &new,
+			    sizeof(new), GADDR_V(mem1),
+			    CMPXCHG_OLD(&old),
+			    CMPXCHG_SUCCESS(&success), KEY(1));
+		} while (!success);
+		for (int j = 0; j < cmpxchg_iter_inner; j++) {
+			choose_block(false, i + j, &size, &offset);
+			do {
+				new = permutate_bits(false, i + j, size, old);
+				MOP(t.vm, ABSOLUTE, CMPXCHG, quad_to_char(&new, size),
+				    size, GADDR_V(mem2 + offset),
+				    CMPXCHG_OLD(quad_to_char(&old, size)),
+				    CMPXCHG_SUCCESS(&success), KEY(1));
+			} while (!success);
+		}
+	}
+
+	pthread_join(thread, NULL);
+
+	MOP(t.vcpu, LOGICAL, READ, mem2, max_block, GADDR_V(mem2));
+	TEST_ASSERT(popcount_eq(*(__uint128_t *)mem1, *(__uint128_t *)mem2),
+		    "Must retain number of set bits");
+
+	kvm_vm_free(t.kvm_vm);
+}
+
 static void guest_copy_key_fetch_prot(void)
 {
 	/*
@@ -457,6 +764,24 @@ static void test_errors_key(void)
 	kvm_vm_free(t.kvm_vm);
 }
 
+static void test_errors_cmpxchg_key(void)
+{
+	struct test_default t = test_default_init(guest_copy_key_fetch_prot);
+	int i;
+
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+	HOST_SYNC(t.vcpu, STAGE_SKEYS_SET);
+
+	for (i = 1; i <= 16; i *= 2) {
+		__uint128_t old = 0;
+
+		ERR_PROT_MOP(t.vm, ABSOLUTE, CMPXCHG, mem2, i, GADDR_V(mem2),
+			     CMPXCHG_OLD(&old), KEY(2));
+	}
+
+	kvm_vm_free(t.kvm_vm);
+}
+
 static void test_termination(void)
 {
 	struct test_default t = test_default_init(guest_error_key);
@@ -692,6 +1017,38 @@ static void test_errors(void)
 	kvm_vm_free(t.kvm_vm);
 }
 
+static void test_errors_cmpxchg(void)
+{
+	struct test_default t = test_default_init(guest_idle);
+	__uint128_t old;
+	int rv, i, power = 1;
+
+	HOST_SYNC(t.vcpu, STAGE_INITED);
+
+	for (i = 0; i < 32; i++) {
+		if (i == power) {
+			power *= 2;
+			continue;
+		}
+		rv = ERR_MOP(t.vm, ABSOLUTE, CMPXCHG, mem1, i, GADDR_V(mem1),
+			     CMPXCHG_OLD(&old));
+		TEST_ASSERT(rv == -1 && errno == EINVAL,
+			    "ioctl allows bad size for cmpxchg");
+	}
+	for (i = 1; i <= 16; i *= 2) {
+		rv = ERR_MOP(t.vm, ABSOLUTE, CMPXCHG, mem1, i, GADDR((void *)~0xfffUL),
+			     CMPXCHG_OLD(&old));
+		TEST_ASSERT(rv > 0, "ioctl allows bad guest address for cmpxchg");
+	}
+	for (i = 2; i <= 16; i *= 2) {
+		rv = ERR_MOP(t.vm, ABSOLUTE, CMPXCHG, mem1, i, GADDR_V(mem1 + 1),
+			     CMPXCHG_OLD(&old));
+		TEST_ASSERT(rv == -1 && errno == EINVAL,
+			    "ioctl allows bad alignment for cmpxchg");
+	}
+
+	kvm_vm_free(t.kvm_vm);
+}
 
 int main(int argc, char *argv[])
 {
@@ -720,6 +1077,16 @@ int main(int argc, char *argv[])
 			.test = test_copy_key,
 			.requirements_met = extension_cap > 0,
 		},
+		{
+			.name = "cmpxchg with storage keys",
+			.test = test_cmpxchg_key,
+			.requirements_met = extension_cap & 0x2,
+		},
+		{
+			.name = "concurrently cmpxchg with storage keys",
+			.test = test_cmpxchg_key_concurrent,
+			.requirements_met = extension_cap & 0x2,
+		},
 		{
 			.name = "copy with key storage protection override",
 			.test = test_copy_key_storage_prot_override,
@@ -740,6 +1107,16 @@ int main(int argc, char *argv[])
 			.test = test_errors_key,
 			.requirements_met = extension_cap > 0,
 		},
+		{
+			.name = "error checks for cmpxchg with key",
+			.test = test_errors_cmpxchg_key,
+			.requirements_met = extension_cap & 0x2,
+		},
+		{
+			.name = "error checks for cmpxchg",
+			.test = test_errors_cmpxchg,
+			.requirements_met = extension_cap & 0x2,
+		},
 		{
 			.name = "termination",
 			.test = test_termination,
-- 
cgit 


From 6012b8202022d9eec0c09cbb3212e49ccd273438 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 2 Feb 2023 17:30:44 +0000
Subject: kselftest/arm64: Copy whole EXTRA context

When copying the EXTRA context our calculation of the amount of data we
need to copy is incorrect, we only calculate the amount of data needed
within uc_mcontext.__reserved, not taking account of the fixed portion
of the context. Add in the offset of the reserved data so that we copy
everything we should.

This will only cause test failures in cases where the last context in the
EXTRA context is smaller than the missing data since we don't currently
validate any of the register data and all the buffers we copy into are
statically allocated so default to zero meaning that if we walk beyond the
end of what we copied we'll encounter what looks like a context with magic
and length both 0 which is a valid terminator record.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230201-arm64-kselftest-full-extra-v1-1-93741f32dd29@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/test_signals_utils.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c
index 308e229e58ab..746a4f70f082 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.c
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c
@@ -192,8 +192,10 @@ static bool handle_signal_copyctx(struct tdescr *td,
 		 * in the copy, this was previously validated in
 		 * ASSERT_GOOD_CONTEXT().
 		 */
-		to_copy = offset + sizeof(struct extra_context) + 16 +
-			extra->size;
+		to_copy = __builtin_offsetof(ucontext_t,
+					     uc_mcontext.__reserved);
+		to_copy += offset + sizeof(struct extra_context) + 16;
+		to_copy += extra->size;
 		copied_extra = (struct extra_context *)&(td->live_uc->uc_mcontext.__reserved[offset]);
 	} else {
 		copied_extra = NULL;
-- 
cgit 


From 2c4192c0a7f2d628b5c1667577316ee9e7471e20 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 2 Feb 2023 17:31:25 +0000
Subject: kselftest/arm64: Don't require FA64 for streaming SVE+ZA tests

During early development a dependedncy was added on having FA64
available so we could use the full FPSIMD register set in the signal
handler which got copied over into the SSVE+ZA registers test case.
Subsequently the ABI was finialised so the handler is run with streaming
mode disabled meaning this is redundant but the dependency was never
removed, do so now.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230202-arm64-kselftest-sve-za-fa64-v1-1-5c5f3dabe441@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c
index 1f62621794d5..9dc5f128bbc0 100644
--- a/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c
@@ -154,12 +154,7 @@ static int sme_regs(struct tdescr *td, siginfo_t *si, ucontext_t *uc)
 struct tdescr tde = {
 	.name = "Streaming SVE registers",
 	.descr = "Check that we get the right Streaming SVE registers reported",
-	/*
-	 * We shouldn't require FA64 but things like memset() used in the
-	 * helpers might use unsupported instructions so for now disable
-	 * the test unless we've got the full instruction set.
-	 */
-	.feats_required = FEAT_SME | FEAT_SME_FA64,
+	.feats_required = FEAT_SME,
 	.timeout = 3,
 	.init = sme_get_vls,
 	.run = sme_regs,
-- 
cgit 


From 56a2df7615fa050cc67b89245b2a482849077939 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 2 Feb 2023 12:28:39 +0100
Subject: tools/resolve_btfids: Compile resolve_btfids as host program

Making resolve_btfids to be compiled as host program so
we can avoid cross compile issues as reported by Nathan.

Also we no longer need HOST_OVERRIDES for BINARY target,
just for 'prepare' targets.

Fixes: 13e07691a16f ("tools/resolve_btfids: Alter how HOSTCC is forced")
Reported-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/bpf/20230202112839.1131892-1-jolsa@kernel.org
---
 tools/bpf/resolve_btfids/Build    | 4 +++-
 tools/bpf/resolve_btfids/Makefile | 9 ++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/Build b/tools/bpf/resolve_btfids/Build
index ae82da03f9bf..077de3829c72 100644
--- a/tools/bpf/resolve_btfids/Build
+++ b/tools/bpf/resolve_btfids/Build
@@ -1,3 +1,5 @@
+hostprogs := resolve_btfids
+
 resolve_btfids-y += main.o
 resolve_btfids-y += rbtree.o
 resolve_btfids-y += zalloc.o
@@ -7,4 +9,4 @@ resolve_btfids-y += str_error_r.o
 
 $(OUTPUT)%.o: ../../lib/%.c FORCE
 	$(call rule_mkdir)
-	$(call if_changed_dep,cc_o_c)
+	$(call if_changed_dep,host_cc_o_c)
diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile
index daed388aa5d7..abdd68ac08f4 100644
--- a/tools/bpf/resolve_btfids/Makefile
+++ b/tools/bpf/resolve_btfids/Makefile
@@ -22,6 +22,9 @@ HOST_OVERRIDES := AR="$(HOSTAR)" CC="$(HOSTCC)" LD="$(HOSTLD)" ARCH="$(HOSTARCH)
 		  EXTRA_CFLAGS="$(HOSTCFLAGS) $(KBUILD_HOSTCFLAGS)"
 
 RM      ?= rm
+HOSTCC  ?= gcc
+HOSTLD  ?= ld
+HOSTAR  ?= ar
 CROSS_COMPILE =
 
 OUTPUT ?= $(srctree)/tools/bpf/resolve_btfids/
@@ -64,7 +67,7 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OU
 LIBELF_FLAGS := $(shell $(HOSTPKG_CONFIG) libelf --cflags 2>/dev/null)
 LIBELF_LIBS  := $(shell $(HOSTPKG_CONFIG) libelf --libs 2>/dev/null || echo -lelf)
 
-CFLAGS += -g \
+HOSTCFLAGS += -g \
           -I$(srctree)/tools/include \
           -I$(srctree)/tools/include/uapi \
           -I$(LIBBPF_INCLUDE) \
@@ -73,11 +76,11 @@ CFLAGS += -g \
 
 LIBS = $(LIBELF_LIBS) -lz
 
-export srctree OUTPUT CFLAGS Q
+export srctree OUTPUT HOSTCFLAGS Q HOSTCC HOSTLD HOSTAR
 include $(srctree)/tools/build/Makefile.include
 
 $(BINARY_IN): fixdep FORCE prepare | $(OUTPUT)
-	$(Q)$(MAKE) $(build)=resolve_btfids $(HOST_OVERRIDES)
+	$(Q)$(MAKE) $(build)=resolve_btfids
 
 $(BINARY): $(BPFOBJ) $(SUBCMDOBJ) $(BINARY_IN)
 	$(call msg,LINK,$@)
-- 
cgit 


From e0975ab92f2406fd3e12834f62dc57cb10404f85 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 2 Feb 2023 14:42:53 -0800
Subject: tools/resolve_btfids: Tidy HOST_OVERRIDES

Don't set EXTRA_CFLAGS to HOSTCFLAGS, ensure CROSS_COMPILE isn't
passed through.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20230202224253.40283-1-irogers@google.com
---
 tools/bpf/resolve_btfids/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile
index abdd68ac08f4..2abdd85b4a08 100644
--- a/tools/bpf/resolve_btfids/Makefile
+++ b/tools/bpf/resolve_btfids/Makefile
@@ -17,9 +17,9 @@ else
   MAKEFLAGS=--no-print-directory
 endif
 
-# always use the host compiler
+# Overrides for the prepare step libraries.
 HOST_OVERRIDES := AR="$(HOSTAR)" CC="$(HOSTCC)" LD="$(HOSTLD)" ARCH="$(HOSTARCH)" \
-		  EXTRA_CFLAGS="$(HOSTCFLAGS) $(KBUILD_HOSTCFLAGS)"
+		  CROSS_COMPILE=""
 
 RM      ?= rm
 HOSTCC  ?= gcc
-- 
cgit 


From 02fc0e73e852f78ec374004f924b9d84275d0006 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 7 Feb 2023 12:11:03 +0100
Subject: libbpf: Always use libbpf_err to return an error in bpf_xdp_query()

In order to properly set errno, rely on libbpf_err utility routine in
bpf_xdp_query() to return an error to the caller.

Fixes: 04d58f1b26a4 ("libbpf: add API to get XDP/XSK supported features")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/827d40181f9f90fb37702f44328e1614df7c0503.1675768112.git.lorenzo@kernel.org
---
 tools/lib/bpf/netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index 32b13b7a11b0..cb082a04ffa8 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -480,7 +480,7 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts)
 
 	err = nlattr_add(&req, NETDEV_A_DEV_IFINDEX, &ifindex, sizeof(ifindex));
 	if (err < 0)
-		return err;
+		return libbpf_err(err);
 
 	err = libbpf_netlink_send_recv(&req, NETLINK_GENERIC,
 				       parse_xdp_features, NULL, &md);
-- 
cgit 


From e5b426879fc39063d940902adbf38d5485925304 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan@redhat.com>
Date: Wed, 18 Jan 2023 17:21:32 +0800
Subject: KVM: selftests: Remove duplicate VM creation in memslot_perf_test

Remove a spurious call to __vm_create_with_one_vcpu() that was introduced
by a merge gone sideways.

Fixes: eb5618911af0 ("Merge tag 'kvmarm-6.2' of https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD")
Signed-off-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Link: https://lore.kernel.org/r/20230118092133.320003-2-gshan@redhat.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/memslot_perf_test.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c
index e6587e193490..adbbcca3e354 100644
--- a/tools/testing/selftests/kvm/memslot_perf_test.c
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -308,8 +308,6 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots,
 	data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots);
 	TEST_ASSERT(data->hva_slots, "malloc() fail");
 
-	data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code);
-
 	pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n",
 		data->nslots, data->pages_per_slot, rempages);
 
-- 
cgit 


From 45f679550d7295b52a1cd8b7afaf2bd31453a5b0 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan@redhat.com>
Date: Wed, 18 Jan 2023 17:21:33 +0800
Subject: KVM: selftests: Assign guest page size in sync area early in
 memslot_perf_test

The guest page size in the synchronization area is needed by all test
cases. So it's reasonable to set it in the unified preparation function
(prepare_vm()).

Signed-off-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Link: https://lore.kernel.org/r/20230118092133.320003-3-gshan@redhat.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/memslot_perf_test.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c
index adbbcca3e354..4210cd21d159 100644
--- a/tools/testing/selftests/kvm/memslot_perf_test.c
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -347,6 +347,7 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots,
 	virt_map(data->vm, MEM_GPA, MEM_GPA, data->npages);
 
 	sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL);
+	sync->guest_page_size = data->vm->page_size;
 	atomic_init(&sync->start_flag, false);
 	atomic_init(&sync->exit_flag, false);
 	atomic_init(&sync->sync_flag, false);
@@ -808,8 +809,6 @@ static bool test_execute(int nslots, uint64_t *maxslots,
 	}
 
 	sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL);
-
-	sync->guest_page_size = data->vm->page_size;
 	if (tdata->prepare &&
 	    !tdata->prepare(data, sync, maxslots)) {
 		ret = false;
-- 
cgit 


From a635a8c3df66ab68dc088c08a4e9e955e22c0e64 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 7 Feb 2023 14:04:17 +0100
Subject: selftests: mptcp: allow more slack for slow test-case

A test-case is frequently failing on some extremely slow VMs.
The mptcp transfer completes before the script is able to do
all the required PM manipulation.

Address the issue in the simplest possible way, making the
transfer even more slow.

Additionally dump more info in case of failures, to help debugging
similar problems in the future and init dump_stats var.

Fixes: e274f7154008 ("selftests: mptcp: add subflow limits test-cases")
Cc: stable@vger.kernel.org
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/323
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index d11d3d566608..f8a969300ef4 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -1694,6 +1694,7 @@ chk_subflow_nr()
 	local subflow_nr=$3
 	local cnt1
 	local cnt2
+	local dump_stats
 
 	if [ -n "${need_title}" ]; then
 		printf "%03u %-36s %s" "${TEST_COUNT}" "${TEST_NAME}" "${msg}"
@@ -1711,7 +1712,12 @@ chk_subflow_nr()
 		echo "[ ok ]"
 	fi
 
-	[ "${dump_stats}" = 1 ] && ( ss -N $ns1 -tOni ; ss -N $ns1 -tOni | grep token; ip -n $ns1 mptcp endpoint )
+	if [ "${dump_stats}" = 1 ]; then
+		ss -N $ns1 -tOni
+		ss -N $ns1 -tOni | grep token
+		ip -n $ns1 mptcp endpoint
+		dump_stats
+	fi
 }
 
 chk_link_usage()
@@ -3069,7 +3075,7 @@ endpoint_tests()
 		pm_nl_set_limits $ns1 1 1
 		pm_nl_set_limits $ns2 1 1
 		pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow
-		run_tests $ns1 $ns2 10.0.1.1 4 0 0 slow &
+		run_tests $ns1 $ns2 10.0.1.1 4 0 0 speed_20 &
 
 		wait_mpj $ns2
 		pm_nl_del_endpoint $ns2 2 10.0.2.2
-- 
cgit 


From 070d6dafacbaa9d1f2e4e3edc263853d194af15e Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matthieu.baerts@tessares.net>
Date: Tue, 7 Feb 2023 14:04:18 +0100
Subject: selftests: mptcp: stop tests earlier

These 'endpoint' tests from 'mptcp_join.sh' selftest start a transfer in
the background and check the status during this transfer.

Once the expected events have been recorded, there is no reason to wait
for the data transfer to finish. It can be stopped earlier to reduce the
execution time by more than half.

For these tests, the exchanged data were not verified. Errors, if any,
were ignored but that's fine, plenty of other tests are looking at that.
It is then OK to mute stderr now that we are sure errors will be printed
(and still ignored) because the transfer is stopped before the end.

Fixes: e274f7154008 ("selftests: mptcp: add subflow limits test-cases")
Cc: stable@vger.kernel.org
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index f8a969300ef4..079f8f46849d 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -498,6 +498,12 @@ kill_events_pids()
 	kill_wait $evts_ns2_pid
 }
 
+kill_tests_wait()
+{
+	kill -SIGUSR1 $(ip netns pids $ns2) $(ip netns pids $ns1)
+	wait
+}
+
 pm_nl_set_limits()
 {
 	local ns=$1
@@ -3055,7 +3061,7 @@ endpoint_tests()
 		pm_nl_set_limits $ns1 2 2
 		pm_nl_set_limits $ns2 2 2
 		pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
-		run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow &
+		run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow 2>/dev/null &
 
 		wait_mpj $ns1
 		pm_nl_check_endpoint 1 "creation" \
@@ -3068,14 +3074,14 @@ endpoint_tests()
 		pm_nl_add_endpoint $ns2 10.0.2.2 flags signal
 		pm_nl_check_endpoint 0 "modif is allowed" \
 			$ns2 10.0.2.2 id 1 flags signal
-		wait
+		kill_tests_wait
 	fi
 
 	if reset "delete and re-add"; then
 		pm_nl_set_limits $ns1 1 1
 		pm_nl_set_limits $ns2 1 1
 		pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow
-		run_tests $ns1 $ns2 10.0.1.1 4 0 0 speed_20 &
+		run_tests $ns1 $ns2 10.0.1.1 4 0 0 speed_20 2>/dev/null &
 
 		wait_mpj $ns2
 		pm_nl_del_endpoint $ns2 2 10.0.2.2
@@ -3085,7 +3091,7 @@ endpoint_tests()
 		pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow
 		wait_mpj $ns2
 		chk_subflow_nr "" "after re-add" 2
-		wait
+		kill_tests_wait
 	fi
 }
 
-- 
cgit 


From 7ae69d7087a9a4d014e11b5328a49091cc1682f8 Mon Sep 17 00:00:00 2001
From: Shaoqin Huang <shahuang@redhat.com>
Date: Thu, 2 Feb 2023 10:57:15 +0800
Subject: selftests: KVM: Replace optarg with arg in guest_modes_cmdline

The parameter arg in guest_modes_cmdline not being used now, and the
optarg should be replaced with arg in guest_modes_cmdline.

And this is the chance to change strtoul() to atoi_non_negative(), since
guest mode ID will never be negative.

Signed-off-by: Shaoqin Huang <shahuang@redhat.com>
Fixes: e42ac777d661 ("KVM: selftests: Factor out guest mode code")
Reviewed-by: Andrew Jones <andrew.jones@linux.dev>
Reviewed-by: Vipin Sharma <vipinsh@google.com>
Link: https://lore.kernel.org/r/20230202025716.216323-1-shahuang@redhat.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/lib/guest_modes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c
index 99a575bbbc52..1df3ce4b16fd 100644
--- a/tools/testing/selftests/kvm/lib/guest_modes.c
+++ b/tools/testing/selftests/kvm/lib/guest_modes.c
@@ -127,7 +127,7 @@ void guest_modes_cmdline(const char *arg)
 		mode_selected = true;
 	}
 
-	mode = strtoul(optarg, NULL, 10);
+	mode = atoi_non_negative("Guest mode ID", arg);
 	TEST_ASSERT(mode < NUM_VM_MODES, "Guest mode ID %d too big", mode);
 	guest_modes[mode].enabled = true;
 }
-- 
cgit 


From 6c77ae716d546d71b21f0c9ee7d405314a3f3f9e Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Mon, 6 Feb 2023 21:24:30 +0100
Subject: KVM: selftests: Clean up misnomers in xen_shinfo_test

As discussed[*], relabel the poorly named structs to align with the
current KVM nomenclature.

Old names are a leftover from before commit 52491a38b2c2 ("KVM:
Initialize gfn_to_pfn_cache locks in dedicated helper"), which i.a.
introduced kvm_gpc_init() and renamed kvm_gfn_to_pfn_cache_init()/
_destroy() to kvm_gpc_activate()/_deactivate(). Partly in an effort
to avoid implying that the cache really is destroyed/freed.

While at it, get rid of #define GPA_INVALID, which being used as a GFN,
is not only misnamed, but also unnecessarily reinvents a UAPI constant.

No functional change intended.

[*] https://lore.kernel.org/r/Y5yZ6CFkEMBqyJ6v@google.com

Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://lore.kernel.org/r/20230206202430.1898057-1-mhal@rbox.co
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index 6a838df69174..fd39a15d6970 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -19,9 +19,6 @@
 
 #include <sys/eventfd.h>
 
-/* Defined in include/linux/kvm_types.h */
-#define GPA_INVALID		(~(ulong)0)
-
 #define SHINFO_REGION_GVA	0xc0000000ULL
 #define SHINFO_REGION_GPA	0xc0000000ULL
 #define SHINFO_REGION_SLOT	10
@@ -412,19 +409,19 @@ static void *juggle_shinfo_state(void *arg)
 {
 	struct kvm_vm *vm = (struct kvm_vm *)arg;
 
-	struct kvm_xen_hvm_attr cache_init = {
+	struct kvm_xen_hvm_attr cache_activate = {
 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
 		.u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE
 	};
 
-	struct kvm_xen_hvm_attr cache_destroy = {
+	struct kvm_xen_hvm_attr cache_deactivate = {
 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
-		.u.shared_info.gfn = GPA_INVALID
+		.u.shared_info.gfn = KVM_XEN_INVALID_GFN
 	};
 
 	for (;;) {
-		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_init);
-		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_destroy);
+		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate);
+		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate);
 		pthread_testcancel();
 	}
 
-- 
cgit 


From 695fa5a64cf52ab1aa2c89c93bbb1fd08995304a Mon Sep 17 00:00:00 2001
From: Shaoqin Huang <shahuang@redhat.com>
Date: Wed, 8 Feb 2023 15:18:00 +0800
Subject: KVM: selftests: Remove duplicate macro definition

The KVM_GUEST_PAGE_TABLE_MIN_PADDR macro has been defined in
include/kvm_util_base.h. So remove the duplicate definition in
lib/kvm_util.c.

Fixes: cce0c23dd944 ("KVM: selftests: Add wrapper to allocate page table page")
Signed-off-by: Shaoqin Huang <shahuang@redhat.com>
Link: https://lore.kernel.org/r/20230208071801.68620-1-shahuang@redhat.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 56d5ea949cbb..a3648c3f273f 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1941,9 +1941,6 @@ vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
 	return vm_phy_pages_alloc(vm, 1, paddr_min, memslot);
 }
 
-/* Arbitrary minimum physical address used for virtual translation tables. */
-#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
-
 vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm)
 {
 	return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR,
-- 
cgit 


From 1fdc6f4f274748f43ebb93eeaaa23c3c69f9c3a5 Mon Sep 17 00:00:00 2001
From: Alexander Pantyukhin <apantykhin@gmail.com>
Date: Wed, 18 Jan 2023 12:42:19 +0500
Subject: tools/testing/kunit/kunit.py: remove redundant double check

The build_tests function contained double checking for not success
result. It is fixed in the current patch. Additional small
simplifications of code like using ternary if were applied (avoid using
the same operation by calculation times differ in two places).

Signed-off-by: Alexander Pantyukhin <apantykhin@gmail.com>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit.py | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index 43fbe96318fe..0e3e08cc0204 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -77,11 +77,8 @@ def config_tests(linux: kunit_kernel.LinuxSourceTree,
 	config_start = time.time()
 	success = linux.build_reconfig(request.build_dir, request.make_options)
 	config_end = time.time()
-	if not success:
-		return KunitResult(KunitStatus.CONFIG_FAILURE,
-				   config_end - config_start)
-	return KunitResult(KunitStatus.SUCCESS,
-			   config_end - config_start)
+	status = KunitStatus.SUCCESS if success else KunitStatus.CONFIG_FAILURE
+	return KunitResult(status, config_end - config_start)
 
 def build_tests(linux: kunit_kernel.LinuxSourceTree,
 		request: KunitBuildRequest) -> KunitResult:
@@ -92,14 +89,8 @@ def build_tests(linux: kunit_kernel.LinuxSourceTree,
 				     request.build_dir,
 				     request.make_options)
 	build_end = time.time()
-	if not success:
-		return KunitResult(KunitStatus.BUILD_FAILURE,
-				   build_end - build_start)
-	if not success:
-		return KunitResult(KunitStatus.BUILD_FAILURE,
-				   build_end - build_start)
-	return KunitResult(KunitStatus.SUCCESS,
-			   build_end - build_start)
+	status = KunitStatus.SUCCESS if success else KunitStatus.BUILD_FAILURE
+	return KunitResult(status, build_end - build_start)
 
 def config_and_build_tests(linux: kunit_kernel.LinuxSourceTree,
 			   request: KunitBuildRequest) -> KunitResult:
@@ -145,7 +136,7 @@ def exec_tests(linux: kunit_kernel.LinuxSourceTree, request: KunitExecRequest) -
 		tests = _list_tests(linux, request)
 		if request.run_isolated == 'test':
 			filter_globs = tests
-		if request.run_isolated == 'suite':
+		elif request.run_isolated == 'suite':
 			filter_globs = _suites_from_test_list(tests)
 			# Apply the test-part of the user's glob, if present.
 			if '.' in request.filter_glob:
-- 
cgit 


From 2dc9d6ca52a47fd00822e818c2a5e48fc5fbbd53 Mon Sep 17 00:00:00 2001
From: Alexander Pantyukhin <apantykhin@gmail.com>
Date: Sun, 22 Jan 2023 02:27:17 +0500
Subject: kunit: kunit.py extract handlers

The main function contains a wide if-elif block that handles different
subcommands. It's possible to make code refactoring to extract
subcommands handlers.

Fixed commit summary line.
Shuah Khan <skhan@linuxfoundation.org>

Signed-off-by: Alexander Pantyukhin <apantykhin@gmail.com>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit.py | 167 +++++++++++++++++++++++++------------------
 1 file changed, 96 insertions(+), 71 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index 0e3e08cc0204..741f15420467 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -386,6 +386,95 @@ def tree_from_args(cli_args: argparse.Namespace) -> kunit_kernel.LinuxSourceTree
 			extra_qemu_args=qemu_args)
 
 
+def run_handler(cli_args):
+	if not os.path.exists(cli_args.build_dir):
+		os.mkdir(cli_args.build_dir)
+
+	linux = tree_from_args(cli_args)
+	request = KunitRequest(build_dir=cli_args.build_dir,
+					make_options=cli_args.make_options,
+					jobs=cli_args.jobs,
+					raw_output=cli_args.raw_output,
+					json=cli_args.json,
+					timeout=cli_args.timeout,
+					filter_glob=cli_args.filter_glob,
+					kernel_args=cli_args.kernel_args,
+					run_isolated=cli_args.run_isolated)
+	result = run_tests(linux, request)
+	if result.status != KunitStatus.SUCCESS:
+		sys.exit(1)
+
+
+def config_handler(cli_args):
+	if cli_args.build_dir and (
+			not os.path.exists(cli_args.build_dir)):
+		os.mkdir(cli_args.build_dir)
+
+	linux = tree_from_args(cli_args)
+	request = KunitConfigRequest(build_dir=cli_args.build_dir,
+						make_options=cli_args.make_options)
+	result = config_tests(linux, request)
+	stdout.print_with_timestamp((
+		'Elapsed time: %.3fs\n') % (
+			result.elapsed_time))
+	if result.status != KunitStatus.SUCCESS:
+		sys.exit(1)
+
+
+def build_handler(cli_args):
+	linux = tree_from_args(cli_args)
+	request = KunitBuildRequest(build_dir=cli_args.build_dir,
+					make_options=cli_args.make_options,
+					jobs=cli_args.jobs)
+	result = config_and_build_tests(linux, request)
+	stdout.print_with_timestamp((
+		'Elapsed time: %.3fs\n') % (
+			result.elapsed_time))
+	if result.status != KunitStatus.SUCCESS:
+		sys.exit(1)
+
+
+def exec_handler(cli_args):
+	linux = tree_from_args(cli_args)
+	exec_request = KunitExecRequest(raw_output=cli_args.raw_output,
+					build_dir=cli_args.build_dir,
+					json=cli_args.json,
+					timeout=cli_args.timeout,
+					filter_glob=cli_args.filter_glob,
+					kernel_args=cli_args.kernel_args,
+					run_isolated=cli_args.run_isolated)
+	result = exec_tests(linux, exec_request)
+	stdout.print_with_timestamp((
+		'Elapsed time: %.3fs\n') % (result.elapsed_time))
+	if result.status != KunitStatus.SUCCESS:
+		sys.exit(1)
+
+
+def parse_handler(cli_args):
+	if cli_args.file is None:
+		sys.stdin.reconfigure(errors='backslashreplace')  # pytype: disable=attribute-error
+		kunit_output = sys.stdin
+	else:
+		with open(cli_args.file, 'r', errors='backslashreplace') as f:
+			kunit_output = f.read().splitlines()
+	# We know nothing about how the result was created!
+	metadata = kunit_json.Metadata()
+	request = KunitParseRequest(raw_output=cli_args.raw_output,
+					json=cli_args.json)
+	result, _ = parse_tests(request, metadata, kunit_output)
+	if result.status != KunitStatus.SUCCESS:
+		sys.exit(1)
+
+
+subcommand_handlers_map = {
+	'run': run_handler,
+	'config': config_handler,
+	'build': build_handler,
+	'exec': exec_handler,
+	'parse': parse_handler
+}
+
+
 def main(argv):
 	parser = argparse.ArgumentParser(
 			description='Helps writing and running KUnit tests.')
@@ -429,78 +518,14 @@ def main(argv):
 	if get_kernel_root_path():
 		os.chdir(get_kernel_root_path())
 
-	if cli_args.subcommand == 'run':
-		if not os.path.exists(cli_args.build_dir):
-			os.mkdir(cli_args.build_dir)
-
-		linux = tree_from_args(cli_args)
-		request = KunitRequest(build_dir=cli_args.build_dir,
-				       make_options=cli_args.make_options,
-				       jobs=cli_args.jobs,
-				       raw_output=cli_args.raw_output,
-				       json=cli_args.json,
-				       timeout=cli_args.timeout,
-				       filter_glob=cli_args.filter_glob,
-				       kernel_args=cli_args.kernel_args,
-				       run_isolated=cli_args.run_isolated)
-		result = run_tests(linux, request)
-		if result.status != KunitStatus.SUCCESS:
-			sys.exit(1)
-	elif cli_args.subcommand == 'config':
-		if cli_args.build_dir and (
-				not os.path.exists(cli_args.build_dir)):
-			os.mkdir(cli_args.build_dir)
-
-		linux = tree_from_args(cli_args)
-		request = KunitConfigRequest(build_dir=cli_args.build_dir,
-					     make_options=cli_args.make_options)
-		result = config_tests(linux, request)
-		stdout.print_with_timestamp((
-			'Elapsed time: %.3fs\n') % (
-				result.elapsed_time))
-		if result.status != KunitStatus.SUCCESS:
-			sys.exit(1)
-	elif cli_args.subcommand == 'build':
-		linux = tree_from_args(cli_args)
-		request = KunitBuildRequest(build_dir=cli_args.build_dir,
-					    make_options=cli_args.make_options,
-					    jobs=cli_args.jobs)
-		result = config_and_build_tests(linux, request)
-		stdout.print_with_timestamp((
-			'Elapsed time: %.3fs\n') % (
-				result.elapsed_time))
-		if result.status != KunitStatus.SUCCESS:
-			sys.exit(1)
-	elif cli_args.subcommand == 'exec':
-		linux = tree_from_args(cli_args)
-		exec_request = KunitExecRequest(raw_output=cli_args.raw_output,
-						build_dir=cli_args.build_dir,
-						json=cli_args.json,
-						timeout=cli_args.timeout,
-						filter_glob=cli_args.filter_glob,
-						kernel_args=cli_args.kernel_args,
-						run_isolated=cli_args.run_isolated)
-		result = exec_tests(linux, exec_request)
-		stdout.print_with_timestamp((
-			'Elapsed time: %.3fs\n') % (result.elapsed_time))
-		if result.status != KunitStatus.SUCCESS:
-			sys.exit(1)
-	elif cli_args.subcommand == 'parse':
-		if cli_args.file is None:
-			sys.stdin.reconfigure(errors='backslashreplace')  # pytype: disable=attribute-error
-			kunit_output = sys.stdin
-		else:
-			with open(cli_args.file, 'r', errors='backslashreplace') as f:
-				kunit_output = f.read().splitlines()
-		# We know nothing about how the result was created!
-		metadata = kunit_json.Metadata()
-		request = KunitParseRequest(raw_output=cli_args.raw_output,
-					    json=cli_args.json)
-		result, _ = parse_tests(request, metadata, kunit_output)
-		if result.status != KunitStatus.SUCCESS:
-			sys.exit(1)
-	else:
+	subcomand_handler = subcommand_handlers_map.get(cli_args.subcommand, None)
+
+	if subcomand_handler is None:
 		parser.print_help()
+		return
+
+	subcomand_handler(cli_args)
+
 
 if __name__ == '__main__':
 	main(sys.argv[1:])
-- 
cgit 


From ab8684b8cecf711cc9e47c1cbb1a8f4b549f8893 Mon Sep 17 00:00:00 2001
From: Jon Doron <jond@wiz.io>
Date: Tue, 7 Feb 2023 10:19:16 +0200
Subject: libbpf: Add sample_period to creation options

Add option to set when the perf buffer should wake up, by default the
perf buffer becomes signaled for every event that is being pushed to it.

In case of a high throughput of events it will be more efficient to wake
up only once you have X events ready to be read.

So your application can wakeup once and drain the entire perf buffer.

Signed-off-by: Jon Doron <jond@wiz.io>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230207081916.3398417-1-arilou@gmail.com
---
 tools/lib/bpf/libbpf.c | 9 +++++++--
 tools/lib/bpf/libbpf.h | 4 +++-
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 4191a78b2815..35a698eb825d 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -11673,17 +11673,22 @@ struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt,
 	const size_t attr_sz = sizeof(struct perf_event_attr);
 	struct perf_buffer_params p = {};
 	struct perf_event_attr attr;
+	__u32 sample_period;
 
 	if (!OPTS_VALID(opts, perf_buffer_opts))
 		return libbpf_err_ptr(-EINVAL);
 
+	sample_period = OPTS_GET(opts, sample_period, 1);
+	if (!sample_period)
+		sample_period = 1;
+
 	memset(&attr, 0, attr_sz);
 	attr.size = attr_sz;
 	attr.config = PERF_COUNT_SW_BPF_OUTPUT;
 	attr.type = PERF_TYPE_SOFTWARE;
 	attr.sample_type = PERF_SAMPLE_RAW;
-	attr.sample_period = 1;
-	attr.wakeup_events = 1;
+	attr.sample_period = sample_period;
+	attr.wakeup_events = sample_period;
 
 	p.attr = &attr;
 	p.sample_cb = sample_cb;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index b18581277eb2..2efd80f6f7b9 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1247,8 +1247,10 @@ typedef void (*perf_buffer_lost_fn)(void *ctx, int cpu, __u64 cnt);
 /* common use perf buffer options */
 struct perf_buffer_opts {
 	size_t sz;
+	__u32 sample_period;
+	size_t :0;
 };
-#define perf_buffer_opts__last_field sz
+#define perf_buffer_opts__last_field sample_period
 
 /**
  * @brief **perf_buffer__new()** creates BPF perfbuf manager for a specified
-- 
cgit 


From b963d9d5b9437a6b99504987310f98537c9e77d4 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Tue, 7 Feb 2023 16:18:19 +0200
Subject: selftests: Fix failing VXLAN VNI filtering test

iproute2 does not recognize the "group6" and "remote6" keywords. Fix by
using "group" and "remote" instead.

Before:

 # ./test_vxlan_vnifiltering.sh
 [...]
 Tests passed:  25
 Tests failed:   2

After:

 # ./test_vxlan_vnifiltering.sh
 [...]
 Tests passed:  27
 Tests failed:   0

Fixes: 3edf5f66c12a ("selftests: add new tests for vxlan vnifiltering")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://lore.kernel.org/r/20230207141819.256689-1-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/test_vxlan_vnifiltering.sh | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/test_vxlan_vnifiltering.sh b/tools/testing/selftests/net/test_vxlan_vnifiltering.sh
index 704997ffc244..8c3ac0a72545 100755
--- a/tools/testing/selftests/net/test_vxlan_vnifiltering.sh
+++ b/tools/testing/selftests/net/test_vxlan_vnifiltering.sh
@@ -293,19 +293,11 @@ setup-vm() {
 	elif [[ -n $vtype && $vtype == "vnifilterg" ]]; then
 	   # Add per vni group config with 'bridge vni' api
 	   if [ -n "$group" ]; then
-	      if [ "$family" == "v4" ]; then
-		 if [ $mcast -eq 1 ]; then
-		    bridge -netns hv-$hvid vni add dev $vxlandev vni $tid group $group
-		 else
-		    bridge -netns hv-$hvid vni add dev $vxlandev vni $tid remote $group
-		 fi
-	      else
-		 if [ $mcast -eq 1 ]; then
-		    bridge -netns hv-$hvid vni add dev $vxlandev vni $tid group6 $group
-		 else
-		    bridge -netns hv-$hvid vni add dev $vxlandev vni $tid remote6 $group
-		 fi
-	      fi
+		if [ $mcast -eq 1 ]; then
+			bridge -netns hv-$hvid vni add dev $vxlandev vni $tid group $group
+		else
+			bridge -netns hv-$hvid vni add dev $vxlandev vni $tid remote $group
+		fi
 	   fi
 	fi
 	done
-- 
cgit 


From 3a082086aa200852545cf15159213582c0c80eba Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Wed, 8 Feb 2023 11:21:10 +0800
Subject: selftests: forwarding: lib: quote the sysctl values

When set/restore sysctl value, we should quote the value as some keys
may have multi values, e.g. net.ipv4.ping_group_range

Fixes: f5ae57784ba8 ("selftests: forwarding: lib: Add sysctl_set(), sysctl_restore()")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://lore.kernel.org/r/20230208032110.879205-1-liuhangbin@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/forwarding/lib.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 1c4f866de7d7..3d8e4ebda1b6 100755
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -914,14 +914,14 @@ sysctl_set()
 	local value=$1; shift
 
 	SYSCTL_ORIG[$key]=$(sysctl -n $key)
-	sysctl -qw $key=$value
+	sysctl -qw $key="$value"
 }
 
 sysctl_restore()
 {
 	local key=$1; shift
 
-	sysctl -qw $key=${SYSCTL_ORIG["$key"]}
+	sysctl -qw $key="${SYSCTL_ORIG[$key]}"
 }
 
 forwarding_enable()
-- 
cgit 


From a974f0c131891027fe8490e654a220151b4caa82 Mon Sep 17 00:00:00 2001
From: Benjamin Gray <bgray@linux.ibm.com>
Date: Fri, 3 Feb 2023 11:39:43 +1100
Subject: selftests/powerpc: Add generic read/write file util

File read/write is reimplemented in about 5 different ways in the
various PowerPC selftests. This indicates it should be a common util.

Add a common read_file / write_file implementation and convert users
to it where (easily) possible.

Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20230203003947.38033-2-bgray@linux.ibm.com
---
 tools/testing/selftests/powerpc/dscr/dscr.h        |  33 ++----
 .../selftests/powerpc/dscr/dscr_sysfs_test.c       |  21 +---
 tools/testing/selftests/powerpc/include/utils.h    |   2 +
 .../testing/selftests/powerpc/nx-gzip/gzfht_test.c |  48 ++++-----
 tools/testing/selftests/powerpc/pmu/lib.c          |  31 ++----
 tools/testing/selftests/powerpc/ptrace/core-pkey.c |  28 ++---
 tools/testing/selftests/powerpc/utils.c            | 117 +++++++++++++--------
 7 files changed, 120 insertions(+), 160 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/dscr/dscr.h b/tools/testing/selftests/powerpc/dscr/dscr.h
index b703714e7d98..aaa2b0d89f7e 100644
--- a/tools/testing/selftests/powerpc/dscr/dscr.h
+++ b/tools/testing/selftests/powerpc/dscr/dscr.h
@@ -64,48 +64,31 @@ inline void set_dscr_usr(unsigned long val)
 /* Default DSCR access */
 unsigned long get_default_dscr(void)
 {
-	int fd = -1, ret;
-	char buf[16];
+	int err;
+	char buf[16] = {0};
 	unsigned long val;
 
-	if (fd == -1) {
-		fd = open(DSCR_DEFAULT, O_RDONLY);
-		if (fd == -1) {
-			perror("open() failed");
-			exit(1);
-		}
-	}
-	memset(buf, 0, sizeof(buf));
-	lseek(fd, 0, SEEK_SET);
-	ret = read(fd, buf, sizeof(buf));
-	if (ret == -1) {
+	err = read_file(DSCR_DEFAULT, buf, sizeof(buf) - 1, NULL);
+	if (err) {
 		perror("read() failed");
 		exit(1);
 	}
 	sscanf(buf, "%lx", &val);
-	close(fd);
 	return val;
 }
 
 void set_default_dscr(unsigned long val)
 {
-	int fd = -1, ret;
+	int err;
 	char buf[16];
 
-	if (fd == -1) {
-		fd = open(DSCR_DEFAULT, O_RDWR);
-		if (fd == -1) {
-			perror("open() failed");
-			exit(1);
-		}
-	}
 	sprintf(buf, "%lx\n", val);
-	ret = write(fd, buf, strlen(buf));
-	if (ret == -1) {
+
+	err = write_file(DSCR_DEFAULT, buf, strlen(buf));
+	if (err) {
 		perror("write() failed");
 		exit(1);
 	}
-	close(fd);
 }
 
 double uniform_deviate(int seed)
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c
index f20d1c166d1e..c350f193830a 100644
--- a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c
+++ b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c
@@ -12,24 +12,13 @@
 
 static int check_cpu_dscr_default(char *file, unsigned long val)
 {
-	char buf[10];
-	int fd, rc;
+	char buf[10] = {0};
+	int rc;
 
-	fd = open(file, O_RDWR);
-	if (fd == -1) {
-		perror("open() failed");
-		return 1;
-	}
-
-	rc = read(fd, buf, sizeof(buf));
-	if (rc == -1) {
-		perror("read() failed");
-		close(fd);
-		return 1;
-	}
-	close(fd);
+	rc = read_file(file, buf, sizeof(buf) - 1, NULL);
+	if (rc)
+		return rc;
 
-	buf[rc] = '\0';
 	if (strtol(buf, NULL, 16) != val) {
 		printf("DSCR match failed: %ld (system) %ld (cpu)\n",
 					val, strtol(buf, NULL, 16));
diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h
index e222a5858450..70885e5814a8 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -33,6 +33,8 @@ void *get_auxv_entry(int type);
 
 int pick_online_cpu(void);
 
+int read_file(const char *path, char *buf, size_t count, size_t *len);
+int write_file(const char *path, const char *buf, size_t count);
 int read_debugfs_file(char *debugfs_file, int *result);
 int write_debugfs_file(char *debugfs_file, int result);
 int read_sysfs_file(char *debugfs_file, char *result, size_t result_size);
diff --git a/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c
index 095195a25687..fbc3d265155b 100644
--- a/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c
+++ b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c
@@ -146,49 +146,37 @@ int gzip_header_blank(char *buf)
 /* Caller must free the allocated buffer return nonzero on error. */
 int read_alloc_input_file(char *fname, char **buf, size_t *bufsize)
 {
+	int err;
 	struct stat statbuf;
-	FILE *fp;
 	char *p;
 	size_t num_bytes;
 
 	if (stat(fname, &statbuf)) {
 		perror(fname);
-		return(-1);
-	}
-	fp = fopen(fname, "r");
-	if (fp == NULL) {
-		perror(fname);
-		return(-1);
+		return -1;
 	}
+
 	assert(NULL != (p = (char *) malloc(statbuf.st_size)));
-	num_bytes = fread(p, 1, statbuf.st_size, fp);
-	if (ferror(fp) || (num_bytes != statbuf.st_size)) {
+
+	err = read_file(fname, p, statbuf.st_size, &num_bytes);
+	if (err) {
 		perror(fname);
-		return(-1);
+		goto fail;
 	}
+
+	if (num_bytes != statbuf.st_size) {
+		fprintf(stderr, "Actual bytes != expected bytes\n");
+		err = -1;
+		goto fail;
+	}
+
 	*buf = p;
 	*bufsize = num_bytes;
 	return 0;
-}
 
-/* Returns nonzero on error */
-int write_output_file(char *fname, char *buf, size_t bufsize)
-{
-	FILE *fp;
-	size_t num_bytes;
-
-	fp = fopen(fname, "w");
-	if (fp == NULL) {
-		perror(fname);
-		return(-1);
-	}
-	num_bytes = fwrite(buf, 1, bufsize, fp);
-	if (ferror(fp) || (num_bytes != bufsize)) {
-		perror(fname);
-		return(-1);
-	}
-	fclose(fp);
-	return 0;
+fail:
+	free(p);
+	return err;
 }
 
 /*
@@ -399,7 +387,7 @@ int compress_file(int argc, char **argv, void *handle)
 	assert(FNAME_MAX > (strlen(argv[1]) + strlen(FEXT)));
 	strcpy(outname, argv[1]);
 	strcat(outname, FEXT);
-	if (write_output_file(outname, outbuf, dsttotlen)) {
+	if (write_file(outname, outbuf, dsttotlen)) {
 		fprintf(stderr, "write error: %s\n", outname);
 		exit(-1);
 	}
diff --git a/tools/testing/selftests/powerpc/pmu/lib.c b/tools/testing/selftests/powerpc/pmu/lib.c
index 88690b97b7b9..960915304a65 100644
--- a/tools/testing/selftests/powerpc/pmu/lib.c
+++ b/tools/testing/selftests/powerpc/pmu/lib.c
@@ -190,38 +190,23 @@ int parse_proc_maps(void)
 
 bool require_paranoia_below(int level)
 {
+	int err;
 	long current;
-	char *end, buf[16];
-	FILE *f;
-	bool rc;
-
-	rc = false;
-
-	f = fopen(PARANOID_PATH, "r");
-	if (!f) {
-		perror("fopen");
-		goto out;
-	}
+	char *end;
+	char buf[16] = {0};
 
-	if (!fgets(buf, sizeof(buf), f)) {
+	err = read_file(PARANOID_PATH, buf, sizeof(buf) - 1, NULL);
+	if (err) {
 		printf("Couldn't read " PARANOID_PATH "?\n");
-		goto out_close;
+		return false;
 	}
 
 	current = strtol(buf, &end, 10);
 
 	if (end == buf) {
 		printf("Couldn't parse " PARANOID_PATH "?\n");
-		goto out_close;
+		return false;
 	}
 
-	if (current >= level)
-		goto out_close;
-
-	rc = true;
-out_close:
-	fclose(f);
-out:
-	return rc;
+	return current < level;
 }
-
diff --git a/tools/testing/selftests/powerpc/ptrace/core-pkey.c b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
index 4e8d0ce1ff58..f6f8596ce8e1 100644
--- a/tools/testing/selftests/powerpc/ptrace/core-pkey.c
+++ b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
@@ -348,15 +348,11 @@ static int parent(struct shared_info *info, pid_t pid)
 
 static int write_core_pattern(const char *core_pattern)
 {
-	size_t len = strlen(core_pattern), ret;
-	FILE *f;
+	int err;
 
-	f = fopen(core_pattern_file, "w");
-	SKIP_IF_MSG(!f, "Try with root privileges");
-
-	ret = fwrite(core_pattern, 1, len, f);
-	fclose(f);
-	if (ret != len) {
+	err = write_file(core_pattern_file, core_pattern, strlen(core_pattern));
+	if (err) {
+		SKIP_IF_MSG(err == -EPERM, "Try with root privileges");
 		perror("Error writing to core_pattern file");
 		return TEST_FAIL;
 	}
@@ -366,8 +362,8 @@ static int write_core_pattern(const char *core_pattern)
 
 static int setup_core_pattern(char **core_pattern_, bool *changed_)
 {
-	FILE *f;
 	char *core_pattern;
+	size_t len;
 	int ret;
 
 	core_pattern = malloc(PATH_MAX);
@@ -376,22 +372,14 @@ static int setup_core_pattern(char **core_pattern_, bool *changed_)
 		return TEST_FAIL;
 	}
 
-	f = fopen(core_pattern_file, "r");
-	if (!f) {
-		perror("Error opening core_pattern file");
-		ret = TEST_FAIL;
-		goto out;
-	}
-
-	ret = fread(core_pattern, 1, PATH_MAX - 1, f);
-	fclose(f);
-	if (!ret) {
+	ret = read_file(core_pattern_file, core_pattern, PATH_MAX - 1, &len);
+	if (ret) {
 		perror("Error reading core_pattern file");
 		ret = TEST_FAIL;
 		goto out;
 	}
 
-	core_pattern[ret] = '\0';
+	core_pattern[len] = '\0';
 
 	/* Check whether we can predict the name of the core file. */
 	if (!strcmp(core_pattern, "core") || !strcmp(core_pattern, "core.%p"))
diff --git a/tools/testing/selftests/powerpc/utils.c b/tools/testing/selftests/powerpc/utils.c
index 1f36ee1a909a..22ba13425b2c 100644
--- a/tools/testing/selftests/powerpc/utils.c
+++ b/tools/testing/selftests/powerpc/utils.c
@@ -26,34 +26,83 @@
 
 static char auxv[4096];
 
-int read_auxv(char *buf, ssize_t buf_size)
+int read_file(const char *path, char *buf, size_t count, size_t *len)
 {
-	ssize_t num;
-	int rc, fd;
+	ssize_t rc;
+	int fd;
+	int err;
+	char eof;
 
-	fd = open("/proc/self/auxv", O_RDONLY);
-	if (fd == -1) {
-		perror("open");
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
 		return -errno;
+
+	rc = read(fd, buf, count);
+	if (rc < 0) {
+		err = -errno;
+		goto out;
+	}
+
+	if (len)
+		*len = rc;
+
+	/* Overflow if there are still more bytes after filling the buffer */
+	if (rc == count) {
+		rc = read(fd, &eof, 1);
+		if (rc != 0) {
+			err = -EOVERFLOW;
+			goto out;
+		}
 	}
 
-	num = read(fd, buf, buf_size);
-	if (num < 0) {
-		perror("read");
-		rc = -EIO;
+	err = 0;
+
+out:
+	close(fd);
+	errno = -err;
+	return err;
+}
+
+int write_file(const char *path, const char *buf, size_t count)
+{
+	int fd;
+	int err;
+	ssize_t rc;
+
+	fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+	if (fd < 0)
+		return -errno;
+
+	rc = write(fd, buf, count);
+	if (rc < 0) {
+		err = -errno;
 		goto out;
 	}
 
-	if (num > buf_size) {
-		printf("overflowed auxv buffer\n");
-		rc = -EOVERFLOW;
+	if (rc != count) {
+		err = -EOVERFLOW;
 		goto out;
 	}
 
-	rc = 0;
+	err = 0;
+
 out:
 	close(fd);
-	return rc;
+	errno = -err;
+	return err;
+}
+
+int read_auxv(char *buf, ssize_t buf_size)
+{
+	int err;
+
+	err = read_file("/proc/self/auxv", buf, buf_size, NULL);
+	if (err) {
+		perror("Error reading /proc/self/auxv");
+		return err;
+	}
+
+	return 0;
 }
 
 void *find_auxv_entry(int type, char *auxv)
@@ -142,65 +191,41 @@ bool is_ppc64le(void)
 int read_sysfs_file(char *fpath, char *result, size_t result_size)
 {
 	char path[PATH_MAX] = "/sys/";
-	int rc = -1, fd;
 
 	strncat(path, fpath, PATH_MAX - strlen(path) - 1);
 
-	if ((fd = open(path, O_RDONLY)) < 0)
-		return rc;
-
-	rc = read(fd, result, result_size);
-
-	close(fd);
-
-	if (rc < 0)
-		return rc;
-
-	return 0;
+	return read_file(path, result, result_size, NULL);
 }
 
 int read_debugfs_file(char *debugfs_file, int *result)
 {
-	int rc = -1, fd;
+	int err;
 	char path[PATH_MAX];
-	char value[16];
+	char value[16] = {0};
 
 	strcpy(path, "/sys/kernel/debug/");
 	strncat(path, debugfs_file, PATH_MAX - strlen(path) - 1);
 
-	if ((fd = open(path, O_RDONLY)) < 0)
-		return rc;
-
-	if ((rc = read(fd, value, sizeof(value))) < 0)
-		return rc;
+	err = read_file(path, value, sizeof(value) - 1, NULL);
+	if (err)
+		return err;
 
-	value[15] = 0;
 	*result = atoi(value);
-	close(fd);
 
 	return 0;
 }
 
 int write_debugfs_file(char *debugfs_file, int result)
 {
-	int rc = -1, fd;
 	char path[PATH_MAX];
 	char value[16];
 
 	strcpy(path, "/sys/kernel/debug/");
 	strncat(path, debugfs_file, PATH_MAX - strlen(path) - 1);
 
-	if ((fd = open(path, O_WRONLY)) < 0)
-		return rc;
-
 	snprintf(value, 16, "%d", result);
 
-	if ((rc = write(fd, value, strlen(value))) < 0)
-		return rc;
-
-	close(fd);
-
-	return 0;
+	return write_file(path, value, strlen(value));
 }
 
 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
-- 
cgit 


From 121d340be9a17ed89d523c56203908c01e09a306 Mon Sep 17 00:00:00 2001
From: Benjamin Gray <bgray@linux.ibm.com>
Date: Fri, 3 Feb 2023 11:39:44 +1100
Subject: selftests/powerpc: Add read/write debugfs file, int

Debugfs files are not always integers, so make *_file return/write a
byte buffer, and *_int deal with int values specifically. This increases
consistency with the other file read/write helpers.

Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20230203003947.38033-3-bgray@linux.ibm.com
---
 tools/testing/selftests/powerpc/include/utils.h    |  6 ++--
 .../selftests/powerpc/security/entry_flush.c       | 12 ++++----
 .../testing/selftests/powerpc/security/rfi_flush.c | 12 ++++----
 .../selftests/powerpc/security/uaccess_flush.c     | 18 ++++++------
 tools/testing/selftests/powerpc/utils.c            | 34 ++++++++++++++--------
 5 files changed, 47 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h
index 70885e5814a8..de5e3790f397 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -35,8 +35,10 @@ int pick_online_cpu(void);
 
 int read_file(const char *path, char *buf, size_t count, size_t *len);
 int write_file(const char *path, const char *buf, size_t count);
-int read_debugfs_file(char *debugfs_file, int *result);
-int write_debugfs_file(char *debugfs_file, int result);
+int read_debugfs_file(const char *debugfs_file, char *buf, size_t count);
+int write_debugfs_file(const char *debugfs_file, const char *buf, size_t count);
+int read_debugfs_int(const char *debugfs_file, int *result);
+int write_debugfs_int(const char *debugfs_file, int result);
 int read_sysfs_file(char *debugfs_file, char *result, size_t result_size);
 int perf_event_open_counter(unsigned int type,
 			    unsigned long config, int group_fd);
diff --git a/tools/testing/selftests/powerpc/security/entry_flush.c b/tools/testing/selftests/powerpc/security/entry_flush.c
index 68ce377b205e..e01c573deadd 100644
--- a/tools/testing/selftests/powerpc/security/entry_flush.c
+++ b/tools/testing/selftests/powerpc/security/entry_flush.c
@@ -34,18 +34,18 @@ int entry_flush_test(void)
 	// The PMU event we use only works on Power7 or later
 	SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06));
 
-	if (read_debugfs_file("powerpc/rfi_flush", &rfi_flush_orig) < 0) {
+	if (read_debugfs_int("powerpc/rfi_flush", &rfi_flush_orig) < 0) {
 		perror("Unable to read powerpc/rfi_flush debugfs file");
 		SKIP_IF(1);
 	}
 
-	if (read_debugfs_file("powerpc/entry_flush", &entry_flush_orig) < 0) {
+	if (read_debugfs_int("powerpc/entry_flush", &entry_flush_orig) < 0) {
 		perror("Unable to read powerpc/entry_flush debugfs file");
 		SKIP_IF(1);
 	}
 
 	if (rfi_flush_orig != 0) {
-		if (write_debugfs_file("powerpc/rfi_flush", 0) < 0) {
+		if (write_debugfs_int("powerpc/rfi_flush", 0) < 0) {
 			perror("error writing to powerpc/rfi_flush debugfs file");
 			FAIL_IF(1);
 		}
@@ -105,7 +105,7 @@ again:
 
 	if (entry_flush == entry_flush_orig) {
 		entry_flush = !entry_flush_orig;
-		if (write_debugfs_file("powerpc/entry_flush", entry_flush) < 0) {
+		if (write_debugfs_int("powerpc/entry_flush", entry_flush) < 0) {
 			perror("error writing to powerpc/entry_flush debugfs file");
 			return 1;
 		}
@@ -120,12 +120,12 @@ again:
 
 	set_dscr(0);
 
-	if (write_debugfs_file("powerpc/rfi_flush", rfi_flush_orig) < 0) {
+	if (write_debugfs_int("powerpc/rfi_flush", rfi_flush_orig) < 0) {
 		perror("unable to restore original value of powerpc/rfi_flush debugfs file");
 		return 1;
 	}
 
-	if (write_debugfs_file("powerpc/entry_flush", entry_flush_orig) < 0) {
+	if (write_debugfs_int("powerpc/entry_flush", entry_flush_orig) < 0) {
 		perror("unable to restore original value of powerpc/entry_flush debugfs file");
 		return 1;
 	}
diff --git a/tools/testing/selftests/powerpc/security/rfi_flush.c b/tools/testing/selftests/powerpc/security/rfi_flush.c
index f73484a6470f..6bedc86443a6 100644
--- a/tools/testing/selftests/powerpc/security/rfi_flush.c
+++ b/tools/testing/selftests/powerpc/security/rfi_flush.c
@@ -34,18 +34,18 @@ int rfi_flush_test(void)
 	// The PMU event we use only works on Power7 or later
 	SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06));
 
-	if (read_debugfs_file("powerpc/rfi_flush", &rfi_flush_orig) < 0) {
+	if (read_debugfs_int("powerpc/rfi_flush", &rfi_flush_orig) < 0) {
 		perror("Unable to read powerpc/rfi_flush debugfs file");
 		SKIP_IF(1);
 	}
 
-	if (read_debugfs_file("powerpc/entry_flush", &entry_flush_orig) < 0) {
+	if (read_debugfs_int("powerpc/entry_flush", &entry_flush_orig) < 0) {
 		have_entry_flush = 0;
 	} else {
 		have_entry_flush = 1;
 
 		if (entry_flush_orig != 0) {
-			if (write_debugfs_file("powerpc/entry_flush", 0) < 0) {
+			if (write_debugfs_int("powerpc/entry_flush", 0) < 0) {
 				perror("error writing to powerpc/entry_flush debugfs file");
 				return 1;
 			}
@@ -105,7 +105,7 @@ again:
 
 	if (rfi_flush == rfi_flush_orig) {
 		rfi_flush = !rfi_flush_orig;
-		if (write_debugfs_file("powerpc/rfi_flush", rfi_flush) < 0) {
+		if (write_debugfs_int("powerpc/rfi_flush", rfi_flush) < 0) {
 			perror("error writing to powerpc/rfi_flush debugfs file");
 			return 1;
 		}
@@ -120,13 +120,13 @@ again:
 
 	set_dscr(0);
 
-	if (write_debugfs_file("powerpc/rfi_flush", rfi_flush_orig) < 0) {
+	if (write_debugfs_int("powerpc/rfi_flush", rfi_flush_orig) < 0) {
 		perror("unable to restore original value of powerpc/rfi_flush debugfs file");
 		return 1;
 	}
 
 	if (have_entry_flush) {
-		if (write_debugfs_file("powerpc/entry_flush", entry_flush_orig) < 0) {
+		if (write_debugfs_int("powerpc/entry_flush", entry_flush_orig) < 0) {
 			perror("unable to restore original value of powerpc/entry_flush "
 			       "debugfs file");
 			return 1;
diff --git a/tools/testing/selftests/powerpc/security/uaccess_flush.c b/tools/testing/selftests/powerpc/security/uaccess_flush.c
index cf80f960e38a..fcf23ea9b183 100644
--- a/tools/testing/selftests/powerpc/security/uaccess_flush.c
+++ b/tools/testing/selftests/powerpc/security/uaccess_flush.c
@@ -36,30 +36,30 @@ int uaccess_flush_test(void)
 	// The PMU event we use only works on Power7 or later
 	SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06));
 
-	if (read_debugfs_file("powerpc/rfi_flush", &rfi_flush_orig) < 0) {
+	if (read_debugfs_int("powerpc/rfi_flush", &rfi_flush_orig) < 0) {
 		perror("Unable to read powerpc/rfi_flush debugfs file");
 		SKIP_IF(1);
 	}
 
-	if (read_debugfs_file("powerpc/entry_flush", &entry_flush_orig) < 0) {
+	if (read_debugfs_int("powerpc/entry_flush", &entry_flush_orig) < 0) {
 		perror("Unable to read powerpc/entry_flush debugfs file");
 		SKIP_IF(1);
 	}
 
-	if (read_debugfs_file("powerpc/uaccess_flush", &uaccess_flush_orig) < 0) {
+	if (read_debugfs_int("powerpc/uaccess_flush", &uaccess_flush_orig) < 0) {
 		perror("Unable to read powerpc/entry_flush debugfs file");
 		SKIP_IF(1);
 	}
 
 	if (rfi_flush_orig != 0) {
-		if (write_debugfs_file("powerpc/rfi_flush", 0) < 0) {
+		if (write_debugfs_int("powerpc/rfi_flush", 0) < 0) {
 			perror("error writing to powerpc/rfi_flush debugfs file");
 			FAIL_IF(1);
 		}
 	}
 
 	if (entry_flush_orig != 0) {
-		if (write_debugfs_file("powerpc/entry_flush", 0) < 0) {
+		if (write_debugfs_int("powerpc/entry_flush", 0) < 0) {
 			perror("error writing to powerpc/entry_flush debugfs file");
 			FAIL_IF(1);
 		}
@@ -119,7 +119,7 @@ again:
 
 	if (uaccess_flush == uaccess_flush_orig) {
 		uaccess_flush = !uaccess_flush_orig;
-		if (write_debugfs_file("powerpc/uaccess_flush", uaccess_flush) < 0) {
+		if (write_debugfs_int("powerpc/uaccess_flush", uaccess_flush) < 0) {
 			perror("error writing to powerpc/uaccess_flush debugfs file");
 			return 1;
 		}
@@ -134,17 +134,17 @@ again:
 
 	set_dscr(0);
 
-	if (write_debugfs_file("powerpc/rfi_flush", rfi_flush_orig) < 0) {
+	if (write_debugfs_int("powerpc/rfi_flush", rfi_flush_orig) < 0) {
 		perror("unable to restore original value of powerpc/rfi_flush debugfs file");
 		return 1;
 	}
 
-	if (write_debugfs_file("powerpc/entry_flush", entry_flush_orig) < 0) {
+	if (write_debugfs_int("powerpc/entry_flush", entry_flush_orig) < 0) {
 		perror("unable to restore original value of powerpc/entry_flush debugfs file");
 		return 1;
 	}
 
-	if (write_debugfs_file("powerpc/uaccess_flush", uaccess_flush_orig) < 0) {
+	if (write_debugfs_int("powerpc/uaccess_flush", uaccess_flush_orig) < 0) {
 		perror("unable to restore original value of powerpc/uaccess_flush debugfs file");
 		return 1;
 	}
diff --git a/tools/testing/selftests/powerpc/utils.c b/tools/testing/selftests/powerpc/utils.c
index 22ba13425b2c..495299a79f50 100644
--- a/tools/testing/selftests/powerpc/utils.c
+++ b/tools/testing/selftests/powerpc/utils.c
@@ -105,6 +105,24 @@ int read_auxv(char *buf, ssize_t buf_size)
 	return 0;
 }
 
+int read_debugfs_file(const char *subpath, char *buf, size_t count)
+{
+	char path[PATH_MAX] = "/sys/kernel/debug/";
+
+	strncat(path, subpath, sizeof(path) - strlen(path) - 1);
+
+	return read_file(path, buf, count, NULL);
+}
+
+int write_debugfs_file(const char *subpath, const char *buf, size_t count)
+{
+	char path[PATH_MAX] = "/sys/kernel/debug/";
+
+	strncat(path, subpath, sizeof(path) - strlen(path) - 1);
+
+	return write_file(path, buf, count);
+}
+
 void *find_auxv_entry(int type, char *auxv)
 {
 	ElfW(auxv_t) *p;
@@ -197,16 +215,12 @@ int read_sysfs_file(char *fpath, char *result, size_t result_size)
 	return read_file(path, result, result_size, NULL);
 }
 
-int read_debugfs_file(char *debugfs_file, int *result)
+int read_debugfs_int(const char *debugfs_file, int *result)
 {
 	int err;
-	char path[PATH_MAX];
 	char value[16] = {0};
 
-	strcpy(path, "/sys/kernel/debug/");
-	strncat(path, debugfs_file, PATH_MAX - strlen(path) - 1);
-
-	err = read_file(path, value, sizeof(value) - 1, NULL);
+	err = read_debugfs_file(debugfs_file, value, sizeof(value) - 1);
 	if (err)
 		return err;
 
@@ -215,17 +229,13 @@ int read_debugfs_file(char *debugfs_file, int *result)
 	return 0;
 }
 
-int write_debugfs_file(char *debugfs_file, int result)
+int write_debugfs_int(const char *debugfs_file, int result)
 {
-	char path[PATH_MAX];
 	char value[16];
 
-	strcpy(path, "/sys/kernel/debug/");
-	strncat(path, debugfs_file, PATH_MAX - strlen(path) - 1);
-
 	snprintf(value, 16, "%d", result);
 
-	return write_file(path, value, strlen(value));
+	return write_debugfs_file(debugfs_file, value, strlen(value));
 }
 
 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
-- 
cgit 


From d1bc05b7bf02f8635fe6c445f67d78f85234cbb7 Mon Sep 17 00:00:00 2001
From: Benjamin Gray <bgray@linux.ibm.com>
Date: Fri, 3 Feb 2023 11:39:45 +1100
Subject: selftests/powerpc: Parse long/unsigned long value safely

Often a file is expected to hold an integral value. Existing functions
will use a C stdlib function like atoi or strtol to parse the file.
These operations are error prone, with complicated error conditions
(atoi returns 0 if not a number, and is undefined behaviour if not in
range. strtol returns 0 if not a number, and LONG_MIN/MAX if not in
range + sets errno to ERANGE).

Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20230203003947.38033-4-bgray@linux.ibm.com
---
 tools/testing/selftests/powerpc/include/utils.h |   7 ++
 tools/testing/selftests/powerpc/pmu/lib.c       |   6 +-
 tools/testing/selftests/powerpc/utils.c         | 126 +++++++++++++++++++++++-
 3 files changed, 132 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h
index de5e3790f397..7c1fa385824c 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -33,6 +33,13 @@ void *get_auxv_entry(int type);
 
 int pick_online_cpu(void);
 
+int parse_intmax(const char *buffer, size_t count, intmax_t *result, int base);
+int parse_uintmax(const char *buffer, size_t count, uintmax_t *result, int base);
+int parse_int(const char *buffer, size_t count, int *result, int base);
+int parse_uint(const char *buffer, size_t count, unsigned int *result, int base);
+int parse_long(const char *buffer, size_t count, long *result, int base);
+int parse_ulong(const char *buffer, size_t count, unsigned long *result, int base);
+
 int read_file(const char *path, char *buf, size_t count, size_t *len);
 int write_file(const char *path, const char *buf, size_t count);
 int read_debugfs_file(const char *debugfs_file, char *buf, size_t count);
diff --git a/tools/testing/selftests/powerpc/pmu/lib.c b/tools/testing/selftests/powerpc/pmu/lib.c
index 960915304a65..1cfc13a25aee 100644
--- a/tools/testing/selftests/powerpc/pmu/lib.c
+++ b/tools/testing/selftests/powerpc/pmu/lib.c
@@ -192,7 +192,6 @@ bool require_paranoia_below(int level)
 {
 	int err;
 	long current;
-	char *end;
 	char buf[16] = {0};
 
 	err = read_file(PARANOID_PATH, buf, sizeof(buf) - 1, NULL);
@@ -201,9 +200,8 @@ bool require_paranoia_below(int level)
 		return false;
 	}
 
-	current = strtol(buf, &end, 10);
-
-	if (end == buf) {
+	err = parse_long(buf, sizeof(buf), &current, 10);
+	if (err) {
 		printf("Couldn't parse " PARANOID_PATH "?\n");
 		return false;
 	}
diff --git a/tools/testing/selftests/powerpc/utils.c b/tools/testing/selftests/powerpc/utils.c
index 495299a79f50..ddfd871881bf 100644
--- a/tools/testing/selftests/powerpc/utils.c
+++ b/tools/testing/selftests/powerpc/utils.c
@@ -8,6 +8,8 @@
 #include <elf.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
 #include <link.h>
 #include <sched.h>
 #include <stdio.h>
@@ -123,6 +125,126 @@ int write_debugfs_file(const char *subpath, const char *buf, size_t count)
 	return write_file(path, buf, count);
 }
 
+static int validate_int_parse(const char *buffer, size_t count, char *end)
+{
+	int err = 0;
+
+	/* Require at least one digit */
+	if (end == buffer) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* Require all remaining characters be whitespace-ish */
+	for (; end < buffer + count; end++) {
+		if (*end == '\0')
+			break;
+
+		if (*end != ' ' && *end != '\n') {
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+out:
+	errno = -err;
+	return err;
+}
+
+static int parse_bounded_int(const char *buffer, size_t count, intmax_t *result,
+			     int base, intmax_t min, intmax_t max)
+{
+	int err;
+	char *end;
+
+	errno = 0;
+	*result = strtoimax(buffer, &end, base);
+
+	if (errno)
+		return -errno;
+
+	err = validate_int_parse(buffer, count, end);
+	if (err)
+		goto out;
+
+	if (*result < min || *result > max)
+		err = -EOVERFLOW;
+
+out:
+	errno = -err;
+	return err;
+}
+
+static int parse_bounded_uint(const char *buffer, size_t count, uintmax_t *result,
+			      int base, uintmax_t max)
+{
+	int err = 0;
+	char *end;
+
+	errno = 0;
+	*result = strtoumax(buffer, &end, base);
+
+	if (errno)
+		return -errno;
+
+	err = validate_int_parse(buffer, count, end);
+	if (err)
+		goto out;
+
+	if (*result > max)
+		err = -EOVERFLOW;
+
+out:
+	errno = -err;
+	return err;
+}
+
+int parse_intmax(const char *buffer, size_t count, intmax_t *result, int base)
+{
+	return parse_bounded_int(buffer, count, result, base, INTMAX_MIN, INTMAX_MAX);
+}
+
+int parse_uintmax(const char *buffer, size_t count, uintmax_t *result, int base)
+{
+	return parse_bounded_uint(buffer, count, result, base, UINTMAX_MAX);
+}
+
+int parse_int(const char *buffer, size_t count, int *result, int base)
+{
+	intmax_t parsed;
+	int err = parse_bounded_int(buffer, count, &parsed, base, INT_MIN, INT_MAX);
+
+	*result = parsed;
+	return err;
+}
+
+int parse_uint(const char *buffer, size_t count, unsigned int *result, int base)
+{
+	uintmax_t parsed;
+	int err = parse_bounded_uint(buffer, count, &parsed, base, UINT_MAX);
+
+	*result = parsed;
+	return err;
+}
+
+int parse_long(const char *buffer, size_t count, long *result, int base)
+{
+	intmax_t parsed;
+	int err = parse_bounded_int(buffer, count, &parsed, base, LONG_MIN, LONG_MAX);
+
+	*result = parsed;
+	return err;
+}
+
+int parse_ulong(const char *buffer, size_t count, unsigned long *result, int base)
+{
+	uintmax_t parsed;
+	int err = parse_bounded_uint(buffer, count, &parsed, base, ULONG_MAX);
+
+	*result = parsed;
+	return err;
+}
+
 void *find_auxv_entry(int type, char *auxv)
 {
 	ElfW(auxv_t) *p;
@@ -224,9 +346,7 @@ int read_debugfs_int(const char *debugfs_file, int *result)
 	if (err)
 		return err;
 
-	*result = atoi(value);
-
-	return 0;
+	return parse_int(value, sizeof(value), result, 10);
 }
 
 int write_debugfs_int(const char *debugfs_file, int result)
-- 
cgit 


From 5c20de57888f0962e25a0eeec1a59c98056fc42e Mon Sep 17 00:00:00 2001
From: Benjamin Gray <bgray@linux.ibm.com>
Date: Fri, 3 Feb 2023 11:39:46 +1100
Subject: selftests/powerpc: Add {read,write}_{long,ulong}

Add helper functions to read and write (unsigned) long values directly
from/to files. One of the kernel interfaces uses hex strings, so we need
to allow passing a base too.

Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20230203003947.38033-5-bgray@linux.ibm.com
---
 tools/testing/selftests/powerpc/dscr/dscr.h        |  9 +--
 .../selftests/powerpc/dscr/dscr_sysfs_test.c       | 14 ++--
 tools/testing/selftests/powerpc/include/utils.h    |  4 ++
 tools/testing/selftests/powerpc/pmu/lib.c          |  9 +--
 tools/testing/selftests/powerpc/utils.c            | 81 ++++++++++++++++++++++
 5 files changed, 95 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/dscr/dscr.h b/tools/testing/selftests/powerpc/dscr/dscr.h
index aaa2b0d89f7e..2c54998d4715 100644
--- a/tools/testing/selftests/powerpc/dscr/dscr.h
+++ b/tools/testing/selftests/powerpc/dscr/dscr.h
@@ -65,26 +65,21 @@ inline void set_dscr_usr(unsigned long val)
 unsigned long get_default_dscr(void)
 {
 	int err;
-	char buf[16] = {0};
 	unsigned long val;
 
-	err = read_file(DSCR_DEFAULT, buf, sizeof(buf) - 1, NULL);
+	err = read_ulong(DSCR_DEFAULT, &val, 16);
 	if (err) {
 		perror("read() failed");
 		exit(1);
 	}
-	sscanf(buf, "%lx", &val);
 	return val;
 }
 
 void set_default_dscr(unsigned long val)
 {
 	int err;
-	char buf[16];
 
-	sprintf(buf, "%lx\n", val);
-
-	err = write_file(DSCR_DEFAULT, buf, strlen(buf));
+	err = write_ulong(DSCR_DEFAULT, val, 16);
 	if (err) {
 		perror("write() failed");
 		exit(1);
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c
index c350f193830a..4f1fef6198fc 100644
--- a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c
+++ b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c
@@ -12,16 +12,16 @@
 
 static int check_cpu_dscr_default(char *file, unsigned long val)
 {
-	char buf[10] = {0};
-	int rc;
+	unsigned long cpu_dscr;
+	int err;
 
-	rc = read_file(file, buf, sizeof(buf) - 1, NULL);
-	if (rc)
-		return rc;
+	err = read_ulong(file, &cpu_dscr, 16);
+	if (err)
+		return err;
 
-	if (strtol(buf, NULL, 16) != val) {
+	if (cpu_dscr != val) {
 		printf("DSCR match failed: %ld (system) %ld (cpu)\n",
-					val, strtol(buf, NULL, 16));
+					val, cpu_dscr);
 		return 1;
 	}
 	return 0;
diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h
index 7c1fa385824c..311ddc124ebc 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -42,6 +42,10 @@ int parse_ulong(const char *buffer, size_t count, unsigned long *result, int bas
 
 int read_file(const char *path, char *buf, size_t count, size_t *len);
 int write_file(const char *path, const char *buf, size_t count);
+int read_long(const char *path, long *result, int base);
+int write_long(const char *path, long result, int base);
+int read_ulong(const char *path, unsigned long *result, int base);
+int write_ulong(const char *path, unsigned long result, int base);
 int read_debugfs_file(const char *debugfs_file, char *buf, size_t count);
 int write_debugfs_file(const char *debugfs_file, const char *buf, size_t count);
 int read_debugfs_int(const char *debugfs_file, int *result);
diff --git a/tools/testing/selftests/powerpc/pmu/lib.c b/tools/testing/selftests/powerpc/pmu/lib.c
index 1cfc13a25aee..719f94f10d41 100644
--- a/tools/testing/selftests/powerpc/pmu/lib.c
+++ b/tools/testing/selftests/powerpc/pmu/lib.c
@@ -192,15 +192,8 @@ bool require_paranoia_below(int level)
 {
 	int err;
 	long current;
-	char buf[16] = {0};
 
-	err = read_file(PARANOID_PATH, buf, sizeof(buf) - 1, NULL);
-	if (err) {
-		printf("Couldn't read " PARANOID_PATH "?\n");
-		return false;
-	}
-
-	err = parse_long(buf, sizeof(buf), &current, 10);
+	err = read_long(PARANOID_PATH, &current, 10);
 	if (err) {
 		printf("Couldn't parse " PARANOID_PATH "?\n");
 		return false;
diff --git a/tools/testing/selftests/powerpc/utils.c b/tools/testing/selftests/powerpc/utils.c
index ddfd871881bf..1017f1738619 100644
--- a/tools/testing/selftests/powerpc/utils.c
+++ b/tools/testing/selftests/powerpc/utils.c
@@ -245,6 +245,87 @@ int parse_ulong(const char *buffer, size_t count, unsigned long *result, int bas
 	return err;
 }
 
+int read_long(const char *path, long *result, int base)
+{
+	int err;
+	char buffer[32] = {0};
+
+	err = read_file(path, buffer, sizeof(buffer) - 1, NULL);
+	if (err)
+		return err;
+
+	return parse_long(buffer, sizeof(buffer), result, base);
+}
+
+int read_ulong(const char *path, unsigned long *result, int base)
+{
+	int err;
+	char buffer[32] = {0};
+
+	err = read_file(path, buffer, sizeof(buffer) - 1, NULL);
+	if (err)
+		return err;
+
+	return parse_ulong(buffer, sizeof(buffer), result, base);
+}
+
+int write_long(const char *path, long result, int base)
+{
+	int err;
+	int len;
+	char buffer[32];
+
+	/* Decimal only for now: no format specifier for signed hex values */
+	if (base != 10) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	len = snprintf(buffer, sizeof(buffer), "%ld", result);
+	if (len < 0 || len >= sizeof(buffer)) {
+		err = -EOVERFLOW;
+		goto out;
+	}
+
+	err = write_file(path, buffer, len);
+
+out:
+	errno = -err;
+	return err;
+}
+
+int write_ulong(const char *path, unsigned long result, int base)
+{
+	int err;
+	int len;
+	char buffer[32];
+	char *fmt;
+
+	switch (base) {
+	case 10:
+		fmt = "%lu";
+		break;
+	case 16:
+		fmt = "%lx";
+		break;
+	default:
+		err = -EINVAL;
+		goto out;
+	}
+
+	len = snprintf(buffer, sizeof(buffer), fmt, result);
+	if (len < 0 || len >= sizeof(buffer)) {
+		err = -errno;
+		goto out;
+	}
+
+	err = write_file(path, buffer, len);
+
+out:
+	errno = -err;
+	return err;
+}
+
 void *find_auxv_entry(int type, char *auxv)
 {
 	ElfW(auxv_t) *p;
-- 
cgit 


From 8d7253dc447473dfcf3f09fb0fa2bd6f7d05b43b Mon Sep 17 00:00:00 2001
From: Benjamin Gray <bgray@linux.ibm.com>
Date: Fri, 3 Feb 2023 11:39:47 +1100
Subject: selftests/powerpc: Add automatically allocating read_file

A couple of tests roll their own auto-allocating file read logic.

Add a generic implementation and convert them to use it.

Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20230203003947.38033-6-bgray@linux.ibm.com
---
 tools/testing/selftests/powerpc/include/utils.h    |  1 +
 .../testing/selftests/powerpc/nx-gzip/gzfht_test.c | 38 +---------
 tools/testing/selftests/powerpc/syscalls/Makefile  |  2 +-
 .../selftests/powerpc/syscalls/rtas_filter.c       | 81 +++-------------------
 tools/testing/selftests/powerpc/utils.c            | 58 ++++++++++++++++
 5 files changed, 71 insertions(+), 109 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h
index 311ddc124ebc..eed7dd7582b2 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -42,6 +42,7 @@ int parse_ulong(const char *buffer, size_t count, unsigned long *result, int bas
 
 int read_file(const char *path, char *buf, size_t count, size_t *len);
 int write_file(const char *path, const char *buf, size_t count);
+int read_file_alloc(const char *path, char **buf, size_t *len);
 int read_long(const char *path, long *result, int base);
 int write_long(const char *path, long result, int base);
 int read_ulong(const char *path, unsigned long *result, int base);
diff --git a/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c
index fbc3d265155b..4de079923ccb 100644
--- a/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c
+++ b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c
@@ -143,42 +143,6 @@ int gzip_header_blank(char *buf)
 	return i;
 }
 
-/* Caller must free the allocated buffer return nonzero on error. */
-int read_alloc_input_file(char *fname, char **buf, size_t *bufsize)
-{
-	int err;
-	struct stat statbuf;
-	char *p;
-	size_t num_bytes;
-
-	if (stat(fname, &statbuf)) {
-		perror(fname);
-		return -1;
-	}
-
-	assert(NULL != (p = (char *) malloc(statbuf.st_size)));
-
-	err = read_file(fname, p, statbuf.st_size, &num_bytes);
-	if (err) {
-		perror(fname);
-		goto fail;
-	}
-
-	if (num_bytes != statbuf.st_size) {
-		fprintf(stderr, "Actual bytes != expected bytes\n");
-		err = -1;
-		goto fail;
-	}
-
-	*buf = p;
-	*bufsize = num_bytes;
-	return 0;
-
-fail:
-	free(p);
-	return err;
-}
-
 /*
  * Z_SYNC_FLUSH as described in zlib.h.
  * Returns number of appended bytes
@@ -245,7 +209,7 @@ int compress_file(int argc, char **argv, void *handle)
 		fprintf(stderr, "usage: %s <fname>\n", argv[0]);
 		exit(-1);
 	}
-	if (read_alloc_input_file(argv[1], &inbuf, &inlen))
+	if (read_file_alloc(argv[1], &inbuf, &inlen))
 		exit(-1);
 	fprintf(stderr, "file %s read, %ld bytes\n", argv[1], inlen);
 
diff --git a/tools/testing/selftests/powerpc/syscalls/Makefile b/tools/testing/selftests/powerpc/syscalls/Makefile
index b63f8459c704..54ff5cfffc63 100644
--- a/tools/testing/selftests/powerpc/syscalls/Makefile
+++ b/tools/testing/selftests/powerpc/syscalls/Makefile
@@ -6,4 +6,4 @@ CFLAGS += -I../../../../../usr/include
 top_srcdir = ../../../../..
 include ../../lib.mk
 
-$(TEST_GEN_PROGS): ../harness.c
+$(TEST_GEN_PROGS): ../harness.c ../utils.c
diff --git a/tools/testing/selftests/powerpc/syscalls/rtas_filter.c b/tools/testing/selftests/powerpc/syscalls/rtas_filter.c
index 03b487f18d00..9b17780f0b18 100644
--- a/tools/testing/selftests/powerpc/syscalls/rtas_filter.c
+++ b/tools/testing/selftests/powerpc/syscalls/rtas_filter.c
@@ -8,6 +8,7 @@
 #include <byteswap.h>
 #include <stdint.h>
 #include <inttypes.h>
+#include <linux/limits.h>
 #include <stdio.h>
 #include <string.h>
 #include <sys/syscall.h>
@@ -50,70 +51,16 @@ struct region {
 	struct region *next;
 };
 
-int read_entire_file(int fd, char **buf, size_t *len)
-{
-	size_t buf_size = 0;
-	size_t off = 0;
-	int rc;
-
-	*buf = NULL;
-	do {
-		buf_size += BLOCK_SIZE;
-		if (*buf == NULL)
-			*buf = malloc(buf_size);
-		else
-			*buf = realloc(*buf, buf_size);
-
-		if (*buf == NULL)
-			return -ENOMEM;
-
-		rc = read(fd, *buf + off, BLOCK_SIZE);
-		if (rc < 0)
-			return -EIO;
-
-		off += rc;
-	} while (rc == BLOCK_SIZE);
-
-	if (len)
-		*len = off;
-
-	return 0;
-}
-
-static int open_prop_file(const char *prop_path, const char *prop_name, int *fd)
-{
-	char *path;
-	int len;
-
-	/* allocate enough for two string, a slash and trailing NULL */
-	len = strlen(prop_path) + strlen(prop_name) + 1 + 1;
-	path = malloc(len);
-	if (path == NULL)
-		return -ENOMEM;
-
-	snprintf(path, len, "%s/%s", prop_path, prop_name);
-
-	*fd = open(path, O_RDONLY);
-	free(path);
-	if (*fd < 0)
-		return -errno;
-
-	return 0;
-}
-
 static int get_property(const char *prop_path, const char *prop_name,
 			char **prop_val, size_t *prop_len)
 {
-	int rc, fd;
-
-	rc = open_prop_file(prop_path, prop_name, &fd);
-	if (rc)
-		return rc;
+	char path[PATH_MAX];
 
-	rc = read_entire_file(fd, prop_val, prop_len);
-	close(fd);
+	int len = snprintf(path, sizeof(path), "%s/%s", prop_path, prop_name);
+	if (len < 0 || len >= sizeof(path))
+		return -ENOMEM;
 
-	return rc;
+	return read_file_alloc(path, prop_val, prop_len);
 }
 
 int rtas_token(const char *call_name)
@@ -138,22 +85,14 @@ err:
 static int read_kregion_bounds(struct region *kregion)
 {
 	char *buf;
-	int fd;
-	int rc;
+	int err;
 
-	fd = open("/proc/ppc64/rtas/rmo_buffer", O_RDONLY);
-	if (fd < 0) {
-		printf("Could not open rmo_buffer file\n");
+	err = read_file_alloc("/proc/ppc64/rtas/rmo_buffer", &buf, NULL);
+	if (err) {
+		perror("Could not open rmo_buffer file");
 		return RTAS_IO_ASSERT;
 	}
 
-	rc = read_entire_file(fd, &buf, NULL);
-	close(fd);
-	if (rc) {
-		free(buf);
-		return rc;
-	}
-
 	sscanf(buf, "%" SCNx64 " %x", &kregion->addr, &kregion->size);
 	free(buf);
 
diff --git a/tools/testing/selftests/powerpc/utils.c b/tools/testing/selftests/powerpc/utils.c
index 1017f1738619..7c8cfedb012a 100644
--- a/tools/testing/selftests/powerpc/utils.c
+++ b/tools/testing/selftests/powerpc/utils.c
@@ -65,6 +65,64 @@ out:
 	return err;
 }
 
+int read_file_alloc(const char *path, char **buf, size_t *len)
+{
+	size_t read_offset = 0;
+	size_t buffer_len = 0;
+	char *buffer = NULL;
+	int err;
+	int fd;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return -errno;
+
+	/*
+	 * We don't use stat & preallocate st_size because some non-files
+	 * report 0 file size. Instead just dynamically grow the buffer
+	 * as needed.
+	 */
+	while (1) {
+		ssize_t rc;
+
+		if (read_offset >= buffer_len / 2) {
+			char *next_buffer;
+
+			buffer_len = buffer_len ? buffer_len * 2 : 4096;
+			next_buffer = realloc(buffer, buffer_len);
+			if (!next_buffer) {
+				err = -errno;
+				goto out;
+			}
+			buffer = next_buffer;
+		}
+
+		rc = read(fd, buffer + read_offset, buffer_len - read_offset);
+		if (rc < 0) {
+			err = -errno;
+			goto out;
+		}
+
+		if (rc == 0)
+			break;
+
+		read_offset += rc;
+	}
+
+	*buf = buffer;
+	if (len)
+		*len = read_offset;
+
+	err = 0;
+
+out:
+	close(fd);
+	if (err)
+		free(buffer);
+	errno = -err;
+	return err;
+}
+
 int write_file(const char *path, const char *buf, size_t count)
 {
 	int fd;
-- 
cgit 


From 2531ba0e4ae67d6d0219400af27805fe52cd28e8 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 9 Feb 2023 15:37:35 +0100
Subject: tools/resolve_btfids: Pass HOSTCFLAGS as EXTRA_CFLAGS to prepare
 targets

Thorsten reported build issue with command line that defined extra
HOSTCFLAGS that were not passed into 'prepare' targets, but were
used to build resolve_btfids objects.

This results in build fail when these objects are linked together:

  /usr/bin/ld: /build.../tools/bpf/resolve_btfids//libbpf/libbpf.a(libbpf-in.o):
  relocation R_X86_64_32 against `.rodata.str1.1' can not be used when making a PIE \
  object; recompile with -fPIE

Fixing this by passing HOSTCFLAGS in EXTRA_CFLAGS as part of
HOST_OVERRIDES variable for prepare targets.

[1] https://lore.kernel.org/bpf/f7922132-6645-6316-5675-0ece4197bfff@leemhuis.info/

Fixes: 56a2df7615fa ("tools/resolve_btfids: Compile resolve_btfids as host program")
Reported-by: Thorsten Leemhuis <linux@leemhuis.info>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Thorsten Leemhuis <linux@leemhuis.info>
Acked-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/bpf/20230209143735.4112845-1-jolsa@kernel.org
---
 tools/bpf/resolve_btfids/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile
index 2abdd85b4a08..ac548a7baa73 100644
--- a/tools/bpf/resolve_btfids/Makefile
+++ b/tools/bpf/resolve_btfids/Makefile
@@ -19,7 +19,7 @@ endif
 
 # Overrides for the prepare step libraries.
 HOST_OVERRIDES := AR="$(HOSTAR)" CC="$(HOSTCC)" LD="$(HOSTLD)" ARCH="$(HOSTARCH)" \
-		  CROSS_COMPILE=""
+		  CROSS_COMPILE="" EXTRA_CFLAGS="$(HOSTCFLAGS)"
 
 RM      ?= rm
 HOSTCC  ?= gcc
-- 
cgit 


From c12e0d5f267d7eb45a2f8eaa9fd44eaa2871a95e Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 7 Feb 2023 19:44:58 +0100
Subject: self-tests: introduce self-tests for RPS default mask

Ensure that RPS default mask changes take place on
all newly created netns/devices and don't affect
existing ones.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile            |  1 +
 tools/testing/selftests/net/config              |  3 ++
 tools/testing/selftests/net/rps_default_mask.sh | 57 +++++++++++++++++++++++++
 3 files changed, 61 insertions(+)
 create mode 100755 tools/testing/selftests/net/rps_default_mask.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 951bd5342bc6..3364c548a23b 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -46,6 +46,7 @@ TEST_PROGS += stress_reuseport_listen.sh
 TEST_PROGS += l2_tos_ttl_inherit.sh
 TEST_PROGS += bind_bhash.sh
 TEST_PROGS += ip_local_port_range.sh
+TEST_PROGS += rps_default_mask.sh
 TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh
 TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh
 TEST_GEN_FILES =  socket nettest
diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
index bd89198cd817..cc9fd55ab869 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -3,6 +3,9 @@ CONFIG_NET_NS=y
 CONFIG_BPF_SYSCALL=y
 CONFIG_TEST_BPF=m
 CONFIG_NUMA=y
+CONFIG_RPS=y
+CONFIG_SYSFS=y
+CONFIG_PROC_SYSCTL=y
 CONFIG_NET_VRF=y
 CONFIG_NET_L3_MASTER_DEV=y
 CONFIG_IPV6=y
diff --git a/tools/testing/selftests/net/rps_default_mask.sh b/tools/testing/selftests/net/rps_default_mask.sh
new file mode 100755
index 000000000000..c81c0ac7ddfe
--- /dev/null
+++ b/tools/testing/selftests/net/rps_default_mask.sh
@@ -0,0 +1,57 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+readonly ksft_skip=4
+readonly cpus=$(nproc)
+ret=0
+
+[ $cpus -gt 2 ] || exit $ksft_skip
+
+readonly INITIAL_RPS_DEFAULT_MASK=$(cat /proc/sys/net/core/rps_default_mask)
+readonly NETNS="ns-$(mktemp -u XXXXXX)"
+
+setup() {
+	ip netns add "${NETNS}"
+	ip -netns "${NETNS}" link set lo up
+}
+
+cleanup() {
+	echo $INITIAL_RPS_DEFAULT_MASK > /proc/sys/net/core/rps_default_mask
+	ip netns del $NETNS
+}
+
+chk_rps() {
+	local rps_mask expected_rps_mask=$3
+	local dev_name=$2
+	local msg=$1
+
+	rps_mask=$(ip netns exec $NETNS cat /sys/class/net/$dev_name/queues/rx-0/rps_cpus)
+	printf "%-60s" "$msg"
+	if [ $rps_mask -eq $expected_rps_mask ]; then
+		echo "[ ok ]"
+	else
+		echo "[fail] expected $expected_rps_mask found $rps_mask"
+		ret=1
+	fi
+}
+
+trap cleanup EXIT
+
+echo 0 > /proc/sys/net/core/rps_default_mask
+setup
+chk_rps "empty rps_default_mask" lo 0
+cleanup
+
+echo 1 > /proc/sys/net/core/rps_default_mask
+setup
+chk_rps "non zero rps_default_mask" lo 1
+
+echo 3 > /proc/sys/net/core/rps_default_mask
+chk_rps "changing rps_default_mask dont affect existing netns" lo 1
+
+ip -n $NETNS link add type veth
+ip -n $NETNS link set dev veth0 up
+ip -n $NETNS link set dev veth1 up
+chk_rps "changing rps_default_mask affect newly created devices" veth0 3
+chk_rps "changing rps_default_mask affect newly created devices[II]" veth1 3
+exit $ret
-- 
cgit 


From c21a20d9d102ab809a992a6b056abbec4cd4c1cd Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Wed, 8 Feb 2023 18:14:07 +0100
Subject: selftests: fib_rule_tests: Test UDP and TCP connections with DSCP
 rules.

Add the fib_rule6_send and fib_rule4_send tests to verify that DSCP
values are properly taken into account when UDP or TCP sockets try to
connect().

Tests are done with nettest, which needs a new option to specify
the DS Field value of the socket being tested. This new option is
named '-Q', in reference to the similar option used by ping.

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib_rule_tests.sh | 128 +++++++++++++++++++++++++-
 tools/testing/selftests/net/nettest.c         |  51 +++++++++-
 2 files changed, 177 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
index c245476fa29d..63c3eaec8d30 100755
--- a/tools/testing/selftests/net/fib_rule_tests.sh
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -10,8 +10,10 @@ ret=0
 
 PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
 IP="ip -netns testns"
+IP_PEER="ip -netns peerns"
 
 RTABLE=100
+RTABLE_PEER=101
 GW_IP4=192.51.100.2
 SRC_IP=192.51.100.3
 GW_IP6=2001:db8:1::2
@@ -20,7 +22,9 @@ SRC_IP6=2001:db8:1::3
 DEV_ADDR=192.51.100.1
 DEV_ADDR6=2001:db8:1::1
 DEV=dummy0
-TESTS="fib_rule6 fib_rule4"
+TESTS="fib_rule6 fib_rule4 fib_rule6_connect fib_rule4_connect"
+
+SELFTEST_PATH=""
 
 log_test()
 {
@@ -52,6 +56,31 @@ log_section()
 	echo "######################################################################"
 }
 
+check_nettest()
+{
+	if which nettest > /dev/null 2>&1; then
+		return 0
+	fi
+
+	# Add the selftest directory to PATH if not already done
+	if [ "${SELFTEST_PATH}" = "" ]; then
+		SELFTEST_PATH="$(dirname $0)"
+		PATH="${PATH}:${SELFTEST_PATH}"
+
+		# Now retry with the new path
+		if which nettest > /dev/null 2>&1; then
+			return 0
+		fi
+
+		if [ "${ret}" -eq 0 ]; then
+			ret="${ksft_skip}"
+		fi
+		echo "nettest not found (try 'make -C ${SELFTEST_PATH} nettest')"
+	fi
+
+	return 1
+}
+
 setup()
 {
 	set -e
@@ -72,6 +101,39 @@ cleanup()
 	ip netns del testns
 }
 
+setup_peer()
+{
+	set -e
+
+	ip netns add peerns
+	$IP_PEER link set dev lo up
+
+	ip link add name veth0 netns testns type veth \
+		peer name veth1 netns peerns
+	$IP link set dev veth0 up
+	$IP_PEER link set dev veth1 up
+
+	$IP address add 192.0.2.10 peer 192.0.2.11/32 dev veth0
+	$IP_PEER address add 192.0.2.11 peer 192.0.2.10/32 dev veth1
+
+	$IP address add 2001:db8::10 peer 2001:db8::11/128 dev veth0 nodad
+	$IP_PEER address add 2001:db8::11 peer 2001:db8::10/128 dev veth1 nodad
+
+	$IP_PEER address add 198.51.100.11/32 dev lo
+	$IP route add table $RTABLE_PEER 198.51.100.11/32 via 192.0.2.11
+
+	$IP_PEER address add 2001:db8::1:11/128 dev lo
+	$IP route add table $RTABLE_PEER 2001:db8::1:11/128 via 2001:db8::11
+
+	set +e
+}
+
+cleanup_peer()
+{
+	$IP link del dev veth0
+	ip netns del peerns
+}
+
 fib_check_iproute_support()
 {
 	ip rule help 2>&1 | grep -q $1
@@ -190,6 +252,37 @@ fib_rule6_test()
 	fi
 }
 
+# Verify that the IPV6_TCLASS option of UDPv6 and TCPv6 sockets is properly
+# taken into account when connecting the socket and when sending packets.
+fib_rule6_connect_test()
+{
+	local dsfield
+
+	if ! check_nettest; then
+		echo "SKIP: Could not run test without nettest tool"
+		return
+	fi
+
+	setup_peer
+	$IP -6 rule add dsfield 0x04 table $RTABLE_PEER
+
+	# Combine the base DS Field value (0x04) with all possible ECN values
+	# (Not-ECT: 0, ECT(1): 1, ECT(0): 2, CE: 3).
+	# The ECN bits shouldn't influence the result of the test.
+	for dsfield in 0x04 0x05 0x06 0x07; do
+		nettest -q -6 -B -t 5 -N testns -O peerns -U -D \
+			-Q "${dsfield}" -l 2001:db8::1:11 -r 2001:db8::1:11
+		log_test $? 0 "rule6 dsfield udp connect (dsfield ${dsfield})"
+
+		nettest -q -6 -B -t 5 -N testns -O peerns -Q "${dsfield}" \
+			-l 2001:db8::1:11 -r 2001:db8::1:11
+		log_test $? 0 "rule6 dsfield tcp connect (dsfield ${dsfield})"
+	done
+
+	$IP -6 rule del dsfield 0x04 table $RTABLE_PEER
+	cleanup_peer
+}
+
 fib_rule4_del()
 {
 	$IP rule del $1
@@ -296,6 +389,37 @@ fib_rule4_test()
 	fi
 }
 
+# Verify that the IP_TOS option of UDPv4 and TCPv4 sockets is properly taken
+# into account when connecting the socket and when sending packets.
+fib_rule4_connect_test()
+{
+	local dsfield
+
+	if ! check_nettest; then
+		echo "SKIP: Could not run test without nettest tool"
+		return
+	fi
+
+	setup_peer
+	$IP -4 rule add dsfield 0x04 table $RTABLE_PEER
+
+	# Combine the base DS Field value (0x04) with all possible ECN values
+	# (Not-ECT: 0, ECT(1): 1, ECT(0): 2, CE: 3).
+	# The ECN bits shouldn't influence the result of the test.
+	for dsfield in 0x04 0x05 0x06 0x07; do
+		nettest -q -B -t 5 -N testns -O peerns -D -U -Q "${dsfield}" \
+			-l 198.51.100.11 -r 198.51.100.11
+		log_test $? 0 "rule4 dsfield udp connect (dsfield ${dsfield})"
+
+		nettest -q -B -t 5 -N testns -O peerns -Q "${dsfield}" \
+			-l 198.51.100.11 -r 198.51.100.11
+		log_test $? 0 "rule4 dsfield tcp connect (dsfield ${dsfield})"
+	done
+
+	$IP -4 rule del dsfield 0x04 table $RTABLE_PEER
+	cleanup_peer
+}
+
 run_fibrule_tests()
 {
 	log_section "IPv4 fib rule"
@@ -345,6 +469,8 @@ do
 	case $t in
 	fib_rule6_test|fib_rule6)		fib_rule6_test;;
 	fib_rule4_test|fib_rule4)		fib_rule4_test;;
+	fib_rule6_connect_test|fib_rule6_connect)	fib_rule6_connect_test;;
+	fib_rule4_connect_test|fib_rule4_connect)	fib_rule4_connect_test;;
 
 	help) echo "Test names: $TESTS"; exit 0;;
 
diff --git a/tools/testing/selftests/net/nettest.c b/tools/testing/selftests/net/nettest.c
index 7900fa98eccb..ee9a72982705 100644
--- a/tools/testing/selftests/net/nettest.c
+++ b/tools/testing/selftests/net/nettest.c
@@ -87,6 +87,7 @@ struct sock_args {
 	int use_setsockopt;
 	int use_freebind;
 	int use_cmsg;
+	uint8_t dsfield;
 	const char *dev;
 	const char *server_dev;
 	int ifindex;
@@ -580,6 +581,36 @@ static int set_reuseaddr(int sd)
 	return rc;
 }
 
+static int set_dsfield(int sd, int version, int dsfield)
+{
+	if (!dsfield)
+		return 0;
+
+	switch (version) {
+	case AF_INET:
+		if (setsockopt(sd, SOL_IP, IP_TOS, &dsfield,
+			       sizeof(dsfield)) < 0) {
+			log_err_errno("setsockopt(IP_TOS)");
+			return -1;
+		}
+		break;
+
+	case AF_INET6:
+		if (setsockopt(sd, SOL_IPV6, IPV6_TCLASS, &dsfield,
+			       sizeof(dsfield)) < 0) {
+			log_err_errno("setsockopt(IPV6_TCLASS)");
+			return -1;
+		}
+		break;
+
+	default:
+		log_error("Invalid address family\n");
+		return -1;
+	}
+
+	return 0;
+}
+
 static int str_to_uint(const char *str, int min, int max, unsigned int *value)
 {
 	int number;
@@ -1317,6 +1348,9 @@ static int msock_init(struct sock_args *args, int server)
 		       (char *)&one, sizeof(one)) < 0)
 		log_err_errno("Setting SO_BROADCAST error");
 
+	if (set_dsfield(sd, AF_INET, args->dsfield) != 0)
+		goto out_err;
+
 	if (args->dev && bind_to_device(sd, args->dev) != 0)
 		goto out_err;
 	else if (args->use_setsockopt &&
@@ -1445,6 +1479,9 @@ static int lsock_init(struct sock_args *args)
 	if (set_reuseport(sd) != 0)
 		goto err;
 
+	if (set_dsfield(sd, args->version, args->dsfield) != 0)
+		goto err;
+
 	if (args->dev && bind_to_device(sd, args->dev) != 0)
 		goto err;
 	else if (args->use_setsockopt &&
@@ -1658,6 +1695,9 @@ static int connectsock(void *addr, socklen_t alen, struct sock_args *args)
 	if (set_reuseport(sd) != 0)
 		goto err;
 
+	if (set_dsfield(sd, args->version, args->dsfield) != 0)
+		goto err;
+
 	if (args->dev && bind_to_device(sd, args->dev) != 0)
 		goto err;
 	else if (args->use_setsockopt &&
@@ -1862,7 +1902,7 @@ static int ipc_parent(int cpid, int fd, struct sock_args *args)
 	return client_status;
 }
 
-#define GETOPT_STR  "sr:l:c:p:t:g:P:DRn:M:X:m:d:I:BN:O:SUCi6xL:0:1:2:3:Fbqf"
+#define GETOPT_STR  "sr:l:c:Q:p:t:g:P:DRn:M:X:m:d:I:BN:O:SUCi6xL:0:1:2:3:Fbqf"
 #define OPT_FORCE_BIND_KEY_IFINDEX 1001
 #define OPT_NO_BIND_KEY_IFINDEX 1002
 
@@ -1893,6 +1933,8 @@ static void print_usage(char *prog)
 	"    -D|R          datagram (D) / raw (R) socket (default stream)\n"
 	"    -l addr       local address to bind to in server mode\n"
 	"    -c addr       local address to bind to in client mode\n"
+	"    -Q dsfield    DS Field value of the socket (the IP_TOS or\n"
+	"                  IPV6_TCLASS socket option)\n"
 	"    -x            configure XFRM policy on socket\n"
 	"\n"
 	"    -d dev        bind socket to given device name\n"
@@ -1971,6 +2013,13 @@ int main(int argc, char *argv[])
 			args.has_local_ip = 1;
 			args.client_local_addr_str = optarg;
 			break;
+		case 'Q':
+			if (str_to_uint(optarg, 0, 255, &tmp) != 0) {
+				fprintf(stderr, "Invalid DS Field\n");
+				return 1;
+			}
+			args.dsfield = tmp;
+			break;
 		case 'p':
 			if (str_to_uint(optarg, 1, 65535, &tmp) != 0) {
 				fprintf(stderr, "Invalid port\n");
-- 
cgit 


From 795deb3f97472942780283e2af8f38625e3329aa Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 10 Feb 2023 01:11:55 +0100
Subject: selftests/bpf: Quote host tools

Using HOSTCC="ccache clang" breaks building the tests, since, when it's
forwarded to e.g. bpftool, the child make sees HOSTCC=ccache and
"clang" is considered a target. Fix by quoting it, and also HOSTLD and
HOSTAR for consistency.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230210001210.395194-2-iii@linux.ibm.com
---
 tools/testing/selftests/bpf/Makefile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index b2eb3201b85a..49bc1ba12d3a 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -248,7 +248,7 @@ BPFTOOL ?= $(DEFAULT_BPFTOOL)
 $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)    \
 		    $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool
 	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)			       \
-		    ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) 	       \
+		    ARCH= CROSS_COMPILE= CC="$(HOSTCC)" LD="$(HOSTLD)" 	       \
 		    EXTRA_CFLAGS='-g -O0'				       \
 		    OUTPUT=$(HOST_BUILD_DIR)/bpftool/			       \
 		    LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/		       \
@@ -280,7 +280,8 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
 		| $(HOST_BUILD_DIR)/libbpf
 	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR)                             \
 		    EXTRA_CFLAGS='-g -O0' ARCH= CROSS_COMPILE=		       \
-		    OUTPUT=$(HOST_BUILD_DIR)/libbpf/ CC=$(HOSTCC) LD=$(HOSTLD) \
+		    OUTPUT=$(HOST_BUILD_DIR)/libbpf/			       \
+		    CC="$(HOSTCC)" LD="$(HOSTLD)"			       \
 		    DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers
 endif
 
@@ -301,7 +302,7 @@ $(RESOLVE_BTFIDS): $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/resolve_btfids	\
 		       $(TOOLSDIR)/lib/ctype.c			\
 		       $(TOOLSDIR)/lib/str_error_r.c
 	$(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/resolve_btfids	\
-		CC=$(HOSTCC) LD=$(HOSTLD) AR=$(HOSTAR) \
+		CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" \
 		LIBBPF_INCLUDE=$(HOST_INCLUDE_DIR) \
 		OUTPUT=$(HOST_BUILD_DIR)/resolve_btfids/ BPFOBJ=$(HOST_BPFOBJ)
 
-- 
cgit 


From 585bf4640ebe9ea4730b5b603347b4ba39731546 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 10 Feb 2023 01:11:56 +0100
Subject: tools: runqslower: Add EXTRA_CFLAGS and EXTRA_LDFLAGS support

This makes it possible to add sanitizer flags.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230210001210.395194-3-iii@linux.ibm.com
---
 tools/bpf/runqslower/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile
index 8b3d87b82b7a..47acf6936516 100644
--- a/tools/bpf/runqslower/Makefile
+++ b/tools/bpf/runqslower/Makefile
@@ -13,6 +13,8 @@ BPF_DESTDIR := $(BPFOBJ_OUTPUT)
 BPF_INCLUDE := $(BPF_DESTDIR)/include
 INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../include/uapi)
 CFLAGS := -g -Wall $(CLANG_CROSS_FLAGS)
+CFLAGS += $(EXTRA_CFLAGS)
+LDFLAGS += $(EXTRA_LDFLAGS)
 
 # Try to detect best kernel BTF source
 KERNEL_REL := $(shell uname -r)
-- 
cgit 


From 0589d16475ae69da30dde8d0064b3b834ee25310 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 10 Feb 2023 01:11:57 +0100
Subject: selftests/bpf: Split SAN_CFLAGS and SAN_LDFLAGS

Memory Sanitizer requires passing different options to CFLAGS and
LDFLAGS: besides the mandatory -fsanitize=memory, one needs to pass
header and library paths, and passing -L to a compilation step
triggers -Wunused-command-line-argument. So introduce a separate
variable for linker flags. Use $(SAN_CFLAGS) as a default in order to
avoid complicating the ASan usage.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230210001210.395194-4-iii@linux.ibm.com
---
 tools/testing/selftests/bpf/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 49bc1ba12d3a..9b5786ac676e 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -22,10 +22,11 @@ endif
 
 BPF_GCC		?= $(shell command -v bpf-gcc;)
 SAN_CFLAGS	?=
+SAN_LDFLAGS	?= $(SAN_CFLAGS)
 CFLAGS += -g -O0 -rdynamic -Wall -Werror $(GENFLAGS) $(SAN_CFLAGS)	\
 	  -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)		\
 	  -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT)
-LDFLAGS += $(SAN_CFLAGS)
+LDFLAGS += $(SAN_LDFLAGS)
 LDLIBS += -lelf -lz -lrt -lpthread
 
 # Silence some warnings when compiled with clang
-- 
cgit 


From 24a87b477c65c33841c0511d268f46a226271502 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 10 Feb 2023 01:11:58 +0100
Subject: selftests/bpf: Forward SAN_CFLAGS and SAN_LDFLAGS to runqslower and
 libbpf

To get useful results from the Memory Sanitizer, all code running in a
process needs to be instrumented. When building tests with other
sanitizers, it's not strictly necessary, but is also helpful.
So make sure runqslower and libbpf are compiled with SAN_CFLAGS and
linked with SAN_LDFLAGS.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230210001210.395194-5-iii@linux.ibm.com
---
 tools/testing/selftests/bpf/Makefile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 9b5786ac676e..c4b5c44cdee2 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -215,7 +215,9 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT)
 		    OUTPUT=$(RUNQSLOWER_OUTPUT) VMLINUX_BTF=$(VMLINUX_BTF)     \
 		    BPFTOOL_OUTPUT=$(HOST_BUILD_DIR)/bpftool/		       \
 		    BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf			       \
-		    BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) &&	       \
+		    BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR)		       \
+		    EXTRA_CFLAGS='-g -O0 $(SAN_CFLAGS)'			       \
+		    EXTRA_LDFLAGS='$(SAN_LDFLAGS)' &&			       \
 		    cp $(RUNQSLOWER_OUTPUT)runqslower $@
 
 TEST_GEN_PROGS_EXTENDED += $(DEFAULT_BPFTOOL)
@@ -272,7 +274,8 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
 	   $(APIDIR)/linux/bpf.h					       \
 	   | $(BUILD_DIR)/libbpf
 	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \
-		    EXTRA_CFLAGS='-g -O0'				       \
+		    EXTRA_CFLAGS='-g -O0 $(SAN_CFLAGS)'			       \
+		    EXTRA_LDFLAGS='$(SAN_LDFLAGS)'			       \
 		    DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
 
 ifneq ($(BPFOBJ),$(HOST_BPFOBJ))
-- 
cgit 


From 907300c7a66b3b58fc0039402aa14e0b9f11aad6 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 10 Feb 2023 01:11:59 +0100
Subject: selftests/bpf: Attach to fopen()/fclose() in uprobe_autoattach

malloc() and free() may be completely replaced by sanitizers, use
fopen() and fclose() instead.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230210001210.395194-6-iii@linux.ibm.com
---
 .../testing/selftests/bpf/prog_tests/uprobe_autoattach.c | 14 ++++++++------
 .../testing/selftests/bpf/progs/test_uprobe_autoattach.c | 16 ++++++++--------
 2 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c b/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c
index 82807def0d24..6558c857e620 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c
@@ -16,10 +16,10 @@ static noinline int autoattach_trigger_func(int arg1, int arg2, int arg3,
 
 void test_uprobe_autoattach(void)
 {
+	const char *devnull_str = "/dev/null";
 	struct test_uprobe_autoattach *skel;
 	int trigger_ret;
-	size_t malloc_sz = 1;
-	char *mem;
+	FILE *devnull;
 
 	skel = test_uprobe_autoattach__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "skel_open"))
@@ -36,16 +36,18 @@ void test_uprobe_autoattach(void)
 	skel->bss->test_pid = getpid();
 
 	/* trigger & validate shared library u[ret]probes attached by name */
-	mem = malloc(malloc_sz);
+	devnull = fopen(devnull_str, "r");
 
 	ASSERT_EQ(skel->bss->uprobe_byname_parm1, 1, "check_uprobe_byname_parm1");
 	ASSERT_EQ(skel->bss->uprobe_byname_ran, 1, "check_uprobe_byname_ran");
 	ASSERT_EQ(skel->bss->uretprobe_byname_rc, trigger_ret, "check_uretprobe_byname_rc");
 	ASSERT_EQ(skel->bss->uretprobe_byname_ret, trigger_ret, "check_uretprobe_byname_ret");
 	ASSERT_EQ(skel->bss->uretprobe_byname_ran, 2, "check_uretprobe_byname_ran");
-	ASSERT_EQ(skel->bss->uprobe_byname2_parm1, malloc_sz, "check_uprobe_byname2_parm1");
+	ASSERT_EQ(skel->bss->uprobe_byname2_parm1, (__u64)(long)devnull_str,
+		  "check_uprobe_byname2_parm1");
 	ASSERT_EQ(skel->bss->uprobe_byname2_ran, 3, "check_uprobe_byname2_ran");
-	ASSERT_EQ(skel->bss->uretprobe_byname2_rc, mem, "check_uretprobe_byname2_rc");
+	ASSERT_EQ(skel->bss->uretprobe_byname2_rc, (__u64)(long)devnull,
+		  "check_uretprobe_byname2_rc");
 	ASSERT_EQ(skel->bss->uretprobe_byname2_ran, 4, "check_uretprobe_byname2_ran");
 
 	ASSERT_EQ(skel->bss->a[0], 1, "arg1");
@@ -67,7 +69,7 @@ void test_uprobe_autoattach(void)
 	ASSERT_EQ(skel->bss->a[7], 8, "arg8");
 #endif
 
-	free(mem);
+	fclose(devnull);
 cleanup:
 	test_uprobe_autoattach__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c b/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c
index 774ddeb45898..da4bf89d004c 100644
--- a/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c
+++ b/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c
@@ -13,9 +13,9 @@ int uprobe_byname_ran = 0;
 int uretprobe_byname_rc = 0;
 int uretprobe_byname_ret = 0;
 int uretprobe_byname_ran = 0;
-size_t uprobe_byname2_parm1 = 0;
+u64 uprobe_byname2_parm1 = 0;
 int uprobe_byname2_ran = 0;
-char *uretprobe_byname2_rc = NULL;
+u64 uretprobe_byname2_rc = 0;
 int uretprobe_byname2_ran = 0;
 
 int test_pid;
@@ -88,28 +88,28 @@ int BPF_URETPROBE(handle_uretprobe_byname, int ret)
 }
 
 
-SEC("uprobe/libc.so.6:malloc")
-int handle_uprobe_byname2(struct pt_regs *ctx)
+SEC("uprobe/libc.so.6:fopen")
+int BPF_UPROBE(handle_uprobe_byname2, const char *pathname, const char *mode)
 {
 	int pid = bpf_get_current_pid_tgid() >> 32;
 
 	/* ignore irrelevant invocations */
 	if (test_pid != pid)
 		return 0;
-	uprobe_byname2_parm1 = PT_REGS_PARM1_CORE(ctx);
+	uprobe_byname2_parm1 = (u64)(long)pathname;
 	uprobe_byname2_ran = 3;
 	return 0;
 }
 
-SEC("uretprobe/libc.so.6:malloc")
-int handle_uretprobe_byname2(struct pt_regs *ctx)
+SEC("uretprobe/libc.so.6:fopen")
+int BPF_URETPROBE(handle_uretprobe_byname2, void *ret)
 {
 	int pid = bpf_get_current_pid_tgid() >> 32;
 
 	/* ignore irrelevant invocations */
 	if (test_pid != pid)
 		return 0;
-	uretprobe_byname2_rc = (char *)PT_REGS_RC_CORE(ctx);
+	uretprobe_byname2_rc = (u64)(long)ret;
 	uretprobe_byname2_ran = 4;
 	return 0;
 }
-- 
cgit 


From 202702e890a41412a7de970b84a970ba1d5001c9 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 10 Feb 2023 01:12:00 +0100
Subject: selftests/bpf: Attach to fopen()/fclose() in attach_probe

malloc() and free() may be completely replaced by sanitizers, use
fopen() and fclose() instead.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230210001210.395194-7-iii@linux.ibm.com
---
 tools/testing/selftests/bpf/prog_tests/attach_probe.c | 10 +++++-----
 tools/testing/selftests/bpf/progs/test_attach_probe.c | 11 ++++++-----
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c
index 9566d9d2f6ee..56374c8b5436 100644
--- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c
+++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c
@@ -33,8 +33,8 @@ void test_attach_probe(void)
 	struct test_attach_probe* skel;
 	ssize_t uprobe_offset, ref_ctr_offset;
 	struct bpf_link *uprobe_err_link;
+	FILE *devnull;
 	bool legacy;
-	char *mem;
 
 	/* Check if new-style kprobe/uprobe API is supported.
 	 * Kernels that support new FD-based kprobe and uprobe BPF attachment
@@ -147,7 +147,7 @@ void test_attach_probe(void)
 	/* test attach by name for a library function, using the library
 	 * as the binary argument. libc.so.6 will be resolved via dlopen()/dlinfo().
 	 */
-	uprobe_opts.func_name = "malloc";
+	uprobe_opts.func_name = "fopen";
 	uprobe_opts.retprobe = false;
 	skel->links.handle_uprobe_byname2 =
 			bpf_program__attach_uprobe_opts(skel->progs.handle_uprobe_byname2,
@@ -157,7 +157,7 @@ void test_attach_probe(void)
 	if (!ASSERT_OK_PTR(skel->links.handle_uprobe_byname2, "attach_uprobe_byname2"))
 		goto cleanup;
 
-	uprobe_opts.func_name = "free";
+	uprobe_opts.func_name = "fclose";
 	uprobe_opts.retprobe = true;
 	skel->links.handle_uretprobe_byname2 =
 			bpf_program__attach_uprobe_opts(skel->progs.handle_uretprobe_byname2,
@@ -195,8 +195,8 @@ void test_attach_probe(void)
 	usleep(1);
 
 	/* trigger & validate shared library u[ret]probes attached by name */
-	mem = malloc(1);
-	free(mem);
+	devnull = fopen("/dev/null", "r");
+	fclose(devnull);
 
 	/* trigger & validate uprobe & uretprobe */
 	trigger_func();
diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c
index a1e45fec8938..3b5dc34d23e9 100644
--- a/tools/testing/selftests/bpf/progs/test_attach_probe.c
+++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c
@@ -92,18 +92,19 @@ int handle_uretprobe_byname(struct pt_regs *ctx)
 }
 
 SEC("uprobe")
-int handle_uprobe_byname2(struct pt_regs *ctx)
+int BPF_UPROBE(handle_uprobe_byname2, const char *pathname, const char *mode)
 {
-	unsigned int size = PT_REGS_PARM1(ctx);
+	char mode_buf[2] = {};
 
-	/* verify malloc size */
-	if (size == 1)
+	/* verify fopen mode */
+	bpf_probe_read_user(mode_buf, sizeof(mode_buf), mode);
+	if (mode_buf[0] == 'r' && mode_buf[1] == 0)
 		uprobe_byname2_res = 7;
 	return 0;
 }
 
 SEC("uretprobe")
-int handle_uretprobe_byname2(struct pt_regs *ctx)
+int BPF_URETPROBE(handle_uretprobe_byname2, void *ret)
 {
 	uretprobe_byname2_res = 8;
 	return 0;
-- 
cgit 


From 17bcd27a08a21397698edf143084d7c87ce17946 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 10 Feb 2023 01:12:01 +0100
Subject: libbpf: Fix alen calculation in libbpf_nla_dump_errormsg()

The code assumes that everything that comes after nlmsgerr are nlattrs.
When calculating their size, it does not account for the initial
nlmsghdr. This may lead to accessing uninitialized memory.

Fixes: bbf48c18ee0c ("libbpf: add error reporting in XDP")
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230210001210.395194-8-iii@linux.ibm.com
---
 tools/lib/bpf/nlattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c
index 3900d052ed19..975e265eab3b 100644
--- a/tools/lib/bpf/nlattr.c
+++ b/tools/lib/bpf/nlattr.c
@@ -178,7 +178,7 @@ int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh)
 		hlen += nlmsg_len(&err->msg);
 
 	attr = (struct nlattr *) ((void *) err + hlen);
-	alen = nlh->nlmsg_len - hlen;
+	alen = (void *)nlh + nlh->nlmsg_len - (void *)attr;
 
 	if (libbpf_nla_parse(tb, NLMSGERR_ATTR_MAX, attr, alen,
 			     extack_policy) != 0) {
-- 
cgit 


From 3d8f7ccaa611a743ae3a1e6f605346993d37c513 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 10 Feb 2023 01:06:45 -0800
Subject: tools/testing/cxl: Define a fixed volatile configuration to parse

Take two endpoints attached to the first switch on the first host-bridge
in the cxl_test topology and define a pre-initialized region. This is a
x2 interleave underneath a x1 CXL Window.

$ modprobe cxl_test
$ # cxl list -Ru
{
  "region":"region3",
  "resource":"0xf010000000",
  "size":"512.00 MiB (536.87 MB)",
  "interleave_ways":2,
  "interleave_granularity":4096,
  "decode_state":"commit"
}

Tested-by: Fan Ni <fan.ni@samsung.com>
Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/167602000547.1924368.11613151863880268868.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/core.h      |   3 -
 drivers/cxl/core/hdm.c       |   3 +-
 drivers/cxl/core/port.c      |   2 +
 drivers/cxl/cxl.h            |   2 +
 drivers/cxl/cxlmem.h         |   3 +
 tools/testing/cxl/test/cxl.c | 147 ++++++++++++++++++++++++++++++++++++++++---
 6 files changed, 146 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 5eb873da5a30..479f01da6d35 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -57,9 +57,6 @@ resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled);
 resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled);
 extern struct rw_semaphore cxl_dpa_rwsem;
 
-bool is_switch_decoder(struct device *dev);
-struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev);
-
 int cxl_memdev_init(void);
 void cxl_memdev_exit(void);
 void cxl_mbox_init(void);
diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index 8c29026a4b9d..80eccae6ba9e 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -279,7 +279,7 @@ success:
 	return 0;
 }
 
-static int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
+int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
 				resource_size_t base, resource_size_t len,
 				resource_size_t skipped)
 {
@@ -295,6 +295,7 @@ static int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
 
 	return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled);
 }
+EXPORT_SYMBOL_NS_GPL(devm_cxl_dpa_reserve, CXL);
 
 resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled)
 {
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 59620528571a..b45d2796ef35 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -458,6 +458,7 @@ bool is_switch_decoder(struct device *dev)
 {
 	return is_root_decoder(dev) || dev->type == &cxl_decoder_switch_type;
 }
+EXPORT_SYMBOL_NS_GPL(is_switch_decoder, CXL);
 
 struct cxl_decoder *to_cxl_decoder(struct device *dev)
 {
@@ -485,6 +486,7 @@ struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev)
 		return NULL;
 	return container_of(dev, struct cxl_switch_decoder, cxld.dev);
 }
+EXPORT_SYMBOL_NS_GPL(to_cxl_switch_decoder, CXL);
 
 static void cxl_ep_release(struct cxl_ep *ep)
 {
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index c8ee4bb8cce6..2ac344235235 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -653,8 +653,10 @@ struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port,
 
 struct cxl_decoder *to_cxl_decoder(struct device *dev);
 struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev);
+struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev);
 struct cxl_endpoint_decoder *to_cxl_endpoint_decoder(struct device *dev);
 bool is_root_decoder(struct device *dev);
+bool is_switch_decoder(struct device *dev);
 bool is_endpoint_decoder(struct device *dev);
 struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port,
 						unsigned int nr_targets,
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index c9da3c699a21..bf7d4c5c8612 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -81,6 +81,9 @@ static inline bool is_cxl_endpoint(struct cxl_port *port)
 }
 
 struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds);
+int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
+			 resource_size_t base, resource_size_t len,
+			 resource_size_t skipped);
 
 static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port,
 					 struct cxl_memdev *cxlmd)
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index 920bd969c554..5342f69d70d2 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -703,6 +703,142 @@ static int mock_decoder_reset(struct cxl_decoder *cxld)
 	return 0;
 }
 
+static void default_mock_decoder(struct cxl_decoder *cxld)
+{
+	cxld->hpa_range = (struct range){
+		.start = 0,
+		.end = -1,
+	};
+
+	cxld->interleave_ways = 1;
+	cxld->interleave_granularity = 256;
+	cxld->target_type = CXL_DECODER_EXPANDER;
+	cxld->commit = mock_decoder_commit;
+	cxld->reset = mock_decoder_reset;
+}
+
+static int first_decoder(struct device *dev, void *data)
+{
+	struct cxl_decoder *cxld;
+
+	if (!is_switch_decoder(dev))
+		return 0;
+	cxld = to_cxl_decoder(dev);
+	if (cxld->id == 0)
+		return 1;
+	return 0;
+}
+
+static void mock_init_hdm_decoder(struct cxl_decoder *cxld)
+{
+	struct acpi_cedt_cfmws *window = mock_cfmws[0];
+	struct platform_device *pdev = NULL;
+	struct cxl_endpoint_decoder *cxled;
+	struct cxl_switch_decoder *cxlsd;
+	struct cxl_port *port, *iter;
+	const int size = SZ_512M;
+	struct cxl_memdev *cxlmd;
+	struct cxl_dport *dport;
+	struct device *dev;
+	bool hb0 = false;
+	u64 base;
+	int i;
+
+	if (is_endpoint_decoder(&cxld->dev)) {
+		cxled = to_cxl_endpoint_decoder(&cxld->dev);
+		cxlmd = cxled_to_memdev(cxled);
+		WARN_ON(!dev_is_platform(cxlmd->dev.parent));
+		pdev = to_platform_device(cxlmd->dev.parent);
+
+		/* check is endpoint is attach to host-bridge0 */
+		port = cxled_to_port(cxled);
+		do {
+			if (port->uport == &cxl_host_bridge[0]->dev) {
+				hb0 = true;
+				break;
+			}
+			if (is_cxl_port(port->dev.parent))
+				port = to_cxl_port(port->dev.parent);
+			else
+				port = NULL;
+		} while (port);
+		port = cxled_to_port(cxled);
+	}
+
+	/*
+	 * The first decoder on the first 2 devices on the first switch
+	 * attached to host-bridge0 mock a fake / static RAM region. All
+	 * other decoders are default disabled. Given the round robin
+	 * assignment those devices are named cxl_mem.0, and cxl_mem.4.
+	 *
+	 * See 'cxl list -BMPu -m cxl_mem.0,cxl_mem.4'
+	 */
+	if (!hb0 || pdev->id % 4 || pdev->id > 4 || cxld->id > 0) {
+		default_mock_decoder(cxld);
+		return;
+	}
+
+	base = window->base_hpa;
+	cxld->hpa_range = (struct range) {
+		.start = base,
+		.end = base + size - 1,
+	};
+
+	cxld->interleave_ways = 2;
+	eig_to_granularity(window->granularity, &cxld->interleave_granularity);
+	cxld->target_type = CXL_DECODER_EXPANDER;
+	cxld->flags = CXL_DECODER_F_ENABLE;
+	cxled->state = CXL_DECODER_STATE_AUTO;
+	port->commit_end = cxld->id;
+	devm_cxl_dpa_reserve(cxled, 0, size / cxld->interleave_ways, 0);
+	cxld->commit = mock_decoder_commit;
+	cxld->reset = mock_decoder_reset;
+
+	/*
+	 * Now that endpoint decoder is set up, walk up the hierarchy
+	 * and setup the switch and root port decoders targeting @cxlmd.
+	 */
+	iter = port;
+	for (i = 0; i < 2; i++) {
+		dport = iter->parent_dport;
+		iter = dport->port;
+		dev = device_find_child(&iter->dev, NULL, first_decoder);
+		/*
+		 * Ancestor ports are guaranteed to be enumerated before
+		 * @port, and all ports have at least one decoder.
+		 */
+		if (WARN_ON(!dev))
+			continue;
+		cxlsd = to_cxl_switch_decoder(dev);
+		if (i == 0) {
+			/* put cxl_mem.4 second in the decode order */
+			if (pdev->id == 4)
+				cxlsd->target[1] = dport;
+			else
+				cxlsd->target[0] = dport;
+		} else
+			cxlsd->target[0] = dport;
+		cxld = &cxlsd->cxld;
+		cxld->target_type = CXL_DECODER_EXPANDER;
+		cxld->flags = CXL_DECODER_F_ENABLE;
+		iter->commit_end = 0;
+		/*
+		 * Switch targets 2 endpoints, while host bridge targets
+		 * one root port
+		 */
+		if (i == 0)
+			cxld->interleave_ways = 2;
+		else
+			cxld->interleave_ways = 1;
+		cxld->interleave_granularity = 256;
+		cxld->hpa_range = (struct range) {
+			.start = base,
+			.end = base + size - 1,
+		};
+		put_device(dev);
+	}
+}
+
 static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm)
 {
 	struct cxl_port *port = cxlhdm->port;
@@ -748,16 +884,7 @@ static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm)
 			cxld = &cxled->cxld;
 		}
 
-		cxld->hpa_range = (struct range) {
-			.start = 0,
-			.end = -1,
-		};
-
-		cxld->interleave_ways = min_not_zero(target_count, 1);
-		cxld->interleave_granularity = SZ_4K;
-		cxld->target_type = CXL_DECODER_EXPANDER;
-		cxld->commit = mock_decoder_commit;
-		cxld->reset = mock_decoder_reset;
+		mock_init_hdm_decoder(cxld);
 
 		if (target_count) {
 			rc = device_for_each_child(port->uport, &ctx,
-- 
cgit 


From 049139126ec7ab55b568b2935aa7db5e6657258c Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Thu, 9 Feb 2023 09:18:52 +0200
Subject: selftests: forwarding: Add MDB dump test cases

The kernel maintains three markers for the MDB dump:

1. The last bridge device from which the MDB was dumped.
2. The last MDB entry from which the MDB was dumped.
3. The last port-group entry that was dumped.

Add test cases for large scale MDB dump to make sure that all the
configured entries are dumped and that the markers are used correctly.

Specifically, create 2 bridges with 32 ports and add 256 MDB entries in
which all the ports are member of. Test that each bridge reports 8192
(256 * 32) permanent entries. Do that with IPv4, IPv6 and L2 MDB
entries.

On my system, MDB dump of the above is contained in about 50 netlink
messages.

Example output:

 # ./bridge_mdb.sh
 [...]
 INFO: # Large scale dump tests
 TEST: IPv4 large scale dump tests                                   [ OK ]
 TEST: IPv6 large scale dump tests                                   [ OK ]
 TEST: L2 large scale dump tests                                     [ OK ]
 [...]

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../testing/selftests/net/forwarding/bridge_mdb.sh | 99 ++++++++++++++++++++++
 1 file changed, 99 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb.sh b/tools/testing/selftests/net/forwarding/bridge_mdb.sh
index b48867d8cadf..ae3f9462a2b6 100755
--- a/tools/testing/selftests/net/forwarding/bridge_mdb.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_mdb.sh
@@ -742,10 +742,109 @@ cfg_test_port()
 	cfg_test_port_l2
 }
 
+ipv4_grps_get()
+{
+	local max_grps=$1; shift
+	local i
+
+	for i in $(seq 0 $((max_grps - 1))); do
+		echo "239.1.1.$i"
+	done
+}
+
+ipv6_grps_get()
+{
+	local max_grps=$1; shift
+	local i
+
+	for i in $(seq 0 $((max_grps - 1))); do
+		echo "ff0e::$(printf %x $i)"
+	done
+}
+
+l2_grps_get()
+{
+	local max_grps=$1; shift
+	local i
+
+	for i in $(seq 0 $((max_grps - 1))); do
+		echo "01:00:00:00:00:$(printf %02x $i)"
+	done
+}
+
+cfg_test_dump_common()
+{
+	local name=$1; shift
+	local fn=$1; shift
+	local max_bridges=2
+	local max_grps=256
+	local max_ports=32
+	local num_entries
+	local batch_file
+	local grp
+	local i j
+
+	RET=0
+
+	# Create net devices.
+	for i in $(seq 1 $max_bridges); do
+		ip link add name br-test${i} up type bridge vlan_filtering 1 \
+			mcast_snooping 1
+		for j in $(seq 1 $max_ports); do
+			ip link add name br-test${i}-du${j} up \
+				master br-test${i} type dummy
+		done
+	done
+
+	# Create batch file with MDB entries.
+	batch_file=$(mktemp)
+	for i in $(seq 1 $max_bridges); do
+		for j in $(seq 1 $max_ports); do
+			for grp in $($fn $max_grps); do
+				echo "mdb add dev br-test${i} \
+					port br-test${i}-du${j} grp $grp \
+					permanent vid 1" >> $batch_file
+			done
+		done
+	done
+
+	# Program the batch file and check for expected number of entries.
+	bridge -b $batch_file
+	for i in $(seq 1 $max_bridges); do
+		num_entries=$(bridge mdb show dev br-test${i} | \
+			grep "permanent" | wc -l)
+		[[ $num_entries -eq $((max_grps * max_ports)) ]]
+		check_err $? "Wrong number of entries in br-test${i}"
+	done
+
+	# Cleanup.
+	rm $batch_file
+	for i in $(seq 1 $max_bridges); do
+		ip link del dev br-test${i}
+		for j in $(seq $max_ports); do
+			ip link del dev br-test${i}-du${j}
+		done
+	done
+
+	log_test "$name large scale dump tests"
+}
+
+# Check large scale dump.
+cfg_test_dump()
+{
+	echo
+	log_info "# Large scale dump tests"
+
+	cfg_test_dump_common "IPv4" ipv4_grps_get
+	cfg_test_dump_common "IPv6" ipv6_grps_get
+	cfg_test_dump_common "L2" l2_grps_get
+}
+
 cfg_test()
 {
 	cfg_test_host
 	cfg_test_port
+	cfg_test_dump
 }
 
 __fwd_test_host_ip()
-- 
cgit 


From 443ed4c302fff6a26af980300463343a7adc9ee8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 30 Jan 2023 15:21:02 +0100
Subject: objtool: mem*() are not uaccess safe

For mysterious raisins I listed the new __asan_mem*() functions as
being uaccess safe, this is giving objtool fails on KASAN builds
because these functions call out to the actual __mem*() functions
which are not marked uaccess safe.

Removing it doesn't make the robots unhappy.

Fixes: 69d4c0d32186 ("entry, kasan, x86: Disallow overriding mem*() functions")
Reported-by: "Paul E. McKenney" <paulmck@kernel.org>
Bisected-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230126182302.GA687063@paulmck-ThinkPad-P17-Gen-1
---
 tools/objtool/check.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 10b5bb4b7886..b118f588cd2b 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1083,9 +1083,6 @@ static const char *uaccess_safe_builtin[] = {
 	"__asan_store16_noabort",
 	"__kasan_check_read",
 	"__kasan_check_write",
-	"__asan_memset",
-	"__asan_memmove",
-	"__asan_memcpy",
 	/* KASAN in-line */
 	"__asan_report_load_n_noabort",
 	"__asan_report_load1_noabort",
-- 
cgit 


From ffb1b4a41016295e298409c9dbcacd55680bd6d4 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Fri, 10 Feb 2023 14:42:01 -0800
Subject: x86/unwind/orc: Add 'signal' field to ORC metadata

Add a 'signal' field which allows unwind hints to specify whether the
instruction pointer should be taken literally (like for most interrupts
and exceptions) rather than decremented (like for call stack return
addresses) when used to find the next ORC entry.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/d2c5ec4d83a45b513d8fd72fab59f1a8cfa46871.1676068346.git.jpoimboe@kernel.org
---
 arch/x86/include/asm/orc_types.h       |  4 +++-
 arch/x86/include/asm/unwind_hints.h    | 10 +++++-----
 arch/x86/kernel/unwind_orc.c           |  5 ++---
 include/linux/objtool.h                | 11 +++++++----
 tools/arch/x86/include/asm/orc_types.h |  4 +++-
 tools/include/linux/objtool.h          | 11 +++++++----
 tools/objtool/orc_dump.c               |  4 ++--
 7 files changed, 29 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
index 5a2baf28a1dc..1343a62106de 100644
--- a/arch/x86/include/asm/orc_types.h
+++ b/arch/x86/include/asm/orc_types.h
@@ -57,12 +57,14 @@ struct orc_entry {
 	unsigned	sp_reg:4;
 	unsigned	bp_reg:4;
 	unsigned	type:2;
+	unsigned	signal:1;
 	unsigned	end:1;
 #elif defined(__BIG_ENDIAN_BITFIELD)
 	unsigned	bp_reg:4;
 	unsigned	sp_reg:4;
-	unsigned	unused:5;
+	unsigned	unused:4;
 	unsigned	end:1;
+	unsigned	signal:1;
 	unsigned	type:2;
 #endif
 } __packed;
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index f66fbe6537dd..e7c71750b309 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -15,7 +15,7 @@
 	UNWIND_HINT type=UNWIND_HINT_TYPE_ENTRY end=1
 .endm
 
-.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0
+.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 signal=1
 	.if \base == %rsp
 		.if \indirect
 			.set sp_reg, ORC_REG_SP_INDIRECT
@@ -45,11 +45,11 @@
 		.set type, UNWIND_HINT_TYPE_REGS
 	.endif
 
-	UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type
+	UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type signal=\signal
 .endm
 
-.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0
-	UNWIND_HINT_REGS base=\base offset=\offset partial=1
+.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 signal=1
+	UNWIND_HINT_REGS base=\base offset=\offset partial=1 signal=\signal
 .endm
 
 .macro UNWIND_HINT_FUNC
@@ -67,7 +67,7 @@
 #else
 
 #define UNWIND_HINT_FUNC \
-	UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0)
+	UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0, 0)
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index cdf6c6060170..37307b40f8da 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -484,6 +484,8 @@ bool unwind_next_frame(struct unwind_state *state)
 		goto the_end;
 	}
 
+	state->signal = orc->signal;
+
 	/* Find the previous frame's stack: */
 	switch (orc->sp_reg) {
 	case ORC_REG_SP:
@@ -563,7 +565,6 @@ bool unwind_next_frame(struct unwind_state *state)
 		state->sp = sp;
 		state->regs = NULL;
 		state->prev_regs = NULL;
-		state->signal = false;
 		break;
 
 	case UNWIND_HINT_TYPE_REGS:
@@ -587,7 +588,6 @@ bool unwind_next_frame(struct unwind_state *state)
 		state->regs = (struct pt_regs *)sp;
 		state->prev_regs = NULL;
 		state->full_regs = true;
-		state->signal = true;
 		break;
 
 	case UNWIND_HINT_TYPE_REGS_PARTIAL:
@@ -604,7 +604,6 @@ bool unwind_next_frame(struct unwind_state *state)
 			state->prev_regs = state->regs;
 		state->regs = (void *)sp - IRET_FRAME_OFFSET;
 		state->full_regs = false;
-		state->signal = true;
 		break;
 
 	default:
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 62c54ffbeeaa..9ac3df3fccf0 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -15,6 +15,7 @@ struct unwind_hint {
 	s16		sp_offset;
 	u8		sp_reg;
 	u8		type;
+	u8		signal;
 	u8		end;
 };
 #endif
@@ -49,7 +50,7 @@ struct unwind_hint {
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(sp_reg, sp_offset, type, end)		\
+#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end)	\
 	"987: \n\t"						\
 	".pushsection .discard.unwind_hints\n\t"		\
 	/* struct unwind_hint */				\
@@ -57,6 +58,7 @@ struct unwind_hint {
 	".short " __stringify(sp_offset) "\n\t"			\
 	".byte " __stringify(sp_reg) "\n\t"			\
 	".byte " __stringify(type) "\n\t"			\
+	".byte " __stringify(signal) "\n\t"			\
 	".byte " __stringify(end) "\n\t"			\
 	".balign 4 \n\t"					\
 	".popsection\n\t"
@@ -129,7 +131,7 @@ struct unwind_hint {
  * the debuginfo as necessary.  It will also warn if it sees any
  * inconsistencies.
  */
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
 .Lunwind_hint_ip_\@:
 	.pushsection .discard.unwind_hints
 		/* struct unwind_hint */
@@ -137,6 +139,7 @@ struct unwind_hint {
 		.short \sp_offset
 		.byte \sp_reg
 		.byte \type
+		.byte \signal
 		.byte \end
 		.balign 4
 	.popsection
@@ -174,7 +177,7 @@ struct unwind_hint {
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(sp_reg, sp_offset, type, end)	\
+#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end) \
 	"\n\t"
 #define STACK_FRAME_NON_STANDARD(func)
 #define STACK_FRAME_NON_STANDARD_FP(func)
@@ -182,7 +185,7 @@ struct unwind_hint {
 #define ASM_REACHABLE
 #else
 #define ANNOTATE_INTRA_FUNCTION_CALL
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
 .endm
 .macro STACK_FRAME_NON_STANDARD func:req
 .endm
diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h
index 5a2baf28a1dc..1343a62106de 100644
--- a/tools/arch/x86/include/asm/orc_types.h
+++ b/tools/arch/x86/include/asm/orc_types.h
@@ -57,12 +57,14 @@ struct orc_entry {
 	unsigned	sp_reg:4;
 	unsigned	bp_reg:4;
 	unsigned	type:2;
+	unsigned	signal:1;
 	unsigned	end:1;
 #elif defined(__BIG_ENDIAN_BITFIELD)
 	unsigned	bp_reg:4;
 	unsigned	sp_reg:4;
-	unsigned	unused:5;
+	unsigned	unused:4;
 	unsigned	end:1;
+	unsigned	signal:1;
 	unsigned	type:2;
 #endif
 } __packed;
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
index 62c54ffbeeaa..9ac3df3fccf0 100644
--- a/tools/include/linux/objtool.h
+++ b/tools/include/linux/objtool.h
@@ -15,6 +15,7 @@ struct unwind_hint {
 	s16		sp_offset;
 	u8		sp_reg;
 	u8		type;
+	u8		signal;
 	u8		end;
 };
 #endif
@@ -49,7 +50,7 @@ struct unwind_hint {
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(sp_reg, sp_offset, type, end)		\
+#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end)	\
 	"987: \n\t"						\
 	".pushsection .discard.unwind_hints\n\t"		\
 	/* struct unwind_hint */				\
@@ -57,6 +58,7 @@ struct unwind_hint {
 	".short " __stringify(sp_offset) "\n\t"			\
 	".byte " __stringify(sp_reg) "\n\t"			\
 	".byte " __stringify(type) "\n\t"			\
+	".byte " __stringify(signal) "\n\t"			\
 	".byte " __stringify(end) "\n\t"			\
 	".balign 4 \n\t"					\
 	".popsection\n\t"
@@ -129,7 +131,7 @@ struct unwind_hint {
  * the debuginfo as necessary.  It will also warn if it sees any
  * inconsistencies.
  */
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
 .Lunwind_hint_ip_\@:
 	.pushsection .discard.unwind_hints
 		/* struct unwind_hint */
@@ -137,6 +139,7 @@ struct unwind_hint {
 		.short \sp_offset
 		.byte \sp_reg
 		.byte \type
+		.byte \signal
 		.byte \end
 		.balign 4
 	.popsection
@@ -174,7 +177,7 @@ struct unwind_hint {
 
 #ifndef __ASSEMBLY__
 
-#define UNWIND_HINT(sp_reg, sp_offset, type, end)	\
+#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end) \
 	"\n\t"
 #define STACK_FRAME_NON_STANDARD(func)
 #define STACK_FRAME_NON_STANDARD_FP(func)
@@ -182,7 +185,7 @@ struct unwind_hint {
 #define ASM_REACHABLE
 #else
 #define ANNOTATE_INTRA_FUNCTION_CALL
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
 .endm
 .macro STACK_FRAME_NON_STANDARD func:req
 .endm
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
index 4f1211fec82c..2d8ebdcd1db3 100644
--- a/tools/objtool/orc_dump.c
+++ b/tools/objtool/orc_dump.c
@@ -211,8 +211,8 @@ int orc_dump(const char *_objname)
 
 		print_reg(orc[i].bp_reg, bswap_if_needed(&dummy_elf, orc[i].bp_offset));
 
-		printf(" type:%s end:%d\n",
-		       orc_type_name(orc[i].type), orc[i].end);
+		printf(" type:%s signal:%d end:%d\n",
+		       orc_type_name(orc[i].type), orc[i].signal, orc[i].end);
 	}
 
 	elf_end(elf);
-- 
cgit 


From f697cb00afa90b68748f246540270b5865c801ba Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 25 Nov 2022 07:32:48 +0100
Subject: x86/xen: mark xen_pv_play_dead() as __noreturn

Mark xen_pv_play_dead() and related to that xen_cpu_bringup_again()
as "__noreturn".

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20221125063248.30256-3-jgross@suse.com
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/xen/smp.h    | 2 +-
 arch/x86/xen/smp_pv.c | 4 ++--
 tools/objtool/check.c | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index 6e90a312067b..22fb982ff971 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -21,7 +21,7 @@ void xen_smp_send_reschedule(int cpu);
 void xen_smp_send_call_function_ipi(const struct cpumask *mask);
 void xen_smp_send_call_function_single_ipi(int cpu);
 
-void xen_cpu_bringup_again(unsigned long stack);
+void __noreturn xen_cpu_bringup_again(unsigned long stack);
 
 struct xen_common_irq {
 	int irq;
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index a97b0505a483..a9cf8c8fa074 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -381,7 +381,7 @@ static void xen_pv_cpu_die(unsigned int cpu)
 	}
 }
 
-static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
+static void __noreturn xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
 {
 	play_dead_common();
 	HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL);
@@ -400,7 +400,7 @@ static void xen_pv_cpu_die(unsigned int cpu)
 	BUG();
 }
 
-static void xen_pv_play_dead(void)
+static void __noreturn xen_pv_play_dead(void)
 {
 	BUG();
 }
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 4b7c8b33069e..68e87b45caef 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -186,6 +186,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"snp_abort",
 		"stop_this_cpu",
 		"usercopy_abort",
+		"xen_cpu_bringup_again",
 		"xen_start_kernel",
 	};
 
-- 
cgit 


From f80f09b59fdd45753dd80ac623981ad00ece4c2d Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:27 -0500
Subject: selftests: dmabuf-heaps: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/dmabuf-heaps/Makefile      | 2 +-
 tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/dmabuf-heaps/Makefile b/tools/testing/selftests/dmabuf-heaps/Makefile
index 604b43ece15f..9e7e158d5fa3 100644
--- a/tools/testing/selftests/dmabuf-heaps/Makefile
+++ b/tools/testing/selftests/dmabuf-heaps/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-CFLAGS += -static -O3 -Wl,-no-as-needed -Wall
+CFLAGS += -static -O3 -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
 
 TEST_GEN_PROGS = dmabuf-heap
 
diff --git a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c
index 29af27acd40e..890a8236a8ba 100644
--- a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c
+++ b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c
@@ -13,10 +13,9 @@
 #include <sys/types.h>
 
 #include <linux/dma-buf.h>
+#include <linux/dma-heap.h>
 #include <drm/drm.h>
 
-#include "../../../../include/uapi/linux/dma-heap.h"
-
 #define DEVPATH "/dev/dma_heap"
 
 static int check_vgem(int fd)
-- 
cgit 


From 07f0148aafe8c95a3a76cd59e9e75b4d78d1d31d Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:28 -0500
Subject: selftests: drivers: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/drivers/dma-buf/Makefile        | 2 +-
 tools/testing/selftests/drivers/s390x/uvdevice/Makefile | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/dma-buf/Makefile b/tools/testing/selftests/drivers/dma-buf/Makefile
index 79cb16b4e01a..441407bb0e80 100644
--- a/tools/testing/selftests/drivers/dma-buf/Makefile
+++ b/tools/testing/selftests/drivers/dma-buf/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-CFLAGS += -I../../../../../usr/include/
+CFLAGS += $(KHDR_INCLUDES)
 
 TEST_GEN_PROGS := udmabuf
 
diff --git a/tools/testing/selftests/drivers/s390x/uvdevice/Makefile b/tools/testing/selftests/drivers/s390x/uvdevice/Makefile
index 891215a7dc8a..755d164384c4 100644
--- a/tools/testing/selftests/drivers/s390x/uvdevice/Makefile
+++ b/tools/testing/selftests/drivers/s390x/uvdevice/Makefile
@@ -11,10 +11,9 @@ else
 TEST_GEN_PROGS := test_uvdevice
 
 top_srcdir ?= ../../../../../..
-khdr_dir = $(top_srcdir)/usr/include
 LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include
 
-CFLAGS += -Wall -Werror -static -I$(khdr_dir) -I$(LINUX_TOOL_ARCH_INCLUDE)
+CFLAGS += -Wall -Werror -static $(KHDR_INCLUDES) -I$(LINUX_TOOL_ARCH_INCLUDE)
 
 include ../../../lib.mk
 
-- 
cgit 


From c2d3cf3653a8ff6e4b402d55e7f84790ac08a8ad Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:29 -0500
Subject: selftests: filesystems: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org> # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/filesystems/Makefile          | 2 +-
 tools/testing/selftests/filesystems/binderfs/Makefile | 2 +-
 tools/testing/selftests/filesystems/epoll/Makefile    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile
index 129880fb42d3..c647fd6a0446 100644
--- a/tools/testing/selftests/filesystems/Makefile
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-CFLAGS += -I../../../../usr/include/
+CFLAGS += $(KHDR_INCLUDES)
 TEST_GEN_PROGS := devpts_pts
 TEST_GEN_PROGS_EXTENDED := dnotify_test
 
diff --git a/tools/testing/selftests/filesystems/binderfs/Makefile b/tools/testing/selftests/filesystems/binderfs/Makefile
index 8af25ae96049..c2f7cef919c0 100644
--- a/tools/testing/selftests/filesystems/binderfs/Makefile
+++ b/tools/testing/selftests/filesystems/binderfs/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-CFLAGS += -I../../../../../usr/include/ -pthread
+CFLAGS += $(KHDR_INCLUDES) -pthread
 TEST_GEN_PROGS := binderfs_test
 
 binderfs_test: binderfs_test.c ../../kselftest.h ../../kselftest_harness.h
diff --git a/tools/testing/selftests/filesystems/epoll/Makefile b/tools/testing/selftests/filesystems/epoll/Makefile
index 78ae4aaf7141..0788a7dc8004 100644
--- a/tools/testing/selftests/filesystems/epoll/Makefile
+++ b/tools/testing/selftests/filesystems/epoll/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-CFLAGS += -I../../../../../usr/include/
+CFLAGS += $(KHDR_INCLUDES)
 LDLIBS += -lpthread
 TEST_GEN_PROGS := epoll_wakeup_test
 
-- 
cgit 


From 24c55275ba0d538def2b1220002d0e808a85d50f Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:30 -0500
Subject: selftests: futex: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/futex/functional/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile
index 5a0e0df8de9b..a392d0917b4e 100644
--- a/tools/testing/selftests/futex/functional/Makefile
+++ b/tools/testing/selftests/futex/functional/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-INCLUDES := -I../include -I../../ -I../../../../../usr/include/
+INCLUDES := -I../include -I../../ $(KHDR_INCLUDES)
 CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES) $(KHDR_INCLUDES)
 LDLIBS := -lpthread -lrt
 
-- 
cgit 


From 8bb9c1808628babcc7b99ec2439bf102379bd4ac Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:31 -0500
Subject: selftests: gpio: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/gpio/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile
index 616ed4019655..e0884390447d 100644
--- a/tools/testing/selftests/gpio/Makefile
+++ b/tools/testing/selftests/gpio/Makefile
@@ -3,6 +3,6 @@
 TEST_PROGS := gpio-mockup.sh gpio-sim.sh
 TEST_FILES := gpio-mockup-sysfs.sh
 TEST_GEN_PROGS_EXTENDED := gpio-mockup-cdev gpio-chip-info gpio-line-name
-CFLAGS += -O2 -g -Wall -I../../../../usr/include/ $(KHDR_INCLUDES)
+CFLAGS += -O2 -g -Wall $(KHDR_INCLUDES)
 
 include ../lib.mk
-- 
cgit 


From ecf9fdb5c2a9d63c732acccb6318feb73dd1589f Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:32 -0500
Subject: selftests: ipc: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org> # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ipc/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ipc/Makefile b/tools/testing/selftests/ipc/Makefile
index 1c4448a843a4..50e9c299fc4a 100644
--- a/tools/testing/selftests/ipc/Makefile
+++ b/tools/testing/selftests/ipc/Makefile
@@ -10,7 +10,7 @@ ifeq ($(ARCH),x86_64)
 	CFLAGS := -DCONFIG_X86_64 -D__x86_64__
 endif
 
-CFLAGS += -I../../../../usr/include/
+CFLAGS += $(KHDR_INCLUDES)
 
 TEST_GEN_PROGS := msgque
 
-- 
cgit 


From 5d74231a2caad259f6669d8d6112814cef6bcd60 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:33 -0500
Subject: selftests: kcmp: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/kcmp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kcmp/Makefile b/tools/testing/selftests/kcmp/Makefile
index b4d39f6b5124..59a1e5379018 100644
--- a/tools/testing/selftests/kcmp/Makefile
+++ b/tools/testing/selftests/kcmp/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-CFLAGS += -I../../../../usr/include/
+CFLAGS += $(KHDR_INCLUDES)
 
 TEST_GEN_PROGS := kcmp_test
 
-- 
cgit 


From f2f9592b736087f695230410fb8dc1afd3cafbbb Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:34 -0500
Subject: selftests: media_tests: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org> # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/media_tests/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/media_tests/Makefile b/tools/testing/selftests/media_tests/Makefile
index 60826d7d37d4..471d83e61d95 100644
--- a/tools/testing/selftests/media_tests/Makefile
+++ b/tools/testing/selftests/media_tests/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 #
-CFLAGS += -I../ -I../../../../usr/include/
+CFLAGS += -I../ $(KHDR_INCLUDES)
 TEST_GEN_PROGS := media_device_test media_device_open video_device_test
 
 include ../lib.mk
-- 
cgit 


From 498bb027726371ba4a94686d251f9be1d437573e Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:35 -0500
Subject: selftests: membarrier: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/membarrier/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/membarrier/Makefile b/tools/testing/selftests/membarrier/Makefile
index 34d1c81a2324..fc840e06ff56 100644
--- a/tools/testing/selftests/membarrier/Makefile
+++ b/tools/testing/selftests/membarrier/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-CFLAGS += -g -I../../../../usr/include/
+CFLAGS += -g $(KHDR_INCLUDES)
 LDLIBS += -lpthread
 
 TEST_GEN_PROGS := membarrier_test_single_thread \
-- 
cgit 


From 5d11f2d0eb39d2b5c5e8f05e1f650c4a4de69918 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:36 -0500
Subject: selftests: mount_setattr: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/mount_setattr/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mount_setattr/Makefile b/tools/testing/selftests/mount_setattr/Makefile
index 2250f7dcb81e..fde72df01b11 100644
--- a/tools/testing/selftests/mount_setattr/Makefile
+++ b/tools/testing/selftests/mount_setattr/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for mount selftests.
-CFLAGS = -g -I../../../../usr/include/ -Wall -O2 -pthread
+CFLAGS = -g $(KHDR_INCLUDES) -Wall -O2 -pthread
 
 TEST_GEN_FILES += mount_setattr_test
 
-- 
cgit 


From 65c68af0131bfef8e395c325735b6c40638cb931 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:37 -0500
Subject: selftests: move_mount_set_group: Fix incorrect kernel headers search
 path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/move_mount_set_group/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/move_mount_set_group/Makefile b/tools/testing/selftests/move_mount_set_group/Makefile
index 80c2d86812b0..94235846b6f9 100644
--- a/tools/testing/selftests/move_mount_set_group/Makefile
+++ b/tools/testing/selftests/move_mount_set_group/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for mount selftests.
-CFLAGS = -g -I../../../../usr/include/ -Wall -O2
+CFLAGS = -g $(KHDR_INCLUDES) -Wall -O2
 
 TEST_GEN_FILES += move_mount_set_group_test
 
-- 
cgit 


From 465cbb1b9a9fd5f6907adb2d761facaf1a46bfbe Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:39 -0500
Subject: selftests: perf_events: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/perf_events/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/perf_events/Makefile b/tools/testing/selftests/perf_events/Makefile
index fcafa5f0d34c..db93c4ff081a 100644
--- a/tools/testing/selftests/perf_events/Makefile
+++ b/tools/testing/selftests/perf_events/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-CFLAGS += -Wl,-no-as-needed -Wall -I../../../../usr/include
+CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
 LDFLAGS += -lpthread
 
 TEST_GEN_PROGS := sigtrap_threads remove_on_exec
-- 
cgit 


From e81ff69f66969a16a98a2e0977c1860f1c182c74 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:40 -0500
Subject: selftests: pid_namespace: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/pid_namespace/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/pid_namespace/Makefile b/tools/testing/selftests/pid_namespace/Makefile
index edafaca1aeb3..9286a1d22cd3 100644
--- a/tools/testing/selftests/pid_namespace/Makefile
+++ b/tools/testing/selftests/pid_namespace/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-CFLAGS += -g -I../../../../usr/include/
+CFLAGS += -g $(KHDR_INCLUDES)
 
 TEST_GEN_PROGS = regression_enomem
 
-- 
cgit 


From 3f7d71768795c386019f2295c1986d00035c9f0f Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:41 -0500
Subject: selftests: pidfd: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org> # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/pidfd/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile
index 778b6cdc8aed..d731e3e76d5b 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-CFLAGS += -g -I../../../../usr/include/ -pthread -Wall
+CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall
 
 TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
 	pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test
-- 
cgit 


From 01ede99e9de16e7a1ed689c99f41022aa878f2f4 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:43 -0500
Subject: selftests: ptp: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org> # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ptp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ptp/Makefile b/tools/testing/selftests/ptp/Makefile
index ef06de0898b7..eeab44cc6863 100644
--- a/tools/testing/selftests/ptp/Makefile
+++ b/tools/testing/selftests/ptp/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-CFLAGS += -I../../../../usr/include/
+CFLAGS += $(KHDR_INCLUDES)
 TEST_PROGS := testptp
 LDLIBS += -lrt
 all: $(TEST_PROGS)
-- 
cgit 


From 2279bfc03211045c8f43a76b01889a5ca86acd5a Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:44 -0500
Subject: selftests: rseq: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org> # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/rseq/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile
index 215e1067f037..3a173e184566 100644
--- a/tools/testing/selftests/rseq/Makefile
+++ b/tools/testing/selftests/rseq/Makefile
@@ -4,7 +4,7 @@ ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),)
 CLANG_FLAGS += -no-integrated-as
 endif
 
-CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -L$(OUTPUT) -Wl,-rpath=./ \
+CFLAGS += -O2 -Wall -g -I./ $(KHDR_INCLUDES) -L$(OUTPUT) -Wl,-rpath=./ \
 	  $(CLANG_FLAGS)
 LDLIBS += -lpthread -ldl
 
-- 
cgit 


From 0d2cace5af50806a6b32ab73d367b80e46acda0f Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:45 -0500
Subject: selftests: sched: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/sched/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sched/Makefile b/tools/testing/selftests/sched/Makefile
index 10c72f14fea9..099ee9213557 100644
--- a/tools/testing/selftests/sched/Makefile
+++ b/tools/testing/selftests/sched/Makefile
@@ -4,7 +4,7 @@ ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),)
 CLANG_FLAGS += -no-integrated-as
 endif
 
-CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/  -Wl,-rpath=./ \
+CFLAGS += -O2 -Wall -g -I./ $(KHDR_INCLUDES) -Wl,-rpath=./ \
 	  $(CLANG_FLAGS)
 LDLIBS += -lpthread
 
-- 
cgit 


From 07d42dd854446ba3177ad7a217870a5b4edee165 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:46 -0500
Subject: selftests: seccomp: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/seccomp/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/seccomp/Makefile b/tools/testing/selftests/seccomp/Makefile
index f017c382c036..584fba487037 100644
--- a/tools/testing/selftests/seccomp/Makefile
+++ b/tools/testing/selftests/seccomp/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-CFLAGS += -Wl,-no-as-needed -Wall -isystem ../../../../usr/include/
+CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
 LDFLAGS += -lpthread
 LDLIBS += -lcap
 
-- 
cgit 


From 5ad0c8e42c13623bd996e19ce76f2596e16eb0db Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:47 -0500
Subject: selftests: sync: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/sync/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sync/Makefile b/tools/testing/selftests/sync/Makefile
index d0121a8a3523..df0f91bf6890 100644
--- a/tools/testing/selftests/sync/Makefile
+++ b/tools/testing/selftests/sync/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 CFLAGS += -O2 -g -std=gnu89 -pthread -Wall -Wextra
-CFLAGS += -I../../../../usr/include/
+CFLAGS += $(KHDR_INCLUDES)
 LDFLAGS += -pthread
 
 .PHONY: all clean
-- 
cgit 


From f3886fd28987c119a98493f625cb9940b5f1c9a0 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:48 -0500
Subject: selftests: user_events: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org> # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/user_events/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/user_events/Makefile b/tools/testing/selftests/user_events/Makefile
index c765d8635d9a..87d54c640068 100644
--- a/tools/testing/selftests/user_events/Makefile
+++ b/tools/testing/selftests/user_events/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-CFLAGS += -Wl,-no-as-needed -Wall -I../../../../usr/include
+CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
 LDLIBS += -lrt -lpthread -lm
 
 TEST_GEN_PROGS = ftrace_test dyn_test perf_test
-- 
cgit 


From 8eb3751c73bec746f61fb6bada60d1074d92b8c3 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:49 -0500
Subject: selftests: vm: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/vm/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 89c14e41bd43..ac9366065fd2 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -25,7 +25,7 @@ MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/p
 # LDLIBS.
 MAKEFLAGS += --no-builtin-rules
 
-CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
+CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
 LDLIBS = -lrt -lpthread
 TEST_GEN_FILES = cow
 TEST_GEN_FILES += compaction_test
-- 
cgit 


From ac5ec90e94fe8eddb4499e51398640fa6a89d657 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:50 -0500
Subject: selftests: x86: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: <stable@vger.kernel.org>  # 5.18+
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/x86/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 0388c4d60af0..ca9374b56ead 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -34,7 +34,7 @@ BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64)
 BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
 BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
 
-CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
+CFLAGS := -O2 -g -std=gnu99 -pthread -Wall $(KHDR_INCLUDES)
 
 # call32_from_64 in thunks.S uses absolute addresses.
 ifeq ($(CAN_BUILD_WITH_NOPIE),1)
-- 
cgit 


From 0d7a91678aaacf8e2f532c3e04a276af06b42d1c Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:52 -0500
Subject: selftests: iommu: Use installed kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for installed kernel headers rather
than using kernel headers in include/uapi from the source kernel tree
kernel headers.

Remove bogus ../../../../include/ from the search path, because
kernel source headers are not needed by those user-space selftests, and
it causes issues because -I paths are searched before -isystem paths,
and conflicts for files appearing both in kernel sources and in uapi
headers with incompatible semantics (e.g. types.h).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/iommu/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/iommu/Makefile b/tools/testing/selftests/iommu/Makefile
index 7cb74d26f141..32c5fdfd0eef 100644
--- a/tools/testing/selftests/iommu/Makefile
+++ b/tools/testing/selftests/iommu/Makefile
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 CFLAGS += -Wall -O2 -Wno-unused-function
-CFLAGS += -I../../../../include/uapi/
-CFLAGS += -I../../../../include/
+CFLAGS += $(KHDR_INCLUDES)
 
 CFLAGS += -D_GNU_SOURCE
 
-- 
cgit 


From a24ebb4937034e3b9459a5faf353c26af27f62be Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:53 -0500
Subject: selftests: memfd: Use installed kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for installed kernel headers rather
than using kernel headers in include/uapi from the source kernel tree
kernel headers.

Remove bogus ../../../../include/ from the search path, because
kernel source headers are not needed by those user-space selftests, and
it causes issues because -I paths are searched before -isystem paths,
and conflicts for files appearing both in kernel sources and in uapi
headers with incompatible semantics (e.g. types.h).

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/memfd/Makefile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile
index 4da8b565fa32..163b6f68631c 100644
--- a/tools/testing/selftests/memfd/Makefile
+++ b/tools/testing/selftests/memfd/Makefile
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 CFLAGS += -D_FILE_OFFSET_BITS=64
-CFLAGS += -I../../../../include/uapi/
-CFLAGS += -I../../../../include/
-CFLAGS += -I../../../../usr/include/
+CFLAGS += $(KHDR_INCLUDES)
 
 TEST_GEN_PROGS := memfd_test
 TEST_PROGS := run_fuse_test.sh run_hugetlbfs_test.sh
-- 
cgit 


From 4c983a14238d6d4e95c5783b9d93dd4d4aeccdd9 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:54 -0500
Subject: selftests: ptrace: Use installed kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for installed kernel headers rather
than using kernel headers in include/uapi from the source kernel tree
kernel headers.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ptrace/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile
index 2f1f532c39db..96ffa94afb91 100644
--- a/tools/testing/selftests/ptrace/Makefile
+++ b/tools/testing/selftests/ptrace/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-CFLAGS += -std=c99 -pthread -iquote../../../../include/uapi -Wall
+CFLAGS += -std=c99 -pthread -Wall $(KHDR_INCLUDES)
 
 TEST_GEN_PROGS := get_syscall_info peeksiginfo vmaccess
 
-- 
cgit 


From 5adbe55c8ba5c2d03add62386e4722d273e5b893 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:55 -0500
Subject: selftests: tdx: Use installed kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for installed kernel headers rather
than using kernel headers in include/uapi from the source kernel tree
kernel headers.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/tdx/Makefile         | 2 +-
 tools/testing/selftests/tdx/tdx_guest_test.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/tdx/Makefile b/tools/testing/selftests/tdx/Makefile
index 8dd43517cd55..306e9c4d5ef7 100644
--- a/tools/testing/selftests/tdx/Makefile
+++ b/tools/testing/selftests/tdx/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-CFLAGS += -O3 -Wl,-no-as-needed -Wall -static
+CFLAGS += -O3 -Wl,-no-as-needed -Wall $(KHDR_INCLUDES) -static
 
 TEST_GEN_PROGS := tdx_guest_test
 
diff --git a/tools/testing/selftests/tdx/tdx_guest_test.c b/tools/testing/selftests/tdx/tdx_guest_test.c
index 2a2afd856798..81d8cb88ea1a 100644
--- a/tools/testing/selftests/tdx/tdx_guest_test.c
+++ b/tools/testing/selftests/tdx/tdx_guest_test.c
@@ -12,8 +12,8 @@
 #include <errno.h>
 #include <fcntl.h>
 
+#include <linux/tdx-guest.h>
 #include "../kselftest_harness.h"
-#include "../../../../include/uapi/linux/tdx-guest.h"
 
 #define TDX_GUEST_DEVNAME "/dev/tdx_guest"
 #define HEX_DUMP_SIZE 8
-- 
cgit 


From 4b225d4f067f544b7594ce8a18216e19f0c1d692 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Thu, 2 Feb 2023 12:56:21 +0000
Subject: selftests: Fix spelling mistake "allright" -> "all right"

There are two spelling mistakes in the test messages. Fix them.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/prctl/disable-tsc-ctxt-sw-stress-test.c | 2 +-
 tools/testing/selftests/prctl/disable-tsc-on-off-stress-test.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/prctl/disable-tsc-ctxt-sw-stress-test.c b/tools/testing/selftests/prctl/disable-tsc-ctxt-sw-stress-test.c
index 62a93cc61b7c..6d1a5ee8eb28 100644
--- a/tools/testing/selftests/prctl/disable-tsc-ctxt-sw-stress-test.c
+++ b/tools/testing/selftests/prctl/disable-tsc-ctxt-sw-stress-test.c
@@ -79,7 +79,7 @@ int main(void)
 {
 	int n_tasks = 100, i;
 
-	fprintf(stderr, "[No further output means we're allright]\n");
+	fprintf(stderr, "[No further output means we're all right]\n");
 
 	for (i=0; i<n_tasks; i++)
 		if (fork() == 0)
diff --git a/tools/testing/selftests/prctl/disable-tsc-on-off-stress-test.c b/tools/testing/selftests/prctl/disable-tsc-on-off-stress-test.c
index 79950f9a26fd..d39511eb9b01 100644
--- a/tools/testing/selftests/prctl/disable-tsc-on-off-stress-test.c
+++ b/tools/testing/selftests/prctl/disable-tsc-on-off-stress-test.c
@@ -83,7 +83,7 @@ int main(void)
 {
 	int n_tasks = 100, i;
 
-	fprintf(stderr, "[No further output means we're allright]\n");
+	fprintf(stderr, "[No further output means we're all right]\n");
 
 	for (i=0; i<n_tasks; i++)
 		if (fork() == 0)
-- 
cgit 


From 4ebe33398c40c1118b4d8546978036c0e0032d1b Mon Sep 17 00:00:00 2001
From: Guillaume Tucker <guillaume.tucker@collabora.com>
Date: Fri, 3 Feb 2023 16:26:03 +0100
Subject: selftests: find echo binary to use -ne options

Find the actual echo binary using $(which echo) and use it for
formatted output with -ne.  On some systems, the default echo command
doesn't handle the -e option and the output looks like this (arm64
build):

-ne Emit Tests for alsa

-ne Emit Tests for amd-pstate

-ne Emit Tests for arm64

This is for example the case with the KernelCI Docker images
e.g. kernelci/gcc-10:x86-kselftest-kernelci.  With the actual echo
binary (e.g. in /bin/echo), the output is formatted as expected (x86
build this time):

Emit Tests for alsa
Emit Tests for amd-pstate
Skipping non-existent dir: arm64

Only the install target is using "echo -ne" so keep the $ECHO variable
local to it.

Reported-by: "kernelci.org bot" <bot@kernelci.org>
Fixes: 3297a4df805d ("kselftests: Enable the echo command to print newlines in Makefile")
Signed-off-by: Guillaume Tucker <guillaume.tucker@collabora.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 41b649452560..9619d0f3b2ff 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -234,10 +234,11 @@ ifdef INSTALL_PATH
 	@# While building kselftest-list.text skip also non-existent TARGET dirs:
 	@# they could be the result of a build failure and should NOT be
 	@# included in the generated runlist.
+	ECHO=`which echo`; \
 	for TARGET in $(TARGETS); do \
 		BUILD_TARGET=$$BUILD/$$TARGET;	\
-		[ ! -d $(INSTALL_PATH)/$$TARGET ] && echo "Skipping non-existent dir: $$TARGET" && continue; \
-		echo -ne "Emit Tests for $$TARGET\n"; \
+		[ ! -d $(INSTALL_PATH)/$$TARGET ] && $$ECHO "Skipping non-existent dir: $$TARGET" && continue; \
+		$$ECHO -ne "Emit Tests for $$TARGET\n"; \
 		$(MAKE) -s --no-print-directory OUTPUT=$$BUILD_TARGET COLLECTION=$$TARGET \
 			-C $$TARGET emit_tests >> $(TEST_LIST); \
 	done;
-- 
cgit 


From 787fccb321dd6c1c464bb11d952074edbde5de36 Mon Sep 17 00:00:00 2001
From: Tzung-Bi Shih <tzungbi@kernel.org>
Date: Fri, 3 Feb 2023 18:14:30 +0800
Subject: selftests: tpm2: remove redundant ord()

When testing with FLAG_DEBUG enabled client, it emits the following
error messages:

File "/root/tpm2/tpm2.py", line 347, in hex_dump
    d = [format(ord(x), '02x') for x in d]
File "/root/tpm2/tpm2.py", line 347, in <listcomp>
    d = [format(ord(x), '02x') for x in d]
TypeError: ord() expected string of length 1, but int found

The input of hex_dump() should be packed binary data.  Remove the
ord().

Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/tpm2/tpm2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/tpm2/tpm2.py b/tools/testing/selftests/tpm2/tpm2.py
index c7363c6764fc..bba8cb54548e 100644
--- a/tools/testing/selftests/tpm2/tpm2.py
+++ b/tools/testing/selftests/tpm2/tpm2.py
@@ -344,7 +344,7 @@ def get_algorithm(name):
 
 
 def hex_dump(d):
-    d = [format(ord(x), '02x') for x in d]
+    d = [format(x, '02x') for x in d]
     d = [d[i: i + 16] for i in range(0, len(d), 16)]
     d = [' '.join(x) for x in d]
     d = os.linesep.join(d)
-- 
cgit 


From 1e6b485c922fbedf41d5a9f4e6449c5aeb923a32 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Sun, 22 Jan 2023 08:32:50 +0900
Subject: selftests/ftrace: Fix bash specific "==" operator

Since commit a1d6cd88c897 ("selftests/ftrace: event_triggers: wait
longer for test_event_enable") introduced bash specific "=="
comparation operator, that test will fail when we run it on a
posix-shell. `checkbashisms` warned it as below.

possible bashism in ftrace/func_event_triggers.tc line 45 (should be 'b = a'):
        if [ "$e" == $val ]; then

This replaces it with "=".

Fixes: a1d6cd88c897 ("selftests/ftrace: event_triggers: wait longer for test_event_enable")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc
index 3eea2abf68f9..2ad7d4b501cc 100644
--- a/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc
@@ -42,7 +42,7 @@ test_event_enabled() {
 
     while [ $check_times -ne 0 ]; do
 	e=`cat $EVENT_ENABLE`
-	if [ "$e" == $val ]; then
+	if [ "$e" = $val ]; then
 	    return 0
 	fi
 	sleep $SLEEP_TIME
-- 
cgit 


From 9e34fad00fc889abbb99d751a4c22cf2bded10df Mon Sep 17 00:00:00 2001
From: Guillaume Tucker <guillaume.tucker@collabora.com>
Date: Thu, 9 Feb 2023 09:55:36 +0100
Subject: selftests: use printf instead of echo -ne

Rather than trying to guess which implementation of "echo" to run with
support for "-ne" options, use "printf" instead of "echo -ne".  It
handles escape characters as a standard feature and it is widespread
among modern shells.

Reported-by: "kernelci.org bot" <bot@kernelci.org>
Suggested-by: David Laight <David.Laight@ACULAB.COM>
Fixes: 3297a4df805d ("kselftests: Enable the echo command to print newlines in Makefile")
Fixes: 79c16b1120fe ("selftests: find echo binary to use -ne options")
Signed-off-by: Guillaume Tucker <guillaume.tucker@collabora.com>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/Makefile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 9619d0f3b2ff..06578963f4f1 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -234,11 +234,10 @@ ifdef INSTALL_PATH
 	@# While building kselftest-list.text skip also non-existent TARGET dirs:
 	@# they could be the result of a build failure and should NOT be
 	@# included in the generated runlist.
-	ECHO=`which echo`; \
 	for TARGET in $(TARGETS); do \
 		BUILD_TARGET=$$BUILD/$$TARGET;	\
-		[ ! -d $(INSTALL_PATH)/$$TARGET ] && $$ECHO "Skipping non-existent dir: $$TARGET" && continue; \
-		$$ECHO -ne "Emit Tests for $$TARGET\n"; \
+		[ ! -d $(INSTALL_PATH)/$$TARGET ] && printf "Skipping non-existent dir: $$TARGET\n" && continue; \
+		printf "Emit Tests for $$TARGET\n"; \
 		$(MAKE) -s --no-print-directory OUTPUT=$$BUILD_TARGET COLLECTION=$$TARGET \
 			-C $$TARGET emit_tests >> $(TEST_LIST); \
 	done;
-- 
cgit 


From 6e81461b06b6a4a24bf97fca37d9b89f72ce8126 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Fri, 10 Feb 2023 17:20:57 -0700
Subject: selftests/ptp: Remove clean target from Makefile

Fix the following build warn removing unnecessary clean target
from the Makefile. lib.mk handles clean.

Makefile:10: warning: overriding recipe for target clean
../lib.mk:124: warning: ignoring old recipe for target clean

In addition, fix to use TEST_GEN_PROGS for generated test executables
and TES_PROGS for the shell script. Ger rid of all target as lib.mk
handles it.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ptp/Makefile | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ptp/Makefile b/tools/testing/selftests/ptp/Makefile
index eeab44cc6863..8f57f88ecadd 100644
--- a/tools/testing/selftests/ptp/Makefile
+++ b/tools/testing/selftests/ptp/Makefile
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 CFLAGS += $(KHDR_INCLUDES)
-TEST_PROGS := testptp
+TEST_GEN_PROGS := testptp
 LDLIBS += -lrt
-all: $(TEST_PROGS)
+TEST_PROGS = phc.sh
 
 include ../lib.mk
-
-clean:
-	rm -fr $(TEST_PROGS)
-- 
cgit 


From 0b0757244754ea1d0721195c824770f5576e119e Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Thu, 9 Feb 2023 00:12:11 +0100
Subject: selftests/bpf: Fix out-of-srctree build

Building BPF selftests out of srctree fails with:

  make: *** No rule to make target '/linux-build//ima_setup.sh', needed by 'ima_setup.sh'.  Stop.

The culprit is the rule that defines convenient shorthands like
"make test_progs", which builds $(OUTPUT)/test_progs. These shorthands
make sense only for binaries that are built though; scripts that live
in the source tree do not end up in $(OUTPUT).

Therefore drop $(TEST_PROGS) and $(TEST_PROGS_EXTENDED) from the rule.

The issue exists for a while, but it became a problem only after commit
d68ae4982cb7 ("selftests/bpf: Install all required files to run selftests"),
which added dependencies on these scripts.

Fixes: 03dcb78460c2 ("selftests/bpf: Add simple per-test targets to Makefile")
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230208231211.283606-1-iii@linux.ibm.com
---
 tools/testing/selftests/bpf/Makefile | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index c4b5c44cdee2..f7771592a920 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -152,8 +152,6 @@ endif
 # NOTE: Semicolon at the end is critical to override lib.mk's default static
 # rule for binaries.
 $(notdir $(TEST_GEN_PROGS)						\
-	 $(TEST_PROGS)							\
-	 $(TEST_PROGS_EXTENDED)						\
 	 $(TEST_GEN_PROGS_EXTENDED)					\
 	 $(TEST_CUSTOM_PROGS)): %: $(OUTPUT)/% ;
 
-- 
cgit 


From 6a3cd3318ff65622415e34e8ee39d76331e7c869 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Sun, 12 Feb 2023 01:27:07 -0800
Subject: bpf: Migrate release_on_unlock logic to non-owning ref semantics

This patch introduces non-owning reference semantics to the verifier,
specifically linked_list API kfunc handling. release_on_unlock logic for
refs is refactored - with small functional changes - to implement these
semantics, and bpf_list_push_{front,back} are migrated to use them.

When a list node is pushed to a list, the program still has a pointer to
the node:

  n = bpf_obj_new(typeof(*n));

  bpf_spin_lock(&l);
  bpf_list_push_back(&l, n);
  /* n still points to the just-added node */
  bpf_spin_unlock(&l);

What the verifier considers n to be after the push, and thus what can be
done with n, are changed by this patch.

Common properties both before/after this patch:
  * After push, n is only a valid reference to the node until end of
    critical section
  * After push, n cannot be pushed to any list
  * After push, the program can read the node's fields using n

Before:
  * After push, n retains the ref_obj_id which it received on
    bpf_obj_new, but the associated bpf_reference_state's
    release_on_unlock field is set to true
    * release_on_unlock field and associated logic is used to implement
      "n is only a valid ref until end of critical section"
  * After push, n cannot be written to, the node must be removed from
    the list before writing to its fields
  * After push, n is marked PTR_UNTRUSTED

After:
  * After push, n's ref is released and ref_obj_id set to 0. NON_OWN_REF
    type flag is added to reg's type, indicating that it's a non-owning
    reference.
    * NON_OWN_REF flag and logic is used to implement "n is only a
      valid ref until end of critical section"
  * n can be written to (except for special fields e.g. bpf_list_node,
    timer, ...)

Summary of specific implementation changes to achieve the above:

  * release_on_unlock field, ref_set_release_on_unlock helper, and logic
    to "release on unlock" based on that field are removed

  * The anonymous active_lock struct used by bpf_verifier_state is
    pulled out into a named struct bpf_active_lock.

  * NON_OWN_REF type flag is introduced along with verifier logic
    changes to handle non-owning refs

  * Helpers are added to use NON_OWN_REF flag to implement non-owning
    ref semantics as described above
    * invalidate_non_owning_refs - helper to clobber all non-owning refs
      matching a particular bpf_active_lock identity. Replaces
      release_on_unlock logic in process_spin_lock.
    * ref_set_non_owning - set NON_OWN_REF type flag after doing some
      sanity checking
    * ref_convert_owning_non_owning - convert owning reference w/
      specified ref_obj_id to non-owning references. Set NON_OWN_REF
      flag for each reg with that ref_obj_id and 0-out its ref_obj_id

  * Update linked_list selftests to account for minor semantic
    differences introduced by this patch
    * Writes to a release_on_unlock node ref are not allowed, while
      writes to non-owning reference pointees are. As a result the
      linked_list "write after push" failure tests are no longer scenarios
      that should fail.
    * The test##missing_lock##op and test##incorrect_lock##op
      macro-generated failure tests need to have a valid node argument in
      order to have the same error output as before. Otherwise
      verification will fail early and the expected error output won't be seen.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230212092715.1422619-2-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |   6 +
 include/linux/bpf_verifier.h                       |  38 +++--
 kernel/bpf/verifier.c                              | 168 +++++++++++++++------
 .../testing/selftests/bpf/prog_tests/linked_list.c |   2 -
 tools/testing/selftests/bpf/progs/linked_list.c    |   2 +-
 .../testing/selftests/bpf/progs/linked_list_fail.c | 100 +++++++-----
 6 files changed, 206 insertions(+), 110 deletions(-)

(limited to 'tools')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4385418118f6..8b5d0b4c4ada 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -181,6 +181,7 @@ enum btf_field_type {
 	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
 	BPF_LIST_HEAD  = (1 << 4),
 	BPF_LIST_NODE  = (1 << 5),
+	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD,
 };
 
 struct btf_field_kptr {
@@ -576,6 +577,11 @@ enum bpf_type_flag {
 	/* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */
 	MEM_RCU			= BIT(13 + BPF_BASE_TYPE_BITS),
 
+	/* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
+	 * Currently only valid for linked-list and rbtree nodes.
+	 */
+	NON_OWN_REF		= BIT(14 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index aa83de1fe755..cf1bb1cf4a7b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -43,6 +43,22 @@ enum bpf_reg_liveness {
 	REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
 };
 
+/* For every reg representing a map value or allocated object pointer,
+ * we consider the tuple of (ptr, id) for them to be unique in verifier
+ * context and conside them to not alias each other for the purposes of
+ * tracking lock state.
+ */
+struct bpf_active_lock {
+	/* This can either be reg->map_ptr or reg->btf. If ptr is NULL,
+	 * there's no active lock held, and other fields have no
+	 * meaning. If non-NULL, it indicates that a lock is held and
+	 * id member has the reg->id of the register which can be >= 0.
+	 */
+	void *ptr;
+	/* This will be reg->id */
+	u32 id;
+};
+
 struct bpf_reg_state {
 	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
@@ -226,11 +242,6 @@ struct bpf_reference_state {
 	 * exiting a callback function.
 	 */
 	int callback_ref;
-	/* Mark the reference state to release the registers sharing the same id
-	 * on bpf_spin_unlock (for nodes that we will lose ownership to but are
-	 * safe to access inside the critical section).
-	 */
-	bool release_on_unlock;
 };
 
 /* state of the program:
@@ -331,21 +342,8 @@ struct bpf_verifier_state {
 	u32 branches;
 	u32 insn_idx;
 	u32 curframe;
-	/* For every reg representing a map value or allocated object pointer,
-	 * we consider the tuple of (ptr, id) for them to be unique in verifier
-	 * context and conside them to not alias each other for the purposes of
-	 * tracking lock state.
-	 */
-	struct {
-		/* This can either be reg->map_ptr or reg->btf. If ptr is NULL,
-		 * there's no active lock held, and other fields have no
-		 * meaning. If non-NULL, it indicates that a lock is held and
-		 * id member has the reg->id of the register which can be >= 0.
-		 */
-		void *ptr;
-		/* This will be reg->id */
-		u32 id;
-	} active_lock;
+
+	struct bpf_active_lock active_lock;
 	bool speculative;
 	bool active_rcu_lock;
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 388245e8826e..f176bc15c879 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -190,6 +190,9 @@ struct bpf_verifier_stack_elem {
 
 static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
 static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
+static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
+static int ref_set_non_owning(struct bpf_verifier_env *env,
+			      struct bpf_reg_state *reg);
 
 static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
 {
@@ -457,6 +460,11 @@ static bool type_is_ptr_alloc_obj(u32 type)
 	return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
 }
 
+static bool type_is_non_owning_ref(u32 type)
+{
+	return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
+}
+
 static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
 {
 	struct btf_record *rec = NULL;
@@ -1073,6 +1081,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 				verbose_a("id=%d", reg->id);
 			if (reg->ref_obj_id)
 				verbose_a("ref_obj_id=%d", reg->ref_obj_id);
+			if (type_is_non_owning_ref(reg->type))
+				verbose_a("%s", "non_own_ref");
 			if (t != SCALAR_VALUE)
 				verbose_a("off=%d", reg->off);
 			if (type_is_pkt_pointer(t))
@@ -5052,7 +5062,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 			return -EACCES;
 		}
 
-		if (type_is_alloc(reg->type) && !reg->ref_obj_id) {
+		if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
+		    !reg->ref_obj_id) {
 			verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
 			return -EFAULT;
 		}
@@ -6042,9 +6053,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
 			cur->active_lock.ptr = btf;
 		cur->active_lock.id = reg->id;
 	} else {
-		struct bpf_func_state *fstate = cur_func(env);
 		void *ptr;
-		int i;
 
 		if (map)
 			ptr = map;
@@ -6060,25 +6069,11 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
 			verbose(env, "bpf_spin_unlock of different lock\n");
 			return -EINVAL;
 		}
-		cur->active_lock.ptr = NULL;
-		cur->active_lock.id = 0;
 
-		for (i = fstate->acquired_refs - 1; i >= 0; i--) {
-			int err;
+		invalidate_non_owning_refs(env);
 
-			/* Complain on error because this reference state cannot
-			 * be freed before this point, as bpf_spin_lock critical
-			 * section does not allow functions that release the
-			 * allocated object immediately.
-			 */
-			if (!fstate->refs[i].release_on_unlock)
-				continue;
-			err = release_reference(env, fstate->refs[i].id);
-			if (err) {
-				verbose(env, "failed to release release_on_unlock reference");
-				return err;
-			}
-		}
+		cur->active_lock.ptr = NULL;
+		cur->active_lock.id = 0;
 	}
 	return 0;
 }
@@ -6546,6 +6541,23 @@ found:
 	return 0;
 }
 
+static struct btf_field *
+reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
+{
+	struct btf_field *field;
+	struct btf_record *rec;
+
+	rec = reg_btf_record(reg);
+	if (!rec)
+		return NULL;
+
+	field = btf_record_find(rec, off, fields);
+	if (!field)
+		return NULL;
+
+	return field;
+}
+
 int check_func_arg_reg_off(struct bpf_verifier_env *env,
 			   const struct bpf_reg_state *reg, int regno,
 			   enum bpf_arg_type arg_type)
@@ -6567,6 +6579,18 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 		 */
 		if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
 			return 0;
+
+		if ((type_is_ptr_alloc_obj(type) || type_is_non_owning_ref(type)) && reg->off) {
+			if (reg_find_field_offset(reg, reg->off, BPF_GRAPH_NODE_OR_ROOT))
+				return __check_ptr_off_reg(env, reg, regno, true);
+
+			verbose(env, "R%d must have zero offset when passed to release func\n",
+				regno);
+			verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno,
+				kernel_type_name(reg->btf, reg->btf_id), reg->off);
+			return -EINVAL;
+		}
+
 		/* Doing check_ptr_off_reg check for the offset will catch this
 		 * because fixed_off_ok is false, but checking here allows us
 		 * to give the user a better error message.
@@ -6601,6 +6625,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	case PTR_TO_BTF_ID | PTR_TRUSTED:
 	case PTR_TO_BTF_ID | MEM_RCU:
 	case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
+	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
 		/* When referenced PTR_TO_BTF_ID is passed to release function,
 		 * its fixed offset must be 0. In the other cases, fixed offset
 		 * can be non-zero. This was already checked above. So pass
@@ -7363,6 +7388,17 @@ static int release_reference(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static void invalidate_non_owning_refs(struct bpf_verifier_env *env)
+{
+	struct bpf_func_state *unused;
+	struct bpf_reg_state *reg;
+
+	bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+		if (type_is_non_owning_ref(reg->type))
+			__mark_reg_unknown(env, reg);
+	}));
+}
+
 static void clear_caller_saved_regs(struct bpf_verifier_env *env,
 				    struct bpf_reg_state *regs)
 {
@@ -8915,38 +8951,54 @@ static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env,
 	return 0;
 }
 
-static int ref_set_release_on_unlock(struct bpf_verifier_env *env, u32 ref_obj_id)
+static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
-	struct bpf_func_state *state = cur_func(env);
+	struct bpf_verifier_state *state = env->cur_state;
+
+	if (!state->active_lock.ptr) {
+		verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
+		return -EFAULT;
+	}
+
+	if (type_flag(reg->type) & NON_OWN_REF) {
+		verbose(env, "verifier internal error: NON_OWN_REF already set\n");
+		return -EFAULT;
+	}
+
+	reg->type |= NON_OWN_REF;
+	return 0;
+}
+
+static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
+{
+	struct bpf_func_state *state, *unused;
 	struct bpf_reg_state *reg;
 	int i;
 
-	/* bpf_spin_lock only allows calling list_push and list_pop, no BPF
-	 * subprogs, no global functions. This means that the references would
-	 * not be released inside the critical section but they may be added to
-	 * the reference state, and the acquired_refs are never copied out for a
-	 * different frame as BPF to BPF calls don't work in bpf_spin_lock
-	 * critical sections.
-	 */
+	state = cur_func(env);
+
 	if (!ref_obj_id) {
-		verbose(env, "verifier internal error: ref_obj_id is zero for release_on_unlock\n");
+		verbose(env, "verifier internal error: ref_obj_id is zero for "
+			     "owning -> non-owning conversion\n");
 		return -EFAULT;
 	}
+
 	for (i = 0; i < state->acquired_refs; i++) {
-		if (state->refs[i].id == ref_obj_id) {
-			if (state->refs[i].release_on_unlock) {
-				verbose(env, "verifier internal error: expected false release_on_unlock");
-				return -EFAULT;
+		if (state->refs[i].id != ref_obj_id)
+			continue;
+
+		/* Clear ref_obj_id here so release_reference doesn't clobber
+		 * the whole reg
+		 */
+		bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+			if (reg->ref_obj_id == ref_obj_id) {
+				reg->ref_obj_id = 0;
+				ref_set_non_owning(env, reg);
 			}
-			state->refs[i].release_on_unlock = true;
-			/* Now mark everyone sharing same ref_obj_id as untrusted */
-			bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
-				if (reg->ref_obj_id == ref_obj_id)
-					reg->type |= PTR_UNTRUSTED;
-			}));
-			return 0;
-		}
+		}));
+		return 0;
 	}
+
 	verbose(env, "verifier internal error: ref state missing for ref_obj_id\n");
 	return -EFAULT;
 }
@@ -9081,7 +9133,6 @@ static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
 {
 	const struct btf_type *et, *t;
 	struct btf_field *field;
-	struct btf_record *rec;
 	u32 list_node_off;
 
 	if (meta->btf != btf_vmlinux ||
@@ -9098,9 +9149,8 @@ static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	rec = reg_btf_record(reg);
 	list_node_off = reg->off + reg->var_off.value;
-	field = btf_record_find(rec, list_node_off, BPF_LIST_NODE);
+	field = reg_find_field_offset(reg, list_node_off, BPF_LIST_NODE);
 	if (!field || field->offset != list_node_off) {
 		verbose(env, "bpf_list_node not found at offset=%u\n", list_node_off);
 		return -EINVAL;
@@ -9126,8 +9176,8 @@ static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
 			btf_name_by_offset(field->graph_root.btf, et->name_off));
 		return -EINVAL;
 	}
-	/* Set arg#1 for expiration after unlock */
-	return ref_set_release_on_unlock(env, reg->ref_obj_id);
+
+	return 0;
 }
 
 static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta)
@@ -9406,11 +9456,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			    int *insn_idx_p)
 {
 	const struct btf_type *t, *func, *func_proto, *ptr_type;
+	u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id;
 	struct bpf_reg_state *regs = cur_regs(env);
 	const char *func_name, *ptr_type_name;
 	bool sleepable, rcu_lock, rcu_unlock;
 	struct bpf_kfunc_call_arg_meta meta;
-	u32 i, nargs, func_id, ptr_type_id;
 	int err, insn_idx = *insn_idx_p;
 	const struct btf_param *args;
 	const struct btf_type *ret_t;
@@ -9505,6 +9555,24 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		}
 	}
 
+	if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front] ||
+	    meta.func_id == special_kfunc_list[KF_bpf_list_push_back]) {
+		release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
+		err = ref_convert_owning_non_owning(env, release_ref_obj_id);
+		if (err) {
+			verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
+				func_name, func_id);
+			return err;
+		}
+
+		err = release_reference(env, release_ref_obj_id);
+		if (err) {
+			verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+				func_name, func_id);
+			return err;
+		}
+	}
+
 	for (i = 0; i < CALLER_SAVED_REGS; i++)
 		mark_reg_not_init(env, regs, caller_saved[i]);
 
@@ -11825,8 +11893,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 		 */
 		if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0)))
 			return;
-		if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL) && WARN_ON_ONCE(reg->off))
+		if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) &&
+		    WARN_ON_ONCE(reg->off))
 			return;
+
 		if (is_null) {
 			reg->type = SCALAR_VALUE;
 			/* We don't need id and ref_obj_id from this point
diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c
index 9a7d4c47af63..2592b8aa5e41 100644
--- a/tools/testing/selftests/bpf/prog_tests/linked_list.c
+++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c
@@ -78,8 +78,6 @@ static struct {
 	{ "direct_write_head", "direct access to bpf_list_head is disallowed" },
 	{ "direct_read_node", "direct access to bpf_list_node is disallowed" },
 	{ "direct_write_node", "direct access to bpf_list_node is disallowed" },
-	{ "write_after_push_front", "only read is supported" },
-	{ "write_after_push_back", "only read is supported" },
 	{ "use_after_unlock_push_front", "invalid mem access 'scalar'" },
 	{ "use_after_unlock_push_back", "invalid mem access 'scalar'" },
 	{ "double_push_front", "arg#1 expected pointer to allocated object" },
diff --git a/tools/testing/selftests/bpf/progs/linked_list.c b/tools/testing/selftests/bpf/progs/linked_list.c
index 4ad88da5cda2..4fa4a9b01bde 100644
--- a/tools/testing/selftests/bpf/progs/linked_list.c
+++ b/tools/testing/selftests/bpf/progs/linked_list.c
@@ -260,7 +260,7 @@ int test_list_push_pop_multiple(struct bpf_spin_lock *lock, struct bpf_list_head
 {
 	int ret;
 
-	ret = list_push_pop_multiple(lock ,head, false);
+	ret = list_push_pop_multiple(lock, head, false);
 	if (ret)
 		return ret;
 	return list_push_pop_multiple(lock, head, true);
diff --git a/tools/testing/selftests/bpf/progs/linked_list_fail.c b/tools/testing/selftests/bpf/progs/linked_list_fail.c
index 1d9017240e19..69cdc07cba13 100644
--- a/tools/testing/selftests/bpf/progs/linked_list_fail.c
+++ b/tools/testing/selftests/bpf/progs/linked_list_fail.c
@@ -54,28 +54,44 @@
 		return 0;                                   \
 	}
 
-CHECK(kptr, push_front, &f->head);
-CHECK(kptr, push_back, &f->head);
 CHECK(kptr, pop_front, &f->head);
 CHECK(kptr, pop_back, &f->head);
 
-CHECK(global, push_front, &ghead);
-CHECK(global, push_back, &ghead);
 CHECK(global, pop_front, &ghead);
 CHECK(global, pop_back, &ghead);
 
-CHECK(map, push_front, &v->head);
-CHECK(map, push_back, &v->head);
 CHECK(map, pop_front, &v->head);
 CHECK(map, pop_back, &v->head);
 
-CHECK(inner_map, push_front, &iv->head);
-CHECK(inner_map, push_back, &iv->head);
 CHECK(inner_map, pop_front, &iv->head);
 CHECK(inner_map, pop_back, &iv->head);
 
 #undef CHECK
 
+#define CHECK(test, op, hexpr, nexpr)					\
+	SEC("?tc")							\
+	int test##_missing_lock_##op(void *ctx)				\
+	{								\
+		INIT;							\
+		void (*p)(void *, void *) = (void *)&bpf_list_##op;	\
+		p(hexpr, nexpr);					\
+		return 0;						\
+	}
+
+CHECK(kptr, push_front, &f->head, b);
+CHECK(kptr, push_back, &f->head, b);
+
+CHECK(global, push_front, &ghead, f);
+CHECK(global, push_back, &ghead, f);
+
+CHECK(map, push_front, &v->head, f);
+CHECK(map, push_back, &v->head, f);
+
+CHECK(inner_map, push_front, &iv->head, f);
+CHECK(inner_map, push_back, &iv->head, f);
+
+#undef CHECK
+
 #define CHECK(test, op, lexpr, hexpr)                       \
 	SEC("?tc")                                          \
 	int test##_incorrect_lock_##op(void *ctx)           \
@@ -108,11 +124,47 @@ CHECK(inner_map, pop_back, &iv->head);
 	CHECK(inner_map_global, op, &iv->lock, &ghead);        \
 	CHECK(inner_map_map, op, &iv->lock, &v->head);
 
-CHECK_OP(push_front);
-CHECK_OP(push_back);
 CHECK_OP(pop_front);
 CHECK_OP(pop_back);
 
+#undef CHECK
+#undef CHECK_OP
+
+#define CHECK(test, op, lexpr, hexpr, nexpr)				\
+	SEC("?tc")							\
+	int test##_incorrect_lock_##op(void *ctx)			\
+	{								\
+		INIT;							\
+		void (*p)(void *, void*) = (void *)&bpf_list_##op;	\
+		bpf_spin_lock(lexpr);					\
+		p(hexpr, nexpr);					\
+		return 0;						\
+	}
+
+#define CHECK_OP(op)							\
+	CHECK(kptr_kptr, op, &f1->lock, &f2->head, b);			\
+	CHECK(kptr_global, op, &f1->lock, &ghead, f);			\
+	CHECK(kptr_map, op, &f1->lock, &v->head, f);			\
+	CHECK(kptr_inner_map, op, &f1->lock, &iv->head, f);		\
+									\
+	CHECK(global_global, op, &glock2, &ghead, f);			\
+	CHECK(global_kptr, op, &glock, &f1->head, b);			\
+	CHECK(global_map, op, &glock, &v->head, f);			\
+	CHECK(global_inner_map, op, &glock, &iv->head, f);		\
+									\
+	CHECK(map_map, op, &v->lock, &v2->head, f);			\
+	CHECK(map_kptr, op, &v->lock, &f2->head, b);			\
+	CHECK(map_global, op, &v->lock, &ghead, f);			\
+	CHECK(map_inner_map, op, &v->lock, &iv->head, f);		\
+									\
+	CHECK(inner_map_inner_map, op, &iv->lock, &iv2->head, f);	\
+	CHECK(inner_map_kptr, op, &iv->lock, &f2->head, b);		\
+	CHECK(inner_map_global, op, &iv->lock, &ghead, f);		\
+	CHECK(inner_map_map, op, &iv->lock, &v->head, f);
+
+CHECK_OP(push_front);
+CHECK_OP(push_back);
+
 #undef CHECK
 #undef CHECK_OP
 #undef INIT
@@ -303,34 +355,6 @@ int direct_write_node(void *ctx)
 	return 0;
 }
 
-static __always_inline
-int write_after_op(void (*push_op)(void *head, void *node))
-{
-	struct foo *f;
-
-	f = bpf_obj_new(typeof(*f));
-	if (!f)
-		return 0;
-	bpf_spin_lock(&glock);
-	push_op(&ghead, &f->node);
-	f->data = 42;
-	bpf_spin_unlock(&glock);
-
-	return 0;
-}
-
-SEC("?tc")
-int write_after_push_front(void *ctx)
-{
-	return write_after_op((void *)bpf_list_push_front);
-}
-
-SEC("?tc")
-int write_after_push_back(void *ctx)
-{
-	return write_after_op((void *)bpf_list_push_back);
-}
-
 static __always_inline
 int use_after_unlock(void (*op)(void *head, void *node))
 {
-- 
cgit 


From a7151a8eaa292933cc934deb1dda0f5f51114c82 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Mon, 13 Feb 2023 10:21:43 -0700
Subject: selftests/sched: fix warn_unused_result build warns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the following warns by adding return check and error handling.

gcc -O2 -Wall -g -I./ -isystem .../tools/testing/selftests/../../../usr/include -Wl,-rpath=./      cs_prctl_test.c -lpthread -o .../tools/testing/selftests/sched/cs_prctl_test
cs_prctl_test.c: In function ‘create_processes’:
cs_prctl_test.c:187:17: warning: ignoring return value of ‘read’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
  187 |                 read(proc[i].pfd[0], &proc[i].thr_tids, sizeof(int) * proc[i].num_threads);
      |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
cs_prctl_test.c: In function ‘child_func_process’:
cs_prctl_test.c:159:9: warning: ignoring return value of ‘write’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
  159 |         write(ca->pfd[1], &ca->thr_tids, sizeof(int) * ca->num_threads);
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/sched/cs_prctl_test.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sched/cs_prctl_test.c b/tools/testing/selftests/sched/cs_prctl_test.c
index 8109b17dc764..25e0d95d3713 100644
--- a/tools/testing/selftests/sched/cs_prctl_test.c
+++ b/tools/testing/selftests/sched/cs_prctl_test.c
@@ -27,6 +27,7 @@
 #include <sys/prctl.h>
 #include <unistd.h>
 #include <time.h>
+#include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -151,12 +152,17 @@ static void create_threads(int num_threads, int thr_tids[])
 static int child_func_process(void *arg)
 {
 	struct child_args *ca = (struct child_args *)arg;
+	int ret;
 
 	close(ca->pfd[0]);
 
 	create_threads(ca->num_threads, ca->thr_tids);
 
-	write(ca->pfd[1], &ca->thr_tids, sizeof(int) * ca->num_threads);
+	ret = write(ca->pfd[1], &ca->thr_tids, sizeof(int) * ca->num_threads);
+	if (ret == -1)
+		printf("write failed on pfd[%d] - error (%s)\n",
+			ca->pfd[1], strerror(errno));
+
 	close(ca->pfd[1]);
 
 	while (1)
@@ -169,7 +175,7 @@ static unsigned char child_func_process_stack[STACK_SIZE];
 void create_processes(int num_processes, int num_threads, struct child_args proc[])
 {
 	pid_t cpid;
-	int i;
+	int i, ret;
 
 	for (i = 0; i < num_processes; ++i) {
 		proc[i].num_threads = num_threads;
@@ -184,7 +190,10 @@ void create_processes(int num_processes, int num_threads, struct child_args proc
 	}
 
 	for (i = 0; i < num_processes; ++i) {
-		read(proc[i].pfd[0], &proc[i].thr_tids, sizeof(int) * proc[i].num_threads);
+		ret = read(proc[i].pfd[0], &proc[i].thr_tids, sizeof(int) * proc[i].num_threads);
+		if (ret == -1)
+			printf("read failed on proc[%d].pfd[0] error (%s)\n",
+				i, strerror(errno));
 		close(proc[i].pfd[0]);
 	}
 }
-- 
cgit 


From d8e45bf1aed2e5fddd8985b5bb1aaf774a97aba8 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Mon, 13 Feb 2023 11:20:07 -0700
Subject: selftests/mount_setattr: fix redefine struct mount_attr build error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the following build error due to redefining struct mount_attr by
removing duplicate define from mount_setattr_test.c

gcc -g -isystem .../tools/testing/selftests/../../../usr/include -Wall -O2 -pthread     mount_setattr_test.c  -o .../tools/testing/selftests/mount_setattr/mount_setattr_test
mount_setattr_test.c:107:8: error: redefinition of ‘struct mount_attr’
  107 | struct mount_attr {
      |        ^~~~~~~~~~
In file included from /usr/include/x86_64-linux-gnu/sys/mount.h:32,
                 from mount_setattr_test.c:10:
.../usr/include/linux/mount.h:129:8: note: originally defined here
  129 | struct mount_attr {
      |        ^~~~~~~~~~
make: *** [../lib.mk:145: .../tools/testing/selftests/mount_setattr/mount_setattr_test] Error 1

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/mount_setattr/mount_setattr_test.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
index 8c5fea68ae67..582669ca38e9 100644
--- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c
+++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
@@ -103,13 +103,6 @@
 	#else
 		#define __NR_mount_setattr 442
 	#endif
-
-struct mount_attr {
-	__u64 attr_set;
-	__u64 attr_clr;
-	__u64 propagation;
-	__u64 userns_fd;
-};
 #endif
 
 #ifndef __NR_open_tree
-- 
cgit 


From aca5a0944c309c56267b0c1d001aa999ddb2fb1e Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Mon, 13 Feb 2023 14:52:47 -0700
Subject: selftests/mount_setattr: fix to make run_tests failure

make run_tests doesn't run the test. Fix Makefile to set TEST_GEN_PROGS
instead of TEST_GEN_FILES to fix the problem.

run_tests runs TEST_GEN_PROGS, TEST_CUSTOM_PROGS, and TEST_PROGS.
TEST_GEN_FILES is for files generated by tests.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/mount_setattr/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mount_setattr/Makefile b/tools/testing/selftests/mount_setattr/Makefile
index fde72df01b11..0c0d7b1234c1 100644
--- a/tools/testing/selftests/mount_setattr/Makefile
+++ b/tools/testing/selftests/mount_setattr/Makefile
@@ -2,6 +2,6 @@
 # Makefile for mount selftests.
 CFLAGS = -g $(KHDR_INCLUDES) -Wall -O2 -pthread
 
-TEST_GEN_FILES += mount_setattr_test
+TEST_GEN_PROGS := mount_setattr_test
 
 include ../lib.mk
-- 
cgit 


From 9c395c1b99bd23f74bc628fa000480c49593d17f Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 13 Feb 2023 16:40:10 -0800
Subject: bpf: Add basic bpf_rb_{root,node} support

This patch adds special BPF_RB_{ROOT,NODE} btf_field_types similar to
BPF_LIST_{HEAD,NODE}, adds the necessary plumbing to detect the new
types, and adds bpf_rb_root_free function for freeing bpf_rb_root in
map_values.

structs bpf_rb_root and bpf_rb_node are opaque types meant to
obscure structs rb_root_cached rb_node, respectively.

btf_struct_access will prevent BPF programs from touching these special
fields automatically now that they're recognized.

btf_check_and_fixup_fields now groups list_head and rb_root together as
"graph root" fields and {list,rb}_node as "graph node", and does same
ownership cycle checking as before. Note that this function does _not_
prevent ownership type mixups (e.g. rb_root owning list_node) - that's
handled by btf_parse_graph_root.

After this patch, a bpf program can have a struct bpf_rb_root in a
map_value, but not add anything to nor do anything useful with it.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230214004017.2534011-2-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |  20 ++-
 include/uapi/linux/bpf.h                           |  11 ++
 kernel/bpf/btf.c                                   | 162 ++++++++++++++-------
 kernel/bpf/helpers.c                               |  40 +++++
 kernel/bpf/syscall.c                               |  28 ++--
 kernel/bpf/verifier.c                              |   5 +-
 tools/include/uapi/linux/bpf.h                     |  11 ++
 .../testing/selftests/bpf/prog_tests/linked_list.c |  12 +-
 8 files changed, 216 insertions(+), 73 deletions(-)

(limited to 'tools')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8b5d0b4c4ada..be34f7deb6c3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -181,7 +181,10 @@ enum btf_field_type {
 	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
 	BPF_LIST_HEAD  = (1 << 4),
 	BPF_LIST_NODE  = (1 << 5),
-	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD,
+	BPF_RB_ROOT    = (1 << 6),
+	BPF_RB_NODE    = (1 << 7),
+	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
+				 BPF_RB_NODE | BPF_RB_ROOT,
 };
 
 struct btf_field_kptr {
@@ -285,6 +288,10 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
 		return "bpf_list_head";
 	case BPF_LIST_NODE:
 		return "bpf_list_node";
+	case BPF_RB_ROOT:
+		return "bpf_rb_root";
+	case BPF_RB_NODE:
+		return "bpf_rb_node";
 	default:
 		WARN_ON_ONCE(1);
 		return "unknown";
@@ -305,6 +312,10 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
 		return sizeof(struct bpf_list_head);
 	case BPF_LIST_NODE:
 		return sizeof(struct bpf_list_node);
+	case BPF_RB_ROOT:
+		return sizeof(struct bpf_rb_root);
+	case BPF_RB_NODE:
+		return sizeof(struct bpf_rb_node);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
@@ -325,6 +336,10 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
 		return __alignof__(struct bpf_list_head);
 	case BPF_LIST_NODE:
 		return __alignof__(struct bpf_list_node);
+	case BPF_RB_ROOT:
+		return __alignof__(struct bpf_rb_root);
+	case BPF_RB_NODE:
+		return __alignof__(struct bpf_rb_node);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
@@ -435,6 +450,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
 void bpf_timer_cancel_and_free(void *timer);
 void bpf_list_head_free(const struct btf_field *field, void *list_head,
 			struct bpf_spin_lock *spin_lock);
+void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
+		      struct bpf_spin_lock *spin_lock);
+
 
 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 17afd2b35ee5..1503f61336b6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6917,6 +6917,17 @@ struct bpf_list_node {
 	__u64 :64;
 } __attribute__((aligned(8)));
 
+struct bpf_rb_root {
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_rb_node {
+	__u64 :64;
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 740bdb045b14..b9d1f5c4e316 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3324,12 +3324,14 @@ static const char *btf_find_decl_tag_value(const struct btf *btf,
 	return NULL;
 }
 
-static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt,
-			      const struct btf_type *t, int comp_idx,
-			      u32 off, int sz, struct btf_field_info *info)
+static int
+btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
+		    const struct btf_type *t, int comp_idx, u32 off,
+		    int sz, struct btf_field_info *info,
+		    enum btf_field_type head_type)
 {
+	const char *node_field_name;
 	const char *value_type;
-	const char *list_node;
 	s32 id;
 
 	if (!__btf_type_is_struct(t))
@@ -3339,26 +3341,32 @@ static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt,
 	value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
 	if (!value_type)
 		return -EINVAL;
-	list_node = strstr(value_type, ":");
-	if (!list_node)
+	node_field_name = strstr(value_type, ":");
+	if (!node_field_name)
 		return -EINVAL;
-	value_type = kstrndup(value_type, list_node - value_type, GFP_KERNEL | __GFP_NOWARN);
+	value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL | __GFP_NOWARN);
 	if (!value_type)
 		return -ENOMEM;
 	id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
 	kfree(value_type);
 	if (id < 0)
 		return id;
-	list_node++;
-	if (str_is_empty(list_node))
+	node_field_name++;
+	if (str_is_empty(node_field_name))
 		return -EINVAL;
-	info->type = BPF_LIST_HEAD;
+	info->type = head_type;
 	info->off = off;
 	info->graph_root.value_btf_id = id;
-	info->graph_root.node_name = list_node;
+	info->graph_root.node_name = node_field_name;
 	return BTF_FIELD_FOUND;
 }
 
+#define field_mask_test_name(field_type, field_type_str) \
+	if (field_mask & field_type && !strcmp(name, field_type_str)) { \
+		type = field_type;					\
+		goto end;						\
+	}
+
 static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
 			      int *align, int *sz)
 {
@@ -3382,18 +3390,11 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
 			goto end;
 		}
 	}
-	if (field_mask & BPF_LIST_HEAD) {
-		if (!strcmp(name, "bpf_list_head")) {
-			type = BPF_LIST_HEAD;
-			goto end;
-		}
-	}
-	if (field_mask & BPF_LIST_NODE) {
-		if (!strcmp(name, "bpf_list_node")) {
-			type = BPF_LIST_NODE;
-			goto end;
-		}
-	}
+	field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
+	field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
+	field_mask_test_name(BPF_RB_ROOT,   "bpf_rb_root");
+	field_mask_test_name(BPF_RB_NODE,   "bpf_rb_node");
+
 	/* Only return BPF_KPTR when all other types with matchable names fail */
 	if (field_mask & BPF_KPTR) {
 		type = BPF_KPTR_REF;
@@ -3406,6 +3407,8 @@ end:
 	return type;
 }
 
+#undef field_mask_test_name
+
 static int btf_find_struct_field(const struct btf *btf,
 				 const struct btf_type *t, u32 field_mask,
 				 struct btf_field_info *info, int info_cnt)
@@ -3438,6 +3441,7 @@ static int btf_find_struct_field(const struct btf *btf,
 		case BPF_SPIN_LOCK:
 		case BPF_TIMER:
 		case BPF_LIST_NODE:
+		case BPF_RB_NODE:
 			ret = btf_find_struct(btf, member_type, off, sz, field_type,
 					      idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3451,8 +3455,11 @@ static int btf_find_struct_field(const struct btf *btf,
 				return ret;
 			break;
 		case BPF_LIST_HEAD:
-			ret = btf_find_list_head(btf, t, member_type, i, off, sz,
-						 idx < info_cnt ? &info[idx] : &tmp);
+		case BPF_RB_ROOT:
+			ret = btf_find_graph_root(btf, t, member_type,
+						  i, off, sz,
+						  idx < info_cnt ? &info[idx] : &tmp,
+						  field_type);
 			if (ret < 0)
 				return ret;
 			break;
@@ -3499,6 +3506,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 		case BPF_SPIN_LOCK:
 		case BPF_TIMER:
 		case BPF_LIST_NODE:
+		case BPF_RB_NODE:
 			ret = btf_find_struct(btf, var_type, off, sz, field_type,
 					      idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3512,8 +3520,11 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 				return ret;
 			break;
 		case BPF_LIST_HEAD:
-			ret = btf_find_list_head(btf, var, var_type, -1, off, sz,
-						 idx < info_cnt ? &info[idx] : &tmp);
+		case BPF_RB_ROOT:
+			ret = btf_find_graph_root(btf, var, var_type,
+						  -1, off, sz,
+						  idx < info_cnt ? &info[idx] : &tmp,
+						  field_type);
 			if (ret < 0)
 				return ret;
 			break;
@@ -3615,8 +3626,11 @@ end_btf:
 	return ret;
 }
 
-static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
-			       struct btf_field_info *info)
+static int btf_parse_graph_root(const struct btf *btf,
+				struct btf_field *field,
+				struct btf_field_info *info,
+				const char *node_type_name,
+				size_t node_type_align)
 {
 	const struct btf_type *t, *n = NULL;
 	const struct btf_member *member;
@@ -3638,13 +3652,13 @@ static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
 		n = btf_type_by_id(btf, member->type);
 		if (!__btf_type_is_struct(n))
 			return -EINVAL;
-		if (strcmp("bpf_list_node", __btf_name_by_offset(btf, n->name_off)))
+		if (strcmp(node_type_name, __btf_name_by_offset(btf, n->name_off)))
 			return -EINVAL;
 		offset = __btf_member_bit_offset(n, member);
 		if (offset % 8)
 			return -EINVAL;
 		offset /= 8;
-		if (offset % __alignof__(struct bpf_list_node))
+		if (offset % node_type_align)
 			return -EINVAL;
 
 		field->graph_root.btf = (struct btf *)btf;
@@ -3656,6 +3670,20 @@ static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
 	return 0;
 }
 
+static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
+			       struct btf_field_info *info)
+{
+	return btf_parse_graph_root(btf, field, info, "bpf_list_node",
+					    __alignof__(struct bpf_list_node));
+}
+
+static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,
+			     struct btf_field_info *info)
+{
+	return btf_parse_graph_root(btf, field, info, "bpf_rb_node",
+					    __alignof__(struct bpf_rb_node));
+}
+
 struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
 				    u32 field_mask, u32 value_size)
 {
@@ -3718,7 +3746,13 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			if (ret < 0)
 				goto end;
 			break;
+		case BPF_RB_ROOT:
+			ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]);
+			if (ret < 0)
+				goto end;
+			break;
 		case BPF_LIST_NODE:
+		case BPF_RB_NODE:
 			break;
 		default:
 			ret = -EFAULT;
@@ -3727,8 +3761,9 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 		rec->cnt++;
 	}
 
-	/* bpf_list_head requires bpf_spin_lock */
-	if (btf_record_has_field(rec, BPF_LIST_HEAD) && rec->spin_lock_off < 0) {
+	/* bpf_{list_head, rb_node} require bpf_spin_lock */
+	if ((btf_record_has_field(rec, BPF_LIST_HEAD) ||
+	     btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) {
 		ret = -EINVAL;
 		goto end;
 	}
@@ -3739,22 +3774,28 @@ end:
 	return ERR_PTR(ret);
 }
 
+#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT)
+#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE)
+
 int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
 {
 	int i;
 
-	/* There are two owning types, kptr_ref and bpf_list_head. The former
-	 * only supports storing kernel types, which can never store references
-	 * to program allocated local types, atleast not yet. Hence we only need
-	 * to ensure that bpf_list_head ownership does not form cycles.
+	/* There are three types that signify ownership of some other type:
+	 *  kptr_ref, bpf_list_head, bpf_rb_root.
+	 * kptr_ref only supports storing kernel types, which can't store
+	 * references to program allocated local types.
+	 *
+	 * Hence we only need to ensure that bpf_{list_head,rb_root} ownership
+	 * does not form cycles.
 	 */
-	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_LIST_HEAD))
+	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK))
 		return 0;
 	for (i = 0; i < rec->cnt; i++) {
 		struct btf_struct_meta *meta;
 		u32 btf_id;
 
-		if (!(rec->fields[i].type & BPF_LIST_HEAD))
+		if (!(rec->fields[i].type & GRAPH_ROOT_MASK))
 			continue;
 		btf_id = rec->fields[i].graph_root.value_btf_id;
 		meta = btf_find_struct_meta(btf, btf_id);
@@ -3762,39 +3803,47 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
 			return -EFAULT;
 		rec->fields[i].graph_root.value_rec = meta->record;
 
-		if (!(rec->field_mask & BPF_LIST_NODE))
+		/* We need to set value_rec for all root types, but no need
+		 * to check ownership cycle for a type unless it's also a
+		 * node type.
+		 */
+		if (!(rec->field_mask & GRAPH_NODE_MASK))
 			continue;
 
 		/* We need to ensure ownership acyclicity among all types. The
 		 * proper way to do it would be to topologically sort all BTF
 		 * IDs based on the ownership edges, since there can be multiple
-		 * bpf_list_head in a type. Instead, we use the following
-		 * reasoning:
+		 * bpf_{list_head,rb_node} in a type. Instead, we use the
+		 * following resaoning:
 		 *
 		 * - A type can only be owned by another type in user BTF if it
-		 *   has a bpf_list_node.
+		 *   has a bpf_{list,rb}_node. Let's call these node types.
 		 * - A type can only _own_ another type in user BTF if it has a
-		 *   bpf_list_head.
+		 *   bpf_{list_head,rb_root}. Let's call these root types.
 		 *
-		 * We ensure that if a type has both bpf_list_head and
-		 * bpf_list_node, its element types cannot be owning types.
+		 * We ensure that if a type is both a root and node, its
+		 * element types cannot be root types.
 		 *
 		 * To ensure acyclicity:
 		 *
-		 * When A only has bpf_list_head, ownership chain can be:
+		 * When A is an root type but not a node, its ownership
+		 * chain can be:
 		 *	A -> B -> C
 		 * Where:
-		 * - B has both bpf_list_head and bpf_list_node.
-		 * - C only has bpf_list_node.
+		 * - A is an root, e.g. has bpf_rb_root.
+		 * - B is both a root and node, e.g. has bpf_rb_node and
+		 *   bpf_list_head.
+		 * - C is only an root, e.g. has bpf_list_node
 		 *
-		 * When A has both bpf_list_head and bpf_list_node, some other
-		 * type already owns it in the BTF domain, hence it can not own
-		 * another owning type through any of the bpf_list_head edges.
+		 * When A is both a root and node, some other type already
+		 * owns it in the BTF domain, hence it can not own
+		 * another root type through any of the ownership edges.
 		 *	A -> B
 		 * Where:
-		 * - B only has bpf_list_node.
+		 * - A is both an root and node.
+		 * - B is only an node.
 		 */
-		if (meta->record->field_mask & BPF_LIST_HEAD)
+		if (meta->record->field_mask & GRAPH_ROOT_MASK)
 			return -ELOOP;
 	}
 	return 0;
@@ -5256,6 +5305,8 @@ static const char *alloc_obj_fields[] = {
 	"bpf_spin_lock",
 	"bpf_list_head",
 	"bpf_list_node",
+	"bpf_rb_root",
+	"bpf_rb_node",
 };
 
 static struct btf_struct_metas *
@@ -5329,7 +5380,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
 
 		type = &tab->types[tab->cnt];
 		type->btf_id = i;
-		record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE, t->size);
+		record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
+						  BPF_RB_ROOT | BPF_RB_NODE, t->size);
 		/* The record cannot be unset, treat it as an error if so */
 		if (IS_ERR_OR_NULL(record)) {
 			ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 2dae44581922..192184b5156e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1772,6 +1772,46 @@ unlock:
 	}
 }
 
+/* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are
+ * 'rb_node *', so field name of rb_node within containing struct is not
+ * needed.
+ *
+ * Since bpf_rb_tree's node type has a corresponding struct btf_field with
+ * graph_root.node_offset, it's not necessary to know field name
+ * or type of node struct
+ */
+#define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \
+	for (pos = rb_first_postorder(root); \
+	    pos && ({ n = rb_next_postorder(pos); 1; }); \
+	    pos = n)
+
+void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
+		      struct bpf_spin_lock *spin_lock)
+{
+	struct rb_root_cached orig_root, *root = rb_root;
+	struct rb_node *pos, *n;
+	void *obj;
+
+	BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root));
+	BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root));
+
+	__bpf_spin_lock_irqsave(spin_lock);
+	orig_root = *root;
+	*root = RB_ROOT_CACHED;
+	__bpf_spin_unlock_irqrestore(spin_lock);
+
+	bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
+		obj = pos;
+		obj -= field->graph_root.node_offset;
+
+		bpf_obj_free_fields(field->graph_root.value_rec, obj);
+
+		migrate_disable();
+		bpf_mem_free(&bpf_global_ma, obj);
+		migrate_enable();
+	}
+}
+
 __diag_push();
 __diag_ignore_all("-Wmissing-prototypes",
 		  "Global functions as their definitions will be in vmlinux BTF");
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cda8d00f3762..e3fcdc9836a6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -537,9 +537,6 @@ void btf_record_free(struct btf_record *rec)
 		return;
 	for (i = 0; i < rec->cnt; i++) {
 		switch (rec->fields[i].type) {
-		case BPF_SPIN_LOCK:
-		case BPF_TIMER:
-			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
 			if (rec->fields[i].kptr.module)
@@ -548,7 +545,11 @@ void btf_record_free(struct btf_record *rec)
 			break;
 		case BPF_LIST_HEAD:
 		case BPF_LIST_NODE:
-			/* Nothing to release for bpf_list_head */
+		case BPF_RB_ROOT:
+		case BPF_RB_NODE:
+		case BPF_SPIN_LOCK:
+		case BPF_TIMER:
+			/* Nothing to release */
 			break;
 		default:
 			WARN_ON_ONCE(1);
@@ -581,9 +582,6 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
 	new_rec->cnt = 0;
 	for (i = 0; i < rec->cnt; i++) {
 		switch (fields[i].type) {
-		case BPF_SPIN_LOCK:
-		case BPF_TIMER:
-			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
 			btf_get(fields[i].kptr.btf);
@@ -594,7 +592,11 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
 			break;
 		case BPF_LIST_HEAD:
 		case BPF_LIST_NODE:
-			/* Nothing to acquire for bpf_list_head */
+		case BPF_RB_ROOT:
+		case BPF_RB_NODE:
+		case BPF_SPIN_LOCK:
+		case BPF_TIMER:
+			/* Nothing to acquire */
 			break;
 		default:
 			ret = -EFAULT;
@@ -674,7 +676,13 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 				continue;
 			bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
 			break;
+		case BPF_RB_ROOT:
+			if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+				continue;
+			bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off);
+			break;
 		case BPF_LIST_NODE:
+		case BPF_RB_NODE:
 			break;
 		default:
 			WARN_ON_ONCE(1);
@@ -1010,7 +1018,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 		return -EINVAL;
 
 	map->record = btf_parse_fields(btf, value_type,
-				       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD,
+				       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
+				       BPF_RB_ROOT,
 				       map->value_size);
 	if (!IS_ERR_OR_NULL(map->record)) {
 		int i;
@@ -1058,6 +1067,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 				}
 				break;
 			case BPF_LIST_HEAD:
+			case BPF_RB_ROOT:
 				if (map->map_type != BPF_MAP_TYPE_HASH &&
 				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
 				    map->map_type != BPF_MAP_TYPE_ARRAY) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f176bc15c879..4fd098851f43 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14703,9 +14703,10 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(prog);
 
-	if (btf_record_has_field(map->record, BPF_LIST_HEAD)) {
+	if (btf_record_has_field(map->record, BPF_LIST_HEAD) ||
+	    btf_record_has_field(map->record, BPF_RB_ROOT)) {
 		if (is_tracing_prog_type(prog_type)) {
-			verbose(env, "tracing progs cannot use bpf_list_head yet\n");
+			verbose(env, "tracing progs cannot use bpf_{list_head,rb_root} yet\n");
 			return -EINVAL;
 		}
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 17afd2b35ee5..1503f61336b6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6917,6 +6917,17 @@ struct bpf_list_node {
 	__u64 :64;
 } __attribute__((aligned(8)));
 
+struct bpf_rb_root {
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_rb_node {
+	__u64 :64;
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c
index 2592b8aa5e41..c456b34a823a 100644
--- a/tools/testing/selftests/bpf/prog_tests/linked_list.c
+++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c
@@ -58,12 +58,12 @@ static struct {
 	TEST(inner_map, pop_front)
 	TEST(inner_map, pop_back)
 #undef TEST
-	{ "map_compat_kprobe", "tracing progs cannot use bpf_list_head yet" },
-	{ "map_compat_kretprobe", "tracing progs cannot use bpf_list_head yet" },
-	{ "map_compat_tp", "tracing progs cannot use bpf_list_head yet" },
-	{ "map_compat_perf", "tracing progs cannot use bpf_list_head yet" },
-	{ "map_compat_raw_tp", "tracing progs cannot use bpf_list_head yet" },
-	{ "map_compat_raw_tp_w", "tracing progs cannot use bpf_list_head yet" },
+	{ "map_compat_kprobe", "tracing progs cannot use bpf_{list_head,rb_root} yet" },
+	{ "map_compat_kretprobe", "tracing progs cannot use bpf_{list_head,rb_root} yet" },
+	{ "map_compat_tp", "tracing progs cannot use bpf_{list_head,rb_root} yet" },
+	{ "map_compat_perf", "tracing progs cannot use bpf_{list_head,rb_root} yet" },
+	{ "map_compat_raw_tp", "tracing progs cannot use bpf_{list_head,rb_root} yet" },
+	{ "map_compat_raw_tp_w", "tracing progs cannot use bpf_{list_head,rb_root} yet" },
 	{ "obj_type_id_oor", "local type ID argument must be in range [0, U32_MAX]" },
 	{ "obj_new_no_composite", "bpf_obj_new type ID argument must be of a struct" },
 	{ "obj_new_no_struct", "bpf_obj_new type ID argument must be of a struct" },
-- 
cgit 


From a40d3632436b1677a94c16e77be8da798ee9e12b Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 13 Feb 2023 16:40:14 -0800
Subject: bpf: Special verifier handling for bpf_rbtree_{remove, first}

Newly-added bpf_rbtree_{remove,first} kfuncs have some special properties
that require handling in the verifier:

  * both bpf_rbtree_remove and bpf_rbtree_first return the type containing
    the bpf_rb_node field, with the offset set to that field's offset,
    instead of a struct bpf_rb_node *
    * mark_reg_graph_node helper added in previous patch generalizes
      this logic, use it

  * bpf_rbtree_remove's node input is a node that's been inserted
    in the tree - a non-owning reference.

  * bpf_rbtree_remove must invalidate non-owning references in order to
    avoid aliasing issue. Use previously-added
    invalidate_non_owning_refs helper to mark this function as a
    non-owning ref invalidation point.

  * Unlike other functions, which convert one of their input arg regs to
    non-owning reference, bpf_rbtree_first takes no arguments and just
    returns a non-owning reference (possibly null)
    * For now verifier logic for this is special-cased instead of
      adding new kfunc flag.

This patch, along with the previous one, complete special verifier
handling for all rbtree API functions added in this series.

With functional verifier handling of rbtree_remove, under current
non-owning reference scheme, a node type with both bpf_{list,rb}_node
fields could cause the verifier to accept programs which remove such
nodes from collections they haven't been added to.

In order to prevent this, this patch adds a check to btf_parse_fields
which rejects structs with both bpf_{list,rb}_node fields. This is a
temporary measure that can be removed after "collection identity"
followup. See comment added in btf_parse_fields. A linked_list BTF test
exercising the new check is added in this patch as well.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230214004017.2534011-6-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c                                   | 24 ++++++++++++
 kernel/bpf/verifier.c                              | 43 ++++++++++++++++------
 .../testing/selftests/bpf/prog_tests/linked_list.c | 37 +++++++++++++++++++
 3 files changed, 92 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index b9d1f5c4e316..6582735ef1fc 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3768,6 +3768,30 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 		goto end;
 	}
 
+	/* need collection identity for non-owning refs before allowing this
+	 *
+	 * Consider a node type w/ both list and rb_node fields:
+	 *   struct node {
+	 *     struct bpf_list_node l;
+	 *     struct bpf_rb_node r;
+	 *   }
+	 *
+	 * Used like so:
+	 *   struct node *n = bpf_obj_new(....);
+	 *   bpf_list_push_front(&list_head, &n->l);
+	 *   bpf_rbtree_remove(&rb_root, &n->r);
+	 *
+	 * It should not be possible to rbtree_remove the node since it hasn't
+	 * been added to a tree. But push_front converts n to a non-owning
+	 * reference, and rbtree_remove accepts the non-owning reference to
+	 * a type w/ bpf_rb_node field.
+	 */
+	if (btf_record_has_field(rec, BPF_LIST_NODE) &&
+	    btf_record_has_field(rec, BPF_RB_NODE)) {
+		ret = -EINVAL;
+		goto end;
+	}
+
 	return rec;
 end:
 	btf_record_free(rec);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 88c8edf67007..21e08c111702 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9682,14 +9682,26 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_RB_NODE:
-			if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
-				verbose(env, "arg#%d expected pointer to allocated object\n", i);
-				return -EINVAL;
-			}
-			if (!reg->ref_obj_id) {
-				verbose(env, "allocated object must be referenced\n");
-				return -EINVAL;
+			if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) {
+				if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) {
+					verbose(env, "rbtree_remove node input must be non-owning ref\n");
+					return -EINVAL;
+				}
+				if (in_rbtree_lock_required_cb(env)) {
+					verbose(env, "rbtree_remove not allowed in rbtree cb\n");
+					return -EINVAL;
+				}
+			} else {
+				if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+					verbose(env, "arg#%d expected pointer to allocated object\n", i);
+					return -EINVAL;
+				}
+				if (!reg->ref_obj_id) {
+					verbose(env, "allocated object must be referenced\n");
+					return -EINVAL;
+				}
 			}
+
 			ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta);
 			if (ret < 0)
 				return ret;
@@ -9940,11 +9952,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				   meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
 				struct btf_field *field = meta.arg_list_head.field;
 
-				mark_reg_known_zero(env, regs, BPF_REG_0);
-				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
-				regs[BPF_REG_0].btf = field->graph_root.btf;
-				regs[BPF_REG_0].btf_id = field->graph_root.value_btf_id;
-				regs[BPF_REG_0].off = field->graph_root.node_offset;
+				mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+			} else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+				   meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
+				struct btf_field *field = meta.arg_rbtree_root.field;
+
+				mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
 			} else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
 				mark_reg_known_zero(env, regs, BPF_REG_0);
 				regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
@@ -10010,7 +10023,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			if (is_kfunc_ret_null(&meta))
 				regs[BPF_REG_0].id = id;
 			regs[BPF_REG_0].ref_obj_id = id;
+		} else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
+			ref_set_non_owning(env, &regs[BPF_REG_0]);
 		}
+
+		if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove])
+			invalidate_non_owning_refs(env);
+
 		if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
 			regs[BPF_REG_0].id = ++env->id_gen;
 	} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c
index c456b34a823a..0ed8132ce1c3 100644
--- a/tools/testing/selftests/bpf/prog_tests/linked_list.c
+++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c
@@ -715,6 +715,43 @@ static void test_btf(void)
 		btf__free(btf);
 		break;
 	}
+
+	while (test__start_subtest("btf: list_node and rb_node in same struct")) {
+		btf = init_btf();
+		if (!ASSERT_OK_PTR(btf, "init_btf"))
+			break;
+
+		id = btf__add_struct(btf, "bpf_rb_node", 24);
+		if (!ASSERT_EQ(id, 5, "btf__add_struct bpf_rb_node"))
+			break;
+		id = btf__add_struct(btf, "bar", 40);
+		if (!ASSERT_EQ(id, 6, "btf__add_struct bar"))
+			break;
+		err = btf__add_field(btf, "a", LIST_NODE, 0, 0);
+		if (!ASSERT_OK(err, "btf__add_field bar::a"))
+			break;
+		err = btf__add_field(btf, "c", 5, 128, 0);
+		if (!ASSERT_OK(err, "btf__add_field bar::c"))
+			break;
+
+		id = btf__add_struct(btf, "foo", 20);
+		if (!ASSERT_EQ(id, 7, "btf__add_struct foo"))
+			break;
+		err = btf__add_field(btf, "a", LIST_HEAD, 0, 0);
+		if (!ASSERT_OK(err, "btf__add_field foo::a"))
+			break;
+		err = btf__add_field(btf, "b", SPIN_LOCK, 128, 0);
+		if (!ASSERT_OK(err, "btf__add_field foo::b"))
+			break;
+		id = btf__add_decl_tag(btf, "contains:bar:a", 7, 0);
+		if (!ASSERT_EQ(id, 8, "btf__add_decl_tag contains:bar:a"))
+			break;
+
+		err = btf__load_into_kernel(btf);
+		ASSERT_EQ(err, -EINVAL, "check btf");
+		btf__free(btf);
+		break;
+	}
 }
 
 void test_linked_list(void)
-- 
cgit 


From c834df847ee60eeb678171eb0f1e59f611c62a99 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 13 Feb 2023 16:40:15 -0800
Subject: bpf: Add bpf_rbtree_{add,remove,first} decls to bpf_experimental.h

These kfuncs will be used by selftests in following patches

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230214004017.2534011-7-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/bpf_experimental.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 424f7bbbfe9b..dbd2c729781a 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -65,4 +65,28 @@ extern struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ks
  */
 extern struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym;
 
+/* Description
+ *	Remove 'node' from rbtree with root 'root'
+ * Returns
+ * 	Pointer to the removed node, or NULL if 'root' didn't contain 'node'
+ */
+extern struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
+					     struct bpf_rb_node *node) __ksym;
+
+/* Description
+ *	Add 'node' to rbtree with root 'root' using comparator 'less'
+ * Returns
+ *	Nothing
+ */
+extern void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+			   bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) __ksym;
+
+/* Description
+ *	Return the first (leftmost) node in input tree
+ * Returns
+ *	Pointer to the node, which is _not_ removed from the tree. If the tree
+ *	contains no nodes, returns NULL.
+ */
+extern struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
+
 #endif
-- 
cgit 


From 215249f6adc0359e3546829e7ee622b5e309b0ad Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 13 Feb 2023 16:40:16 -0800
Subject: selftests/bpf: Add rbtree selftests

This patch adds selftests exercising the logic changed/added in the
previous patches in the series. A variety of successful and unsuccessful
rbtree usages are validated:

Success:
  * Add some nodes, let map_value bpf_rbtree_root destructor clean them
    up
  * Add some nodes, remove one using the non-owning ref leftover by
    successful rbtree_add() call
  * Add some nodes, remove one using the non-owning ref returned by
    rbtree_first() call

Failure:
  * BTF where bpf_rb_root owns bpf_list_node should fail to load
  * BTF where node of type X is added to tree containing nodes of type Y
    should fail to load
  * No calling rbtree api functions in 'less' callback for rbtree_add
  * No releasing lock in 'less' callback for rbtree_add
  * No removing a node which hasn't been added to any tree
  * No adding a node which has already been added to a tree
  * No escaping of non-owning references past their lock's
    critical section
  * No escaping of non-owning references past other invalidation points
    (rbtree_remove)

These tests mostly focus on rbtree-specific additions, but some of the
failure cases revalidate scenarios common to both linked_list and rbtree
which are covered in the former's tests. Better to be a bit redundant in
case linked_list and rbtree semantics deviate over time.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230214004017.2534011-8-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/rbtree.c    | 117 ++++++++
 tools/testing/selftests/bpf/progs/rbtree.c         | 176 +++++++++++
 .../bpf/progs/rbtree_btf_fail__add_wrong_type.c    |  52 ++++
 .../bpf/progs/rbtree_btf_fail__wrong_node_type.c   |  49 ++++
 tools/testing/selftests/bpf/progs/rbtree_fail.c    | 322 +++++++++++++++++++++
 5 files changed, 716 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/rbtree.c
 create mode 100644 tools/testing/selftests/bpf/progs/rbtree.c
 create mode 100644 tools/testing/selftests/bpf/progs/rbtree_btf_fail__add_wrong_type.c
 create mode 100644 tools/testing/selftests/bpf/progs/rbtree_btf_fail__wrong_node_type.c
 create mode 100644 tools/testing/selftests/bpf/progs/rbtree_fail.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/rbtree.c b/tools/testing/selftests/bpf/prog_tests/rbtree.c
new file mode 100644
index 000000000000..156fa95c42f6
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/rbtree.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include <network_helpers.h>
+
+#include "rbtree.skel.h"
+#include "rbtree_fail.skel.h"
+#include "rbtree_btf_fail__wrong_node_type.skel.h"
+#include "rbtree_btf_fail__add_wrong_type.skel.h"
+
+static void test_rbtree_add_nodes(void)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts,
+		    .data_in = &pkt_v4,
+		    .data_size_in = sizeof(pkt_v4),
+		    .repeat = 1,
+	);
+	struct rbtree *skel;
+	int ret;
+
+	skel = rbtree__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "rbtree__open_and_load"))
+		return;
+
+	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.rbtree_add_nodes), &opts);
+	ASSERT_OK(ret, "rbtree_add_nodes run");
+	ASSERT_OK(opts.retval, "rbtree_add_nodes retval");
+	ASSERT_EQ(skel->data->less_callback_ran, 1, "rbtree_add_nodes less_callback_ran");
+
+	rbtree__destroy(skel);
+}
+
+static void test_rbtree_add_and_remove(void)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts,
+		    .data_in = &pkt_v4,
+		    .data_size_in = sizeof(pkt_v4),
+		    .repeat = 1,
+	);
+	struct rbtree *skel;
+	int ret;
+
+	skel = rbtree__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "rbtree__open_and_load"))
+		return;
+
+	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.rbtree_add_and_remove), &opts);
+	ASSERT_OK(ret, "rbtree_add_and_remove");
+	ASSERT_OK(opts.retval, "rbtree_add_and_remove retval");
+	ASSERT_EQ(skel->data->removed_key, 5, "rbtree_add_and_remove first removed key");
+
+	rbtree__destroy(skel);
+}
+
+static void test_rbtree_first_and_remove(void)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts,
+		    .data_in = &pkt_v4,
+		    .data_size_in = sizeof(pkt_v4),
+		    .repeat = 1,
+	);
+	struct rbtree *skel;
+	int ret;
+
+	skel = rbtree__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "rbtree__open_and_load"))
+		return;
+
+	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.rbtree_first_and_remove), &opts);
+	ASSERT_OK(ret, "rbtree_first_and_remove");
+	ASSERT_OK(opts.retval, "rbtree_first_and_remove retval");
+	ASSERT_EQ(skel->data->first_data[0], 2, "rbtree_first_and_remove first rbtree_first()");
+	ASSERT_EQ(skel->data->removed_key, 1, "rbtree_first_and_remove first removed key");
+	ASSERT_EQ(skel->data->first_data[1], 4, "rbtree_first_and_remove second rbtree_first()");
+
+	rbtree__destroy(skel);
+}
+
+void test_rbtree_success(void)
+{
+	if (test__start_subtest("rbtree_add_nodes"))
+		test_rbtree_add_nodes();
+	if (test__start_subtest("rbtree_add_and_remove"))
+		test_rbtree_add_and_remove();
+	if (test__start_subtest("rbtree_first_and_remove"))
+		test_rbtree_first_and_remove();
+}
+
+#define BTF_FAIL_TEST(suffix)									\
+void test_rbtree_btf_fail__##suffix(void)							\
+{												\
+	struct rbtree_btf_fail__##suffix *skel;							\
+												\
+	skel = rbtree_btf_fail__##suffix##__open_and_load();					\
+	if (!ASSERT_ERR_PTR(skel,								\
+			    "rbtree_btf_fail__" #suffix "__open_and_load unexpected success"))	\
+		rbtree_btf_fail__##suffix##__destroy(skel);					\
+}
+
+#define RUN_BTF_FAIL_TEST(suffix)				\
+	if (test__start_subtest("rbtree_btf_fail__" #suffix))	\
+		test_rbtree_btf_fail__##suffix();
+
+BTF_FAIL_TEST(wrong_node_type);
+BTF_FAIL_TEST(add_wrong_type);
+
+void test_rbtree_btf_fail(void)
+{
+	RUN_BTF_FAIL_TEST(wrong_node_type);
+	RUN_BTF_FAIL_TEST(add_wrong_type);
+}
+
+void test_rbtree_fail(void)
+{
+	RUN_TESTS(rbtree_fail);
+}
diff --git a/tools/testing/selftests/bpf/progs/rbtree.c b/tools/testing/selftests/bpf/progs/rbtree.c
new file mode 100644
index 000000000000..e5db1a4287e5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/rbtree.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include "bpf_experimental.h"
+
+struct node_data {
+	long key;
+	long data;
+	struct bpf_rb_node node;
+};
+
+long less_callback_ran = -1;
+long removed_key = -1;
+long first_data[2] = {-1, -1};
+
+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
+private(A) struct bpf_spin_lock glock;
+private(A) struct bpf_rb_root groot __contains(node_data, node);
+
+static bool less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
+{
+	struct node_data *node_a;
+	struct node_data *node_b;
+
+	node_a = container_of(a, struct node_data, node);
+	node_b = container_of(b, struct node_data, node);
+	less_callback_ran = 1;
+
+	return node_a->key < node_b->key;
+}
+
+static long __add_three(struct bpf_rb_root *root, struct bpf_spin_lock *lock)
+{
+	struct node_data *n, *m;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return 1;
+	n->key = 5;
+
+	m = bpf_obj_new(typeof(*m));
+	if (!m) {
+		bpf_obj_drop(n);
+		return 2;
+	}
+	m->key = 1;
+
+	bpf_spin_lock(&glock);
+	bpf_rbtree_add(&groot, &n->node, less);
+	bpf_rbtree_add(&groot, &m->node, less);
+	bpf_spin_unlock(&glock);
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return 3;
+	n->key = 3;
+
+	bpf_spin_lock(&glock);
+	bpf_rbtree_add(&groot, &n->node, less);
+	bpf_spin_unlock(&glock);
+	return 0;
+}
+
+SEC("tc")
+long rbtree_add_nodes(void *ctx)
+{
+	return __add_three(&groot, &glock);
+}
+
+SEC("tc")
+long rbtree_add_and_remove(void *ctx)
+{
+	struct bpf_rb_node *res = NULL;
+	struct node_data *n, *m;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		goto err_out;
+	n->key = 5;
+
+	m = bpf_obj_new(typeof(*m));
+	if (!m)
+		goto err_out;
+	m->key = 3;
+
+	bpf_spin_lock(&glock);
+	bpf_rbtree_add(&groot, &n->node, less);
+	bpf_rbtree_add(&groot, &m->node, less);
+	res = bpf_rbtree_remove(&groot, &n->node);
+	bpf_spin_unlock(&glock);
+
+	n = container_of(res, struct node_data, node);
+	removed_key = n->key;
+
+	bpf_obj_drop(n);
+
+	return 0;
+err_out:
+	if (n)
+		bpf_obj_drop(n);
+	if (m)
+		bpf_obj_drop(m);
+	return 1;
+}
+
+SEC("tc")
+long rbtree_first_and_remove(void *ctx)
+{
+	struct bpf_rb_node *res = NULL;
+	struct node_data *n, *m, *o;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return 1;
+	n->key = 3;
+	n->data = 4;
+
+	m = bpf_obj_new(typeof(*m));
+	if (!m)
+		goto err_out;
+	m->key = 5;
+	m->data = 6;
+
+	o = bpf_obj_new(typeof(*o));
+	if (!o)
+		goto err_out;
+	o->key = 1;
+	o->data = 2;
+
+	bpf_spin_lock(&glock);
+	bpf_rbtree_add(&groot, &n->node, less);
+	bpf_rbtree_add(&groot, &m->node, less);
+	bpf_rbtree_add(&groot, &o->node, less);
+
+	res = bpf_rbtree_first(&groot);
+	if (!res) {
+		bpf_spin_unlock(&glock);
+		return 2;
+	}
+
+	o = container_of(res, struct node_data, node);
+	first_data[0] = o->data;
+
+	res = bpf_rbtree_remove(&groot, &o->node);
+	bpf_spin_unlock(&glock);
+
+	o = container_of(res, struct node_data, node);
+	removed_key = o->key;
+
+	bpf_obj_drop(o);
+
+	bpf_spin_lock(&glock);
+	res = bpf_rbtree_first(&groot);
+	if (!res) {
+		bpf_spin_unlock(&glock);
+		return 3;
+	}
+
+	o = container_of(res, struct node_data, node);
+	first_data[1] = o->data;
+	bpf_spin_unlock(&glock);
+
+	return 0;
+err_out:
+	if (n)
+		bpf_obj_drop(n);
+	if (m)
+		bpf_obj_drop(m);
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/rbtree_btf_fail__add_wrong_type.c b/tools/testing/selftests/bpf/progs/rbtree_btf_fail__add_wrong_type.c
new file mode 100644
index 000000000000..60079b202c07
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/rbtree_btf_fail__add_wrong_type.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include "bpf_experimental.h"
+
+struct node_data {
+	int key;
+	int data;
+	struct bpf_rb_node node;
+};
+
+struct node_data2 {
+	int key;
+	struct bpf_rb_node node;
+	int data;
+};
+
+static bool less2(struct bpf_rb_node *a, const struct bpf_rb_node *b)
+{
+	struct node_data2 *node_a;
+	struct node_data2 *node_b;
+
+	node_a = container_of(a, struct node_data2, node);
+	node_b = container_of(b, struct node_data2, node);
+
+	return node_a->key < node_b->key;
+}
+
+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
+private(A) struct bpf_spin_lock glock;
+private(A) struct bpf_rb_root groot __contains(node_data, node);
+
+SEC("tc")
+long rbtree_api_add__add_wrong_type(void *ctx)
+{
+	struct node_data2 *n;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return 1;
+
+	bpf_spin_lock(&glock);
+	bpf_rbtree_add(&groot, &n->node, less2);
+	bpf_spin_unlock(&glock);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/rbtree_btf_fail__wrong_node_type.c b/tools/testing/selftests/bpf/progs/rbtree_btf_fail__wrong_node_type.c
new file mode 100644
index 000000000000..340f97da1084
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/rbtree_btf_fail__wrong_node_type.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include "bpf_experimental.h"
+
+/* BTF load should fail as bpf_rb_root __contains this type and points to
+ * 'node', but 'node' is not a bpf_rb_node
+ */
+struct node_data {
+	int key;
+	int data;
+	struct bpf_list_node node;
+};
+
+static bool less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
+{
+	struct node_data *node_a;
+	struct node_data *node_b;
+
+	node_a = container_of(a, struct node_data, node);
+	node_b = container_of(b, struct node_data, node);
+
+	return node_a->key < node_b->key;
+}
+
+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
+private(A) struct bpf_spin_lock glock;
+private(A) struct bpf_rb_root groot __contains(node_data, node);
+
+SEC("tc")
+long rbtree_api_add__wrong_node_type(void *ctx)
+{
+	struct node_data *n;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return 1;
+
+	bpf_spin_lock(&glock);
+	bpf_rbtree_first(&groot);
+	bpf_spin_unlock(&glock);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c
new file mode 100644
index 000000000000..bf3cba115897
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include "bpf_experimental.h"
+#include "bpf_misc.h"
+
+struct node_data {
+	long key;
+	long data;
+	struct bpf_rb_node node;
+};
+
+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
+private(A) struct bpf_spin_lock glock;
+private(A) struct bpf_rb_root groot __contains(node_data, node);
+private(A) struct bpf_rb_root groot2 __contains(node_data, node);
+
+static bool less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
+{
+	struct node_data *node_a;
+	struct node_data *node_b;
+
+	node_a = container_of(a, struct node_data, node);
+	node_b = container_of(b, struct node_data, node);
+
+	return node_a->key < node_b->key;
+}
+
+SEC("?tc")
+__failure __msg("bpf_spin_lock at off=16 must be held for bpf_rb_root")
+long rbtree_api_nolock_add(void *ctx)
+{
+	struct node_data *n;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return 1;
+
+	bpf_rbtree_add(&groot, &n->node, less);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("bpf_spin_lock at off=16 must be held for bpf_rb_root")
+long rbtree_api_nolock_remove(void *ctx)
+{
+	struct node_data *n;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return 1;
+
+	bpf_spin_lock(&glock);
+	bpf_rbtree_add(&groot, &n->node, less);
+	bpf_spin_unlock(&glock);
+
+	bpf_rbtree_remove(&groot, &n->node);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("bpf_spin_lock at off=16 must be held for bpf_rb_root")
+long rbtree_api_nolock_first(void *ctx)
+{
+	bpf_rbtree_first(&groot);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("rbtree_remove node input must be non-owning ref")
+long rbtree_api_remove_unadded_node(void *ctx)
+{
+	struct node_data *n, *m;
+	struct bpf_rb_node *res;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return 1;
+
+	m = bpf_obj_new(typeof(*m));
+	if (!m) {
+		bpf_obj_drop(n);
+		return 1;
+	}
+
+	bpf_spin_lock(&glock);
+	bpf_rbtree_add(&groot, &n->node, less);
+
+	/* This remove should pass verifier */
+	res = bpf_rbtree_remove(&groot, &n->node);
+	n = container_of(res, struct node_data, node);
+
+	/* This remove shouldn't, m isn't in an rbtree */
+	res = bpf_rbtree_remove(&groot, &m->node);
+	m = container_of(res, struct node_data, node);
+	bpf_spin_unlock(&glock);
+
+	if (n)
+		bpf_obj_drop(n);
+	if (m)
+		bpf_obj_drop(m);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("Unreleased reference id=2 alloc_insn=11")
+long rbtree_api_remove_no_drop(void *ctx)
+{
+	struct bpf_rb_node *res;
+	struct node_data *n;
+
+	bpf_spin_lock(&glock);
+	res = bpf_rbtree_first(&groot);
+	if (!res)
+		goto unlock_err;
+
+	res = bpf_rbtree_remove(&groot, res);
+
+	n = container_of(res, struct node_data, node);
+	bpf_spin_unlock(&glock);
+
+	/* bpf_obj_drop(n) is missing here */
+	return 0;
+
+unlock_err:
+	bpf_spin_unlock(&glock);
+	return 1;
+}
+
+SEC("?tc")
+__failure __msg("arg#1 expected pointer to allocated object")
+long rbtree_api_add_to_multiple_trees(void *ctx)
+{
+	struct node_data *n;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return 1;
+
+	bpf_spin_lock(&glock);
+	bpf_rbtree_add(&groot, &n->node, less);
+
+	/* This add should fail since n already in groot's tree */
+	bpf_rbtree_add(&groot2, &n->node, less);
+	bpf_spin_unlock(&glock);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("rbtree_remove node input must be non-owning ref")
+long rbtree_api_add_release_unlock_escape(void *ctx)
+{
+	struct node_data *n;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return 1;
+
+	bpf_spin_lock(&glock);
+	bpf_rbtree_add(&groot, &n->node, less);
+	bpf_spin_unlock(&glock);
+
+	bpf_spin_lock(&glock);
+	/* After add() in previous critical section, n should be
+	 * release_on_unlock and released after previous spin_unlock,
+	 * so should not be possible to use it here
+	 */
+	bpf_rbtree_remove(&groot, &n->node);
+	bpf_spin_unlock(&glock);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("rbtree_remove node input must be non-owning ref")
+long rbtree_api_release_aliasing(void *ctx)
+{
+	struct node_data *n, *m, *o;
+	struct bpf_rb_node *res;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return 1;
+
+	bpf_spin_lock(&glock);
+	bpf_rbtree_add(&groot, &n->node, less);
+	bpf_spin_unlock(&glock);
+
+	bpf_spin_lock(&glock);
+
+	/* m and o point to the same node,
+	 * but verifier doesn't know this
+	 */
+	res = bpf_rbtree_first(&groot);
+	if (!res)
+		return 1;
+	o = container_of(res, struct node_data, node);
+
+	res = bpf_rbtree_first(&groot);
+	if (!res)
+		return 1;
+	m = container_of(res, struct node_data, node);
+
+	bpf_rbtree_remove(&groot, &m->node);
+	/* This second remove shouldn't be possible. Retval of previous
+	 * remove returns owning reference to m, which is the same
+	 * node o's non-owning ref is pointing at
+	 *
+	 * In order to preserve property
+	 *   * owning ref must not be in rbtree
+	 *   * non-owning ref must be in rbtree
+	 *
+	 * o's ref must be invalidated after previous remove. Otherwise
+	 * we'd have non-owning ref to node that isn't in rbtree, and
+	 * verifier wouldn't be able to use type system to prevent remove
+	 * of ref that already isn't in any tree. Would have to do runtime
+	 * checks in that case.
+	 */
+	bpf_rbtree_remove(&groot, &o->node);
+
+	bpf_spin_unlock(&glock);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("rbtree_remove node input must be non-owning ref")
+long rbtree_api_first_release_unlock_escape(void *ctx)
+{
+	struct bpf_rb_node *res;
+	struct node_data *n;
+
+	bpf_spin_lock(&glock);
+	res = bpf_rbtree_first(&groot);
+	if (res)
+		n = container_of(res, struct node_data, node);
+	bpf_spin_unlock(&glock);
+
+	bpf_spin_lock(&glock);
+	/* After first() in previous critical section, n should be
+	 * release_on_unlock and released after previous spin_unlock,
+	 * so should not be possible to use it here
+	 */
+	bpf_rbtree_remove(&groot, &n->node);
+	bpf_spin_unlock(&glock);
+	return 0;
+}
+
+static bool less__bad_fn_call_add(struct bpf_rb_node *a, const struct bpf_rb_node *b)
+{
+	struct node_data *node_a;
+	struct node_data *node_b;
+
+	node_a = container_of(a, struct node_data, node);
+	node_b = container_of(b, struct node_data, node);
+	bpf_rbtree_add(&groot, &node_a->node, less);
+
+	return node_a->key < node_b->key;
+}
+
+static bool less__bad_fn_call_remove(struct bpf_rb_node *a, const struct bpf_rb_node *b)
+{
+	struct node_data *node_a;
+	struct node_data *node_b;
+
+	node_a = container_of(a, struct node_data, node);
+	node_b = container_of(b, struct node_data, node);
+	bpf_rbtree_remove(&groot, &node_a->node);
+
+	return node_a->key < node_b->key;
+}
+
+static bool less__bad_fn_call_first_unlock_after(struct bpf_rb_node *a, const struct bpf_rb_node *b)
+{
+	struct node_data *node_a;
+	struct node_data *node_b;
+
+	node_a = container_of(a, struct node_data, node);
+	node_b = container_of(b, struct node_data, node);
+	bpf_rbtree_first(&groot);
+	bpf_spin_unlock(&glock);
+
+	return node_a->key < node_b->key;
+}
+
+static __always_inline
+long add_with_cb(bool (cb)(struct bpf_rb_node *a, const struct bpf_rb_node *b))
+{
+	struct node_data *n;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return 1;
+
+	bpf_spin_lock(&glock);
+	bpf_rbtree_add(&groot, &n->node, cb);
+	bpf_spin_unlock(&glock);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("arg#1 expected pointer to allocated object")
+long rbtree_api_add_bad_cb_bad_fn_call_add(void *ctx)
+{
+	return add_with_cb(less__bad_fn_call_add);
+}
+
+SEC("?tc")
+__failure __msg("rbtree_remove not allowed in rbtree cb")
+long rbtree_api_add_bad_cb_bad_fn_call_remove(void *ctx)
+{
+	return add_with_cb(less__bad_fn_call_remove);
+}
+
+SEC("?tc")
+__failure __msg("can't spin_{lock,unlock} in rbtree cb")
+long rbtree_api_add_bad_cb_bad_fn_call_first_unlock_after(void *ctx)
+{
+	return add_with_cb(less__bad_fn_call_first_unlock_after);
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit 


From 1f428356c38dcbe49fd2f1c488b41e88720ead92 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Tue, 7 Feb 2023 22:48:50 +0100
Subject: rtla: Add hwnoise tool

The hwnoise tool is a special mode for the osnoise top tool.

hwnoise dispatches the osnoise tracer and displays a summary of the noise.
The difference is that it runs the tracer with the OSNOISE_IRQ_DISABLE
option set, thus only allowing only hardware-related noise, resulting in
a simplified output. hwnoise has the same features of osnoise.

An example of the tool's output:

 # rtla hwnoise -c 1-11 -T 1 -d 10m -q
                                           Hardware-related Noise
 duration:   0 00:10:00 | time is in us
 CPU Period       Runtime        Noise  % CPU Aval   Max Noise   Max Single          HW          NMI
   1 #599       599000000          138    99.99997           3            3           4           74
   2 #599       599000000           85    99.99998           3            3           4           75
   3 #599       599000000           86    99.99998           4            3           6           75
   4 #599       599000000           81    99.99998           4            4           2           75
   5 #599       599000000           85    99.99998           2            2           2           75

Link: https://lkml.kernel.org/r/2d6f49a6f3a4f8b51b2c806458b1cff71ad4d014.1675805361.git.bristot@kernel.org

Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Cc: Daniel Bristot de Oliveira <bristot@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Bagas Sanjaya <bagasdotme@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/tracing/rtla/Makefile          |   2 +
 tools/tracing/rtla/src/osnoise.c     | 117 +++++++++++++++++++++++++++++++++++
 tools/tracing/rtla/src/osnoise.h     |   7 +++
 tools/tracing/rtla/src/osnoise_top.c |  84 ++++++++++++++++++++-----
 tools/tracing/rtla/src/rtla.c        |   4 ++
 5 files changed, 198 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index 22e28b76f800..2456a399eb9a 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -119,6 +119,8 @@ install: doc_install
 	$(STRIP) $(DESTDIR)$(BINDIR)/rtla
 	@test ! -f $(DESTDIR)$(BINDIR)/osnoise || rm $(DESTDIR)$(BINDIR)/osnoise
 	ln -s rtla $(DESTDIR)$(BINDIR)/osnoise
+	@test ! -f $(DESTDIR)$(BINDIR)/hwnoise || rm $(DESTDIR)$(BINDIR)/hwnoise
+	ln -s rtla $(DESTDIR)$(BINDIR)/hwnoise
 	@test ! -f $(DESTDIR)$(BINDIR)/timerlat || rm $(DESTDIR)$(BINDIR)/timerlat
 	ln -s rtla $(DESTDIR)$(BINDIR)/timerlat
 
diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c
index 4dee343909b1..3ca7a3853943 100644
--- a/tools/tracing/rtla/src/osnoise.c
+++ b/tools/tracing/rtla/src/osnoise.c
@@ -734,6 +734,113 @@ void osnoise_put_tracing_thresh(struct osnoise_context *context)
 	context->orig_tracing_thresh = OSNOISE_OPTION_INIT_VAL;
 }
 
+static int osnoise_options_get_option(char *option)
+{
+	char *options = tracefs_instance_file_read(NULL, "osnoise/options", NULL);
+	char no_option[128];
+	int retval = 0;
+	char *opt;
+
+	if (!options)
+		return OSNOISE_OPTION_INIT_VAL;
+
+	/*
+	 * Check first if the option is disabled.
+	 */
+	snprintf(no_option, sizeof(no_option), "NO_%s", option);
+
+	opt = strstr(options, no_option);
+	if (opt)
+		goto out_free;
+
+	/*
+	 * Now that it is not disabled, if the string is there, it is
+	 * enabled. If the string is not there, the option does not exist.
+	 */
+	opt = strstr(options, option);
+	if (opt)
+		retval = 1;
+	else
+		retval = OSNOISE_OPTION_INIT_VAL;
+
+out_free:
+	free(options);
+	return retval;
+}
+
+static int osnoise_options_set_option(char *option, bool onoff)
+{
+	char no_option[128];
+
+	if (onoff)
+		return tracefs_instance_file_write(NULL, "osnoise/options", option);
+
+	snprintf(no_option, sizeof(no_option), "NO_%s", option);
+
+	return tracefs_instance_file_write(NULL, "osnoise/options", no_option);
+}
+
+static int osnoise_get_irq_disable(struct osnoise_context *context)
+{
+	if (context->opt_irq_disable != OSNOISE_OPTION_INIT_VAL)
+		return context->opt_irq_disable;
+
+	if (context->orig_opt_irq_disable != OSNOISE_OPTION_INIT_VAL)
+		return context->orig_opt_irq_disable;
+
+	context->orig_opt_irq_disable = osnoise_options_get_option("OSNOISE_IRQ_DISABLE");
+
+	return context->orig_opt_irq_disable;
+}
+
+int osnoise_set_irq_disable(struct osnoise_context *context, bool onoff)
+{
+	int opt_irq_disable = osnoise_get_irq_disable(context);
+	int retval;
+
+	if (opt_irq_disable == OSNOISE_OPTION_INIT_VAL)
+		return -1;
+
+	if (opt_irq_disable == onoff)
+		return 0;
+
+	retval = osnoise_options_set_option("OSNOISE_IRQ_DISABLE", onoff);
+	if (retval < 0)
+		return -1;
+
+	context->opt_irq_disable = onoff;
+
+	return 0;
+}
+
+static void osnoise_restore_irq_disable(struct osnoise_context *context)
+{
+	int retval;
+
+	if (context->orig_opt_irq_disable == OSNOISE_OPTION_INIT_VAL)
+		return;
+
+	if (context->orig_opt_irq_disable == context->opt_irq_disable)
+		goto out_done;
+
+	retval = osnoise_options_set_option("OSNOISE_IRQ_DISABLE", context->orig_opt_irq_disable);
+	if (retval < 0)
+		err_msg("Could not restore original OSNOISE_IRQ_DISABLE option\n");
+
+out_done:
+	context->orig_opt_irq_disable = OSNOISE_OPTION_INIT_VAL;
+}
+
+static void osnoise_put_irq_disable(struct osnoise_context *context)
+{
+	osnoise_restore_irq_disable(context);
+
+	if (context->orig_opt_irq_disable == OSNOISE_OPTION_INIT_VAL)
+		return;
+
+	context->orig_opt_irq_disable = OSNOISE_OPTION_INIT_VAL;
+}
+
 /*
  * enable_osnoise - enable osnoise tracer in the trace_instance
  */
@@ -798,6 +905,9 @@ struct osnoise_context *osnoise_context_alloc(void)
 	context->orig_tracing_thresh	= OSNOISE_OPTION_INIT_VAL;
 	context->tracing_thresh		= OSNOISE_OPTION_INIT_VAL;
 
+	context->orig_opt_irq_disable	= OSNOISE_OPTION_INIT_VAL;
+	context->opt_irq_disable	= OSNOISE_OPTION_INIT_VAL;
+
 	osnoise_get_context(context);
 
 	return context;
@@ -824,6 +934,7 @@ void osnoise_put_context(struct osnoise_context *context)
 	osnoise_put_timerlat_period_us(context);
 	osnoise_put_print_stack(context);
 	osnoise_put_tracing_thresh(context);
+	osnoise_put_irq_disable(context);
 
 	free(context);
 }
@@ -958,3 +1069,9 @@ usage:
 	osnoise_usage(1);
 	exit(1);
 }
+
+int hwnoise_main(int argc, char *argv[])
+{
+	osnoise_top_main(argc, argv);
+	exit(0);
+}
diff --git a/tools/tracing/rtla/src/osnoise.h b/tools/tracing/rtla/src/osnoise.h
index 04a4384cc544..4dcf22ccd704 100644
--- a/tools/tracing/rtla/src/osnoise.h
+++ b/tools/tracing/rtla/src/osnoise.h
@@ -38,6 +38,10 @@ struct osnoise_context {
 	/* -1 as init value because 0 is disabled */
 	long long		orig_print_stack;
 	long long		print_stack;
+
+	/* -1 as init value because 0 is off */
+	int			orig_opt_irq_disable;
+	int			opt_irq_disable;
 };
 
 /*
@@ -79,6 +83,8 @@ void osnoise_restore_print_stack(struct osnoise_context *context);
 int osnoise_set_print_stack(struct osnoise_context *context,
 			    long long print_stack);
 
+int osnoise_set_irq_disable(struct osnoise_context *context, bool onoff);
+
 /*
  * osnoise_tool -  osnoise based tool definition.
  */
@@ -97,3 +103,4 @@ struct osnoise_tool *osnoise_init_trace_tool(char *tracer);
 int osnoise_hist_main(int argc, char *argv[]);
 int osnoise_top_main(int argc, char **argv);
 int osnoise_main(int argc, char **argv);
+int hwnoise_main(int argc, char **argv);
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 76479bfb2922..562f2e4b18c5 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -14,6 +14,11 @@
 #include "osnoise.h"
 #include "utils.h"
 
+enum osnoise_mode {
+	MODE_OSNOISE = 0,
+	MODE_HWNOISE
+};
+
 /*
  * osnoise top parameters
  */
@@ -32,6 +37,7 @@ struct osnoise_top_params {
 	int			set_sched;
 	struct sched_attr	sched_param;
 	struct trace_events	*events;
+	enum osnoise_mode	mode;
 };
 
 struct osnoise_top_cpu {
@@ -143,15 +149,23 @@ osnoise_top_handler(struct trace_seq *s, struct tep_record *record,
  */
 static void osnoise_top_header(struct osnoise_tool *top)
 {
+	struct osnoise_top_params *params = top->params;
 	struct trace_seq *s = top->trace.seq;
 	char duration[26];
 
 	get_duration(top->start_time, duration, sizeof(duration));
 
 	trace_seq_printf(s, "\033[2;37;40m");
-	trace_seq_printf(s, "                                          Operating System Noise");
-	trace_seq_printf(s, "                                     ");
-	trace_seq_printf(s, "                                     ");
+	trace_seq_printf(s, "                                          ");
+
+	if (params->mode == MODE_OSNOISE) {
+		trace_seq_printf(s, "Operating System Noise");
+		trace_seq_printf(s, "                                       ");
+	} else if (params->mode == MODE_HWNOISE) {
+		trace_seq_printf(s, "Hardware-related Noise");
+	}
+
+	trace_seq_printf(s, "                                   ");
 	trace_seq_printf(s, "\033[0;0;0m");
 	trace_seq_printf(s, "\n");
 
@@ -162,7 +176,14 @@ static void osnoise_top_header(struct osnoise_tool *top)
 	trace_seq_printf(s, "       Noise ");
 	trace_seq_printf(s, " %% CPU Aval ");
 	trace_seq_printf(s, "  Max Noise   Max Single ");
-	trace_seq_printf(s, "         HW          NMI          IRQ      Softirq       Thread");
+	trace_seq_printf(s, "         HW          NMI");
+
+	if (params->mode == MODE_HWNOISE)
+		goto eol;
+
+	trace_seq_printf(s, "          IRQ      Softirq       Thread");
+
+eol:
 	trace_seq_printf(s, "\033[0;0;0m");
 	trace_seq_printf(s, "\n");
 }
@@ -181,6 +202,7 @@ static void clear_terminal(struct trace_seq *seq)
  */
 static void osnoise_top_print(struct osnoise_tool *tool, int cpu)
 {
+	struct osnoise_top_params *params = tool->params;
 	struct trace_seq *s = tool->trace.seq;
 	struct osnoise_top_cpu *cpu_data;
 	struct osnoise_top_data *data;
@@ -205,6 +227,12 @@ static void osnoise_top_print(struct osnoise_tool *tool, int cpu)
 
 	trace_seq_printf(s, "%12llu ", cpu_data->hw_count);
 	trace_seq_printf(s, "%12llu ", cpu_data->nmi_count);
+
+	if (params->mode == MODE_HWNOISE) {
+		trace_seq_printf(s, "\n");
+		return;
+	}
+
 	trace_seq_printf(s, "%12llu ", cpu_data->irq_count);
 	trace_seq_printf(s, "%12llu ", cpu_data->softirq_count);
 	trace_seq_printf(s, "%12llu\n", cpu_data->thread_count);
@@ -241,12 +269,12 @@ osnoise_print_stats(struct osnoise_top_params *params, struct osnoise_tool *top)
 /*
  * osnoise_top_usage - prints osnoise top usage message
  */
-void osnoise_top_usage(char *usage)
+static void osnoise_top_usage(struct osnoise_top_params *params, char *usage)
 {
 	int i;
 
 	static const char * const msg[] = {
-		"  usage: rtla osnoise [top] [-h] [-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\",
+		" [-h] [-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\",
 		"	  [-T us] [-t[=file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\",
 		"	  [-c cpu-list] [-P priority]",
 		"",
@@ -277,9 +305,22 @@ void osnoise_top_usage(char *usage)
 	if (usage)
 		fprintf(stderr, "%s\n", usage);
 
-	fprintf(stderr, "rtla osnoise top: a per-cpu summary of the OS noise (version %s)\n",
+	if (params->mode == MODE_OSNOISE) {
+		fprintf(stderr,
+			"rtla osnoise top: a per-cpu summary of the OS noise (version %s)\n",
 			VERSION);
 
+		fprintf(stderr, "  usage: rtla osnoise [top]");
+	}
+
+	if (params->mode == MODE_HWNOISE) {
+		fprintf(stderr,
+			"rtla hwnoise: a summary of hardware-related noise (version %s)\n",
+			VERSION);
+
+		fprintf(stderr, "  usage: rtla hwnoise");
+	}
+
 	for (i = 0; msg[i]; i++)
 		fprintf(stderr, "%s\n", msg[i]);
 	exit(1);
@@ -299,6 +340,9 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv)
 	if (!params)
 		exit(1);
 
+	if (strcmp(argv[0], "hwnoise") == 0)
+		params->mode = MODE_HWNOISE;
+
 	while (1) {
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
@@ -345,7 +389,7 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv)
 		case 'c':
 			retval = parse_cpu_list(optarg, &params->monitored_cpus);
 			if (retval)
-				osnoise_top_usage("\nInvalid -c cpu list\n");
+				osnoise_top_usage(params, "\nInvalid -c cpu list\n");
 			params->cpus = optarg;
 			break;
 		case 'D':
@@ -354,7 +398,7 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv)
 		case 'd':
 			params->duration = parse_seconds_duration(optarg);
 			if (!params->duration)
-				osnoise_top_usage("Invalid -D duration\n");
+				osnoise_top_usage(params, "Invalid -D duration\n");
 			break;
 		case 'e':
 			tevent = trace_event_alloc(optarg);
@@ -370,17 +414,17 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv)
 			break;
 		case 'h':
 		case '?':
-			osnoise_top_usage(NULL);
+			osnoise_top_usage(params, NULL);
 			break;
 		case 'p':
 			params->period = get_llong_from_str(optarg);
 			if (params->period > 10000000)
-				osnoise_top_usage("Period longer than 10 s\n");
+				osnoise_top_usage(params, "Period longer than 10 s\n");
 			break;
 		case 'P':
 			retval = parse_prio(optarg, &params->sched_param);
 			if (retval == -1)
-				osnoise_top_usage("Invalid -P priority");
+				osnoise_top_usage(params, "Invalid -P priority");
 			params->set_sched = 1;
 			break;
 		case 'q':
@@ -389,7 +433,7 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv)
 		case 'r':
 			params->runtime = get_llong_from_str(optarg);
 			if (params->runtime < 100)
-				osnoise_top_usage("Runtime shorter than 100 us\n");
+				osnoise_top_usage(params, "Runtime shorter than 100 us\n");
 			break;
 		case 's':
 			params->stop_us = get_llong_from_str(optarg);
@@ -415,7 +459,7 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv)
 					exit(EXIT_FAILURE);
 				}
 			} else {
-				osnoise_top_usage("--trigger requires a previous -e\n");
+				osnoise_top_usage(params, "--trigger requires a previous -e\n");
 			}
 			break;
 		case '1': /* filter */
@@ -426,11 +470,11 @@ struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv)
 					exit(EXIT_FAILURE);
 				}
 			} else {
-				osnoise_top_usage("--filter requires a previous -e\n");
+				osnoise_top_usage(params, "--filter requires a previous -e\n");
 			}
 			break;
 		default:
-			osnoise_top_usage("Invalid option");
+			osnoise_top_usage(params, "Invalid option");
 		}
 	}
 
@@ -495,6 +539,14 @@ osnoise_top_apply_config(struct osnoise_tool *tool, struct osnoise_top_params *p
 		}
 	}
 
+	if (params->mode == MODE_HWNOISE) {
+		retval = osnoise_set_irq_disable(tool->context, 1);
+		if (retval) {
+			err_msg("Failed to set OSNOISE_IRQ_DISABLE option\n");
+			goto out_err;
+		}
+	}
+
 	return 0;
 
 out_err:
diff --git a/tools/tracing/rtla/src/rtla.c b/tools/tracing/rtla/src/rtla.c
index 52e8f1825281..7635c70123ab 100644
--- a/tools/tracing/rtla/src/rtla.c
+++ b/tools/tracing/rtla/src/rtla.c
@@ -26,6 +26,7 @@ static void rtla_usage(int err)
 		"",
 		"  commands:",
 		"     osnoise  - gives information about the operating system noise (osnoise)",
+		"     hwnoise  - gives information about hardware-related noise",
 		"     timerlat - measures the timer irq and thread latency",
 		"",
 		NULL,
@@ -47,6 +48,9 @@ int run_command(int argc, char **argv, int start_position)
 	if (strcmp(argv[start_position], "osnoise") == 0) {
 		osnoise_main(argc-start_position, &argv[start_position]);
 		goto ran;
+	} else if (strcmp(argv[start_position], "hwnoise") == 0) {
+		hwnoise_main(argc-start_position, &argv[start_position]);
+		goto ran;
 	} else if (strcmp(argv[start_position], "timerlat") == 0) {
 		timerlat_main(argc-start_position, &argv[start_position]);
 		goto ran;
-- 
cgit 


From 8032cad1030279066ce4a1f82b76d0fe7eb578e2 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 13 Feb 2023 21:13:31 -0800
Subject: selftests/bpf: Clean up user_ringbuf, cgrp_kfunc, kfunc_dynptr_param
 tests

Clean up user_ringbuf, cgrp_kfunc, and kfunc_dynptr_param tests to use
the generic verification tester for checking verifier rejections.
The generic verification tester uses btf_decl_tag-based annotations
for verifying that the tests fail with the expected log messages.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Acked-by: David Vernet <void@manifault.com>
Reviewed-by: Roberto Sassu <roberto.sassu@huawei.com>
Link: https://lore.kernel.org/r/20230214051332.4007131-1-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/cgrp_kfunc.c  | 69 +--------------------
 .../selftests/bpf/prog_tests/kfunc_dynptr_param.c  | 72 +++++-----------------
 .../selftests/bpf/prog_tests/user_ringbuf.c        | 62 +------------------
 .../selftests/bpf/progs/cgrp_kfunc_failure.c       | 17 ++++-
 .../selftests/bpf/progs/test_kfunc_dynptr_param.c  |  4 ++
 .../selftests/bpf/progs/user_ringbuf_fail.c        | 31 +++++++---
 6 files changed, 58 insertions(+), 197 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c b/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c
index f3bb0e16e088..b3f7985c8504 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c
@@ -8,9 +8,6 @@
 #include "cgrp_kfunc_failure.skel.h"
 #include "cgrp_kfunc_success.skel.h"
 
-static size_t log_buf_sz = 1 << 20; /* 1 MB */
-static char obj_log_buf[1048576];
-
 static struct cgrp_kfunc_success *open_load_cgrp_kfunc_skel(void)
 {
 	struct cgrp_kfunc_success *skel;
@@ -89,65 +86,6 @@ static const char * const success_tests[] = {
 	"test_cgrp_get_ancestors",
 };
 
-static struct {
-	const char *prog_name;
-	const char *expected_err_msg;
-} failure_tests[] = {
-	{"cgrp_kfunc_acquire_untrusted", "Possibly NULL pointer passed to trusted arg0"},
-	{"cgrp_kfunc_acquire_fp", "arg#0 pointer type STRUCT cgroup must point"},
-	{"cgrp_kfunc_acquire_unsafe_kretprobe", "reg type unsupported for arg#0 function"},
-	{"cgrp_kfunc_acquire_trusted_walked", "R1 must be referenced or trusted"},
-	{"cgrp_kfunc_acquire_null", "Possibly NULL pointer passed to trusted arg0"},
-	{"cgrp_kfunc_acquire_unreleased", "Unreleased reference"},
-	{"cgrp_kfunc_get_non_kptr_param", "arg#0 expected pointer to map value"},
-	{"cgrp_kfunc_get_non_kptr_acquired", "arg#0 expected pointer to map value"},
-	{"cgrp_kfunc_get_null", "arg#0 expected pointer to map value"},
-	{"cgrp_kfunc_xchg_unreleased", "Unreleased reference"},
-	{"cgrp_kfunc_get_unreleased", "Unreleased reference"},
-	{"cgrp_kfunc_release_untrusted", "arg#0 is untrusted_ptr_or_null_ expected ptr_ or socket"},
-	{"cgrp_kfunc_release_fp", "arg#0 pointer type STRUCT cgroup must point"},
-	{"cgrp_kfunc_release_null", "arg#0 is ptr_or_null_ expected ptr_ or socket"},
-	{"cgrp_kfunc_release_unacquired", "release kernel function bpf_cgroup_release expects"},
-};
-
-static void verify_fail(const char *prog_name, const char *expected_err_msg)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
-	struct cgrp_kfunc_failure *skel;
-	int err, i;
-
-	opts.kernel_log_buf = obj_log_buf;
-	opts.kernel_log_size = log_buf_sz;
-	opts.kernel_log_level = 1;
-
-	skel = cgrp_kfunc_failure__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "cgrp_kfunc_failure__open_opts"))
-		goto cleanup;
-
-	for (i = 0; i < ARRAY_SIZE(failure_tests); i++) {
-		struct bpf_program *prog;
-		const char *curr_name = failure_tests[i].prog_name;
-
-		prog = bpf_object__find_program_by_name(skel->obj, curr_name);
-		if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
-			goto cleanup;
-
-		bpf_program__set_autoload(prog, !strcmp(curr_name, prog_name));
-	}
-
-	err = cgrp_kfunc_failure__load(skel);
-	if (!ASSERT_ERR(err, "unexpected load success"))
-		goto cleanup;
-
-	if (!ASSERT_OK_PTR(strstr(obj_log_buf, expected_err_msg), "expected_err_msg")) {
-		fprintf(stderr, "Expected err_msg: %s\n", expected_err_msg);
-		fprintf(stderr, "Verifier output: %s\n", obj_log_buf);
-	}
-
-cleanup:
-	cgrp_kfunc_failure__destroy(skel);
-}
-
 void test_cgrp_kfunc(void)
 {
 	int i, err;
@@ -163,12 +101,7 @@ void test_cgrp_kfunc(void)
 		run_success_test(success_tests[i]);
 	}
 
-	for (i = 0; i < ARRAY_SIZE(failure_tests); i++) {
-		if (!test__start_subtest(failure_tests[i].prog_name))
-			continue;
-
-		verify_fail(failure_tests[i].prog_name, failure_tests[i].expected_err_msg);
-	}
+	RUN_TESTS(cgrp_kfunc_failure);
 
 cleanup:
 	cleanup_cgroup_environment();
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
index 72800b1e8395..8cd298b78e44 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
@@ -10,17 +10,11 @@
 #include <test_progs.h>
 #include "test_kfunc_dynptr_param.skel.h"
 
-static size_t log_buf_sz = 1048576; /* 1 MB */
-static char obj_log_buf[1048576];
-
 static struct {
 	const char *prog_name;
-	const char *expected_verifier_err_msg;
 	int expected_runtime_err;
 } kfunc_dynptr_tests[] = {
-	{"not_valid_dynptr", "cannot pass in dynptr at an offset=-8", 0},
-	{"not_ptr_to_stack", "arg#0 expected pointer to stack or dynptr_ptr", 0},
-	{"dynptr_data_null", NULL, -EBADMSG},
+	{"dynptr_data_null", -EBADMSG},
 };
 
 static bool kfunc_not_supported;
@@ -38,29 +32,15 @@ static int libbpf_print_cb(enum libbpf_print_level level, const char *fmt,
 	return 0;
 }
 
-static void verify_fail(const char *prog_name, const char *expected_err_msg)
+static bool has_pkcs7_kfunc_support(void)
 {
 	struct test_kfunc_dynptr_param *skel;
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
 	libbpf_print_fn_t old_print_cb;
-	struct bpf_program *prog;
 	int err;
 
-	opts.kernel_log_buf = obj_log_buf;
-	opts.kernel_log_size = log_buf_sz;
-	opts.kernel_log_level = 1;
-
-	skel = test_kfunc_dynptr_param__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "test_kfunc_dynptr_param__open_opts"))
-		goto cleanup;
-
-	prog = bpf_object__find_program_by_name(skel->obj, prog_name);
-	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
-		goto cleanup;
-
-	bpf_program__set_autoload(prog, true);
-
-	bpf_map__set_max_entries(skel->maps.ringbuf, getpagesize());
+	skel = test_kfunc_dynptr_param__open();
+	if (!ASSERT_OK_PTR(skel, "test_kfunc_dynptr_param__open"))
+		return false;
 
 	kfunc_not_supported = false;
 
@@ -72,26 +52,18 @@ static void verify_fail(const char *prog_name, const char *expected_err_msg)
 		fprintf(stderr,
 		  "%s:SKIP:bpf_verify_pkcs7_signature() kfunc not supported\n",
 		  __func__);
-		test__skip();
-		goto cleanup;
-	}
-
-	if (!ASSERT_ERR(err, "unexpected load success"))
-		goto cleanup;
-
-	if (!ASSERT_OK_PTR(strstr(obj_log_buf, expected_err_msg), "expected_err_msg")) {
-		fprintf(stderr, "Expected err_msg: %s\n", expected_err_msg);
-		fprintf(stderr, "Verifier output: %s\n", obj_log_buf);
+		test_kfunc_dynptr_param__destroy(skel);
+		return false;
 	}
 
-cleanup:
 	test_kfunc_dynptr_param__destroy(skel);
+
+	return true;
 }
 
 static void verify_success(const char *prog_name, int expected_runtime_err)
 {
 	struct test_kfunc_dynptr_param *skel;
-	libbpf_print_fn_t old_print_cb;
 	struct bpf_program *prog;
 	struct bpf_link *link;
 	__u32 next_id;
@@ -103,21 +75,7 @@ static void verify_success(const char *prog_name, int expected_runtime_err)
 
 	skel->bss->pid = getpid();
 
-	bpf_map__set_max_entries(skel->maps.ringbuf, getpagesize());
-
-	kfunc_not_supported = false;
-
-	old_print_cb = libbpf_set_print(libbpf_print_cb);
 	err = test_kfunc_dynptr_param__load(skel);
-	libbpf_set_print(old_print_cb);
-
-	if (err < 0 && kfunc_not_supported) {
-		fprintf(stderr,
-		  "%s:SKIP:bpf_verify_pkcs7_signature() kfunc not supported\n",
-		  __func__);
-		test__skip();
-		goto cleanup;
-	}
 
 	if (!ASSERT_OK(err, "test_kfunc_dynptr_param__load"))
 		goto cleanup;
@@ -147,15 +105,15 @@ void test_kfunc_dynptr_param(void)
 {
 	int i;
 
+	if (!has_pkcs7_kfunc_support())
+		return;
+
 	for (i = 0; i < ARRAY_SIZE(kfunc_dynptr_tests); i++) {
 		if (!test__start_subtest(kfunc_dynptr_tests[i].prog_name))
 			continue;
 
-		if (kfunc_dynptr_tests[i].expected_verifier_err_msg)
-			verify_fail(kfunc_dynptr_tests[i].prog_name,
-			  kfunc_dynptr_tests[i].expected_verifier_err_msg);
-		else
-			verify_success(kfunc_dynptr_tests[i].prog_name,
-				kfunc_dynptr_tests[i].expected_runtime_err);
+		verify_success(kfunc_dynptr_tests[i].prog_name,
+			kfunc_dynptr_tests[i].expected_runtime_err);
 	}
+	RUN_TESTS(test_kfunc_dynptr_param);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
index dae68de285b9..3a13e102c149 100644
--- a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
+++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
@@ -19,8 +19,6 @@
 
 #include "../progs/test_user_ringbuf.h"
 
-static size_t log_buf_sz = 1 << 20; /* 1 MB */
-static char obj_log_buf[1048576];
 static const long c_sample_size = sizeof(struct sample) + BPF_RINGBUF_HDR_SZ;
 static const long c_ringbuf_size = 1 << 12; /* 1 small page */
 static const long c_max_entries = c_ringbuf_size / c_sample_size;
@@ -663,23 +661,6 @@ cleanup:
 	user_ringbuf_success__destroy(skel);
 }
 
-static struct {
-	const char *prog_name;
-	const char *expected_err_msg;
-} failure_tests[] = {
-	/* failure cases */
-	{"user_ringbuf_callback_bad_access1", "negative offset dynptr_ptr ptr"},
-	{"user_ringbuf_callback_bad_access2", "dereference of modified dynptr_ptr ptr"},
-	{"user_ringbuf_callback_write_forbidden", "invalid mem access 'dynptr_ptr'"},
-	{"user_ringbuf_callback_null_context_write", "invalid mem access 'scalar'"},
-	{"user_ringbuf_callback_null_context_read", "invalid mem access 'scalar'"},
-	{"user_ringbuf_callback_discard_dynptr", "cannot release unowned const bpf_dynptr"},
-	{"user_ringbuf_callback_submit_dynptr", "cannot release unowned const bpf_dynptr"},
-	{"user_ringbuf_callback_invalid_return", "At callback return the register R0 has value"},
-	{"user_ringbuf_callback_reinit_dynptr_mem", "Dynptr has to be an uninitialized dynptr"},
-	{"user_ringbuf_callback_reinit_dynptr_ringbuf", "Dynptr has to be an uninitialized dynptr"},
-};
-
 #define SUCCESS_TEST(_func) { _func, #_func }
 
 static struct {
@@ -700,42 +681,6 @@ static struct {
 	SUCCESS_TEST(test_user_ringbuf_blocking_reserve),
 };
 
-static void verify_fail(const char *prog_name, const char *expected_err_msg)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
-	struct bpf_program *prog;
-	struct user_ringbuf_fail *skel;
-	int err;
-
-	opts.kernel_log_buf = obj_log_buf;
-	opts.kernel_log_size = log_buf_sz;
-	opts.kernel_log_level = 1;
-
-	skel = user_ringbuf_fail__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "dynptr_fail__open_opts"))
-		goto cleanup;
-
-	prog = bpf_object__find_program_by_name(skel->obj, prog_name);
-	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
-		goto cleanup;
-
-	bpf_program__set_autoload(prog, true);
-
-	bpf_map__set_max_entries(skel->maps.user_ringbuf, getpagesize());
-
-	err = user_ringbuf_fail__load(skel);
-	if (!ASSERT_ERR(err, "unexpected load success"))
-		goto cleanup;
-
-	if (!ASSERT_OK_PTR(strstr(obj_log_buf, expected_err_msg), "expected_err_msg")) {
-		fprintf(stderr, "Expected err_msg: %s\n", expected_err_msg);
-		fprintf(stderr, "Verifier output: %s\n", obj_log_buf);
-	}
-
-cleanup:
-	user_ringbuf_fail__destroy(skel);
-}
-
 void test_user_ringbuf(void)
 {
 	int i;
@@ -747,10 +692,5 @@ void test_user_ringbuf(void)
 		success_tests[i].test_callback();
 	}
 
-	for (i = 0; i < ARRAY_SIZE(failure_tests); i++) {
-		if (!test__start_subtest(failure_tests[i].prog_name))
-			continue;
-
-		verify_fail(failure_tests[i].prog_name, failure_tests[i].expected_err_msg);
-	}
+	RUN_TESTS(user_ringbuf_fail);
 }
diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
index a1369b5ebcf8..4ad7fe24966d 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
@@ -5,6 +5,7 @@
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_helpers.h>
 
+#include "bpf_misc.h"
 #include "cgrp_kfunc_common.h"
 
 char _license[] SEC("license") = "GPL";
@@ -28,6 +29,7 @@ static struct __cgrps_kfunc_map_value *insert_lookup_cgrp(struct cgroup *cgrp)
 }
 
 SEC("tp_btf/cgroup_mkdir")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
 int BPF_PROG(cgrp_kfunc_acquire_untrusted, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *acquired;
@@ -45,6 +47,7 @@ int BPF_PROG(cgrp_kfunc_acquire_untrusted, struct cgroup *cgrp, const char *path
 }
 
 SEC("tp_btf/cgroup_mkdir")
+__failure __msg("arg#0 pointer type STRUCT cgroup must point")
 int BPF_PROG(cgrp_kfunc_acquire_fp, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *acquired, *stack_cgrp = (struct cgroup *)&path;
@@ -57,6 +60,7 @@ int BPF_PROG(cgrp_kfunc_acquire_fp, struct cgroup *cgrp, const char *path)
 }
 
 SEC("kretprobe/cgroup_destroy_locked")
+__failure __msg("reg type unsupported for arg#0 function")
 int BPF_PROG(cgrp_kfunc_acquire_unsafe_kretprobe, struct cgroup *cgrp)
 {
 	struct cgroup *acquired;
@@ -69,6 +73,7 @@ int BPF_PROG(cgrp_kfunc_acquire_unsafe_kretprobe, struct cgroup *cgrp)
 }
 
 SEC("tp_btf/cgroup_mkdir")
+__failure __msg("cgrp_kfunc_acquire_trusted_walked")
 int BPF_PROG(cgrp_kfunc_acquire_trusted_walked, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *acquired;
@@ -80,8 +85,8 @@ int BPF_PROG(cgrp_kfunc_acquire_trusted_walked, struct cgroup *cgrp, const char
 	return 0;
 }
 
-
 SEC("tp_btf/cgroup_mkdir")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
 int BPF_PROG(cgrp_kfunc_acquire_null, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *acquired;
@@ -96,6 +101,7 @@ int BPF_PROG(cgrp_kfunc_acquire_null, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
+__failure __msg("Unreleased reference")
 int BPF_PROG(cgrp_kfunc_acquire_unreleased, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *acquired;
@@ -108,6 +114,7 @@ int BPF_PROG(cgrp_kfunc_acquire_unreleased, struct cgroup *cgrp, const char *pat
 }
 
 SEC("tp_btf/cgroup_mkdir")
+__failure __msg("arg#0 expected pointer to map value")
 int BPF_PROG(cgrp_kfunc_get_non_kptr_param, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *kptr;
@@ -123,6 +130,7 @@ int BPF_PROG(cgrp_kfunc_get_non_kptr_param, struct cgroup *cgrp, const char *pat
 }
 
 SEC("tp_btf/cgroup_mkdir")
+__failure __msg("arg#0 expected pointer to map value")
 int BPF_PROG(cgrp_kfunc_get_non_kptr_acquired, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *kptr, *acquired;
@@ -141,6 +149,7 @@ int BPF_PROG(cgrp_kfunc_get_non_kptr_acquired, struct cgroup *cgrp, const char *
 }
 
 SEC("tp_btf/cgroup_mkdir")
+__failure __msg("arg#0 expected pointer to map value")
 int BPF_PROG(cgrp_kfunc_get_null, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *kptr;
@@ -156,6 +165,7 @@ int BPF_PROG(cgrp_kfunc_get_null, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
+__failure __msg("Unreleased reference")
 int BPF_PROG(cgrp_kfunc_xchg_unreleased, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *kptr;
@@ -175,6 +185,7 @@ int BPF_PROG(cgrp_kfunc_xchg_unreleased, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
+__failure __msg("Unreleased reference")
 int BPF_PROG(cgrp_kfunc_get_unreleased, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *kptr;
@@ -194,6 +205,7 @@ int BPF_PROG(cgrp_kfunc_get_unreleased, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
+__failure __msg("arg#0 is untrusted_ptr_or_null_ expected ptr_ or socket")
 int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path)
 {
 	struct __cgrps_kfunc_map_value *v;
@@ -209,6 +221,7 @@ int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path
 }
 
 SEC("tp_btf/cgroup_mkdir")
+__failure __msg("arg#0 pointer type STRUCT cgroup must point")
 int BPF_PROG(cgrp_kfunc_release_fp, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *acquired = (struct cgroup *)&path;
@@ -220,6 +233,7 @@ int BPF_PROG(cgrp_kfunc_release_fp, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
+__failure __msg("arg#0 is ptr_or_null_ expected ptr_ or socket")
 int BPF_PROG(cgrp_kfunc_release_null, struct cgroup *cgrp, const char *path)
 {
 	struct __cgrps_kfunc_map_value local, *v;
@@ -251,6 +265,7 @@ int BPF_PROG(cgrp_kfunc_release_null, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
+__failure __msg("release kernel function bpf_cgroup_release expects")
 int BPF_PROG(cgrp_kfunc_release_unacquired, struct cgroup *cgrp, const char *path)
 {
 	/* Cannot release trusted cgroup pointer which was not acquired. */
diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
index f4a8250329b2..2fbef3cc7ad8 100644
--- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
+++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
@@ -10,6 +10,7 @@
 #include <errno.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
 
 extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym;
 extern void bpf_key_put(struct bpf_key *key) __ksym;
@@ -19,6 +20,7 @@ extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr,
 
 struct {
 	__uint(type, BPF_MAP_TYPE_RINGBUF);
+	__uint(max_entries, 4096);
 } ringbuf SEC(".maps");
 
 struct {
@@ -33,6 +35,7 @@ int err, pid;
 char _license[] SEC("license") = "GPL";
 
 SEC("?lsm.s/bpf")
+__failure __msg("cannot pass in dynptr at an offset=-8")
 int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size)
 {
 	unsigned long val;
@@ -42,6 +45,7 @@ int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size)
 }
 
 SEC("?lsm.s/bpf")
+__failure __msg("arg#0 expected pointer to stack or dynptr_ptr")
 int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size)
 {
 	unsigned long val;
diff --git a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c
index f3201dc69a60..03ee946c6bf7 100644
--- a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c
+++ b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c
@@ -16,6 +16,7 @@ struct sample {
 
 struct {
 	__uint(type, BPF_MAP_TYPE_USER_RINGBUF);
+	__uint(max_entries, 4096);
 } user_ringbuf SEC(".maps");
 
 struct {
@@ -39,7 +40,8 @@ bad_access1(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to read before the pointer.
  */
-SEC("?raw_tp/")
+SEC("?raw_tp")
+__failure __msg("negative offset dynptr_ptr ptr")
 int user_ringbuf_callback_bad_access1(void *ctx)
 {
 	bpf_user_ringbuf_drain(&user_ringbuf, bad_access1, NULL, 0);
@@ -61,7 +63,8 @@ bad_access2(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to read past the end of the pointer.
  */
-SEC("?raw_tp/")
+SEC("?raw_tp")
+__failure __msg("dereference of modified dynptr_ptr ptr")
 int user_ringbuf_callback_bad_access2(void *ctx)
 {
 	bpf_user_ringbuf_drain(&user_ringbuf, bad_access2, NULL, 0);
@@ -80,7 +83,8 @@ write_forbidden(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to write to that pointer.
  */
-SEC("?raw_tp/")
+SEC("?raw_tp")
+__failure __msg("invalid mem access 'dynptr_ptr'")
 int user_ringbuf_callback_write_forbidden(void *ctx)
 {
 	bpf_user_ringbuf_drain(&user_ringbuf, write_forbidden, NULL, 0);
@@ -99,7 +103,8 @@ null_context_write(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to write to that pointer.
  */
-SEC("?raw_tp/")
+SEC("?raw_tp")
+__failure __msg("invalid mem access 'scalar'")
 int user_ringbuf_callback_null_context_write(void *ctx)
 {
 	bpf_user_ringbuf_drain(&user_ringbuf, null_context_write, NULL, 0);
@@ -120,7 +125,8 @@ null_context_read(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to write to that pointer.
  */
-SEC("?raw_tp/")
+SEC("?raw_tp")
+__failure __msg("invalid mem access 'scalar'")
 int user_ringbuf_callback_null_context_read(void *ctx)
 {
 	bpf_user_ringbuf_drain(&user_ringbuf, null_context_read, NULL, 0);
@@ -139,7 +145,8 @@ try_discard_dynptr(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to read past the end of the pointer.
  */
-SEC("?raw_tp/")
+SEC("?raw_tp")
+__failure __msg("cannot release unowned const bpf_dynptr")
 int user_ringbuf_callback_discard_dynptr(void *ctx)
 {
 	bpf_user_ringbuf_drain(&user_ringbuf, try_discard_dynptr, NULL, 0);
@@ -158,7 +165,8 @@ try_submit_dynptr(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to read past the end of the pointer.
  */
-SEC("?raw_tp/")
+SEC("?raw_tp")
+__failure __msg("cannot release unowned const bpf_dynptr")
 int user_ringbuf_callback_submit_dynptr(void *ctx)
 {
 	bpf_user_ringbuf_drain(&user_ringbuf, try_submit_dynptr, NULL, 0);
@@ -175,7 +183,8 @@ invalid_drain_callback_return(struct bpf_dynptr *dynptr, void *context)
 /* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
  * not be able to write to that pointer.
  */
-SEC("?raw_tp/")
+SEC("?raw_tp")
+__failure __msg("At callback return the register R0 has value")
 int user_ringbuf_callback_invalid_return(void *ctx)
 {
 	bpf_user_ringbuf_drain(&user_ringbuf, invalid_drain_callback_return, NULL, 0);
@@ -197,14 +206,16 @@ try_reinit_dynptr_ringbuf(struct bpf_dynptr *dynptr, void *context)
 	return 0;
 }
 
-SEC("?raw_tp/")
+SEC("?raw_tp")
+__failure __msg("Dynptr has to be an uninitialized dynptr")
 int user_ringbuf_callback_reinit_dynptr_mem(void *ctx)
 {
 	bpf_user_ringbuf_drain(&user_ringbuf, try_reinit_dynptr_mem, NULL, 0);
 	return 0;
 }
 
-SEC("?raw_tp/")
+SEC("?raw_tp")
+__failure __msg("Dynptr has to be an uninitialized dynptr")
 int user_ringbuf_callback_reinit_dynptr_ringbuf(void *ctx)
 {
 	bpf_user_ringbuf_drain(&user_ringbuf, try_reinit_dynptr_ringbuf, NULL, 0);
-- 
cgit 


From 50a7cedb150a628b54aa7f8ce1e922a72c773273 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 13 Feb 2023 21:13:32 -0800
Subject: selftests/bpf: Clean up dynptr prog_tests

Clean up prog_tests/dynptr.c by removing the unneeded "expected_err_msg"
in the dynptr_tests struct, which is a remnant from converting the fail
tests cases to use the generic verification tester.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/r/20230214051332.4007131-2-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/dynptr.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
index 7faaf6d9e0d4..b99264ec0d9c 100644
--- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
@@ -5,14 +5,10 @@
 #include "dynptr_fail.skel.h"
 #include "dynptr_success.skel.h"
 
-static struct {
-	const char *prog_name;
-	const char *expected_err_msg;
-} dynptr_tests[] = {
-	/* success cases */
-	{"test_read_write", NULL},
-	{"test_data_slice", NULL},
-	{"test_ringbuf", NULL},
+static const char * const success_tests[] = {
+	"test_read_write",
+	"test_data_slice",
+	"test_ringbuf",
 };
 
 static void verify_success(const char *prog_name)
@@ -53,11 +49,11 @@ void test_dynptr(void)
 {
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(dynptr_tests); i++) {
-		if (!test__start_subtest(dynptr_tests[i].prog_name))
+	for (i = 0; i < ARRAY_SIZE(success_tests); i++) {
+		if (!test__start_subtest(success_tests[i]))
 			continue;
 
-		verify_success(dynptr_tests[i].prog_name);
+		verify_success(success_tests[i]);
 	}
 
 	RUN_TESTS(dynptr_fail);
-- 
cgit 


From 0eb15a47bf437a268b69ab1a1a45fdf1f609b69f Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Tue, 14 Feb 2023 12:19:39 -0700
Subject: selftests/user_events: add a note about user_events.h dependency

This test depends on <linux/user_events.h> exported in uapi
The following commit removed user_events.h out of uapi:
commit 5cfff569cab8 ("tracing: Move user_events.h temporarily out
   of include/uapi")

This test will not compile until user_events.h is added back to uapi.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/user_events/Makefile | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/user_events/Makefile b/tools/testing/selftests/user_events/Makefile
index 87d54c640068..6b512b86aec3 100644
--- a/tools/testing/selftests/user_events/Makefile
+++ b/tools/testing/selftests/user_events/Makefile
@@ -2,6 +2,14 @@
 CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
 LDLIBS += -lrt -lpthread -lm
 
+# Note:
+# This test depends on <linux/user_events.h> exported in uapi
+# The following commit removed user_events.h out of uapi:
+# commit 5cfff569cab8bf544bab62c911c5d6efd5af5e05
+# tracing: Move user_events.h temporarily out of include/uapi
+# This test will not compile until user_events.h is added
+# back to uapi.
+
 TEST_GEN_PROGS = ftrace_test dyn_test perf_test
 
 TEST_FILES := settings
-- 
cgit 


From 59c3368b2e69eb7da7f271286a0bd80930dfc070 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 14 Feb 2023 11:41:13 -0800
Subject: cxl/port: Export cxl_dvsec_rr_decode() to cxl_port

Call cxl_dvsec_rr_decode() in the beginning of cxl_port_probe() and
preserve the decoded information in a local
'struct cxl_endpoint_dvsec_info'. This info can be passed to various
functions later on in order to support the HDM decoder emulation.
The invocation of cxl_dvsec_rr_decode() in cxl_hdm_decode_init() is
removed and a pointer to the 'struct cxl_endpoint_dvsec_info' is passed
in.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/167640367377.935665.2848747799651019676.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/pci.c        | 18 +++++++-----------
 drivers/cxl/cxl.h             | 14 ++++++++++++++
 drivers/cxl/cxlmem.h          | 12 ------------
 drivers/cxl/cxlpci.h          |  3 ++-
 drivers/cxl/port.c            | 20 +++++++++++++-------
 tools/testing/cxl/Kbuild      |  1 +
 tools/testing/cxl/test/mock.c | 21 +++++++++++++++++++--
 7 files changed, 56 insertions(+), 33 deletions(-)

(limited to 'tools')

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 52bf6b4d093e..948fa3724a0f 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -333,8 +333,8 @@ static bool __cxl_hdm_decode_init(struct cxl_dev_state *cxlds,
 	return true;
 }
 
-static int cxl_dvsec_rr_decode(struct device *dev, int d,
-			       struct cxl_endpoint_dvsec_info *info)
+int cxl_dvsec_rr_decode(struct device *dev, int d,
+			struct cxl_endpoint_dvsec_info *info)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	int hdm_count, rc, i, ranges = 0;
@@ -434,30 +434,26 @@ static int cxl_dvsec_rr_decode(struct device *dev, int d,
 
 	return 0;
 }
+EXPORT_SYMBOL_NS_GPL(cxl_dvsec_rr_decode, CXL);
 
 /**
  * cxl_hdm_decode_init() - Setup HDM decoding for the endpoint
  * @cxlds: Device state
  * @cxlhdm: Mapped HDM decoder Capability
+ * @info: Cached DVSEC range registers info
  *
  * Try to enable the endpoint's HDM Decoder Capability
  */
-int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm)
+int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
+			struct cxl_endpoint_dvsec_info *info)
 {
-	struct cxl_endpoint_dvsec_info info = { 0 };
 	struct device *dev = cxlds->dev;
-	int d = cxlds->cxl_dvsec;
-	int rc;
-
-	rc = cxl_dvsec_rr_decode(dev, d, &info);
-	if (rc < 0)
-		return rc;
 
 	/*
 	 * If DVSEC ranges are being used instead of HDM decoder registers there
 	 * is no use in trying to manage those.
 	 */
-	if (!__cxl_hdm_decode_init(cxlds, cxlhdm, &info)) {
+	if (!__cxl_hdm_decode_init(cxlds, cxlhdm, info)) {
 		dev_err(dev,
 			"Legacy range registers configuration prevents HDM operation.\n");
 		return -EBUSY;
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 1b1cf459ac77..fc01ce96d326 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -630,10 +630,24 @@ int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map);
 int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld);
 int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint);
 
+/**
+ * struct cxl_endpoint_dvsec_info - Cached DVSEC info
+ * @mem_enabled: cached value of mem_enabled in the DVSEC, PCIE_DEVICE
+ * @ranges: Number of active HDM ranges this device uses.
+ * @dvsec_range: cached attributes of the ranges in the DVSEC, PCIE_DEVICE
+ */
+struct cxl_endpoint_dvsec_info {
+	bool mem_enabled;
+	int ranges;
+	struct range dvsec_range[2];
+};
+
 struct cxl_hdm;
 struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port);
 int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm);
 int devm_cxl_add_passthrough_decoder(struct cxl_port *port);
+int cxl_dvsec_rr_decode(struct device *dev, int dvsec,
+			struct cxl_endpoint_dvsec_info *info);
 
 bool is_cxl_region(struct device *dev);
 
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index ab138004f644..187a310780a9 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -181,18 +181,6 @@ static inline int cxl_mbox_cmd_rc2errno(struct cxl_mbox_cmd *mbox_cmd)
  */
 #define CXL_CAPACITY_MULTIPLIER SZ_256M
 
-/**
- * struct cxl_endpoint_dvsec_info - Cached DVSEC info
- * @mem_enabled: cached value of mem_enabled in the DVSEC, PCIE_DEVICE
- * @ranges: Number of active HDM ranges this device uses.
- * @dvsec_range: cached attributes of the ranges in the DVSEC, PCIE_DEVICE
- */
-struct cxl_endpoint_dvsec_info {
-	bool mem_enabled;
-	int ranges;
-	struct range dvsec_range[2];
-};
-
 /**
  * struct cxl_dev_state - The driver device state
  *
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 920909791bb9..430e23345a16 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -64,6 +64,7 @@ enum cxl_regloc_type {
 
 int devm_cxl_port_enumerate_dports(struct cxl_port *port);
 struct cxl_dev_state;
-int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm);
+int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
+			struct cxl_endpoint_dvsec_info *info);
 void read_cdat_data(struct cxl_port *port);
 #endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index 5453771bf330..9e09728b20d9 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -32,12 +32,21 @@ static void schedule_detach(void *cxlmd)
 
 static int cxl_port_probe(struct device *dev)
 {
+	struct cxl_endpoint_dvsec_info info = { 0 };
 	struct cxl_port *port = to_cxl_port(dev);
+	bool is_ep = is_cxl_endpoint(port);
+	struct cxl_dev_state *cxlds;
+	struct cxl_memdev *cxlmd;
 	struct cxl_hdm *cxlhdm;
 	int rc;
 
-
-	if (!is_cxl_endpoint(port)) {
+	if (is_ep) {
+		cxlmd = to_cxl_memdev(port->uport);
+		cxlds = cxlmd->cxlds;
+		rc = cxl_dvsec_rr_decode(cxlds->dev, cxlds->cxl_dvsec, &info);
+		if (rc < 0)
+			return rc;
+	} else {
 		rc = devm_cxl_port_enumerate_dports(port);
 		if (rc < 0)
 			return rc;
@@ -49,10 +58,7 @@ static int cxl_port_probe(struct device *dev)
 	if (IS_ERR(cxlhdm))
 		return PTR_ERR(cxlhdm);
 
-	if (is_cxl_endpoint(port)) {
-		struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport);
-		struct cxl_dev_state *cxlds = cxlmd->cxlds;
-
+	if (is_ep) {
 		/* Cache the data early to ensure is_visible() works */
 		read_cdat_data(port);
 
@@ -61,7 +67,7 @@ static int cxl_port_probe(struct device *dev)
 		if (rc)
 			return rc;
 
-		rc = cxl_hdm_decode_init(cxlds, cxlhdm);
+		rc = cxl_hdm_decode_init(cxlds, cxlhdm, &info);
 		if (rc)
 			return rc;
 
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 0805f08af8b3..012149ad5c1c 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -10,6 +10,7 @@ ldflags-y += --wrap=devm_cxl_add_passthrough_decoder
 ldflags-y += --wrap=devm_cxl_enumerate_decoders
 ldflags-y += --wrap=cxl_await_media_ready
 ldflags-y += --wrap=cxl_hdm_decode_init
+ldflags-y += --wrap=cxl_dvsec_rr_decode
 ldflags-y += --wrap=cxl_rcrb_to_component
 
 DRIVERS := ../../../drivers
diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c
index 5dface08e0de..2a13f4722891 100644
--- a/tools/testing/cxl/test/mock.c
+++ b/tools/testing/cxl/test/mock.c
@@ -209,7 +209,8 @@ int __wrap_cxl_await_media_ready(struct cxl_dev_state *cxlds)
 EXPORT_SYMBOL_NS_GPL(__wrap_cxl_await_media_ready, CXL);
 
 int __wrap_cxl_hdm_decode_init(struct cxl_dev_state *cxlds,
-			       struct cxl_hdm *cxlhdm)
+			       struct cxl_hdm *cxlhdm,
+			       struct cxl_endpoint_dvsec_info *info)
 {
 	int rc = 0, index;
 	struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
@@ -217,13 +218,29 @@ int __wrap_cxl_hdm_decode_init(struct cxl_dev_state *cxlds,
 	if (ops && ops->is_mock_dev(cxlds->dev))
 		rc = 0;
 	else
-		rc = cxl_hdm_decode_init(cxlds, cxlhdm);
+		rc = cxl_hdm_decode_init(cxlds, cxlhdm, info);
 	put_cxl_mock_ops(index);
 
 	return rc;
 }
 EXPORT_SYMBOL_NS_GPL(__wrap_cxl_hdm_decode_init, CXL);
 
+int __wrap_cxl_dvsec_rr_decode(struct device *dev, int dvsec,
+			       struct cxl_endpoint_dvsec_info *info)
+{
+	int rc = 0, index;
+	struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
+
+	if (ops && ops->is_mock_dev(dev))
+		rc = 0;
+	else
+		rc = cxl_dvsec_rr_decode(dev, dvsec, info);
+	put_cxl_mock_ops(index);
+
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(__wrap_cxl_dvsec_rr_decode, CXL);
+
 resource_size_t __wrap_cxl_rcrb_to_component(struct device *dev,
 					     resource_size_t rcrb,
 					     enum cxl_rcrb which)
-- 
cgit 


From b777e9bec960a29374dc486d47784c73b7ac4cef Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 14 Feb 2023 11:41:24 -0800
Subject: cxl/hdm: Emulate HDM decoder from DVSEC range registers

In the case where HDM decoder register block exists but is not programmed
and at the same time the DVSEC range register range is active, populate the
CXL decoder object 'cxl_decoder' with info from DVSEC range registers.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/167640368454.935665.13806415120298330717.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/hdm.c        | 36 +++++++++++++++++++++++++++++++++---
 drivers/cxl/core/pci.c        |  2 +-
 drivers/cxl/cxl.h             |  3 ++-
 drivers/cxl/port.c            |  2 +-
 tools/testing/cxl/test/cxl.c  |  3 ++-
 tools/testing/cxl/test/mock.c |  7 ++++---
 tools/testing/cxl/test/mock.h |  3 ++-
 7 files changed, 45 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index dcc16d7cb8f3..c0f224454447 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -679,9 +679,34 @@ static int cxl_decoder_reset(struct cxl_decoder *cxld)
 	return 0;
 }
 
+static int cxl_setup_hdm_decoder_from_dvsec(struct cxl_port *port,
+					    struct cxl_decoder *cxld, int which,
+					    struct cxl_endpoint_dvsec_info *info)
+{
+	if (!is_cxl_endpoint(port))
+		return -EOPNOTSUPP;
+
+	if (!range_len(&info->dvsec_range[which]))
+		return -ENOENT;
+
+	cxld->target_type = CXL_DECODER_EXPANDER;
+	cxld->commit = NULL;
+	cxld->reset = NULL;
+	cxld->hpa_range = info->dvsec_range[which];
+
+	/*
+	 * Set the emulated decoder as locked pending additional support to
+	 * change the range registers at run time.
+	 */
+	cxld->flags |= CXL_DECODER_F_ENABLE | CXL_DECODER_F_LOCK;
+	port->commit_end = cxld->id;
+
+	return 0;
+}
+
 static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld,
 			    int *target_map, void __iomem *hdm, int which,
-			    u64 *dpa_base)
+			    u64 *dpa_base, struct cxl_endpoint_dvsec_info *info)
 {
 	struct cxl_endpoint_decoder *cxled = NULL;
 	u64 size, base, skip, dpa_size;
@@ -717,6 +742,9 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld,
 		.end = base + size - 1,
 	};
 
+	if (cxled && !committed && range_len(&info->dvsec_range[which]))
+		return cxl_setup_hdm_decoder_from_dvsec(port, cxld, which, info);
+
 	/* decoders are enabled if committed */
 	if (committed) {
 		cxld->flags |= CXL_DECODER_F_ENABLE;
@@ -790,7 +818,8 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld,
  * devm_cxl_enumerate_decoders - add decoder objects per HDM register set
  * @cxlhdm: Structure to populate with HDM capabilities
  */
-int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm)
+int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm,
+				struct cxl_endpoint_dvsec_info *info)
 {
 	void __iomem *hdm = cxlhdm->regs.hdm_decoder;
 	struct cxl_port *port = cxlhdm->port;
@@ -842,7 +871,8 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm)
 			cxld = &cxlsd->cxld;
 		}
 
-		rc = init_hdm_decoder(port, cxld, target_map, hdm, i, &dpa_base);
+		rc = init_hdm_decoder(port, cxld, target_map, hdm, i,
+				      &dpa_base, info);
 		if (rc) {
 			put_device(&cxld->dev);
 			return rc;
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index d0b25481bdce..4df0b35c9b1a 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -426,7 +426,7 @@ int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
 	 * Decoder Capability Enable.
 	 */
 	if (info->mem_enabled)
-		return -EBUSY;
+		return 0;
 
 	rc = devm_cxl_enable_hdm(&port->dev, cxlhdm);
 	if (rc)
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index fc01ce96d326..fe9d75989c8a 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -644,7 +644,8 @@ struct cxl_endpoint_dvsec_info {
 
 struct cxl_hdm;
 struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port);
-int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm);
+int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm,
+				struct cxl_endpoint_dvsec_info *info);
 int devm_cxl_add_passthrough_decoder(struct cxl_port *port);
 int cxl_dvsec_rr_decode(struct device *dev, int dvsec,
 			struct cxl_endpoint_dvsec_info *info);
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index 9e09728b20d9..d3a708e32565 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -78,7 +78,7 @@ static int cxl_port_probe(struct device *dev)
 		}
 	}
 
-	rc = devm_cxl_enumerate_decoders(cxlhdm);
+	rc = devm_cxl_enumerate_decoders(cxlhdm, &info);
 	if (rc) {
 		dev_err(dev, "Couldn't enumerate decoders (%d)\n", rc);
 		return rc;
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index 30ee680d38ff..3b4916adf29c 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -701,7 +701,8 @@ static int mock_decoder_reset(struct cxl_decoder *cxld)
 	return 0;
 }
 
-static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm)
+static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm,
+				       struct cxl_endpoint_dvsec_info *info)
 {
 	struct cxl_port *port = cxlhdm->port;
 	struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c
index 2a13f4722891..3116c9f07c5d 100644
--- a/tools/testing/cxl/test/mock.c
+++ b/tools/testing/cxl/test/mock.c
@@ -162,16 +162,17 @@ int __wrap_devm_cxl_add_passthrough_decoder(struct cxl_port *port)
 }
 EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_add_passthrough_decoder, CXL);
 
-int __wrap_devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm)
+int __wrap_devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm,
+				       struct cxl_endpoint_dvsec_info *info)
 {
 	int rc, index;
 	struct cxl_port *port = cxlhdm->port;
 	struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
 
 	if (ops && ops->is_mock_port(port->uport))
-		rc = ops->devm_cxl_enumerate_decoders(cxlhdm);
+		rc = ops->devm_cxl_enumerate_decoders(cxlhdm, info);
 	else
-		rc = devm_cxl_enumerate_decoders(cxlhdm);
+		rc = devm_cxl_enumerate_decoders(cxlhdm, info);
 	put_cxl_mock_ops(index);
 
 	return rc;
diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h
index ef33f159375e..e377ced5f1b3 100644
--- a/tools/testing/cxl/test/mock.h
+++ b/tools/testing/cxl/test/mock.h
@@ -25,7 +25,8 @@ struct cxl_mock_ops {
 	int (*devm_cxl_port_enumerate_dports)(struct cxl_port *port);
 	struct cxl_hdm *(*devm_cxl_setup_hdm)(struct cxl_port *port);
 	int (*devm_cxl_add_passthrough_decoder)(struct cxl_port *port);
-	int (*devm_cxl_enumerate_decoders)(struct cxl_hdm *hdm);
+	int (*devm_cxl_enumerate_decoders)(
+		struct cxl_hdm *hdm, struct cxl_endpoint_dvsec_info *info);
 };
 
 void register_cxl_mock_ops(struct cxl_mock_ops *ops);
-- 
cgit 


From 4474ce565ee4490fb4e6d8443b617a9d98ae10ff Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 14 Feb 2023 11:41:30 -0800
Subject: cxl/hdm: Create emulated cxl_hdm for devices that do not have HDM
 decoders

CXL rev3 spec 8.1.3

RCDs may not have HDM register blocks. Create a fake HDM with information
from the CXL PCIe DVSEC registers. The decoder count will be set to the
HDM count retrieved from the DVSEC cap register.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/167640368994.935665.15831225724059704620.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/hdm.c        | 58 +++++++++++++++++++++++++++++++++++--------
 drivers/cxl/core/pci.c        |  9 ++++---
 drivers/cxl/cxl.h             |  3 ++-
 drivers/cxl/port.c            |  2 +-
 tools/testing/cxl/test/cxl.c  |  3 ++-
 tools/testing/cxl/test/mock.c |  8 +++---
 tools/testing/cxl/test/mock.h |  3 ++-
 7 files changed, 66 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index c0f224454447..a49543f22dca 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -101,11 +101,34 @@ static int map_hdm_decoder_regs(struct cxl_port *port, void __iomem *crb,
 				      BIT(CXL_CM_CAP_CAP_ID_HDM));
 }
 
+static struct cxl_hdm *devm_cxl_setup_emulated_hdm(struct cxl_port *port,
+						   struct cxl_endpoint_dvsec_info *info)
+{
+	struct device *dev = &port->dev;
+	struct cxl_hdm *cxlhdm;
+
+	if (!info->mem_enabled)
+		return ERR_PTR(-ENODEV);
+
+	cxlhdm = devm_kzalloc(dev, sizeof(*cxlhdm), GFP_KERNEL);
+	if (!cxlhdm)
+		return ERR_PTR(-ENOMEM);
+
+	cxlhdm->port = port;
+	cxlhdm->decoder_count = info->ranges;
+	cxlhdm->target_count = info->ranges;
+	dev_set_drvdata(&port->dev, cxlhdm);
+
+	return cxlhdm;
+}
+
 /**
  * devm_cxl_setup_hdm - map HDM decoder component registers
  * @port: cxl_port to map
+ * @info: cached DVSEC range register info
  */
-struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port)
+struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port,
+				   struct cxl_endpoint_dvsec_info *info)
 {
 	struct device *dev = &port->dev;
 	struct cxl_hdm *cxlhdm;
@@ -119,6 +142,9 @@ struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port)
 	cxlhdm->port = port;
 	crb = ioremap(port->component_reg_phys, CXL_COMPONENT_REG_BLOCK_SIZE);
 	if (!crb) {
+		if (info->mem_enabled)
+			return devm_cxl_setup_emulated_hdm(port, info);
+
 		dev_err(dev, "No component registers mapped\n");
 		return ERR_PTR(-ENXIO);
 	}
@@ -814,19 +840,15 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld,
 	return 0;
 }
 
-/**
- * devm_cxl_enumerate_decoders - add decoder objects per HDM register set
- * @cxlhdm: Structure to populate with HDM capabilities
- */
-int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm,
-				struct cxl_endpoint_dvsec_info *info)
+static void cxl_settle_decoders(struct cxl_hdm *cxlhdm)
 {
 	void __iomem *hdm = cxlhdm->regs.hdm_decoder;
-	struct cxl_port *port = cxlhdm->port;
-	int i, committed;
-	u64 dpa_base = 0;
+	int committed, i;
 	u32 ctrl;
 
+	if (!hdm)
+		return;
+
 	/*
 	 * Since the register resource was recently claimed via request_region()
 	 * be careful about trusting the "not-committed" status until the commit
@@ -843,6 +865,22 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm,
 	/* ensure that future checks of committed can be trusted */
 	if (committed != cxlhdm->decoder_count)
 		msleep(20);
+}
+
+/**
+ * devm_cxl_enumerate_decoders - add decoder objects per HDM register set
+ * @cxlhdm: Structure to populate with HDM capabilities
+ * @info: cached DVSEC range register info
+ */
+int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm,
+				struct cxl_endpoint_dvsec_info *info)
+{
+	void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+	struct cxl_port *port = cxlhdm->port;
+	int i;
+	u64 dpa_base = 0;
+
+	cxl_settle_decoders(cxlhdm);
 
 	for (i = 0; i < cxlhdm->decoder_count; i++) {
 		int target_map[CXL_DECODER_MAX_INTERLEAVE] = { 0 };
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 4df0b35c9b1a..4eb34dee7c95 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -378,16 +378,19 @@ int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
 	struct device *dev = cxlds->dev;
 	struct cxl_port *root;
 	int i, rc, allowed;
-	u32 global_ctrl;
+	u32 global_ctrl = 0;
 
-	global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET);
+	if (hdm)
+		global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET);
 
 	/*
 	 * If the HDM Decoder Capability is already enabled then assume
 	 * that some other agent like platform firmware set it up.
 	 */
-	if (global_ctrl & CXL_HDM_DECODER_ENABLE)
+	if (global_ctrl & CXL_HDM_DECODER_ENABLE || (!hdm && info->mem_enabled))
 		return devm_cxl_enable_mem(&port->dev, cxlds);
+	else if (!hdm)
+		return -ENODEV;
 
 	root = to_cxl_port(port->dev.parent);
 	while (!is_cxl_root(root) && is_cxl_port(root->dev.parent))
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index fe9d75989c8a..f8cbc5275451 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -643,7 +643,8 @@ struct cxl_endpoint_dvsec_info {
 };
 
 struct cxl_hdm;
-struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port);
+struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port,
+				   struct cxl_endpoint_dvsec_info *info);
 int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm,
 				struct cxl_endpoint_dvsec_info *info);
 int devm_cxl_add_passthrough_decoder(struct cxl_port *port);
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index d3a708e32565..9f9cc268b597 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -54,7 +54,7 @@ static int cxl_port_probe(struct device *dev)
 			return devm_cxl_add_passthrough_decoder(port);
 	}
 
-	cxlhdm = devm_cxl_setup_hdm(port);
+	cxlhdm = devm_cxl_setup_hdm(port, &info);
 	if (IS_ERR(cxlhdm))
 		return PTR_ERR(cxlhdm);
 
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index 3b4916adf29c..94197abd44aa 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -618,7 +618,8 @@ static struct acpi_pci_root *mock_acpi_pci_find_root(acpi_handle handle)
 	return &mock_pci_root[host_bridge_index(adev)];
 }
 
-static struct cxl_hdm *mock_cxl_setup_hdm(struct cxl_port *port)
+static struct cxl_hdm *mock_cxl_setup_hdm(struct cxl_port *port,
+					  struct cxl_endpoint_dvsec_info *info)
 {
 	struct cxl_hdm *cxlhdm = devm_kzalloc(&port->dev, sizeof(*cxlhdm), GFP_KERNEL);
 
diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c
index 3116c9f07c5d..c4e53f22e421 100644
--- a/tools/testing/cxl/test/mock.c
+++ b/tools/testing/cxl/test/mock.c
@@ -131,16 +131,18 @@ __wrap_nvdimm_bus_register(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(__wrap_nvdimm_bus_register);
 
-struct cxl_hdm *__wrap_devm_cxl_setup_hdm(struct cxl_port *port)
+struct cxl_hdm *__wrap_devm_cxl_setup_hdm(struct cxl_port *port,
+					  struct cxl_endpoint_dvsec_info *info)
+
 {
 	int index;
 	struct cxl_hdm *cxlhdm;
 	struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
 
 	if (ops && ops->is_mock_port(port->uport))
-		cxlhdm = ops->devm_cxl_setup_hdm(port);
+		cxlhdm = ops->devm_cxl_setup_hdm(port, info);
 	else
-		cxlhdm = devm_cxl_setup_hdm(port);
+		cxlhdm = devm_cxl_setup_hdm(port, info);
 	put_cxl_mock_ops(index);
 
 	return cxlhdm;
diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h
index e377ced5f1b3..bef8817b01f2 100644
--- a/tools/testing/cxl/test/mock.h
+++ b/tools/testing/cxl/test/mock.h
@@ -23,7 +23,8 @@ struct cxl_mock_ops {
 	bool (*is_mock_port)(struct device *dev);
 	bool (*is_mock_dev)(struct device *dev);
 	int (*devm_cxl_port_enumerate_dports)(struct cxl_port *port);
-	struct cxl_hdm *(*devm_cxl_setup_hdm)(struct cxl_port *port);
+	struct cxl_hdm *(*devm_cxl_setup_hdm)(
+		struct cxl_port *port, struct cxl_endpoint_dvsec_info *info);
 	int (*devm_cxl_add_passthrough_decoder)(struct cxl_port *port);
 	int (*devm_cxl_enumerate_decoders)(
 		struct cxl_hdm *hdm, struct cxl_endpoint_dvsec_info *info);
-- 
cgit 


From 4f11410bf6da87defe8fd59b0413f0d9f71744da Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 27 Jan 2023 08:57:42 -0500
Subject: selftests/powerpc: Fix incorrect kernel headers search path

Use $(KHDR_INCLUDES) as lookup path for kernel headers. This prevents
building against kernel headers from the build environment in scenarios
where kernel headers are installed into a specific output directory
(O=...).

Cc: stable@vger.kernel.org # v5.18+
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20230127135755.79929-22-mathieu.desnoyers@efficios.com
---
 tools/testing/selftests/powerpc/ptrace/Makefile   | 2 +-
 tools/testing/selftests/powerpc/security/Makefile | 2 +-
 tools/testing/selftests/powerpc/syscalls/Makefile | 2 +-
 tools/testing/selftests/powerpc/tm/Makefile       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile
index 2f02cb54224d..cbeeaeae8837 100644
--- a/tools/testing/selftests/powerpc/ptrace/Makefile
+++ b/tools/testing/selftests/powerpc/ptrace/Makefile
@@ -33,7 +33,7 @@ TESTS_64 := $(patsubst %,$(OUTPUT)/%,$(TESTS_64))
 $(TESTS_64): CFLAGS += -m64
 $(TM_TESTS): CFLAGS += -I../tm -mhtm
 
-CFLAGS += -I../../../../../usr/include -fno-pie
+CFLAGS += $(KHDR_INCLUDES) -fno-pie
 
 $(OUTPUT)/ptrace-gpr: ptrace-gpr.S
 $(OUTPUT)/ptrace-pkey $(OUTPUT)/core-pkey: LDLIBS += -pthread
diff --git a/tools/testing/selftests/powerpc/security/Makefile b/tools/testing/selftests/powerpc/security/Makefile
index 7488315fd847..e0d979ab0204 100644
--- a/tools/testing/selftests/powerpc/security/Makefile
+++ b/tools/testing/selftests/powerpc/security/Makefile
@@ -5,7 +5,7 @@ TEST_PROGS := mitigation-patching.sh
 
 top_srcdir = ../../../../..
 
-CFLAGS += -I../../../../../usr/include
+CFLAGS += $(KHDR_INCLUDES)
 
 include ../../lib.mk
 
diff --git a/tools/testing/selftests/powerpc/syscalls/Makefile b/tools/testing/selftests/powerpc/syscalls/Makefile
index 54ff5cfffc63..ee1740ddfb0c 100644
--- a/tools/testing/selftests/powerpc/syscalls/Makefile
+++ b/tools/testing/selftests/powerpc/syscalls/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 TEST_GEN_PROGS := ipc_unmuxed rtas_filter
 
-CFLAGS += -I../../../../../usr/include
+CFLAGS += $(KHDR_INCLUDES)
 
 top_srcdir = ../../../../..
 include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile
index 5881e97c73c1..3876805c2f31 100644
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -17,7 +17,7 @@ $(TEST_GEN_PROGS): ../harness.c ../utils.c
 CFLAGS += -mhtm
 
 $(OUTPUT)/tm-syscall: tm-syscall-asm.S
-$(OUTPUT)/tm-syscall: CFLAGS += -I../../../../../usr/include
+$(OUTPUT)/tm-syscall: CFLAGS += $(KHDR_INCLUDES)
 $(OUTPUT)/tm-tmspr: CFLAGS += -pthread
 $(OUTPUT)/tm-vmx-unavail: CFLAGS += -pthread -m64
 $(OUTPUT)/tm-resched-dscr: ../pmu/lib.c
-- 
cgit 


From a29cbd76aaf63f5493e962aa2fbaadcdc4615143 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Thu, 2 Feb 2023 11:28:12 +0100
Subject: tools/lib/thermal: Fix thermal_sampling_exit()

thermal_sampling_init() suscribes to THERMAL_GENL_SAMPLING_GROUP_NAME group
so thermal_sampling_exit() should unsubscribe from the same group.

Fixes: 47c4b0de080a ("tools/lib/thermal: Add a thermal library")
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20230202102812.453357-1-vincent.guittot@linaro.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 tools/lib/thermal/sampling.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/thermal/sampling.c b/tools/lib/thermal/sampling.c
index ee818f4e9654..70577423a9f0 100644
--- a/tools/lib/thermal/sampling.c
+++ b/tools/lib/thermal/sampling.c
@@ -54,7 +54,7 @@ int thermal_sampling_fd(struct thermal_handler *th)
 thermal_error_t thermal_sampling_exit(struct thermal_handler *th)
 {
 	if (nl_unsubscribe_thermal(th->sk_sampling, th->cb_sampling,
-				   THERMAL_GENL_EVENT_GROUP_NAME))
+				   THERMAL_GENL_SAMPLING_GROUP_NAME))
 		return THERMAL_ERROR;
 
 	nl_thermal_disconnect(th->sk_sampling, th->cb_sampling);
-- 
cgit 


From ef1ab1657fda1fd38a082b5652a2bbc6d510ffff Mon Sep 17 00:00:00 2001
From: Vibhav Pant <vibhavp@gmail.com>
Date: Sat, 11 Feb 2023 13:49:35 +0530
Subject: tools/lib/thermal: Fix include path for libnl3 in pkg-config file.

Fixes pkg-config returning malformed CFLAGS for libthermal.

Signed-off-by: Vibhav Pant <vibhavp@gmail.com>
Link: https://lore.kernel.org/r/20230211081935.62690-1-vibhavp@gmail.com
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 tools/lib/thermal/libthermal.pc.template | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/thermal/libthermal.pc.template b/tools/lib/thermal/libthermal.pc.template
index 6f3769731b59..ac24d0ab17f5 100644
--- a/tools/lib/thermal/libthermal.pc.template
+++ b/tools/lib/thermal/libthermal.pc.template
@@ -9,4 +9,4 @@ Description: thermal library
 Requires: libnl-3.0 libnl-genl-3.0
 Version: @VERSION@
 Libs: -L${libdir} -lnl-genl-3 -lnl-3
-Cflags: -I${includedir} -I{include}/libnl3
+Cflags: -I${includedir} -I${include}/libnl3
-- 
cgit 


From 524581d1216411a807d34181cb880d991fcb4b96 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Wed, 15 Feb 2023 19:01:07 +0800
Subject: selftests/bpf: Fix build error for LoongArch

There exists build error when make -C tools/testing/selftests/bpf/
on LoongArch:

  BINARY   test_verifier
In file included from test_verifier.c:27:
tools/include/uapi/linux/bpf_perf_event.h:14:28: error: field 'regs' has incomplete type
   14 |         bpf_user_pt_regs_t regs;
      |                            ^~~~
make: *** [Makefile:577: tools/testing/selftests/bpf/test_verifier] Error 1
make: Leaving directory 'tools/testing/selftests/bpf'

Add missing uapi header for LoongArch to use the following definition:
typedef struct user_pt_regs bpf_user_pt_regs_t;

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Link: https://lore.kernel.org/r/1676458867-22052-1-git-send-email-yangtiezhu@loongson.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/asm/bpf_perf_event.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/include/uapi/asm/bpf_perf_event.h b/tools/include/uapi/asm/bpf_perf_event.h
index d7dfeab0d71a..ff52668abf8c 100644
--- a/tools/include/uapi/asm/bpf_perf_event.h
+++ b/tools/include/uapi/asm/bpf_perf_event.h
@@ -6,6 +6,8 @@
 #include "../../arch/s390/include/uapi/asm/bpf_perf_event.h"
 #elif defined(__riscv)
 #include "../../arch/riscv/include/uapi/asm/bpf_perf_event.h"
+#elif defined(__loongarch__)
+#include "../../arch/loongarch/include/uapi/asm/bpf_perf_event.h"
 #else
 #include <uapi/asm-generic/bpf_perf_event.h>
 #endif
-- 
cgit 


From 5e53e5c7edc6d69b8cb48b3b370cfe531e4b4132 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn@rivosinc.com>
Date: Tue, 14 Feb 2023 17:12:53 +0100
Subject: selftests/bpf: Cross-compile bpftool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the BPF selftests are cross-compiled, only the a host version of
bpftool is built. This version of bpftool is used on the host-side to
generate various intermediates, e.g., skeletons.

The test runners are also using bpftool, so the Makefile will symlink
bpftool from the selftest/bpf root, where the test runners will look
the tool:

  | $(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/bootstrap/bpftool \
  |    $(OUTPUT)/$(if $2,$2/)bpftool

There are two problems for cross-compilation builds:

 1. There is no native (cross-compilation target) of bpftool
 2. The bootstrap/bpftool is never cross-compiled (by design)

Make sure that a native/cross-compiled version of bpftool is built,
and if CROSS_COMPILE is set, symlink the native/non-bootstrap version.

Acked-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://lore.kernel.org/r/20230214161253.183458-1-bjorn@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index f7771592a920..521933bc15fe 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -156,8 +156,9 @@ $(notdir $(TEST_GEN_PROGS)						\
 	 $(TEST_CUSTOM_PROGS)): %: $(OUTPUT)/% ;
 
 # sort removes libbpf duplicates when not cross-building
-MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf	       \
-	       $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids      \
+MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf	\
+	       $(BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/bpftool		\
+	       $(HOST_BUILD_DIR)/resolve_btfids				\
 	       $(RUNQSLOWER_OUTPUT) $(INCLUDE_DIR))
 $(MAKE_DIRS):
 	$(call msg,MKDIR,,$@)
@@ -207,6 +208,14 @@ $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(wildcard bpf_testmod/Makefile bpf_tes
 	$(Q)cp bpf_testmod/bpf_testmod.ko $@
 
 DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool
+ifneq ($(CROSS_COMPILE),)
+CROSS_BPFTOOL := $(SCRATCH_DIR)/sbin/bpftool
+TRUNNER_BPFTOOL := $(CROSS_BPFTOOL)
+USE_BOOTSTRAP := ""
+else
+TRUNNER_BPFTOOL := $(DEFAULT_BPFTOOL)
+USE_BOOTSTRAP := "bootstrap/"
+endif
 
 $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT)
 	$(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower	       \
@@ -218,7 +227,7 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT)
 		    EXTRA_LDFLAGS='$(SAN_LDFLAGS)' &&			       \
 		    cp $(RUNQSLOWER_OUTPUT)runqslower $@
 
-TEST_GEN_PROGS_EXTENDED += $(DEFAULT_BPFTOOL)
+TEST_GEN_PROGS_EXTENDED += $(TRUNNER_BPFTOOL)
 
 $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(BPFOBJ)
 
@@ -256,6 +265,18 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)    \
 		    LIBBPF_DESTDIR=$(HOST_SCRATCH_DIR)/			       \
 		    prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install-bin
 
+ifneq ($(CROSS_COMPILE),)
+$(CROSS_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)	\
+		    $(BPFOBJ) | $(BUILD_DIR)/bpftool
+	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)				\
+		    ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE)			\
+		    EXTRA_CFLAGS='-g -O0'					\
+		    OUTPUT=$(BUILD_DIR)/bpftool/				\
+		    LIBBPF_OUTPUT=$(BUILD_DIR)/libbpf/				\
+		    LIBBPF_DESTDIR=$(SCRATCH_DIR)/				\
+		    prefix= DESTDIR=$(SCRATCH_DIR)/ install-bin
+endif
+
 all: docs
 
 docs:
@@ -521,11 +542,12 @@ endif
 $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS)			\
 			     $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ)		\
 			     $(RESOLVE_BTFIDS)				\
+			     $(TRUNNER_BPFTOOL)				\
 			     | $(TRUNNER_BINARY)-extras
 	$$(call msg,BINARY,,$$@)
 	$(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) -o $$@
 	$(Q)$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@
-	$(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/bootstrap/bpftool \
+	$(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/$(USE_BOOTSTRAP)bpftool \
 		   $(OUTPUT)/$(if $2,$2/)bpftool
 
 endef
-- 
cgit 


From 62d101d5f422cde39b269f7eb4cbbe2f1e26f9d4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 14 Feb 2023 15:50:51 -0800
Subject: selftests/bpf: Fix map_kptr test.

The compiler is optimizing out majority of unref_ptr read/writes, so the test
wasn't testing much. For example, one could delete '__kptr' tag from
'struct prog_test_ref_kfunc __kptr *unref_ptr;' and the test would still "pass".

Convert it to volatile stores. Confirmed by comparing bpf asm before/after.

Fixes: 2cbc469a6fc3 ("selftests/bpf: Add C tests for kptr")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230214235051.22938-1-alexei.starovoitov@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/progs/map_kptr.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c
index eb8217803493..228ec45365a8 100644
--- a/tools/testing/selftests/bpf/progs/map_kptr.c
+++ b/tools/testing/selftests/bpf/progs/map_kptr.c
@@ -62,21 +62,23 @@ extern struct prog_test_ref_kfunc *
 bpf_kfunc_call_test_kptr_get(struct prog_test_ref_kfunc **p, int a, int b) __ksym;
 extern void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym;
 
+#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val))
+
 static void test_kptr_unref(struct map_value *v)
 {
 	struct prog_test_ref_kfunc *p;
 
 	p = v->unref_ptr;
 	/* store untrusted_ptr_or_null_ */
-	v->unref_ptr = p;
+	WRITE_ONCE(v->unref_ptr, p);
 	if (!p)
 		return;
 	if (p->a + p->b > 100)
 		return;
 	/* store untrusted_ptr_ */
-	v->unref_ptr = p;
+	WRITE_ONCE(v->unref_ptr, p);
 	/* store NULL */
-	v->unref_ptr = NULL;
+	WRITE_ONCE(v->unref_ptr, NULL);
 }
 
 static void test_kptr_ref(struct map_value *v)
@@ -85,7 +87,7 @@ static void test_kptr_ref(struct map_value *v)
 
 	p = v->ref_ptr;
 	/* store ptr_or_null_ */
-	v->unref_ptr = p;
+	WRITE_ONCE(v->unref_ptr, p);
 	if (!p)
 		return;
 	if (p->a + p->b > 100)
@@ -99,7 +101,7 @@ static void test_kptr_ref(struct map_value *v)
 		return;
 	}
 	/* store ptr_ */
-	v->unref_ptr = p;
+	WRITE_ONCE(v->unref_ptr, p);
 	bpf_kfunc_call_test_release(p);
 
 	p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
-- 
cgit 


From ecdf985d7615356b78241fdb159c091830ed0380 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Wed, 15 Feb 2023 01:20:27 +0200
Subject: bpf: track immediate values written to stack by BPF_ST instruction

For aligned stack writes using BPF_ST instruction track stored values
in a same way BPF_STX is handled, e.g. make sure that the following
commands produce similar verifier knowledge:

  fp[-8] = 42;             r1 = 42;
                       fp[-8] = r1;

This covers two cases:
 - non-null values written to stack are stored as spill of fake
   registers;
 - null values written to stack are stored as STACK_ZERO marks.

Previously both cases above used STACK_MISC marks instead.

Some verifier test cases relied on the old logic to obtain STACK_MISC
marks for some stack values. These test cases are updated in the same
commit to avoid failures during bisect.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20230214232030.1502829-2-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                              |  18 +++-
 .../bpf/verifier/bounds_mix_sign_unsign.c          | 110 ++++++++++++---------
 2 files changed, 80 insertions(+), 48 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 21e08c111702..c28afae60874 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3473,6 +3473,11 @@ static void save_register_state(struct bpf_func_state *state,
 		scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]);
 }
 
+static bool is_bpf_st_mem(struct bpf_insn *insn)
+{
+	return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM;
+}
+
 /* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
  * stack boundary and alignment are checked in check_mem_access()
  */
@@ -3484,8 +3489,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 {
 	struct bpf_func_state *cur; /* state of the current function */
 	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
-	u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
+	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
 	struct bpf_reg_state *reg = NULL;
+	u32 dst_reg = insn->dst_reg;
 
 	err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));
 	if (err)
@@ -3538,6 +3544,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 				return err;
 		}
 		save_register_state(state, spi, reg, size);
+	} else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
+		   insn->imm != 0 && env->bpf_capable) {
+		struct bpf_reg_state fake_reg = {};
+
+		__mark_reg_known(&fake_reg, (u32)insn->imm);
+		fake_reg.type = SCALAR_VALUE;
+		save_register_state(state, spi, &fake_reg, size);
 	} else if (reg && is_spillable_regtype(reg->type)) {
 		/* register containing pointer is being spilled into stack */
 		if (size != BPF_REG_SIZE) {
@@ -3572,7 +3585,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 			state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
 
 		/* when we zero initialize stack slots mark them as such */
-		if (reg && register_is_null(reg)) {
+		if ((reg && register_is_null(reg)) ||
+		    (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
 			/* backtracking doesn't work for STACK_ZERO yet. */
 			err = mark_chain_precision(env, value_regno);
 			if (err)
diff --git a/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c b/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c
index c2aa6f26738b..bf82b923c5fe 100644
--- a/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c
+++ b/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c
@@ -1,13 +1,14 @@
 {
 	"bounds checks mixing signed and unsigned, positive bounds",
 	.insns = {
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_2, 2),
 	BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 3),
@@ -17,20 +18,21 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.fixup_map_hash_8b = { 3 },
+	.fixup_map_hash_8b = { 5 },
 	.errstr = "unbounded min value",
 	.result = REJECT,
 },
 {
 	"bounds checks mixing signed and unsigned",
 	.insns = {
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_2, -1),
 	BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 3),
@@ -40,20 +42,21 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.fixup_map_hash_8b = { 3 },
+	.fixup_map_hash_8b = { 5 },
 	.errstr = "unbounded min value",
 	.result = REJECT,
 },
 {
 	"bounds checks mixing signed and unsigned, variant 2",
 	.insns = {
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_2, -1),
 	BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 5),
@@ -65,20 +68,21 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.fixup_map_hash_8b = { 3 },
+	.fixup_map_hash_8b = { 5 },
 	.errstr = "unbounded min value",
 	.result = REJECT,
 },
 {
 	"bounds checks mixing signed and unsigned, variant 3",
 	.insns = {
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_2, -1),
 	BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 4),
@@ -89,20 +93,21 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.fixup_map_hash_8b = { 3 },
+	.fixup_map_hash_8b = { 5 },
 	.errstr = "unbounded min value",
 	.result = REJECT,
 },
 {
 	"bounds checks mixing signed and unsigned, variant 4",
 	.insns = {
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_2, 1),
 	BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
@@ -112,19 +117,20 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.fixup_map_hash_8b = { 3 },
+	.fixup_map_hash_8b = { 5 },
 	.result = ACCEPT,
 },
 {
 	"bounds checks mixing signed and unsigned, variant 5",
 	.insns = {
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_2, -1),
 	BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 5),
@@ -135,17 +141,20 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.fixup_map_hash_8b = { 3 },
+	.fixup_map_hash_8b = { 5 },
 	.errstr = "unbounded min value",
 	.result = REJECT,
 },
 {
 	"bounds checks mixing signed and unsigned, variant 6",
 	.insns = {
+	BPF_MOV64_REG(BPF_REG_9, BPF_REG_1),
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),
 	BPF_MOV64_IMM(BPF_REG_2, 0),
 	BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -512),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_6, -1),
 	BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_6, 5),
@@ -163,13 +172,14 @@
 {
 	"bounds checks mixing signed and unsigned, variant 7",
 	.insns = {
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_2, 1024 * 1024 * 1024),
 	BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 3),
@@ -179,19 +189,20 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.fixup_map_hash_8b = { 3 },
+	.fixup_map_hash_8b = { 5 },
 	.result = ACCEPT,
 },
 {
 	"bounds checks mixing signed and unsigned, variant 8",
 	.insns = {
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_2, -1),
 	BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 2),
@@ -203,20 +214,21 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.fixup_map_hash_8b = { 3 },
+	.fixup_map_hash_8b = { 5 },
 	.errstr = "unbounded min value",
 	.result = REJECT,
 },
 {
 	"bounds checks mixing signed and unsigned, variant 9",
 	.insns = {
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_LD_IMM64(BPF_REG_2, -9223372036854775808ULL),
 	BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 2),
@@ -228,19 +240,20 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.fixup_map_hash_8b = { 3 },
+	.fixup_map_hash_8b = { 5 },
 	.result = ACCEPT,
 },
 {
 	"bounds checks mixing signed and unsigned, variant 10",
 	.insns = {
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_2, 0),
 	BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 2),
@@ -252,20 +265,21 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.fixup_map_hash_8b = { 3 },
+	.fixup_map_hash_8b = { 5 },
 	.errstr = "unbounded min value",
 	.result = REJECT,
 },
 {
 	"bounds checks mixing signed and unsigned, variant 11",
 	.insns = {
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_2, -1),
 	BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2),
@@ -278,20 +292,21 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.fixup_map_hash_8b = { 3 },
+	.fixup_map_hash_8b = { 5 },
 	.errstr = "unbounded min value",
 	.result = REJECT,
 },
 {
 	"bounds checks mixing signed and unsigned, variant 12",
 	.insns = {
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_2, -6),
 	BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2),
@@ -303,20 +318,21 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.fixup_map_hash_8b = { 3 },
+	.fixup_map_hash_8b = { 5 },
 	.errstr = "unbounded min value",
 	.result = REJECT,
 },
 {
 	"bounds checks mixing signed and unsigned, variant 13",
 	.insns = {
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_2, 2),
 	BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2),
@@ -331,7 +347,7 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.fixup_map_hash_8b = { 3 },
+	.fixup_map_hash_8b = { 5 },
 	.errstr = "unbounded min value",
 	.result = REJECT,
 },
@@ -340,13 +356,14 @@
 	.insns = {
 	BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1,
 		    offsetof(struct __sk_buff, mark)),
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_2, -1),
 	BPF_MOV64_IMM(BPF_REG_8, 2),
@@ -360,20 +377,21 @@
 	BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, -3),
 	BPF_JMP_IMM(BPF_JA, 0, 0, -7),
 	},
-	.fixup_map_hash_8b = { 4 },
+	.fixup_map_hash_8b = { 6 },
 	.errstr = "unbounded min value",
 	.result = REJECT,
 },
 {
 	"bounds checks mixing signed and unsigned, variant 15",
 	.insns = {
+	BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_LD_MAP_FD(BPF_REG_1, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
-	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, -8),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
 	BPF_MOV64_IMM(BPF_REG_2, -6),
 	BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2),
@@ -387,7 +405,7 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.fixup_map_hash_8b = { 3 },
+	.fixup_map_hash_8b = { 5 },
 	.errstr = "unbounded min value",
 	.result = REJECT,
 },
-- 
cgit 


From 1a24af65bb5fed673a9377e794ee3cf416fec64d Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Wed, 15 Feb 2023 01:20:28 +0200
Subject: selftests/bpf: check if verifier tracks constants spilled by
 BPF_ST_MEM

Check that verifier tracks the value of 'imm' spilled to stack by
BPF_ST_MEM instruction. Cover the following cases:
- write of non-zero constant to stack;
- write of a zero constant to stack.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20230214232030.1502829-3-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/verifier/bpf_st_mem.c | 37 +++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/verifier/bpf_st_mem.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c
new file mode 100644
index 000000000000..932903f9e585
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c
@@ -0,0 +1,37 @@
+{
+	"BPF_ST_MEM stack imm non-zero",
+	.insns = {
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 42),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, -42),
+	/* if value is tracked correctly R0 is zero */
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	/* Use prog type that requires return value in range [0, 1] */
+	.prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+	.expected_attach_type = BPF_SK_LOOKUP,
+	.runs = -1,
+},
+{
+	"BPF_ST_MEM stack imm zero",
+	.insns = {
+	/* mark stack 0000 0000 */
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	/* read and sum a few bytes */
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_10, -8),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+	BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_10, -4),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+	BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_10, -1),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+	/* if value is tracked correctly R0 is zero */
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	/* Use prog type that requires return value in range [0, 1] */
+	.prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+	.expected_attach_type = BPF_SK_LOOKUP,
+	.runs = -1,
+},
-- 
cgit 


From 2a33c5a25ef4fb574e6744fe7636956b124ad78f Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Wed, 15 Feb 2023 01:20:30 +0200
Subject: selftests/bpf: check if BPF_ST with variable offset preserves
 STACK_ZERO

A test case to verify that variable offset BPF_ST instruction
preserves STACK_ZERO marks when writes zeros, e.g. in the following
situation:

  *(u64*)(r10 - 8) = 0   ; STACK_ZERO marks for fp[-8]
  r0 = random(-7, -1)    ; some random number in range of [-7, -1]
  r0 += r10              ; r0 is now variable offset pointer to stack
  *(u8*)(r0) = 0         ; BPF_ST writing zero, STACK_ZERO mark for
                         ; fp[-8] should be preserved.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20230214232030.1502829-5-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/verifier/bpf_st_mem.c | 30 +++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c
index 932903f9e585..3af2501082b2 100644
--- a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c
+++ b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c
@@ -35,3 +35,33 @@
 	.expected_attach_type = BPF_SK_LOOKUP,
 	.runs = -1,
 },
+{
+	"BPF_ST_MEM stack imm zero, variable offset",
+	.insns = {
+	/* set fp[-16], fp[-24] to zeros */
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0),
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -24, 0),
+	/* r0 = random value in range [-32, -15] */
+	BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+	BPF_JMP_IMM(BPF_JLE, BPF_REG_0, 16, 2),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 32),
+	/* fp[r0] = 0, make a variable offset write of zero,
+	 *             this should preserve zero marks on stack.
+	 */
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_10),
+	BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0),
+	/* r0 = fp[-20], if variable offset write was tracked correctly
+	 *               r0 would be a known zero.
+	 */
+	BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_10, -20),
+	/* Would fail return code verification if r0 range is not tracked correctly. */
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	/* Use prog type that requires return value in range [0, 1] */
+	.prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+	.expected_attach_type = BPF_SK_LOOKUP,
+	.runs = -1,
+},
-- 
cgit 


From f88da2d46cc9a19b0c233285339659cae36c5d9a Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 15 Feb 2023 16:21:32 +0800
Subject: selftests/bpf: Add test case for element reuse in htab map

The reinitialization of spin-lock in map value after immediate reuse may
corrupt lookup with BPF_F_LOCK flag and result in hard lock-up, so add
one test case to demonstrate the problem.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230215082132.3856544-3-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/htab_reuse.c  | 101 +++++++++++++++++++++
 tools/testing/selftests/bpf/progs/htab_reuse.c     |  19 ++++
 2 files changed, 120 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/htab_reuse.c
 create mode 100644 tools/testing/selftests/bpf/progs/htab_reuse.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/htab_reuse.c b/tools/testing/selftests/bpf/prog_tests/htab_reuse.c
new file mode 100644
index 000000000000..a742dd994d60
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/htab_reuse.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdbool.h>
+#include <test_progs.h>
+#include "htab_reuse.skel.h"
+
+struct htab_op_ctx {
+	int fd;
+	int loop;
+	bool stop;
+};
+
+struct htab_val {
+	unsigned int lock;
+	unsigned int data;
+};
+
+static void *htab_lookup_fn(void *arg)
+{
+	struct htab_op_ctx *ctx = arg;
+	int i = 0;
+
+	while (i++ < ctx->loop && !ctx->stop) {
+		struct htab_val value;
+		unsigned int key;
+
+		/* Use BPF_F_LOCK to use spin-lock in map value. */
+		key = 7;
+		bpf_map_lookup_elem_flags(ctx->fd, &key, &value, BPF_F_LOCK);
+	}
+
+	return NULL;
+}
+
+static void *htab_update_fn(void *arg)
+{
+	struct htab_op_ctx *ctx = arg;
+	int i = 0;
+
+	while (i++ < ctx->loop && !ctx->stop) {
+		struct htab_val value;
+		unsigned int key;
+
+		key = 7;
+		value.lock = 0;
+		value.data = key;
+		bpf_map_update_elem(ctx->fd, &key, &value, BPF_F_LOCK);
+		bpf_map_delete_elem(ctx->fd, &key);
+
+		key = 24;
+		value.lock = 0;
+		value.data = key;
+		bpf_map_update_elem(ctx->fd, &key, &value, BPF_F_LOCK);
+		bpf_map_delete_elem(ctx->fd, &key);
+	}
+
+	return NULL;
+}
+
+void test_htab_reuse(void)
+{
+	unsigned int i, wr_nr = 1, rd_nr = 4;
+	pthread_t tids[wr_nr + rd_nr];
+	struct htab_reuse *skel;
+	struct htab_op_ctx ctx;
+	int err;
+
+	skel = htab_reuse__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "htab_reuse__open_and_load"))
+		return;
+
+	ctx.fd = bpf_map__fd(skel->maps.htab);
+	ctx.loop = 500;
+	ctx.stop = false;
+
+	memset(tids, 0, sizeof(tids));
+	for (i = 0; i < wr_nr; i++) {
+		err = pthread_create(&tids[i], NULL, htab_update_fn, &ctx);
+		if (!ASSERT_OK(err, "pthread_create")) {
+			ctx.stop = true;
+			goto reap;
+		}
+	}
+	for (i = 0; i < rd_nr; i++) {
+		err = pthread_create(&tids[i + wr_nr], NULL, htab_lookup_fn, &ctx);
+		if (!ASSERT_OK(err, "pthread_create")) {
+			ctx.stop = true;
+			goto reap;
+		}
+	}
+
+reap:
+	for (i = 0; i < wr_nr + rd_nr; i++) {
+		if (!tids[i])
+			continue;
+		pthread_join(tids[i], NULL);
+	}
+	htab_reuse__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/htab_reuse.c b/tools/testing/selftests/bpf/progs/htab_reuse.c
new file mode 100644
index 000000000000..7f7368cb3095
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/htab_reuse.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct htab_val {
+	struct bpf_spin_lock lock;
+	unsigned int data;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 64);
+	__type(key, unsigned int);
+	__type(value, struct htab_val);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+} htab SEC(".maps");
-- 
cgit 


From 4db98ab445c58bd26c303ef7a10ccd8f049acc22 Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Mon, 13 Feb 2023 09:15:13 +0000
Subject: selftest/bpf/benchs: Fix a typo in bpf_hashmap_full_update

To call the bpf_hashmap_full_update benchmark, one should say:

    bench bpf-hashmap-ful-update

The patch adds a missing 'l' to the benchmark name.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230213091519.1202813-2-aspsk@isovalent.com
---
 tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c      | 2 +-
 tools/testing/selftests/bpf/benchs/run_bench_bpf_hashmap_full_update.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c
index cec51e0ff4b8..44706acf632a 100644
--- a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c
@@ -85,7 +85,7 @@ void hashmap_report_final(struct bench_res res[], int res_cnt)
 }
 
 const struct bench bench_bpf_hashmap_full_update = {
-	.name = "bpf-hashmap-ful-update",
+	.name = "bpf-hashmap-full-update",
 	.validate = validate,
 	.setup = setup,
 	.producer_thread = producer,
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_bpf_hashmap_full_update.sh b/tools/testing/selftests/bpf/benchs/run_bench_bpf_hashmap_full_update.sh
index 1e2de838f9fa..cd2efd3fdef3 100755
--- a/tools/testing/selftests/bpf/benchs/run_bench_bpf_hashmap_full_update.sh
+++ b/tools/testing/selftests/bpf/benchs/run_bench_bpf_hashmap_full_update.sh
@@ -6,6 +6,6 @@ source ./benchs/run_common.sh
 set -eufo pipefail
 
 nr_threads=`expr $(cat /proc/cpuinfo | grep "processor"| wc -l) - 1`
-summary=$($RUN_BENCH -p $nr_threads bpf-hashmap-ful-update)
+summary=$($RUN_BENCH -p $nr_threads bpf-hashmap-full-update)
 printf "$summary"
 printf "\n"
-- 
cgit 


From 2f1c59637fb17dbb2a725c3bd48e4d9d3809df89 Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Mon, 13 Feb 2023 09:15:14 +0000
Subject: selftest/bpf/benchs: Make a function static in
 bpf_hashmap_full_update

The hashmap_report_final callback function defined in the
benchs/bench_bpf_hashmap_full_update.c file should be static.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230213091519.1202813-3-aspsk@isovalent.com
---
 tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c
index 44706acf632a..67f76415a362 100644
--- a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c
@@ -68,7 +68,7 @@ static void setup(void)
 		bpf_map_update_elem(map_fd, &i, &i, BPF_ANY);
 }
 
-void hashmap_report_final(struct bench_res res[], int res_cnt)
+static void hashmap_report_final(struct bench_res res[], int res_cnt)
 {
 	unsigned int nr_cpus = bpf_num_possible_cpus();
 	int i;
-- 
cgit 


From 22ff7aeaa9e3d0533df613da3500db1ecf452253 Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Mon, 13 Feb 2023 09:15:15 +0000
Subject: selftest/bpf/benchs: Enhance argp parsing

To parse command line the bench utility uses the argp_parse() function. This
function takes as an argument a parent 'struct argp' structure which defines
common command line options and an array of children 'struct argp' structures
which defines additional command line options for particular benchmarks. This
implementation doesn't allow benchmarks to share option names, e.g., if two
benchmarks want to use, say, the --option option, then only one of them will
succeed (the first one encountered in the array).  This will be convenient if
same option names could be used in different benchmarks (with the same
semantics, e.g., --nr_loops=N).

Fix this by calling the argp_parse() function twice. The first call is the same
as it was before, with all children argps, and helps to find the benchmark name
and to print a combined help message if anything is wrong.  Given the name, we
can call the argp_parse the second time, but now the children array points only
to a correct benchmark thus always calling the correct parsers. (If there's no
a specific list of arguments, then only one call to argp_parse will be done.)

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230213091519.1202813-4-aspsk@isovalent.com
---
 tools/testing/selftests/bpf/bench.c                | 44 +++++++++++++++++-----
 tools/testing/selftests/bpf/bench.h                |  1 +
 .../selftests/bpf/benchs/bench_bloom_filter_map.c  |  5 +++
 .../testing/selftests/bpf/benchs/bench_bpf_loop.c  |  1 +
 .../selftests/bpf/benchs/bench_local_storage.c     |  3 ++
 .../benchs/bench_local_storage_rcu_tasks_trace.c   |  1 +
 .../testing/selftests/bpf/benchs/bench_ringbufs.c  |  4 ++
 tools/testing/selftests/bpf/benchs/bench_strncmp.c |  2 +
 8 files changed, 51 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index c1f20a147462..12c3b3ab84aa 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -287,10 +287,11 @@ static const struct argp_child bench_parsers[] = {
 	{},
 };
 
+/* Make pos_args global, so that we can run argp_parse twice, if necessary */
+static int pos_args;
+
 static error_t parse_arg(int key, char *arg, struct argp_state *state)
 {
-	static int pos_args;
-
 	switch (key) {
 	case 'v':
 		env.verbose = true;
@@ -359,7 +360,7 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 	return 0;
 }
 
-static void parse_cmdline_args(int argc, char **argv)
+static void parse_cmdline_args_init(int argc, char **argv)
 {
 	static const struct argp argp = {
 		.options = opts,
@@ -369,9 +370,25 @@ static void parse_cmdline_args(int argc, char **argv)
 	};
 	if (argp_parse(&argp, argc, argv, 0, NULL, NULL))
 		exit(1);
-	if (!env.list && !env.bench_name) {
-		argp_help(&argp, stderr, ARGP_HELP_DOC, "bench");
-		exit(1);
+}
+
+static void parse_cmdline_args_final(int argc, char **argv)
+{
+	struct argp_child bench_parsers[2] = {};
+	const struct argp argp = {
+		.options = opts,
+		.parser = parse_arg,
+		.doc = argp_program_doc,
+		.children = bench_parsers,
+	};
+
+	/* Parse arguments the second time with the correct set of parsers */
+	if (bench->argp) {
+		bench_parsers[0].argp = bench->argp;
+		bench_parsers[0].header = bench->name;
+		pos_args = 0;
+		if (argp_parse(&argp, argc, argv, 0, NULL, NULL))
+			exit(1);
 	}
 }
 
@@ -531,15 +548,14 @@ static const struct bench *benchs[] = {
 	&bench_local_storage_tasks_trace,
 };
 
-static void setup_benchmark()
+static void find_benchmark(void)
 {
-	int i, err;
+	int i;
 
 	if (!env.bench_name) {
 		fprintf(stderr, "benchmark name is not specified\n");
 		exit(1);
 	}
-
 	for (i = 0; i < ARRAY_SIZE(benchs); i++) {
 		if (strcmp(benchs[i]->name, env.bench_name) == 0) {
 			bench = benchs[i];
@@ -550,6 +566,11 @@ static void setup_benchmark()
 		fprintf(stderr, "benchmark '%s' not found\n", env.bench_name);
 		exit(1);
 	}
+}
+
+static void setup_benchmark(void)
+{
+	int i, err;
 
 	printf("Setting up benchmark '%s'...\n", bench->name);
 
@@ -621,7 +642,7 @@ static void collect_measurements(long delta_ns) {
 
 int main(int argc, char **argv)
 {
-	parse_cmdline_args(argc, argv);
+	parse_cmdline_args_init(argc, argv);
 
 	if (env.list) {
 		int i;
@@ -633,6 +654,9 @@ int main(int argc, char **argv)
 		return 0;
 	}
 
+	find_benchmark();
+	parse_cmdline_args_final(argc, argv);
+
 	setup_benchmark();
 
 	setup_timer();
diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h
index d748255877e2..3c8afa0131a3 100644
--- a/tools/testing/selftests/bpf/bench.h
+++ b/tools/testing/selftests/bpf/bench.h
@@ -47,6 +47,7 @@ struct bench_res {
 
 struct bench {
 	const char *name;
+	const struct argp *argp;
 	void (*validate)(void);
 	void (*setup)(void);
 	void *(*producer_thread)(void *ctx);
diff --git a/tools/testing/selftests/bpf/benchs/bench_bloom_filter_map.c b/tools/testing/selftests/bpf/benchs/bench_bloom_filter_map.c
index 5bcb8a8cdeb2..7c8ccc108313 100644
--- a/tools/testing/selftests/bpf/benchs/bench_bloom_filter_map.c
+++ b/tools/testing/selftests/bpf/benchs/bench_bloom_filter_map.c
@@ -428,6 +428,7 @@ static void *consumer(void *input)
 
 const struct bench bench_bloom_lookup = {
 	.name = "bloom-lookup",
+	.argp = &bench_bloom_map_argp,
 	.validate = validate,
 	.setup = bloom_lookup_setup,
 	.producer_thread = producer,
@@ -439,6 +440,7 @@ const struct bench bench_bloom_lookup = {
 
 const struct bench bench_bloom_update = {
 	.name = "bloom-update",
+	.argp = &bench_bloom_map_argp,
 	.validate = validate,
 	.setup = bloom_update_setup,
 	.producer_thread = producer,
@@ -450,6 +452,7 @@ const struct bench bench_bloom_update = {
 
 const struct bench bench_bloom_false_positive = {
 	.name = "bloom-false-positive",
+	.argp = &bench_bloom_map_argp,
 	.validate = validate,
 	.setup = false_positive_setup,
 	.producer_thread = producer,
@@ -461,6 +464,7 @@ const struct bench bench_bloom_false_positive = {
 
 const struct bench bench_hashmap_without_bloom = {
 	.name = "hashmap-without-bloom",
+	.argp = &bench_bloom_map_argp,
 	.validate = validate,
 	.setup = hashmap_no_bloom_setup,
 	.producer_thread = producer,
@@ -472,6 +476,7 @@ const struct bench bench_hashmap_without_bloom = {
 
 const struct bench bench_hashmap_with_bloom = {
 	.name = "hashmap-with-bloom",
+	.argp = &bench_bloom_map_argp,
 	.validate = validate,
 	.setup = hashmap_with_bloom_setup,
 	.producer_thread = producer,
diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_loop.c b/tools/testing/selftests/bpf/benchs/bench_bpf_loop.c
index d0a6572bfab6..d8a0394e10b1 100644
--- a/tools/testing/selftests/bpf/benchs/bench_bpf_loop.c
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_loop.c
@@ -95,6 +95,7 @@ static void setup(void)
 
 const struct bench bench_bpf_loop = {
 	.name = "bpf-loop",
+	.argp = &bench_bpf_loop_argp,
 	.validate = validate,
 	.setup = setup,
 	.producer_thread = producer,
diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage.c b/tools/testing/selftests/bpf/benchs/bench_local_storage.c
index 5a378c84e81f..d4b2817306d4 100644
--- a/tools/testing/selftests/bpf/benchs/bench_local_storage.c
+++ b/tools/testing/selftests/bpf/benchs/bench_local_storage.c
@@ -255,6 +255,7 @@ static void *producer(void *input)
  */
 const struct bench bench_local_storage_cache_seq_get = {
 	.name = "local-storage-cache-seq-get",
+	.argp = &bench_local_storage_argp,
 	.validate = validate,
 	.setup = local_storage_cache_get_setup,
 	.producer_thread = producer,
@@ -266,6 +267,7 @@ const struct bench bench_local_storage_cache_seq_get = {
 
 const struct bench bench_local_storage_cache_interleaved_get = {
 	.name = "local-storage-cache-int-get",
+	.argp = &bench_local_storage_argp,
 	.validate = validate,
 	.setup = local_storage_cache_get_interleaved_setup,
 	.producer_thread = producer,
@@ -277,6 +279,7 @@ const struct bench bench_local_storage_cache_interleaved_get = {
 
 const struct bench bench_local_storage_cache_hashmap_control = {
 	.name = "local-storage-cache-hashmap-control",
+	.argp = &bench_local_storage_argp,
 	.validate = validate,
 	.setup = hashmap_setup,
 	.producer_thread = producer,
diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c b/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c
index 43f109d93130..4f9401ecf09c 100644
--- a/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c
+++ b/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c
@@ -271,6 +271,7 @@ static void report_final(struct bench_res res[], int res_cnt)
  */
 const struct bench bench_local_storage_tasks_trace = {
 	.name = "local-storage-tasks-trace",
+	.argp = &bench_local_storage_rcu_tasks_trace_argp,
 	.validate = validate,
 	.setup = local_storage_tasks_trace_setup,
 	.producer_thread = producer,
diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
index c2554f9695ff..fc91fdac4faa 100644
--- a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
+++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
@@ -518,6 +518,7 @@ static void *perfbuf_custom_consumer(void *input)
 
 const struct bench bench_rb_libbpf = {
 	.name = "rb-libbpf",
+	.argp = &bench_ringbufs_argp,
 	.validate = bufs_validate,
 	.setup = ringbuf_libbpf_setup,
 	.producer_thread = bufs_sample_producer,
@@ -529,6 +530,7 @@ const struct bench bench_rb_libbpf = {
 
 const struct bench bench_rb_custom = {
 	.name = "rb-custom",
+	.argp = &bench_ringbufs_argp,
 	.validate = bufs_validate,
 	.setup = ringbuf_custom_setup,
 	.producer_thread = bufs_sample_producer,
@@ -540,6 +542,7 @@ const struct bench bench_rb_custom = {
 
 const struct bench bench_pb_libbpf = {
 	.name = "pb-libbpf",
+	.argp = &bench_ringbufs_argp,
 	.validate = bufs_validate,
 	.setup = perfbuf_libbpf_setup,
 	.producer_thread = bufs_sample_producer,
@@ -551,6 +554,7 @@ const struct bench bench_pb_libbpf = {
 
 const struct bench bench_pb_custom = {
 	.name = "pb-custom",
+	.argp = &bench_ringbufs_argp,
 	.validate = bufs_validate,
 	.setup = perfbuf_libbpf_setup,
 	.producer_thread = bufs_sample_producer,
diff --git a/tools/testing/selftests/bpf/benchs/bench_strncmp.c b/tools/testing/selftests/bpf/benchs/bench_strncmp.c
index 494b591c0289..d3fad2ba6916 100644
--- a/tools/testing/selftests/bpf/benchs/bench_strncmp.c
+++ b/tools/testing/selftests/bpf/benchs/bench_strncmp.c
@@ -140,6 +140,7 @@ static void strncmp_measure(struct bench_res *res)
 
 const struct bench bench_strncmp_no_helper = {
 	.name = "strncmp-no-helper",
+	.argp = &bench_strncmp_argp,
 	.validate = strncmp_validate,
 	.setup = strncmp_no_helper_setup,
 	.producer_thread = strncmp_producer,
@@ -151,6 +152,7 @@ const struct bench bench_strncmp_no_helper = {
 
 const struct bench bench_strncmp_helper = {
 	.name = "strncmp-helper",
+	.argp = &bench_strncmp_argp,
 	.validate = strncmp_validate,
 	.setup = strncmp_helper_setup,
 	.producer_thread = strncmp_producer,
-- 
cgit 


From 9644546260eac49348b2c0694b01bdf72c627194 Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Mon, 13 Feb 2023 09:15:16 +0000
Subject: selftest/bpf/benchs: Remove an unused header

The benchs/bench_bpf_hashmap_full_update.c doesn't set a custom argp,
so it shouldn't include the <argp.h> header.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230213091519.1202813-5-aspsk@isovalent.com
---
 tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c
index 67f76415a362..75abe8137b6c 100644
--- a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2022 Bytedance */
 
-#include <argp.h>
 #include "bench.h"
 #include "bpf_hashmap_full_update_bench.skel.h"
 #include "bpf_util.h"
-- 
cgit 


From 90c22503cd8910c54a8cd4bfe5bb6873d9ba8e0b Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Mon, 13 Feb 2023 09:15:17 +0000
Subject: selftest/bpf/benchs: Make quiet option common

The "local-storage-tasks-trace" benchmark has a `--quiet` option. Move it to
the list of common options, so that the main code and other benchmarks can use
(new) env.quiet variable. Patch the run_bench_local_storage_rcu_tasks_trace.sh
helper script accordingly.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230213091519.1202813-6-aspsk@isovalent.com
---
 tools/testing/selftests/bpf/bench.c                       |  5 +++++
 tools/testing/selftests/bpf/bench.h                       |  1 +
 .../bpf/benchs/bench_local_storage_rcu_tasks_trace.c      | 15 +--------------
 .../bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh |  2 +-
 4 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 12c3b3ab84aa..23c24c346130 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -16,6 +16,7 @@ struct env env = {
 	.warmup_sec = 1,
 	.duration_sec = 5,
 	.affinity = false,
+	.quiet = false,
 	.consumer_cnt = 1,
 	.producer_cnt = 1,
 };
@@ -262,6 +263,7 @@ static const struct argp_option opts[] = {
 	{ "consumers", 'c', "NUM", 0, "Number of consumer threads"},
 	{ "verbose", 'v', NULL, 0, "Verbose debug output"},
 	{ "affinity", 'a', NULL, 0, "Set consumer/producer thread affinity"},
+	{ "quiet", 'q', NULL, 0, "Be more quiet"},
 	{ "prod-affinity", ARG_PROD_AFFINITY_SET, "CPUSET", 0,
 	  "Set of CPUs for producer threads; implies --affinity"},
 	{ "cons-affinity", ARG_CONS_AFFINITY_SET, "CPUSET", 0,
@@ -330,6 +332,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 	case 'a':
 		env.affinity = true;
 		break;
+	case 'q':
+		env.quiet = true;
+		break;
 	case ARG_PROD_AFFINITY_SET:
 		env.affinity = true;
 		if (parse_num_list(arg, &env.prod_cpus.cpus,
diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h
index 3c8afa0131a3..402729c6a3ac 100644
--- a/tools/testing/selftests/bpf/bench.h
+++ b/tools/testing/selftests/bpf/bench.h
@@ -24,6 +24,7 @@ struct env {
 	bool verbose;
 	bool list;
 	bool affinity;
+	bool quiet;
 	int consumer_cnt;
 	int producer_cnt;
 	struct cpu_set prod_cpus;
diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c b/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c
index 4f9401ecf09c..d5eb5587f2aa 100644
--- a/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c
+++ b/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c
@@ -12,17 +12,14 @@
 static struct {
 	__u32 nr_procs;
 	__u32 kthread_pid;
-	bool quiet;
 } args = {
 	.nr_procs = 1000,
 	.kthread_pid = 0,
-	.quiet = false,
 };
 
 enum {
 	ARG_NR_PROCS = 7000,
 	ARG_KTHREAD_PID = 7001,
-	ARG_QUIET = 7002,
 };
 
 static const struct argp_option opts[] = {
@@ -30,8 +27,6 @@ static const struct argp_option opts[] = {
 		"Set number of user processes to spin up"},
 	{ "kthread_pid", ARG_KTHREAD_PID, "PID", 0,
 		"Pid of rcu_tasks_trace kthread for ticks tracking"},
-	{ "quiet", ARG_QUIET, "{0,1}", 0,
-		"If true, don't report progress"},
 	{},
 };
 
@@ -56,14 +51,6 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 		}
 		args.kthread_pid = ret;
 		break;
-	case ARG_QUIET:
-		ret = strtol(arg, NULL, 10);
-		if (ret < 0 || ret > 1) {
-			fprintf(stderr, "invalid quiet %ld\n", ret);
-			argp_usage(state);
-		}
-		args.quiet = ret;
-		break;
 break;
 	default:
 		return ARGP_ERR_UNKNOWN;
@@ -230,7 +217,7 @@ static void report_progress(int iter, struct bench_res *res, long delta_ns)
 		exit(1);
 	}
 
-	if (args.quiet)
+	if (env.quiet)
 		return;
 
 	printf("Iter %d\t avg tasks_trace grace period latency\t%lf ns\n",
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh b/tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh
index 5dac1f02892c..3e8a969f2096 100755
--- a/tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh
+++ b/tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh
@@ -8,4 +8,4 @@ if [ -z $kthread_pid ]; then
 	exit 1
 fi
 
-./bench --nr_procs 15000 --kthread_pid $kthread_pid -d 600 --quiet 1 local-storage-tasks-trace
+./bench --nr_procs 15000 --kthread_pid $kthread_pid -d 600 --quiet local-storage-tasks-trace
-- 
cgit 


From a237dda05e9101404a634ac53ee65c8f8c8fce58 Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Mon, 13 Feb 2023 09:15:18 +0000
Subject: selftest/bpf/benchs: Print less if the quiet option is set

The bench utility will print

    Setting up benchmark '<bench-name>'...
    Benchmark '<bench-name>' started.

on startup to stdout. Suppress this output if --quiet option if given. This
makes it simpler to parse benchmark output by a script.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230213091519.1202813-7-aspsk@isovalent.com
---
 tools/testing/selftests/bpf/bench.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 23c24c346130..767ca679ee67 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -577,7 +577,8 @@ static void setup_benchmark(void)
 {
 	int i, err;
 
-	printf("Setting up benchmark '%s'...\n", bench->name);
+	if (!env.quiet)
+		printf("Setting up benchmark '%s'...\n", bench->name);
 
 	state.producers = calloc(env.producer_cnt, sizeof(*state.producers));
 	state.consumers = calloc(env.consumer_cnt, sizeof(*state.consumers));
@@ -623,7 +624,8 @@ static void setup_benchmark(void)
 					    next_cpu(&env.prod_cpus));
 	}
 
-	printf("Benchmark '%s' started.\n", bench->name);
+	if (!env.quiet)
+		printf("Benchmark '%s' started.\n", bench->name);
 }
 
 static pthread_mutex_t bench_done_mtx = PTHREAD_MUTEX_INITIALIZER;
-- 
cgit 


From f371f2dc53d107af25171f29c852a3908ee0afb6 Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Mon, 13 Feb 2023 09:15:19 +0000
Subject: selftest/bpf/benchs: Add benchmark for hashmap lookups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new benchmark which measures hashmap lookup operations speed.  A user can
control the following parameters of the benchmark:

    * key_size (max 1024): the key size to use
    * max_entries: the hashmap max entries
    * nr_entries: the number of entries to insert/lookup
    * nr_loops: the number of loops for the benchmark
    * map_flags The hashmap flags passed to BPF_MAP_CREATE

The BPF program performing the benchmarks calls two nested bpf_loop:

    bpf_loop(nr_loops/nr_entries)
            bpf_loop(nr_entries)
                     bpf_map_lookup()

So the nr_loops determines the number of actual map lookups. All lookups are
successful.

Example (the output is generated on a AMD Ryzen 9 3950X machine):

    for nr_entries in `seq 4096 4096 65536`; do echo -n "$((nr_entries*100/65536))% full: "; sudo ./bench -d2 -a bpf-hashmap-lookup --key_size=4 --nr_entries=$nr_entries --max_entries=65536 --nr_loops=1000000 --map_flags=0x40 | grep cpu; done
    6% full: cpu01: lookup 50.739M ± 0.018M events/sec (approximated from 32 samples of ~19ms)
    12% full: cpu01: lookup 47.751M ± 0.015M events/sec (approximated from 32 samples of ~20ms)
    18% full: cpu01: lookup 45.153M ± 0.013M events/sec (approximated from 32 samples of ~22ms)
    25% full: cpu01: lookup 43.826M ± 0.014M events/sec (approximated from 32 samples of ~22ms)
    31% full: cpu01: lookup 41.971M ± 0.012M events/sec (approximated from 32 samples of ~23ms)
    37% full: cpu01: lookup 41.034M ± 0.015M events/sec (approximated from 32 samples of ~24ms)
    43% full: cpu01: lookup 39.946M ± 0.012M events/sec (approximated from 32 samples of ~25ms)
    50% full: cpu01: lookup 38.256M ± 0.014M events/sec (approximated from 32 samples of ~26ms)
    56% full: cpu01: lookup 36.580M ± 0.018M events/sec (approximated from 32 samples of ~27ms)
    62% full: cpu01: lookup 36.252M ± 0.012M events/sec (approximated from 32 samples of ~27ms)
    68% full: cpu01: lookup 35.200M ± 0.012M events/sec (approximated from 32 samples of ~28ms)
    75% full: cpu01: lookup 34.061M ± 0.009M events/sec (approximated from 32 samples of ~29ms)
    81% full: cpu01: lookup 34.374M ± 0.010M events/sec (approximated from 32 samples of ~29ms)
    87% full: cpu01: lookup 33.244M ± 0.011M events/sec (approximated from 32 samples of ~30ms)
    93% full: cpu01: lookup 32.182M ± 0.013M events/sec (approximated from 32 samples of ~31ms)
    100% full: cpu01: lookup 31.497M ± 0.016M events/sec (approximated from 32 samples of ~31ms)

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230213091519.1202813-8-aspsk@isovalent.com
---
 tools/testing/selftests/bpf/Makefile               |   5 +-
 tools/testing/selftests/bpf/bench.c                |   4 +
 .../bpf/benchs/bench_bpf_hashmap_lookup.c          | 283 +++++++++++++++++++++
 .../selftests/bpf/progs/bpf_hashmap_lookup.c       |  63 +++++
 4 files changed, 354 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_hashmap_lookup.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 521933bc15fe..b677dcd0b77a 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -638,6 +638,7 @@ $(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h
 $(OUTPUT)/bench_bpf_hashmap_full_update.o: $(OUTPUT)/bpf_hashmap_full_update_bench.skel.h
 $(OUTPUT)/bench_local_storage.o: $(OUTPUT)/local_storage_bench.skel.h
 $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tasks_trace_bench.skel.h
+$(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h
 $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
 $(OUTPUT)/bench: LDLIBS += -lm
 $(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -652,7 +653,9 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
 		 $(OUTPUT)/bench_strncmp.o \
 		 $(OUTPUT)/bench_bpf_hashmap_full_update.o \
 		 $(OUTPUT)/bench_local_storage.o \
-		 $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o
+		 $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o \
+		 $(OUTPUT)/bench_bpf_hashmap_lookup.o \
+		 #
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
 
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 767ca679ee67..0b2a53bb8460 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -277,6 +277,7 @@ extern struct argp bench_bpf_loop_argp;
 extern struct argp bench_local_storage_argp;
 extern struct argp bench_local_storage_rcu_tasks_trace_argp;
 extern struct argp bench_strncmp_argp;
+extern struct argp bench_hashmap_lookup_argp;
 
 static const struct argp_child bench_parsers[] = {
 	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
@@ -286,6 +287,7 @@ static const struct argp_child bench_parsers[] = {
 	{ &bench_strncmp_argp, 0, "bpf_strncmp helper benchmark", 0 },
 	{ &bench_local_storage_rcu_tasks_trace_argp, 0,
 		"local_storage RCU Tasks Trace slowdown benchmark", 0 },
+	{ &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 },
 	{},
 };
 
@@ -512,6 +514,7 @@ extern const struct bench bench_local_storage_cache_seq_get;
 extern const struct bench bench_local_storage_cache_interleaved_get;
 extern const struct bench bench_local_storage_cache_hashmap_control;
 extern const struct bench bench_local_storage_tasks_trace;
+extern const struct bench bench_bpf_hashmap_lookup;
 
 static const struct bench *benchs[] = {
 	&bench_count_global,
@@ -551,6 +554,7 @@ static const struct bench *benchs[] = {
 	&bench_local_storage_cache_interleaved_get,
 	&bench_local_storage_cache_hashmap_control,
 	&bench_local_storage_tasks_trace,
+	&bench_bpf_hashmap_lookup,
 };
 
 static void find_benchmark(void)
diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c
new file mode 100644
index 000000000000..8dbb02f75cff
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include <sys/random.h>
+#include <argp.h>
+#include "bench.h"
+#include "bpf_hashmap_lookup.skel.h"
+#include "bpf_util.h"
+
+/* BPF triggering benchmarks */
+static struct ctx {
+	struct bpf_hashmap_lookup *skel;
+} ctx;
+
+/* only available to kernel, so define it here */
+#define BPF_MAX_LOOPS (1<<23)
+
+#define MAX_KEY_SIZE 1024 /* the size of the key map */
+
+static struct {
+	__u32 key_size;
+	__u32 map_flags;
+	__u32 max_entries;
+	__u32 nr_entries;
+	__u32 nr_loops;
+} args = {
+	.key_size = 4,
+	.map_flags = 0,
+	.max_entries = 1000,
+	.nr_entries = 500,
+	.nr_loops = 1000000,
+};
+
+enum {
+	ARG_KEY_SIZE = 8001,
+	ARG_MAP_FLAGS,
+	ARG_MAX_ENTRIES,
+	ARG_NR_ENTRIES,
+	ARG_NR_LOOPS,
+};
+
+static const struct argp_option opts[] = {
+	{ "key_size", ARG_KEY_SIZE, "KEY_SIZE", 0,
+	  "The hashmap key size (max 1024)"},
+	{ "map_flags", ARG_MAP_FLAGS, "MAP_FLAGS", 0,
+	  "The hashmap flags passed to BPF_MAP_CREATE"},
+	{ "max_entries", ARG_MAX_ENTRIES, "MAX_ENTRIES", 0,
+	  "The hashmap max entries"},
+	{ "nr_entries", ARG_NR_ENTRIES, "NR_ENTRIES", 0,
+	  "The number of entries to insert/lookup"},
+	{ "nr_loops", ARG_NR_LOOPS, "NR_LOOPS", 0,
+	  "The number of loops for the benchmark"},
+	{},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+	long ret;
+
+	switch (key) {
+	case ARG_KEY_SIZE:
+		ret = strtol(arg, NULL, 10);
+		if (ret < 1 || ret > MAX_KEY_SIZE) {
+			fprintf(stderr, "invalid key_size");
+			argp_usage(state);
+		}
+		args.key_size = ret;
+		break;
+	case ARG_MAP_FLAGS:
+		ret = strtol(arg, NULL, 0);
+		if (ret < 0 || ret > UINT_MAX) {
+			fprintf(stderr, "invalid map_flags");
+			argp_usage(state);
+		}
+		args.map_flags = ret;
+		break;
+	case ARG_MAX_ENTRIES:
+		ret = strtol(arg, NULL, 10);
+		if (ret < 1 || ret > UINT_MAX) {
+			fprintf(stderr, "invalid max_entries");
+			argp_usage(state);
+		}
+		args.max_entries = ret;
+		break;
+	case ARG_NR_ENTRIES:
+		ret = strtol(arg, NULL, 10);
+		if (ret < 1 || ret > UINT_MAX) {
+			fprintf(stderr, "invalid nr_entries");
+			argp_usage(state);
+		}
+		args.nr_entries = ret;
+		break;
+	case ARG_NR_LOOPS:
+		ret = strtol(arg, NULL, 10);
+		if (ret < 1 || ret > BPF_MAX_LOOPS) {
+			fprintf(stderr, "invalid nr_loops: %ld (min=1 max=%u)\n",
+				ret, BPF_MAX_LOOPS);
+			argp_usage(state);
+		}
+		args.nr_loops = ret;
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+
+	return 0;
+}
+
+const struct argp bench_hashmap_lookup_argp = {
+	.options = opts,
+	.parser = parse_arg,
+};
+
+static void validate(void)
+{
+	if (env.consumer_cnt != 1) {
+		fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
+		exit(1);
+	}
+
+	if (args.nr_entries > args.max_entries) {
+		fprintf(stderr, "args.nr_entries is too big! (max %u, got %u)\n",
+			args.max_entries, args.nr_entries);
+		exit(1);
+	}
+}
+
+static void *producer(void *input)
+{
+	while (true) {
+		/* trigger the bpf program */
+		syscall(__NR_getpgid);
+	}
+	return NULL;
+}
+
+static void *consumer(void *input)
+{
+	return NULL;
+}
+
+static void measure(struct bench_res *res)
+{
+}
+
+static inline void patch_key(u32 i, u32 *key)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	*key = i + 1;
+#else
+	*key = __builtin_bswap32(i + 1);
+#endif
+	/* the rest of key is random */
+}
+
+static void setup(void)
+{
+	struct bpf_link *link;
+	int map_fd;
+	int ret;
+	int i;
+
+	setup_libbpf();
+
+	ctx.skel = bpf_hashmap_lookup__open();
+	if (!ctx.skel) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
+
+	bpf_map__set_max_entries(ctx.skel->maps.hash_map_bench, args.max_entries);
+	bpf_map__set_key_size(ctx.skel->maps.hash_map_bench, args.key_size);
+	bpf_map__set_value_size(ctx.skel->maps.hash_map_bench, 8);
+	bpf_map__set_map_flags(ctx.skel->maps.hash_map_bench, args.map_flags);
+
+	ctx.skel->bss->nr_entries = args.nr_entries;
+	ctx.skel->bss->nr_loops = args.nr_loops / args.nr_entries;
+
+	if (args.key_size > 4) {
+		for (i = 1; i < args.key_size/4; i++)
+			ctx.skel->bss->key[i] = 2654435761 * i;
+	}
+
+	ret = bpf_hashmap_lookup__load(ctx.skel);
+	if (ret) {
+		bpf_hashmap_lookup__destroy(ctx.skel);
+		fprintf(stderr, "failed to load map: %s", strerror(-ret));
+		exit(1);
+	}
+
+	/* fill in the hash_map */
+	map_fd = bpf_map__fd(ctx.skel->maps.hash_map_bench);
+	for (u64 i = 0; i < args.nr_entries; i++) {
+		patch_key(i, ctx.skel->bss->key);
+		bpf_map_update_elem(map_fd, ctx.skel->bss->key, &i, BPF_ANY);
+	}
+
+	link = bpf_program__attach(ctx.skel->progs.benchmark);
+	if (!link) {
+		fprintf(stderr, "failed to attach program!\n");
+		exit(1);
+	}
+}
+
+static inline double events_from_time(u64 time)
+{
+	if (time)
+		return args.nr_loops * 1000000000llu / time / 1000000.0L;
+
+	return 0;
+}
+
+static int compute_events(u64 *times, double *events_mean, double *events_stddev, u64 *mean_time)
+{
+	int i, n = 0;
+
+	*events_mean = 0;
+	*events_stddev = 0;
+	*mean_time = 0;
+
+	for (i = 0; i < 32; i++) {
+		if (!times[i])
+			break;
+		*mean_time += times[i];
+		*events_mean += events_from_time(times[i]);
+		n += 1;
+	}
+	if (!n)
+		return 0;
+
+	*mean_time /= n;
+	*events_mean /= n;
+
+	if (n > 1) {
+		for (i = 0; i < n; i++) {
+			double events_i = *events_mean - events_from_time(times[i]);
+			*events_stddev += events_i * events_i / (n - 1);
+		}
+		*events_stddev = sqrt(*events_stddev);
+	}
+
+	return n;
+}
+
+static void hashmap_report_final(struct bench_res res[], int res_cnt)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	double events_mean, events_stddev;
+	u64 mean_time;
+	int i, n;
+
+	for (i = 0; i < nr_cpus; i++) {
+		n = compute_events(ctx.skel->bss->percpu_times[i], &events_mean,
+				   &events_stddev, &mean_time);
+		if (n == 0)
+			continue;
+
+		if (env.quiet) {
+			/* we expect only one cpu to be present */
+			if (env.affinity)
+				printf("%.3lf\n", events_mean);
+			else
+				printf("cpu%02d %.3lf\n", i, events_mean);
+		} else {
+			printf("cpu%02d: lookup %.3lfM ± %.3lfM events/sec"
+			       " (approximated from %d samples of ~%lums)\n",
+			       i, events_mean, 2*events_stddev,
+			       n, mean_time / 1000000);
+		}
+	}
+}
+
+const struct bench bench_bpf_hashmap_lookup = {
+	.name = "bpf-hashmap-lookup",
+	.argp = &bench_hashmap_lookup_argp,
+	.validate = validate,
+	.setup = setup,
+	.producer_thread = producer,
+	.consumer_thread = consumer,
+	.measure = measure,
+	.report_progress = NULL,
+	.report_final = hashmap_report_final,
+};
diff --git a/tools/testing/selftests/bpf/progs/bpf_hashmap_lookup.c b/tools/testing/selftests/bpf/progs/bpf_hashmap_lookup.c
new file mode 100644
index 000000000000..1eb74ddca414
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_hashmap_lookup.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+} hash_map_bench SEC(".maps");
+
+/* The number of slots to store times */
+#define NR_SLOTS 32
+#define NR_CPUS 256
+#define CPU_MASK (NR_CPUS-1)
+
+/* Configured by userspace */
+u64 nr_entries;
+u64 nr_loops;
+u32 __attribute__((__aligned__(8))) key[NR_CPUS];
+
+/* Filled by us */
+u64 __attribute__((__aligned__(256))) percpu_times_index[NR_CPUS];
+u64 __attribute__((__aligned__(256))) percpu_times[NR_CPUS][NR_SLOTS];
+
+static inline void patch_key(u32 i)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	key[0] = i + 1;
+#else
+	key[0] = __builtin_bswap32(i + 1);
+#endif
+	/* the rest of key is random and is configured by userspace */
+}
+
+static int lookup_callback(__u32 index, u32 *unused)
+{
+	patch_key(index);
+	return bpf_map_lookup_elem(&hash_map_bench, key) ? 0 : 1;
+}
+
+static int loop_lookup_callback(__u32 index, u32 *unused)
+{
+	return bpf_loop(nr_entries, lookup_callback, NULL, 0) ? 0 : 1;
+}
+
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
+int benchmark(void *ctx)
+{
+	u32 cpu = bpf_get_smp_processor_id();
+	u32 times_index;
+	u64 start_time;
+
+	times_index = percpu_times_index[cpu & CPU_MASK] % NR_SLOTS;
+	start_time = bpf_ktime_get_ns();
+	bpf_loop(nr_loops, loop_lookup_callback, NULL, 0);
+	percpu_times[cpu & CPU_MASK][times_index] = bpf_ktime_get_ns() - start_time;
+	percpu_times_index[cpu & CPU_MASK] += 1;
+	return 0;
+}
-- 
cgit 


From 6c20822fada1b8adb77fa450d03a0d449686a4a9 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 15 Feb 2023 19:54:40 +0100
Subject: bpf, test_run: fix &xdp_frame misplacement for LIVE_FRAMES
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

&xdp_buff and &xdp_frame are bound in a way that

xdp_buff->data_hard_start == xdp_frame

It's always the case and e.g. xdp_convert_buff_to_frame() relies on
this.
IOW, the following:

	for (u32 i = 0; i < 0xdead; i++) {
		xdpf = xdp_convert_buff_to_frame(&xdp);
		xdp_convert_frame_to_buff(xdpf, &xdp);
	}

shouldn't ever modify @xdpf's contents or the pointer itself.
However, "live packet" code wrongly treats &xdp_frame as part of its
context placed *before* the data_hard_start. With such flow,
data_hard_start is sizeof(*xdpf) off to the right and no longer points
to the XDP frame.

Instead of replacing `sizeof(ctx)` with `offsetof(ctx, xdpf)` in several
places and praying that there are no more miscalcs left somewhere in the
code, unionize ::frm with ::data in a flex array, so that both starts
pointing to the actual data_hard_start and the XDP frame actually starts
being a part of it, i.e. a part of the headroom, not the context.
A nice side effect is that the maximum frame size for this mode gets
increased by 40 bytes, as xdp_buff::frame_sz includes everything from
data_hard_start (-> includes xdpf already) to the end of XDP/skb shared
info.
Also update %MAX_PKT_SIZE accordingly in the selftests code. Leave it
hardcoded for 64 bit && 4k pages, it can be made more flexible later on.

Minor: align `&head->data` with how `head->frm` is assigned for
consistency.
Minor #2: rename 'frm' to 'frame' in &xdp_page_head while at it for
clarity.

(was found while testing XDP traffic generator on ice, which calls
 xdp_convert_frame_to_buff() for each XDP frame)

Fixes: b530e9e1063e ("bpf: Add "live packet" mode for XDP in BPF_PROG_RUN")
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20230215185440.4126672-1-aleksander.lobakin@intel.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 net/bpf/test_run.c                                 | 29 +++++++++++++++++-----
 .../selftests/bpf/prog_tests/xdp_do_redirect.c     |  7 +++---
 2 files changed, 27 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index b766a84c8536..1ab396a2b87f 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -97,8 +97,11 @@ reset:
 struct xdp_page_head {
 	struct xdp_buff orig_ctx;
 	struct xdp_buff ctx;
-	struct xdp_frame frm;
-	u8 data[];
+	union {
+		/* ::data_hard_start starts here */
+		DECLARE_FLEX_ARRAY(struct xdp_frame, frame);
+		DECLARE_FLEX_ARRAY(u8, data);
+	};
 };
 
 struct xdp_test_data {
@@ -116,6 +119,20 @@ struct xdp_test_data {
 #define TEST_XDP_FRAME_SIZE (PAGE_SIZE - sizeof(struct xdp_page_head))
 #define TEST_XDP_MAX_BATCH 256
 
+#if BITS_PER_LONG == 64 && PAGE_SIZE == SZ_4K
+/* tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c:%MAX_PKT_SIZE
+ * must be updated accordingly when any of these changes, otherwise BPF
+ * selftests will fail.
+ */
+#ifdef __s390x__
+#define TEST_MAX_PKT_SIZE 3216
+#else
+#define TEST_MAX_PKT_SIZE 3408
+#endif
+static_assert(SKB_WITH_OVERHEAD(TEST_XDP_FRAME_SIZE - XDP_PACKET_HEADROOM) ==
+	      TEST_MAX_PKT_SIZE);
+#endif
+
 static void xdp_test_run_init_page(struct page *page, void *arg)
 {
 	struct xdp_page_head *head = phys_to_virt(page_to_phys(page));
@@ -132,8 +149,8 @@ static void xdp_test_run_init_page(struct page *page, void *arg)
 	headroom -= meta_len;
 
 	new_ctx = &head->ctx;
-	frm = &head->frm;
-	data = &head->data;
+	frm = head->frame;
+	data = head->data;
 	memcpy(data + headroom, orig_ctx->data_meta, frm_len);
 
 	xdp_init_buff(new_ctx, TEST_XDP_FRAME_SIZE, &xdp->rxq);
@@ -223,7 +240,7 @@ static void reset_ctx(struct xdp_page_head *head)
 	head->ctx.data = head->orig_ctx.data;
 	head->ctx.data_meta = head->orig_ctx.data_meta;
 	head->ctx.data_end = head->orig_ctx.data_end;
-	xdp_update_frame_from_buff(&head->ctx, &head->frm);
+	xdp_update_frame_from_buff(&head->ctx, head->frame);
 }
 
 static int xdp_recv_frames(struct xdp_frame **frames, int nframes,
@@ -285,7 +302,7 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
 		head = phys_to_virt(page_to_phys(page));
 		reset_ctx(head);
 		ctx = &head->ctx;
-		frm = &head->frm;
+		frm = head->frame;
 		xdp->frame_cnt++;
 
 		act = bpf_prog_run_xdp(prog, ctx);
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
index 2666c84dbd01..7271a18ab3e2 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
@@ -65,12 +65,13 @@ static int attach_tc_prog(struct bpf_tc_hook *hook, int fd)
 }
 
 /* The maximum permissible size is: PAGE_SIZE - sizeof(struct xdp_page_head) -
- * sizeof(struct skb_shared_info) - XDP_PACKET_HEADROOM = 3368 bytes
+ * SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) - XDP_PACKET_HEADROOM =
+ * 3408 bytes for 64-byte cacheline and 3216 for 256-byte one.
  */
 #if defined(__s390x__)
-#define MAX_PKT_SIZE 3176
+#define MAX_PKT_SIZE 3216
 #else
-#define MAX_PKT_SIZE 3368
+#define MAX_PKT_SIZE 3408
 #endif
 static void test_max_pkt_size(int fd)
 {
-- 
cgit 


From f58531716ced8975a4ade108ef4af35f98722af7 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Tue, 14 Feb 2023 10:52:37 +0100
Subject: selftests: forwarding: tc_actions: cleanup temporary files when test
 is aborted

remove temporary files created by 'mirred_egress_to_ingress_tcp' test
in the cleanup() handler. Also, change variable names to avoid clashing
with globals from lib.sh.

Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Link: https://lore.kernel.org/r/091649045a017fc00095ecbb75884e5681f7025f.1676368027.git.dcaratti@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/tc_actions.sh | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh
index 919c0dd9fe4b..a96cff8e7219 100755
--- a/tools/testing/selftests/net/forwarding/tc_actions.sh
+++ b/tools/testing/selftests/net/forwarding/tc_actions.sh
@@ -201,10 +201,10 @@ mirred_egress_to_ingress_test()
 
 mirred_egress_to_ingress_tcp_test()
 {
-	local tmpfile=$(mktemp) tmpfile1=$(mktemp)
+	mirred_e2i_tf1=$(mktemp) mirred_e2i_tf2=$(mktemp)
 
 	RET=0
-	dd conv=sparse status=none if=/dev/zero bs=1M count=2 of=$tmpfile
+	dd conv=sparse status=none if=/dev/zero bs=1M count=2 of=$mirred_e2i_tf1
 	tc filter add dev $h1 protocol ip pref 100 handle 100 egress flower \
 		$tcflags ip_proto tcp src_ip 192.0.2.1 dst_ip 192.0.2.2 \
 			action ct commit nat src addr 192.0.2.2 pipe \
@@ -220,11 +220,11 @@ mirred_egress_to_ingress_tcp_test()
 		ip_proto icmp \
 			action drop
 
-	ip vrf exec v$h1 nc --recv-only -w10 -l -p 12345 -o $tmpfile1  &
+	ip vrf exec v$h1 nc --recv-only -w10 -l -p 12345 -o $mirred_e2i_tf2  &
 	local rpid=$!
-	ip vrf exec v$h1 nc -w1 --send-only 192.0.2.2 12345 <$tmpfile
+	ip vrf exec v$h1 nc -w1 --send-only 192.0.2.2 12345 <$mirred_e2i_tf1
 	wait -n $rpid
-	cmp -s $tmpfile $tmpfile1
+	cmp -s $mirred_e2i_tf1 $mirred_e2i_tf2
 	check_err $? "server output check failed"
 
 	$MZ $h1 -c 10 -p 64 -a $h1mac -b $h1mac -A 192.0.2.1 -B 192.0.2.1 \
@@ -241,7 +241,7 @@ mirred_egress_to_ingress_tcp_test()
 	tc filter del dev $h1 egress protocol ip pref 101 handle 101 flower
 	tc filter del dev $h1 ingress protocol ip pref 102 handle 102 flower
 
-	rm -f $tmpfile $tmpfile1
+	rm -f $mirred_e2i_tf1 $mirred_e2i_tf2
 	log_test "mirred_egress_to_ingress_tcp ($tcflags)"
 }
 
@@ -270,6 +270,8 @@ setup_prepare()
 
 cleanup()
 {
+	local tf
+
 	pre_cleanup
 
 	switch_destroy
@@ -280,6 +282,8 @@ cleanup()
 
 	ip link set $swp2 address $swp2origmac
 	ip link set $swp1 address $swp1origmac
+
+	for tf in $mirred_e2i_tf1 $mirred_e2i_tf2; do rm -f $tf; done
 }
 
 mirred_egress_redirect_test()
-- 
cgit 


From 051d442098421c28c7951625652f61b1e15c4bd5 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Tue, 14 Feb 2023 08:49:11 -0500
Subject: net/sched: Retire CBQ qdisc

While this amazing qdisc has served us well over the years it has not been
getting any tender love and care and has bitrotted over time.
It has become mostly a shooting target for syzkaller lately.
For this reason, we are retiring it. Goodbye CBQ - we loved you.

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/sched/Kconfig                                  |   17 -
 net/sched/Makefile                                 |    1 -
 net/sched/sch_cbq.c                                | 1727 --------------------
 .../selftests/tc-testing/tc-tests/qdiscs/cbq.json  |  184 ---
 4 files changed, 1929 deletions(-)
 delete mode 100644 net/sched/sch_cbq.c
 delete mode 100644 tools/testing/selftests/tc-testing/tc-tests/qdiscs/cbq.json

(limited to 'tools')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 4f7b52f5a11c..960e30b6b746 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -45,23 +45,6 @@ if NET_SCHED
 
 comment "Queueing/Scheduling"
 
-config NET_SCH_CBQ
-	tristate "Class Based Queueing (CBQ)"
-	help
-	  Say Y here if you want to use the Class-Based Queueing (CBQ) packet
-	  scheduling algorithm. This algorithm classifies the waiting packets
-	  into a tree-like hierarchy of classes; the leaves of this tree are
-	  in turn scheduled by separate algorithms.
-
-	  See the top of <file:net/sched/sch_cbq.c> for more details.
-
-	  CBQ is a commonly used scheduler, so if you're unsure, you should
-	  say Y here. Then say Y to all the queueing algorithms below that you
-	  want to use as leaf disciplines.
-
-	  To compile this code as a module, choose M here: the
-	  module will be called sch_cbq.
-
 config NET_SCH_HTB
 	tristate "Hierarchical Token Bucket (HTB)"
 	help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 7911eec09837..9cc7d74b9c4a 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
 obj-$(CONFIG_NET_ACT_CT)	+= act_ct.o
 obj-$(CONFIG_NET_ACT_GATE)	+= act_gate.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
-obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
 obj-$(CONFIG_NET_SCH_HFSC)	+= sch_hfsc.o
 obj-$(CONFIG_NET_SCH_RED)	+= sch_red.o
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
deleted file mode 100644
index 36db5f6782f2..000000000000
--- a/net/sched/sch_cbq.c
+++ /dev/null
@@ -1,1727 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/sched/sch_cbq.c	Class-Based Queueing discipline.
- *
- * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <net/netlink.h>
-#include <net/pkt_sched.h>
-#include <net/pkt_cls.h>
-
-
-/*	Class-Based Queueing (CBQ) algorithm.
-	=======================================
-
-	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
-		 Management Models for Packet Networks",
-		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
-
-		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
-
-		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
-		 Parameters", 1996
-
-		 [4] Sally Floyd and Michael Speer, "Experimental Results
-		 for Class-Based Queueing", 1998, not published.
-
-	-----------------------------------------------------------------------
-
-	Algorithm skeleton was taken from NS simulator cbq.cc.
-	If someone wants to check this code against the LBL version,
-	he should take into account that ONLY the skeleton was borrowed,
-	the implementation is different. Particularly:
-
-	--- The WRR algorithm is different. Our version looks more
-	reasonable (I hope) and works when quanta are allowed to be
-	less than MTU, which is always the case when real time classes
-	have small rates. Note, that the statement of [3] is
-	incomplete, delay may actually be estimated even if class
-	per-round allotment is less than MTU. Namely, if per-round
-	allotment is W*r_i, and r_1+...+r_k = r < 1
-
-	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
-
-	In the worst case we have IntServ estimate with D = W*r+k*MTU
-	and C = MTU*r. The proof (if correct at all) is trivial.
-
-
-	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
-	interpret some places, which look like wrong translations
-	from NS. Anyone is advised to find these differences
-	and explain to me, why I am wrong 8).
-
-	--- Linux has no EOI event, so that we cannot estimate true class
-	idle time. Workaround is to consider the next dequeue event
-	as sign that previous packet is finished. This is wrong because of
-	internal device queueing, but on a permanently loaded link it is true.
-	Moreover, combined with clock integrator, this scheme looks
-	very close to an ideal solution.  */
-
-struct cbq_sched_data;
-
-
-struct cbq_class {
-	struct Qdisc_class_common common;
-	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
-
-/* Parameters */
-	unsigned char		priority;	/* class priority */
-	unsigned char		priority2;	/* priority to be used after overlimit */
-	unsigned char		ewma_log;	/* time constant for idle time calculation */
-
-	u32			defmap;
-
-	/* Link-sharing scheduler parameters */
-	long			maxidle;	/* Class parameters: see below. */
-	long			offtime;
-	long			minidle;
-	u32			avpkt;
-	struct qdisc_rate_table	*R_tab;
-
-	/* General scheduler (WRR) parameters */
-	long			allot;
-	long			quantum;	/* Allotment per WRR round */
-	long			weight;		/* Relative allotment: see below */
-
-	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
-	struct cbq_class	*split;		/* Ptr to split node */
-	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
-	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
-	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
-						   parent otherwise */
-	struct cbq_class	*sibling;	/* Sibling chain */
-	struct cbq_class	*children;	/* Pointer to children chain */
-
-	struct Qdisc		*q;		/* Elementary queueing discipline */
-
-
-/* Variables */
-	unsigned char		cpriority;	/* Effective priority */
-	unsigned char		delayed;
-	unsigned char		level;		/* level of the class in hierarchy:
-						   0 for leaf classes, and maximal
-						   level of children + 1 for nodes.
-						 */
-
-	psched_time_t		last;		/* Last end of service */
-	psched_time_t		undertime;
-	long			avgidle;
-	long			deficit;	/* Saved deficit for WRR */
-	psched_time_t		penalized;
-	struct gnet_stats_basic_sync bstats;
-	struct gnet_stats_queue qstats;
-	struct net_rate_estimator __rcu *rate_est;
-	struct tc_cbq_xstats	xstats;
-
-	struct tcf_proto __rcu	*filter_list;
-	struct tcf_block	*block;
-
-	int			filters;
-
-	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
-};
-
-struct cbq_sched_data {
-	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
-	int			nclasses[TC_CBQ_MAXPRIO + 1];
-	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
-
-	struct cbq_class	link;
-
-	unsigned int		activemask;
-	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
-								   with backlog */
-
-#ifdef CONFIG_NET_CLS_ACT
-	struct cbq_class	*rx_class;
-#endif
-	struct cbq_class	*tx_class;
-	struct cbq_class	*tx_borrowed;
-	int			tx_len;
-	psched_time_t		now;		/* Cached timestamp */
-	unsigned int		pmask;
-
-	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
-						   started when CBQ has
-						   backlog, but cannot
-						   transmit just now */
-	psched_tdiff_t		wd_expires;
-	int			toplevel;
-	u32			hgenerator;
-};
-
-
-#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
-
-static inline struct cbq_class *
-cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
-{
-	struct Qdisc_class_common *clc;
-
-	clc = qdisc_class_find(&q->clhash, classid);
-	if (clc == NULL)
-		return NULL;
-	return container_of(clc, struct cbq_class, common);
-}
-
-#ifdef CONFIG_NET_CLS_ACT
-
-static struct cbq_class *
-cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
-{
-	struct cbq_class *cl;
-
-	for (cl = this->tparent; cl; cl = cl->tparent) {
-		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
-
-		if (new != NULL && new != this)
-			return new;
-	}
-	return NULL;
-}
-
-#endif
-
-/* Classify packet. The procedure is pretty complicated, but
- * it allows us to combine link sharing and priority scheduling
- * transparently.
- *
- * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
- * so that it resolves to split nodes. Then packets are classified
- * by logical priority, or a more specific classifier may be attached
- * to the split node.
- */
-
-static struct cbq_class *
-cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *head = &q->link;
-	struct cbq_class **defmap;
-	struct cbq_class *cl = NULL;
-	u32 prio = skb->priority;
-	struct tcf_proto *fl;
-	struct tcf_result res;
-
-	/*
-	 *  Step 1. If skb->priority points to one of our classes, use it.
-	 */
-	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
-	    (cl = cbq_class_lookup(q, prio)) != NULL)
-		return cl;
-
-	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	for (;;) {
-		int result = 0;
-		defmap = head->defaults;
-
-		fl = rcu_dereference_bh(head->filter_list);
-		/*
-		 * Step 2+n. Apply classifier.
-		 */
-		result = tcf_classify(skb, NULL, fl, &res, true);
-		if (!fl || result < 0)
-			goto fallback;
-		if (result == TC_ACT_SHOT)
-			return NULL;
-
-		cl = (void *)res.class;
-		if (!cl) {
-			if (TC_H_MAJ(res.classid))
-				cl = cbq_class_lookup(q, res.classid);
-			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
-				cl = defmap[TC_PRIO_BESTEFFORT];
-
-			if (cl == NULL)
-				goto fallback;
-		}
-		if (cl->level >= head->level)
-			goto fallback;
-#ifdef CONFIG_NET_CLS_ACT
-		switch (result) {
-		case TC_ACT_QUEUED:
-		case TC_ACT_STOLEN:
-		case TC_ACT_TRAP:
-			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
-			fallthrough;
-		case TC_ACT_RECLASSIFY:
-			return cbq_reclassify(skb, cl);
-		}
-#endif
-		if (cl->level == 0)
-			return cl;
-
-		/*
-		 * Step 3+n. If classifier selected a link sharing class,
-		 *	   apply agency specific classifier.
-		 *	   Repeat this procedure until we hit a leaf node.
-		 */
-		head = cl;
-	}
-
-fallback:
-	cl = head;
-
-	/*
-	 * Step 4. No success...
-	 */
-	if (TC_H_MAJ(prio) == 0 &&
-	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
-	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
-		return head;
-
-	return cl;
-}
-
-/*
- * A packet has just been enqueued on the empty class.
- * cbq_activate_class adds it to the tail of active class list
- * of its priority band.
- */
-
-static inline void cbq_activate_class(struct cbq_class *cl)
-{
-	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
-	int prio = cl->cpriority;
-	struct cbq_class *cl_tail;
-
-	cl_tail = q->active[prio];
-	q->active[prio] = cl;
-
-	if (cl_tail != NULL) {
-		cl->next_alive = cl_tail->next_alive;
-		cl_tail->next_alive = cl;
-	} else {
-		cl->next_alive = cl;
-		q->activemask |= (1<<prio);
-	}
-}
-
-/*
- * Unlink class from active chain.
- * Note that this same procedure is done directly in cbq_dequeue*
- * during round-robin procedure.
- */
-
-static void cbq_deactivate_class(struct cbq_class *this)
-{
-	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
-	int prio = this->cpriority;
-	struct cbq_class *cl;
-	struct cbq_class *cl_prev = q->active[prio];
-
-	do {
-		cl = cl_prev->next_alive;
-		if (cl == this) {
-			cl_prev->next_alive = cl->next_alive;
-			cl->next_alive = NULL;
-
-			if (cl == q->active[prio]) {
-				q->active[prio] = cl_prev;
-				if (cl == q->active[prio]) {
-					q->active[prio] = NULL;
-					q->activemask &= ~(1<<prio);
-					return;
-				}
-			}
-			return;
-		}
-	} while ((cl_prev = cl) != q->active[prio]);
-}
-
-static void
-cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
-{
-	int toplevel = q->toplevel;
-
-	if (toplevel > cl->level) {
-		psched_time_t now = psched_get_time();
-
-		do {
-			if (cl->undertime < now) {
-				q->toplevel = cl->level;
-				return;
-			}
-		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
-	}
-}
-
-static int
-cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
-	    struct sk_buff **to_free)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	int ret;
-	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
-
-#ifdef CONFIG_NET_CLS_ACT
-	q->rx_class = cl;
-#endif
-	if (cl == NULL) {
-		if (ret & __NET_XMIT_BYPASS)
-			qdisc_qstats_drop(sch);
-		__qdisc_drop(skb, to_free);
-		return ret;
-	}
-
-	ret = qdisc_enqueue(skb, cl->q, to_free);
-	if (ret == NET_XMIT_SUCCESS) {
-		sch->q.qlen++;
-		cbq_mark_toplevel(q, cl);
-		if (!cl->next_alive)
-			cbq_activate_class(cl);
-		return ret;
-	}
-
-	if (net_xmit_drop_count(ret)) {
-		qdisc_qstats_drop(sch);
-		cbq_mark_toplevel(q, cl);
-		cl->qstats.drops++;
-	}
-	return ret;
-}
-
-/* Overlimit action: penalize leaf class by adding offtime */
-static void cbq_overlimit(struct cbq_class *cl)
-{
-	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
-	psched_tdiff_t delay = cl->undertime - q->now;
-
-	if (!cl->delayed) {
-		delay += cl->offtime;
-
-		/*
-		 * Class goes to sleep, so that it will have no
-		 * chance to work avgidle. Let's forgive it 8)
-		 *
-		 * BTW cbq-2.0 has a crap in this
-		 * place, apparently they forgot to shift it by cl->ewma_log.
-		 */
-		if (cl->avgidle < 0)
-			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
-		if (cl->avgidle < cl->minidle)
-			cl->avgidle = cl->minidle;
-		if (delay <= 0)
-			delay = 1;
-		cl->undertime = q->now + delay;
-
-		cl->xstats.overactions++;
-		cl->delayed = 1;
-	}
-	if (q->wd_expires == 0 || q->wd_expires > delay)
-		q->wd_expires = delay;
-
-	/* Dirty work! We must schedule wakeups based on
-	 * real available rate, rather than leaf rate,
-	 * which may be tiny (even zero).
-	 */
-	if (q->toplevel == TC_CBQ_MAXLEVEL) {
-		struct cbq_class *b;
-		psched_tdiff_t base_delay = q->wd_expires;
-
-		for (b = cl->borrow; b; b = b->borrow) {
-			delay = b->undertime - q->now;
-			if (delay < base_delay) {
-				if (delay <= 0)
-					delay = 1;
-				base_delay = delay;
-			}
-		}
-
-		q->wd_expires = base_delay;
-	}
-}
-
-/*
- * It is mission critical procedure.
- *
- * We "regenerate" toplevel cutoff, if transmitting class
- * has backlog and it is not regulated. It is not part of
- * original CBQ description, but looks more reasonable.
- * Probably, it is wrong. This question needs further investigation.
- */
-
-static inline void
-cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
-		    struct cbq_class *borrowed)
-{
-	if (cl && q->toplevel >= borrowed->level) {
-		if (cl->q->q.qlen > 1) {
-			do {
-				if (borrowed->undertime == PSCHED_PASTPERFECT) {
-					q->toplevel = borrowed->level;
-					return;
-				}
-			} while ((borrowed = borrowed->borrow) != NULL);
-		}
-#if 0
-	/* It is not necessary now. Uncommenting it
-	   will save CPU cycles, but decrease fairness.
-	 */
-		q->toplevel = TC_CBQ_MAXLEVEL;
-#endif
-	}
-}
-
-static void
-cbq_update(struct cbq_sched_data *q)
-{
-	struct cbq_class *this = q->tx_class;
-	struct cbq_class *cl = this;
-	int len = q->tx_len;
-	psched_time_t now;
-
-	q->tx_class = NULL;
-	/* Time integrator. We calculate EOS time
-	 * by adding expected packet transmission time.
-	 */
-	now = q->now + L2T(&q->link, len);
-
-	for ( ; cl; cl = cl->share) {
-		long avgidle = cl->avgidle;
-		long idle;
-
-		_bstats_update(&cl->bstats, len, 1);
-
-		/*
-		 * (now - last) is total time between packet right edges.
-		 * (last_pktlen/rate) is "virtual" busy time, so that
-		 *
-		 *	idle = (now - last) - last_pktlen/rate
-		 */
-
-		idle = now - cl->last;
-		if ((unsigned long)idle > 128*1024*1024) {
-			avgidle = cl->maxidle;
-		} else {
-			idle -= L2T(cl, len);
-
-		/* true_avgidle := (1-W)*true_avgidle + W*idle,
-		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
-		 * cl->avgidle == true_avgidle/W,
-		 * hence:
-		 */
-			avgidle += idle - (avgidle>>cl->ewma_log);
-		}
-
-		if (avgidle <= 0) {
-			/* Overlimit or at-limit */
-
-			if (avgidle < cl->minidle)
-				avgidle = cl->minidle;
-
-			cl->avgidle = avgidle;
-
-			/* Calculate expected time, when this class
-			 * will be allowed to send.
-			 * It will occur, when:
-			 * (1-W)*true_avgidle + W*delay = 0, i.e.
-			 * idle = (1/W - 1)*(-true_avgidle)
-			 * or
-			 * idle = (1 - W)*(-cl->avgidle);
-			 */
-			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
-
-			/*
-			 * That is not all.
-			 * To maintain the rate allocated to the class,
-			 * we add to undertime virtual clock,
-			 * necessary to complete transmitted packet.
-			 * (len/phys_bandwidth has been already passed
-			 * to the moment of cbq_update)
-			 */
-
-			idle -= L2T(&q->link, len);
-			idle += L2T(cl, len);
-
-			cl->undertime = now + idle;
-		} else {
-			/* Underlimit */
-
-			cl->undertime = PSCHED_PASTPERFECT;
-			if (avgidle > cl->maxidle)
-				cl->avgidle = cl->maxidle;
-			else
-				cl->avgidle = avgidle;
-		}
-		if ((s64)(now - cl->last) > 0)
-			cl->last = now;
-	}
-
-	cbq_update_toplevel(q, this, q->tx_borrowed);
-}
-
-static inline struct cbq_class *
-cbq_under_limit(struct cbq_class *cl)
-{
-	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
-	struct cbq_class *this_cl = cl;
-
-	if (cl->tparent == NULL)
-		return cl;
-
-	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
-		cl->delayed = 0;
-		return cl;
-	}
-
-	do {
-		/* It is very suspicious place. Now overlimit
-		 * action is generated for not bounded classes
-		 * only if link is completely congested.
-		 * Though it is in agree with ancestor-only paradigm,
-		 * it looks very stupid. Particularly,
-		 * it means that this chunk of code will either
-		 * never be called or result in strong amplification
-		 * of burstiness. Dangerous, silly, and, however,
-		 * no another solution exists.
-		 */
-		cl = cl->borrow;
-		if (!cl) {
-			this_cl->qstats.overlimits++;
-			cbq_overlimit(this_cl);
-			return NULL;
-		}
-		if (cl->level > q->toplevel)
-			return NULL;
-	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
-
-	cl->delayed = 0;
-	return cl;
-}
-
-static inline struct sk_buff *
-cbq_dequeue_prio(struct Qdisc *sch, int prio)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *cl_tail, *cl_prev, *cl;
-	struct sk_buff *skb;
-	int deficit;
-
-	cl_tail = cl_prev = q->active[prio];
-	cl = cl_prev->next_alive;
-
-	do {
-		deficit = 0;
-
-		/* Start round */
-		do {
-			struct cbq_class *borrow = cl;
-
-			if (cl->q->q.qlen &&
-			    (borrow = cbq_under_limit(cl)) == NULL)
-				goto skip_class;
-
-			if (cl->deficit <= 0) {
-				/* Class exhausted its allotment per
-				 * this round. Switch to the next one.
-				 */
-				deficit = 1;
-				cl->deficit += cl->quantum;
-				goto next_class;
-			}
-
-			skb = cl->q->dequeue(cl->q);
-
-			/* Class did not give us any skb :-(
-			 * It could occur even if cl->q->q.qlen != 0
-			 * f.e. if cl->q == "tbf"
-			 */
-			if (skb == NULL)
-				goto skip_class;
-
-			cl->deficit -= qdisc_pkt_len(skb);
-			q->tx_class = cl;
-			q->tx_borrowed = borrow;
-			if (borrow != cl) {
-#ifndef CBQ_XSTATS_BORROWS_BYTES
-				borrow->xstats.borrows++;
-				cl->xstats.borrows++;
-#else
-				borrow->xstats.borrows += qdisc_pkt_len(skb);
-				cl->xstats.borrows += qdisc_pkt_len(skb);
-#endif
-			}
-			q->tx_len = qdisc_pkt_len(skb);
-
-			if (cl->deficit <= 0) {
-				q->active[prio] = cl;
-				cl = cl->next_alive;
-				cl->deficit += cl->quantum;
-			}
-			return skb;
-
-skip_class:
-			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
-				/* Class is empty or penalized.
-				 * Unlink it from active chain.
-				 */
-				cl_prev->next_alive = cl->next_alive;
-				cl->next_alive = NULL;
-
-				/* Did cl_tail point to it? */
-				if (cl == cl_tail) {
-					/* Repair it! */
-					cl_tail = cl_prev;
-
-					/* Was it the last class in this band? */
-					if (cl == cl_tail) {
-						/* Kill the band! */
-						q->active[prio] = NULL;
-						q->activemask &= ~(1<<prio);
-						if (cl->q->q.qlen)
-							cbq_activate_class(cl);
-						return NULL;
-					}
-
-					q->active[prio] = cl_tail;
-				}
-				if (cl->q->q.qlen)
-					cbq_activate_class(cl);
-
-				cl = cl_prev;
-			}
-
-next_class:
-			cl_prev = cl;
-			cl = cl->next_alive;
-		} while (cl_prev != cl_tail);
-	} while (deficit);
-
-	q->active[prio] = cl_prev;
-
-	return NULL;
-}
-
-static inline struct sk_buff *
-cbq_dequeue_1(struct Qdisc *sch)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct sk_buff *skb;
-	unsigned int activemask;
-
-	activemask = q->activemask & 0xFF;
-	while (activemask) {
-		int prio = ffz(~activemask);
-		activemask &= ~(1<<prio);
-		skb = cbq_dequeue_prio(sch, prio);
-		if (skb)
-			return skb;
-	}
-	return NULL;
-}
-
-static struct sk_buff *
-cbq_dequeue(struct Qdisc *sch)
-{
-	struct sk_buff *skb;
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	psched_time_t now;
-
-	now = psched_get_time();
-
-	if (q->tx_class)
-		cbq_update(q);
-
-	q->now = now;
-
-	for (;;) {
-		q->wd_expires = 0;
-
-		skb = cbq_dequeue_1(sch);
-		if (skb) {
-			qdisc_bstats_update(sch, skb);
-			sch->q.qlen--;
-			return skb;
-		}
-
-		/* All the classes are overlimit.
-		 *
-		 * It is possible, if:
-		 *
-		 * 1. Scheduler is empty.
-		 * 2. Toplevel cutoff inhibited borrowing.
-		 * 3. Root class is overlimit.
-		 *
-		 * Reset 2d and 3d conditions and retry.
-		 *
-		 * Note, that NS and cbq-2.0 are buggy, peeking
-		 * an arbitrary class is appropriate for ancestor-only
-		 * sharing, but not for toplevel algorithm.
-		 *
-		 * Our version is better, but slower, because it requires
-		 * two passes, but it is unavoidable with top-level sharing.
-		 */
-
-		if (q->toplevel == TC_CBQ_MAXLEVEL &&
-		    q->link.undertime == PSCHED_PASTPERFECT)
-			break;
-
-		q->toplevel = TC_CBQ_MAXLEVEL;
-		q->link.undertime = PSCHED_PASTPERFECT;
-	}
-
-	/* No packets in scheduler or nobody wants to give them to us :-(
-	 * Sigh... start watchdog timer in the last case.
-	 */
-
-	if (sch->q.qlen) {
-		qdisc_qstats_overlimit(sch);
-		if (q->wd_expires)
-			qdisc_watchdog_schedule(&q->watchdog,
-						now + q->wd_expires);
-	}
-	return NULL;
-}
-
-/* CBQ class maintenance routines */
-
-static void cbq_adjust_levels(struct cbq_class *this)
-{
-	if (this == NULL)
-		return;
-
-	do {
-		int level = 0;
-		struct cbq_class *cl;
-
-		cl = this->children;
-		if (cl) {
-			do {
-				if (cl->level > level)
-					level = cl->level;
-			} while ((cl = cl->sibling) != this->children);
-		}
-		this->level = level + 1;
-	} while ((this = this->tparent) != NULL);
-}
-
-static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
-{
-	struct cbq_class *cl;
-	unsigned int h;
-
-	if (q->quanta[prio] == 0)
-		return;
-
-	for (h = 0; h < q->clhash.hashsize; h++) {
-		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
-			/* BUGGGG... Beware! This expression suffer of
-			 * arithmetic overflows!
-			 */
-			if (cl->priority == prio) {
-				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
-					q->quanta[prio];
-			}
-			if (cl->quantum <= 0 ||
-			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
-				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
-					cl->common.classid, cl->quantum);
-				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
-			}
-		}
-	}
-}
-
-static void cbq_sync_defmap(struct cbq_class *cl)
-{
-	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
-	struct cbq_class *split = cl->split;
-	unsigned int h;
-	int i;
-
-	if (split == NULL)
-		return;
-
-	for (i = 0; i <= TC_PRIO_MAX; i++) {
-		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
-			split->defaults[i] = NULL;
-	}
-
-	for (i = 0; i <= TC_PRIO_MAX; i++) {
-		int level = split->level;
-
-		if (split->defaults[i])
-			continue;
-
-		for (h = 0; h < q->clhash.hashsize; h++) {
-			struct cbq_class *c;
-
-			hlist_for_each_entry(c, &q->clhash.hash[h],
-					     common.hnode) {
-				if (c->split == split && c->level < level &&
-				    c->defmap & (1<<i)) {
-					split->defaults[i] = c;
-					level = c->level;
-				}
-			}
-		}
-	}
-}
-
-static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
-{
-	struct cbq_class *split = NULL;
-
-	if (splitid == 0) {
-		split = cl->split;
-		if (!split)
-			return;
-		splitid = split->common.classid;
-	}
-
-	if (split == NULL || split->common.classid != splitid) {
-		for (split = cl->tparent; split; split = split->tparent)
-			if (split->common.classid == splitid)
-				break;
-	}
-
-	if (split == NULL)
-		return;
-
-	if (cl->split != split) {
-		cl->defmap = 0;
-		cbq_sync_defmap(cl);
-		cl->split = split;
-		cl->defmap = def & mask;
-	} else
-		cl->defmap = (cl->defmap & ~mask) | (def & mask);
-
-	cbq_sync_defmap(cl);
-}
-
-static void cbq_unlink_class(struct cbq_class *this)
-{
-	struct cbq_class *cl, **clp;
-	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
-
-	qdisc_class_hash_remove(&q->clhash, &this->common);
-
-	if (this->tparent) {
-		clp = &this->sibling;
-		cl = *clp;
-		do {
-			if (cl == this) {
-				*clp = cl->sibling;
-				break;
-			}
-			clp = &cl->sibling;
-		} while ((cl = *clp) != this->sibling);
-
-		if (this->tparent->children == this) {
-			this->tparent->children = this->sibling;
-			if (this->sibling == this)
-				this->tparent->children = NULL;
-		}
-	} else {
-		WARN_ON(this->sibling != this);
-	}
-}
-
-static void cbq_link_class(struct cbq_class *this)
-{
-	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
-	struct cbq_class *parent = this->tparent;
-
-	this->sibling = this;
-	qdisc_class_hash_insert(&q->clhash, &this->common);
-
-	if (parent == NULL)
-		return;
-
-	if (parent->children == NULL) {
-		parent->children = this;
-	} else {
-		this->sibling = parent->children->sibling;
-		parent->children->sibling = this;
-	}
-}
-
-static void
-cbq_reset(struct Qdisc *sch)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *cl;
-	int prio;
-	unsigned int h;
-
-	q->activemask = 0;
-	q->pmask = 0;
-	q->tx_class = NULL;
-	q->tx_borrowed = NULL;
-	qdisc_watchdog_cancel(&q->watchdog);
-	q->toplevel = TC_CBQ_MAXLEVEL;
-	q->now = psched_get_time();
-
-	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
-		q->active[prio] = NULL;
-
-	for (h = 0; h < q->clhash.hashsize; h++) {
-		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
-			qdisc_reset(cl->q);
-
-			cl->next_alive = NULL;
-			cl->undertime = PSCHED_PASTPERFECT;
-			cl->avgidle = cl->maxidle;
-			cl->deficit = cl->quantum;
-			cl->cpriority = cl->priority;
-		}
-	}
-}
-
-
-static void cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
-{
-	if (lss->change & TCF_CBQ_LSS_FLAGS) {
-		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
-		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
-	}
-	if (lss->change & TCF_CBQ_LSS_EWMA)
-		cl->ewma_log = lss->ewma_log;
-	if (lss->change & TCF_CBQ_LSS_AVPKT)
-		cl->avpkt = lss->avpkt;
-	if (lss->change & TCF_CBQ_LSS_MINIDLE)
-		cl->minidle = -(long)lss->minidle;
-	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
-		cl->maxidle = lss->maxidle;
-		cl->avgidle = lss->maxidle;
-	}
-	if (lss->change & TCF_CBQ_LSS_OFFTIME)
-		cl->offtime = lss->offtime;
-}
-
-static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
-{
-	q->nclasses[cl->priority]--;
-	q->quanta[cl->priority] -= cl->weight;
-	cbq_normalize_quanta(q, cl->priority);
-}
-
-static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
-{
-	q->nclasses[cl->priority]++;
-	q->quanta[cl->priority] += cl->weight;
-	cbq_normalize_quanta(q, cl->priority);
-}
-
-static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
-{
-	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
-
-	if (wrr->allot)
-		cl->allot = wrr->allot;
-	if (wrr->weight)
-		cl->weight = wrr->weight;
-	if (wrr->priority) {
-		cl->priority = wrr->priority - 1;
-		cl->cpriority = cl->priority;
-		if (cl->priority >= cl->priority2)
-			cl->priority2 = TC_CBQ_MAXPRIO - 1;
-	}
-
-	cbq_addprio(q, cl);
-	return 0;
-}
-
-static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
-{
-	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
-	return 0;
-}
-
-static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
-	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
-	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
-	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
-	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
-	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
-	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
-	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
-};
-
-static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
-			 struct nlattr *opt,
-			 struct netlink_ext_ack *extack)
-{
-	int err;
-
-	if (!opt) {
-		NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
-		return -EINVAL;
-	}
-
-	err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
-					  cbq_policy, extack);
-	if (err < 0)
-		return err;
-
-	if (tb[TCA_CBQ_WRROPT]) {
-		const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
-
-		if (wrr->priority > TC_CBQ_MAXPRIO) {
-			NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
-			err = -EINVAL;
-		}
-	}
-	return err;
-}
-
-static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
-		    struct netlink_ext_ack *extack)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct nlattr *tb[TCA_CBQ_MAX + 1];
-	struct tc_ratespec *r;
-	int err;
-
-	qdisc_watchdog_init(&q->watchdog, sch);
-
-	err = cbq_opt_parse(tb, opt, extack);
-	if (err < 0)
-		return err;
-
-	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
-		NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
-		return -EINVAL;
-	}
-
-	r = nla_data(tb[TCA_CBQ_RATE]);
-
-	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
-	if (!q->link.R_tab)
-		return -EINVAL;
-
-	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
-	if (err)
-		goto put_rtab;
-
-	err = qdisc_class_hash_init(&q->clhash);
-	if (err < 0)
-		goto put_block;
-
-	q->link.sibling = &q->link;
-	q->link.common.classid = sch->handle;
-	q->link.qdisc = sch;
-	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
-				      sch->handle, NULL);
-	if (!q->link.q)
-		q->link.q = &noop_qdisc;
-	else
-		qdisc_hash_add(q->link.q, true);
-
-	q->link.priority = TC_CBQ_MAXPRIO - 1;
-	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
-	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
-	q->link.allot = psched_mtu(qdisc_dev(sch));
-	q->link.quantum = q->link.allot;
-	q->link.weight = q->link.R_tab->rate.rate;
-
-	q->link.ewma_log = TC_CBQ_DEF_EWMA;
-	q->link.avpkt = q->link.allot/2;
-	q->link.minidle = -0x7FFFFFFF;
-
-	q->toplevel = TC_CBQ_MAXLEVEL;
-	q->now = psched_get_time();
-
-	cbq_link_class(&q->link);
-
-	if (tb[TCA_CBQ_LSSOPT])
-		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
-
-	cbq_addprio(q, &q->link);
-	return 0;
-
-put_block:
-	tcf_block_put(q->link.block);
-
-put_rtab:
-	qdisc_put_rtab(q->link.R_tab);
-	return err;
-}
-
-static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
-{
-	unsigned char *b = skb_tail_pointer(skb);
-
-	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
-		goto nla_put_failure;
-	return skb->len;
-
-nla_put_failure:
-	nlmsg_trim(skb, b);
-	return -1;
-}
-
-static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
-{
-	unsigned char *b = skb_tail_pointer(skb);
-	struct tc_cbq_lssopt opt;
-
-	opt.flags = 0;
-	if (cl->borrow == NULL)
-		opt.flags |= TCF_CBQ_LSS_BOUNDED;
-	if (cl->share == NULL)
-		opt.flags |= TCF_CBQ_LSS_ISOLATED;
-	opt.ewma_log = cl->ewma_log;
-	opt.level = cl->level;
-	opt.avpkt = cl->avpkt;
-	opt.maxidle = cl->maxidle;
-	opt.minidle = (u32)(-cl->minidle);
-	opt.offtime = cl->offtime;
-	opt.change = ~0;
-	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
-		goto nla_put_failure;
-	return skb->len;
-
-nla_put_failure:
-	nlmsg_trim(skb, b);
-	return -1;
-}
-
-static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
-{
-	unsigned char *b = skb_tail_pointer(skb);
-	struct tc_cbq_wrropt opt;
-
-	memset(&opt, 0, sizeof(opt));
-	opt.flags = 0;
-	opt.allot = cl->allot;
-	opt.priority = cl->priority + 1;
-	opt.cpriority = cl->cpriority + 1;
-	opt.weight = cl->weight;
-	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
-		goto nla_put_failure;
-	return skb->len;
-
-nla_put_failure:
-	nlmsg_trim(skb, b);
-	return -1;
-}
-
-static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
-{
-	unsigned char *b = skb_tail_pointer(skb);
-	struct tc_cbq_fopt opt;
-
-	if (cl->split || cl->defmap) {
-		opt.split = cl->split ? cl->split->common.classid : 0;
-		opt.defmap = cl->defmap;
-		opt.defchange = ~0;
-		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
-			goto nla_put_failure;
-	}
-	return skb->len;
-
-nla_put_failure:
-	nlmsg_trim(skb, b);
-	return -1;
-}
-
-static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
-{
-	if (cbq_dump_lss(skb, cl) < 0 ||
-	    cbq_dump_rate(skb, cl) < 0 ||
-	    cbq_dump_wrr(skb, cl) < 0 ||
-	    cbq_dump_fopt(skb, cl) < 0)
-		return -1;
-	return 0;
-}
-
-static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct nlattr *nest;
-
-	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
-	if (nest == NULL)
-		goto nla_put_failure;
-	if (cbq_dump_attr(skb, &q->link) < 0)
-		goto nla_put_failure;
-	return nla_nest_end(skb, nest);
-
-nla_put_failure:
-	nla_nest_cancel(skb, nest);
-	return -1;
-}
-
-static int
-cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-
-	q->link.xstats.avgidle = q->link.avgidle;
-	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
-}
-
-static int
-cbq_dump_class(struct Qdisc *sch, unsigned long arg,
-	       struct sk_buff *skb, struct tcmsg *tcm)
-{
-	struct cbq_class *cl = (struct cbq_class *)arg;
-	struct nlattr *nest;
-
-	if (cl->tparent)
-		tcm->tcm_parent = cl->tparent->common.classid;
-	else
-		tcm->tcm_parent = TC_H_ROOT;
-	tcm->tcm_handle = cl->common.classid;
-	tcm->tcm_info = cl->q->handle;
-
-	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
-	if (nest == NULL)
-		goto nla_put_failure;
-	if (cbq_dump_attr(skb, cl) < 0)
-		goto nla_put_failure;
-	return nla_nest_end(skb, nest);
-
-nla_put_failure:
-	nla_nest_cancel(skb, nest);
-	return -1;
-}
-
-static int
-cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
-	struct gnet_dump *d)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *cl = (struct cbq_class *)arg;
-	__u32 qlen;
-
-	cl->xstats.avgidle = cl->avgidle;
-	cl->xstats.undertime = 0;
-	qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
-
-	if (cl->undertime != PSCHED_PASTPERFECT)
-		cl->xstats.undertime = cl->undertime - q->now;
-
-	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
-	    gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
-		return -1;
-
-	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
-}
-
-static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
-		     struct Qdisc **old, struct netlink_ext_ack *extack)
-{
-	struct cbq_class *cl = (struct cbq_class *)arg;
-
-	if (new == NULL) {
-		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
-					cl->common.classid, extack);
-		if (new == NULL)
-			return -ENOBUFS;
-	}
-
-	*old = qdisc_replace(sch, new, &cl->q);
-	return 0;
-}
-
-static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
-{
-	struct cbq_class *cl = (struct cbq_class *)arg;
-
-	return cl->q;
-}
-
-static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
-{
-	struct cbq_class *cl = (struct cbq_class *)arg;
-
-	cbq_deactivate_class(cl);
-}
-
-static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-
-	return (unsigned long)cbq_class_lookup(q, classid);
-}
-
-static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-
-	WARN_ON(cl->filters);
-
-	tcf_block_put(cl->block);
-	qdisc_put(cl->q);
-	qdisc_put_rtab(cl->R_tab);
-	gen_kill_estimator(&cl->rate_est);
-	if (cl != &q->link)
-		kfree(cl);
-}
-
-static void cbq_destroy(struct Qdisc *sch)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct hlist_node *next;
-	struct cbq_class *cl;
-	unsigned int h;
-
-#ifdef CONFIG_NET_CLS_ACT
-	q->rx_class = NULL;
-#endif
-	/*
-	 * Filters must be destroyed first because we don't destroy the
-	 * classes from root to leafs which means that filters can still
-	 * be bound to classes which have been destroyed already. --TGR '04
-	 */
-	for (h = 0; h < q->clhash.hashsize; h++) {
-		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
-			tcf_block_put(cl->block);
-			cl->block = NULL;
-		}
-	}
-	for (h = 0; h < q->clhash.hashsize; h++) {
-		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
-					  common.hnode)
-			cbq_destroy_class(sch, cl);
-	}
-	qdisc_class_hash_destroy(&q->clhash);
-}
-
-static int
-cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
-		 unsigned long *arg, struct netlink_ext_ack *extack)
-{
-	int err;
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *cl = (struct cbq_class *)*arg;
-	struct nlattr *opt = tca[TCA_OPTIONS];
-	struct nlattr *tb[TCA_CBQ_MAX + 1];
-	struct cbq_class *parent;
-	struct qdisc_rate_table *rtab = NULL;
-
-	err = cbq_opt_parse(tb, opt, extack);
-	if (err < 0)
-		return err;
-
-	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
-		NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
-		return -EOPNOTSUPP;
-	}
-
-	if (cl) {
-		/* Check parent */
-		if (parentid) {
-			if (cl->tparent &&
-			    cl->tparent->common.classid != parentid) {
-				NL_SET_ERR_MSG(extack, "Invalid parent id");
-				return -EINVAL;
-			}
-			if (!cl->tparent && parentid != TC_H_ROOT) {
-				NL_SET_ERR_MSG(extack, "Parent must be root");
-				return -EINVAL;
-			}
-		}
-
-		if (tb[TCA_CBQ_RATE]) {
-			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
-					      tb[TCA_CBQ_RTAB], extack);
-			if (rtab == NULL)
-				return -EINVAL;
-		}
-
-		if (tca[TCA_RATE]) {
-			err = gen_replace_estimator(&cl->bstats, NULL,
-						    &cl->rate_est,
-						    NULL,
-						    true,
-						    tca[TCA_RATE]);
-			if (err) {
-				NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
-				qdisc_put_rtab(rtab);
-				return err;
-			}
-		}
-
-		/* Change class parameters */
-		sch_tree_lock(sch);
-
-		if (cl->next_alive != NULL)
-			cbq_deactivate_class(cl);
-
-		if (rtab) {
-			qdisc_put_rtab(cl->R_tab);
-			cl->R_tab = rtab;
-		}
-
-		if (tb[TCA_CBQ_LSSOPT])
-			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
-
-		if (tb[TCA_CBQ_WRROPT]) {
-			cbq_rmprio(q, cl);
-			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
-		}
-
-		if (tb[TCA_CBQ_FOPT])
-			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
-
-		if (cl->q->q.qlen)
-			cbq_activate_class(cl);
-
-		sch_tree_unlock(sch);
-
-		return 0;
-	}
-
-	if (parentid == TC_H_ROOT)
-		return -EINVAL;
-
-	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
-		NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
-		return -EINVAL;
-	}
-
-	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
-			      extack);
-	if (rtab == NULL)
-		return -EINVAL;
-
-	if (classid) {
-		err = -EINVAL;
-		if (TC_H_MAJ(classid ^ sch->handle) ||
-		    cbq_class_lookup(q, classid)) {
-			NL_SET_ERR_MSG(extack, "Specified class not found");
-			goto failure;
-		}
-	} else {
-		int i;
-		classid = TC_H_MAKE(sch->handle, 0x8000);
-
-		for (i = 0; i < 0x8000; i++) {
-			if (++q->hgenerator >= 0x8000)
-				q->hgenerator = 1;
-			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
-				break;
-		}
-		err = -ENOSR;
-		if (i >= 0x8000) {
-			NL_SET_ERR_MSG(extack, "Unable to generate classid");
-			goto failure;
-		}
-		classid = classid|q->hgenerator;
-	}
-
-	parent = &q->link;
-	if (parentid) {
-		parent = cbq_class_lookup(q, parentid);
-		err = -EINVAL;
-		if (!parent) {
-			NL_SET_ERR_MSG(extack, "Failed to find parentid");
-			goto failure;
-		}
-	}
-
-	err = -ENOBUFS;
-	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
-	if (cl == NULL)
-		goto failure;
-
-	gnet_stats_basic_sync_init(&cl->bstats);
-	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
-	if (err) {
-		kfree(cl);
-		goto failure;
-	}
-
-	if (tca[TCA_RATE]) {
-		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
-					NULL, true, tca[TCA_RATE]);
-		if (err) {
-			NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
-			tcf_block_put(cl->block);
-			kfree(cl);
-			goto failure;
-		}
-	}
-
-	cl->R_tab = rtab;
-	rtab = NULL;
-	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
-				  NULL);
-	if (!cl->q)
-		cl->q = &noop_qdisc;
-	else
-		qdisc_hash_add(cl->q, true);
-
-	cl->common.classid = classid;
-	cl->tparent = parent;
-	cl->qdisc = sch;
-	cl->allot = parent->allot;
-	cl->quantum = cl->allot;
-	cl->weight = cl->R_tab->rate.rate;
-
-	sch_tree_lock(sch);
-	cbq_link_class(cl);
-	cl->borrow = cl->tparent;
-	if (cl->tparent != &q->link)
-		cl->share = cl->tparent;
-	cbq_adjust_levels(parent);
-	cl->minidle = -0x7FFFFFFF;
-	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
-	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
-	if (cl->ewma_log == 0)
-		cl->ewma_log = q->link.ewma_log;
-	if (cl->maxidle == 0)
-		cl->maxidle = q->link.maxidle;
-	if (cl->avpkt == 0)
-		cl->avpkt = q->link.avpkt;
-	if (tb[TCA_CBQ_FOPT])
-		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
-	sch_tree_unlock(sch);
-
-	qdisc_class_hash_grow(sch, &q->clhash);
-
-	*arg = (unsigned long)cl;
-	return 0;
-
-failure:
-	qdisc_put_rtab(rtab);
-	return err;
-}
-
-static int cbq_delete(struct Qdisc *sch, unsigned long arg,
-		      struct netlink_ext_ack *extack)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *cl = (struct cbq_class *)arg;
-
-	if (cl->filters || cl->children || cl == &q->link)
-		return -EBUSY;
-
-	sch_tree_lock(sch);
-
-	qdisc_purge_queue(cl->q);
-
-	if (cl->next_alive)
-		cbq_deactivate_class(cl);
-
-	if (q->tx_borrowed == cl)
-		q->tx_borrowed = q->tx_class;
-	if (q->tx_class == cl) {
-		q->tx_class = NULL;
-		q->tx_borrowed = NULL;
-	}
-#ifdef CONFIG_NET_CLS_ACT
-	if (q->rx_class == cl)
-		q->rx_class = NULL;
-#endif
-
-	cbq_unlink_class(cl);
-	cbq_adjust_levels(cl->tparent);
-	cl->defmap = 0;
-	cbq_sync_defmap(cl);
-
-	cbq_rmprio(q, cl);
-	sch_tree_unlock(sch);
-
-	cbq_destroy_class(sch, cl);
-	return 0;
-}
-
-static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
-				       struct netlink_ext_ack *extack)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *cl = (struct cbq_class *)arg;
-
-	if (cl == NULL)
-		cl = &q->link;
-
-	return cl->block;
-}
-
-static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
-				     u32 classid)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *p = (struct cbq_class *)parent;
-	struct cbq_class *cl = cbq_class_lookup(q, classid);
-
-	if (cl) {
-		if (p && p->level <= cl->level)
-			return 0;
-		cl->filters++;
-		return (unsigned long)cl;
-	}
-	return 0;
-}
-
-static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
-{
-	struct cbq_class *cl = (struct cbq_class *)arg;
-
-	cl->filters--;
-}
-
-static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *cl;
-	unsigned int h;
-
-	if (arg->stop)
-		return;
-
-	for (h = 0; h < q->clhash.hashsize; h++) {
-		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
-			if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
-				return;
-		}
-	}
-}
-
-static const struct Qdisc_class_ops cbq_class_ops = {
-	.graft		=	cbq_graft,
-	.leaf		=	cbq_leaf,
-	.qlen_notify	=	cbq_qlen_notify,
-	.find		=	cbq_find,
-	.change		=	cbq_change_class,
-	.delete		=	cbq_delete,
-	.walk		=	cbq_walk,
-	.tcf_block	=	cbq_tcf_block,
-	.bind_tcf	=	cbq_bind_filter,
-	.unbind_tcf	=	cbq_unbind_filter,
-	.dump		=	cbq_dump_class,
-	.dump_stats	=	cbq_dump_class_stats,
-};
-
-static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
-	.next		=	NULL,
-	.cl_ops		=	&cbq_class_ops,
-	.id		=	"cbq",
-	.priv_size	=	sizeof(struct cbq_sched_data),
-	.enqueue	=	cbq_enqueue,
-	.dequeue	=	cbq_dequeue,
-	.peek		=	qdisc_peek_dequeued,
-	.init		=	cbq_init,
-	.reset		=	cbq_reset,
-	.destroy	=	cbq_destroy,
-	.change		=	NULL,
-	.dump		=	cbq_dump,
-	.dump_stats	=	cbq_dump_stats,
-	.owner		=	THIS_MODULE,
-};
-
-static int __init cbq_module_init(void)
-{
-	return register_qdisc(&cbq_qdisc_ops);
-}
-static void __exit cbq_module_exit(void)
-{
-	unregister_qdisc(&cbq_qdisc_ops);
-}
-module_init(cbq_module_init)
-module_exit(cbq_module_exit)
-MODULE_LICENSE("GPL");
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cbq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cbq.json
deleted file mode 100644
index 1ab21c83a122..000000000000
--- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cbq.json
+++ /dev/null
@@ -1,184 +0,0 @@
-[
-    {
-        "id": "3460",
-        "name": "Create CBQ with default setting",
-        "category": [
-            "qdisc",
-            "cbq"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root cbq bandwidth 10000 avpkt 9000",
-        "expExitCode": "0",
-        "verifyCmd": "$TC qdisc show dev $DUMMY",
-        "matchPattern": "qdisc cbq 1: root refcnt [0-9]+ rate 10Kbit \\(bounded,isolated\\) prio no-transmit",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DUMMY handle 1: root",
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "0592",
-        "name": "Create CBQ with mpu",
-        "category": [
-            "qdisc",
-            "cbq"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root cbq bandwidth 10000 avpkt 9000 mpu 1000",
-        "expExitCode": "0",
-        "verifyCmd": "$TC qdisc show dev $DUMMY",
-        "matchPattern": "qdisc cbq 1: root refcnt [0-9]+ rate 10Kbit \\(bounded,isolated\\) prio no-transmit",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DUMMY handle 1: root",
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "4684",
-        "name": "Create CBQ with valid cell num",
-        "category": [
-            "qdisc",
-            "cbq"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root cbq bandwidth 10000 avpkt 9000 cell 128",
-        "expExitCode": "0",
-        "verifyCmd": "$TC qdisc show dev $DUMMY",
-        "matchPattern": "qdisc cbq 1: root refcnt [0-9]+ rate 10Kbit \\(bounded,isolated\\) prio no-transmit",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DUMMY handle 1: root",
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "4345",
-        "name": "Create CBQ with invalid cell num",
-        "category": [
-            "qdisc",
-            "cbq"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root cbq bandwidth 10000 avpkt 9000 cell 100",
-        "expExitCode": "1",
-        "verifyCmd": "$TC qdisc show dev $DUMMY",
-        "matchPattern": "qdisc cbq 1: root refcnt [0-9]+ rate 10Kbit \\(bounded,isolated\\) prio no-transmit",
-        "matchCount": "0",
-        "teardown": [
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "4525",
-        "name": "Create CBQ with valid ewma",
-        "category": [
-            "qdisc",
-            "cbq"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root cbq bandwidth 10000 avpkt 9000 ewma 16",
-        "expExitCode": "0",
-        "verifyCmd": "$TC qdisc show dev $DUMMY",
-        "matchPattern": "qdisc cbq 1: root refcnt [0-9]+ rate 10Kbit \\(bounded,isolated\\) prio no-transmit",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DUMMY handle 1: root",
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "6784",
-        "name": "Create CBQ with invalid ewma",
-        "category": [
-            "qdisc",
-            "cbq"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root cbq bandwidth 10000 avpkt 9000 ewma 128",
-        "expExitCode": "1",
-        "verifyCmd": "$TC qdisc show dev $DUMMY",
-        "matchPattern": "qdisc cbq 1: root refcnt [0-9]+ rate 10Kbit \\(bounded,isolated\\) prio no-transmit",
-        "matchCount": "0",
-        "teardown": [
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "5468",
-        "name": "Delete CBQ with handle",
-        "category": [
-            "qdisc",
-            "cbq"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true",
-            "$TC qdisc add dev $DUMMY handle 1: root cbq bandwidth 10000 avpkt 9000"
-        ],
-        "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 1: root",
-        "expExitCode": "0",
-        "verifyCmd": "$TC qdisc show dev $DUMMY",
-        "matchPattern": "qdisc cbq 1: root refcnt [0-9]+ rate 10Kbit \\(bounded,isolated\\) prio no-transmit",
-        "matchCount": "0",
-        "teardown": [
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "492a",
-        "name": "Show CBQ class",
-        "category": [
-            "qdisc",
-            "cbq"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root cbq bandwidth 10000 avpkt 9000",
-        "expExitCode": "0",
-        "verifyCmd": "$TC class show dev $DUMMY",
-        "matchPattern": "class cbq 1: root rate 10Kbit \\(bounded,isolated\\) prio no-transmit",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DUMMY handle 1: root",
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    }
-]
-- 
cgit 


From fb38306ceb9e770adfb5ffa6e3c64047b55f7a07 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Tue, 14 Feb 2023 08:49:12 -0500
Subject: net/sched: Retire ATM qdisc

The ATM qdisc has served us well over the years but has not been getting much
TLC due to lack of known users. Most recently it has become a shooting target
for syzkaller. For this reason, we are retiring it.

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/sched/Kconfig                                  |  14 -
 net/sched/Makefile                                 |   1 -
 net/sched/sch_atm.c                                | 706 ---------------------
 .../selftests/tc-testing/tc-tests/qdiscs/atm.json  |  94 ---
 4 files changed, 815 deletions(-)
 delete mode 100644 net/sched/sch_atm.c
 delete mode 100644 tools/testing/selftests/tc-testing/tc-tests/qdiscs/atm.json

(limited to 'tools')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 960e30b6b746..869321db12f7 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -68,20 +68,6 @@ config NET_SCH_HFSC
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_hfsc.
 
-config NET_SCH_ATM
-	tristate "ATM Virtual Circuits (ATM)"
-	depends on ATM
-	help
-	  Say Y here if you want to use the ATM pseudo-scheduler.  This
-	  provides a framework for invoking classifiers, which in turn
-	  select classes of this queuing discipline.  Each class maps
-	  the flow(s) it is handling to a given virtual circuit.
-
-	  See the top of <file:net/sched/sch_atm.c> for more details.
-
-	  To compile this code as a module, choose M here: the
-	  module will be called sch_atm.
-
 config NET_SCH_PRIO
 	tristate "Multi Band Priority Queueing (PRIO)"
 	help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 9cc7d74b9c4a..d2612b47530c 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -45,7 +45,6 @@ obj-$(CONFIG_NET_SCH_TBF)	+= sch_tbf.o
 obj-$(CONFIG_NET_SCH_TEQL)	+= sch_teql.o
 obj-$(CONFIG_NET_SCH_PRIO)	+= sch_prio.o
 obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
-obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
 obj-$(CONFIG_NET_SCH_PLUG)	+= sch_plug.o
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
deleted file mode 100644
index 4a981ca90b0b..000000000000
--- a/net/sched/sch_atm.c
+++ /dev/null
@@ -1,706 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */
-
-/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <linux/atmdev.h>
-#include <linux/atmclip.h>
-#include <linux/rtnetlink.h>
-#include <linux/file.h>		/* for fput */
-#include <net/netlink.h>
-#include <net/pkt_sched.h>
-#include <net/pkt_cls.h>
-
-/*
- * The ATM queuing discipline provides a framework for invoking classifiers
- * (aka "filters"), which in turn select classes of this queuing discipline.
- * Each class maps the flow(s) it is handling to a given VC. Multiple classes
- * may share the same VC.
- *
- * When creating a class, VCs are specified by passing the number of the open
- * socket descriptor by which the calling process references the VC. The kernel
- * keeps the VC open at least until all classes using it are removed.
- *
- * In this file, most functions are named atm_tc_* to avoid confusion with all
- * the atm_* in net/atm. This naming convention differs from what's used in the
- * rest of net/sched.
- *
- * Known bugs:
- *  - sometimes messes up the IP stack
- *  - any manipulations besides the few operations described in the README, are
- *    untested and likely to crash the system
- *  - should lock the flow while there is data in the queue (?)
- */
-
-#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back))
-
-struct atm_flow_data {
-	struct Qdisc_class_common common;
-	struct Qdisc		*q;	/* FIFO, TBF, etc. */
-	struct tcf_proto __rcu	*filter_list;
-	struct tcf_block	*block;
-	struct atm_vcc		*vcc;	/* VCC; NULL if VCC is closed */
-	void			(*old_pop)(struct atm_vcc *vcc,
-					   struct sk_buff *skb); /* chaining */
-	struct atm_qdisc_data	*parent;	/* parent qdisc */
-	struct socket		*sock;		/* for closing */
-	int			ref;		/* reference count */
-	struct gnet_stats_basic_sync	bstats;
-	struct gnet_stats_queue	qstats;
-	struct list_head	list;
-	struct atm_flow_data	*excess;	/* flow for excess traffic;
-						   NULL to set CLP instead */
-	int			hdr_len;
-	unsigned char		hdr[];		/* header data; MUST BE LAST */
-};
-
-struct atm_qdisc_data {
-	struct atm_flow_data	link;		/* unclassified skbs go here */
-	struct list_head	flows;		/* NB: "link" is also on this
-						   list */
-	struct tasklet_struct	task;		/* dequeue tasklet */
-};
-
-/* ------------------------- Class/flow operations ------------------------- */
-
-static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct atm_flow_data *flow;
-
-	list_for_each_entry(flow, &p->flows, list) {
-		if (flow->common.classid == classid)
-			return flow;
-	}
-	return NULL;
-}
-
-static int atm_tc_graft(struct Qdisc *sch, unsigned long arg,
-			struct Qdisc *new, struct Qdisc **old,
-			struct netlink_ext_ack *extack)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct atm_flow_data *flow = (struct atm_flow_data *)arg;
-
-	pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",
-		sch, p, flow, new, old);
-	if (list_empty(&flow->list))
-		return -EINVAL;
-	if (!new)
-		new = &noop_qdisc;
-	*old = flow->q;
-	flow->q = new;
-	if (*old)
-		qdisc_reset(*old);
-	return 0;
-}
-
-static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl)
-{
-	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
-
-	pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow);
-	return flow ? flow->q : NULL;
-}
-
-static unsigned long atm_tc_find(struct Qdisc *sch, u32 classid)
-{
-	struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
-	struct atm_flow_data *flow;
-
-	pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid);
-	flow = lookup_flow(sch, classid);
-	pr_debug("%s: flow %p\n", __func__, flow);
-	return (unsigned long)flow;
-}
-
-static unsigned long atm_tc_bind_filter(struct Qdisc *sch,
-					unsigned long parent, u32 classid)
-{
-	struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
-	struct atm_flow_data *flow;
-
-	pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid);
-	flow = lookup_flow(sch, classid);
-	if (flow)
-		flow->ref++;
-	pr_debug("%s: flow %p\n", __func__, flow);
-	return (unsigned long)flow;
-}
-
-/*
- * atm_tc_put handles all destructions, including the ones that are explicitly
- * requested (atm_tc_destroy, etc.). The assumption here is that we never drop
- * anything that still seems to be in use.
- */
-static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
-
-	pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
-	if (--flow->ref)
-		return;
-	pr_debug("atm_tc_put: destroying\n");
-	list_del_init(&flow->list);
-	pr_debug("atm_tc_put: qdisc %p\n", flow->q);
-	qdisc_put(flow->q);
-	tcf_block_put(flow->block);
-	if (flow->sock) {
-		pr_debug("atm_tc_put: f_count %ld\n",
-			file_count(flow->sock->file));
-		flow->vcc->pop = flow->old_pop;
-		sockfd_put(flow->sock);
-	}
-	if (flow->excess)
-		atm_tc_put(sch, (unsigned long)flow->excess);
-	if (flow != &p->link)
-		kfree(flow);
-	/*
-	 * If flow == &p->link, the qdisc no longer works at this point and
-	 * needs to be removed. (By the caller of atm_tc_put.)
-	 */
-}
-
-static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb)
-{
-	struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent;
-
-	pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p);
-	VCC2FLOW(vcc)->old_pop(vcc, skb);
-	tasklet_schedule(&p->task);
-}
-
-static const u8 llc_oui_ip[] = {
-	0xaa,			/* DSAP: non-ISO */
-	0xaa,			/* SSAP: non-ISO */
-	0x03,			/* Ctrl: Unnumbered Information Command PDU */
-	0x00,			/* OUI: EtherType */
-	0x00, 0x00,
-	0x08, 0x00
-};				/* Ethertype IP (0800) */
-
-static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = {
-	[TCA_ATM_FD]		= { .type = NLA_U32 },
-	[TCA_ATM_EXCESS]	= { .type = NLA_U32 },
-};
-
-static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
-			 struct nlattr **tca, unsigned long *arg,
-			 struct netlink_ext_ack *extack)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct atm_flow_data *flow = (struct atm_flow_data *)*arg;
-	struct atm_flow_data *excess = NULL;
-	struct nlattr *opt = tca[TCA_OPTIONS];
-	struct nlattr *tb[TCA_ATM_MAX + 1];
-	struct socket *sock;
-	int fd, error, hdr_len;
-	void *hdr;
-
-	pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x,"
-		"flow %p,opt %p)\n", sch, p, classid, parent, flow, opt);
-	/*
-	 * The concept of parents doesn't apply for this qdisc.
-	 */
-	if (parent && parent != TC_H_ROOT && parent != sch->handle)
-		return -EINVAL;
-	/*
-	 * ATM classes cannot be changed. In order to change properties of the
-	 * ATM connection, that socket needs to be modified directly (via the
-	 * native ATM API. In order to send a flow to a different VC, the old
-	 * class needs to be removed and a new one added. (This may be changed
-	 * later.)
-	 */
-	if (flow)
-		return -EBUSY;
-	if (opt == NULL)
-		return -EINVAL;
-
-	error = nla_parse_nested_deprecated(tb, TCA_ATM_MAX, opt, atm_policy,
-					    NULL);
-	if (error < 0)
-		return error;
-
-	if (!tb[TCA_ATM_FD])
-		return -EINVAL;
-	fd = nla_get_u32(tb[TCA_ATM_FD]);
-	pr_debug("atm_tc_change: fd %d\n", fd);
-	if (tb[TCA_ATM_HDR]) {
-		hdr_len = nla_len(tb[TCA_ATM_HDR]);
-		hdr = nla_data(tb[TCA_ATM_HDR]);
-	} else {
-		hdr_len = RFC1483LLC_LEN;
-		hdr = NULL;	/* default LLC/SNAP for IP */
-	}
-	if (!tb[TCA_ATM_EXCESS])
-		excess = NULL;
-	else {
-		excess = (struct atm_flow_data *)
-			atm_tc_find(sch, nla_get_u32(tb[TCA_ATM_EXCESS]));
-		if (!excess)
-			return -ENOENT;
-	}
-	pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n",
-		 opt->nla_type, nla_len(opt), hdr_len);
-	sock = sockfd_lookup(fd, &error);
-	if (!sock)
-		return error;	/* f_count++ */
-	pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file));
-	if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
-		error = -EPROTOTYPE;
-		goto err_out;
-	}
-	/* @@@ should check if the socket is really operational or we'll crash
-	   on vcc->send */
-	if (classid) {
-		if (TC_H_MAJ(classid ^ sch->handle)) {
-			pr_debug("atm_tc_change: classid mismatch\n");
-			error = -EINVAL;
-			goto err_out;
-		}
-	} else {
-		int i;
-		unsigned long cl;
-
-		for (i = 1; i < 0x8000; i++) {
-			classid = TC_H_MAKE(sch->handle, 0x8000 | i);
-			cl = atm_tc_find(sch, classid);
-			if (!cl)
-				break;
-		}
-	}
-	pr_debug("atm_tc_change: new id %x\n", classid);
-	flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL);
-	pr_debug("atm_tc_change: flow %p\n", flow);
-	if (!flow) {
-		error = -ENOBUFS;
-		goto err_out;
-	}
-
-	error = tcf_block_get(&flow->block, &flow->filter_list, sch,
-			      extack);
-	if (error) {
-		kfree(flow);
-		goto err_out;
-	}
-
-	flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
-				    extack);
-	if (!flow->q)
-		flow->q = &noop_qdisc;
-	pr_debug("atm_tc_change: qdisc %p\n", flow->q);
-	flow->sock = sock;
-	flow->vcc = ATM_SD(sock);	/* speedup */
-	flow->vcc->user_back = flow;
-	pr_debug("atm_tc_change: vcc %p\n", flow->vcc);
-	flow->old_pop = flow->vcc->pop;
-	flow->parent = p;
-	flow->vcc->pop = sch_atm_pop;
-	flow->common.classid = classid;
-	flow->ref = 1;
-	flow->excess = excess;
-	list_add(&flow->list, &p->link.list);
-	flow->hdr_len = hdr_len;
-	if (hdr)
-		memcpy(flow->hdr, hdr, hdr_len);
-	else
-		memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip));
-	*arg = (unsigned long)flow;
-	return 0;
-err_out:
-	sockfd_put(sock);
-	return error;
-}
-
-static int atm_tc_delete(struct Qdisc *sch, unsigned long arg,
-			 struct netlink_ext_ack *extack)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct atm_flow_data *flow = (struct atm_flow_data *)arg;
-
-	pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
-	if (list_empty(&flow->list))
-		return -EINVAL;
-	if (rcu_access_pointer(flow->filter_list) || flow == &p->link)
-		return -EBUSY;
-	/*
-	 * Reference count must be 2: one for "keepalive" (set at class
-	 * creation), and one for the reference held when calling delete.
-	 */
-	if (flow->ref < 2) {
-		pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref);
-		return -EINVAL;
-	}
-	if (flow->ref > 2)
-		return -EBUSY;	/* catch references via excess, etc. */
-	atm_tc_put(sch, arg);
-	return 0;
-}
-
-static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct atm_flow_data *flow;
-
-	pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
-	if (walker->stop)
-		return;
-	list_for_each_entry(flow, &p->flows, list) {
-		if (!tc_qdisc_stats_dump(sch, (unsigned long)flow, walker))
-			break;
-	}
-}
-
-static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl,
-					  struct netlink_ext_ack *extack)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
-
-	pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
-	return flow ? flow->block : p->link.block;
-}
-
-/* --------------------------- Qdisc operations ---------------------------- */
-
-static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
-			  struct sk_buff **to_free)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct atm_flow_data *flow;
-	struct tcf_result res;
-	int result;
-	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-
-	pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
-	result = TC_ACT_OK;	/* be nice to gcc */
-	flow = NULL;
-	if (TC_H_MAJ(skb->priority) != sch->handle ||
-	    !(flow = (struct atm_flow_data *)atm_tc_find(sch, skb->priority))) {
-		struct tcf_proto *fl;
-
-		list_for_each_entry(flow, &p->flows, list) {
-			fl = rcu_dereference_bh(flow->filter_list);
-			if (fl) {
-				result = tcf_classify(skb, NULL, fl, &res, true);
-				if (result < 0)
-					continue;
-				if (result == TC_ACT_SHOT)
-					goto done;
-
-				flow = (struct atm_flow_data *)res.class;
-				if (!flow)
-					flow = lookup_flow(sch, res.classid);
-				goto drop;
-			}
-		}
-		flow = NULL;
-done:
-		;
-	}
-	if (!flow) {
-		flow = &p->link;
-	} else {
-		if (flow->vcc)
-			ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
-		/*@@@ looks good ... but it's not supposed to work :-) */
-#ifdef CONFIG_NET_CLS_ACT
-		switch (result) {
-		case TC_ACT_QUEUED:
-		case TC_ACT_STOLEN:
-		case TC_ACT_TRAP:
-			__qdisc_drop(skb, to_free);
-			return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
-		case TC_ACT_SHOT:
-			__qdisc_drop(skb, to_free);
-			goto drop;
-		case TC_ACT_RECLASSIFY:
-			if (flow->excess)
-				flow = flow->excess;
-			else
-				ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP;
-			break;
-		}
-#endif
-	}
-
-	ret = qdisc_enqueue(skb, flow->q, to_free);
-	if (ret != NET_XMIT_SUCCESS) {
-drop: __maybe_unused
-		if (net_xmit_drop_count(ret)) {
-			qdisc_qstats_drop(sch);
-			if (flow)
-				flow->qstats.drops++;
-		}
-		return ret;
-	}
-	/*
-	 * Okay, this may seem weird. We pretend we've dropped the packet if
-	 * it goes via ATM. The reason for this is that the outer qdisc
-	 * expects to be able to q->dequeue the packet later on if we return
-	 * success at this place. Also, sch->q.qdisc needs to reflect whether
-	 * there is a packet egligible for dequeuing or not. Note that the
-	 * statistics of the outer qdisc are necessarily wrong because of all
-	 * this. There's currently no correct solution for this.
-	 */
-	if (flow == &p->link) {
-		sch->q.qlen++;
-		return NET_XMIT_SUCCESS;
-	}
-	tasklet_schedule(&p->task);
-	return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-}
-
-/*
- * Dequeue packets and send them over ATM. Note that we quite deliberately
- * avoid checking net_device's flow control here, simply because sch_atm
- * uses its own channels, which have nothing to do with any CLIP/LANE/or
- * non-ATM interfaces.
- */
-
-static void sch_atm_dequeue(struct tasklet_struct *t)
-{
-	struct atm_qdisc_data *p = from_tasklet(p, t, task);
-	struct Qdisc *sch = qdisc_from_priv(p);
-	struct atm_flow_data *flow;
-	struct sk_buff *skb;
-
-	pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p);
-	list_for_each_entry(flow, &p->flows, list) {
-		if (flow == &p->link)
-			continue;
-		/*
-		 * If traffic is properly shaped, this won't generate nasty
-		 * little bursts. Otherwise, it may ... (but that's okay)
-		 */
-		while ((skb = flow->q->ops->peek(flow->q))) {
-			if (!atm_may_send(flow->vcc, skb->truesize))
-				break;
-
-			skb = qdisc_dequeue_peeked(flow->q);
-			if (unlikely(!skb))
-				break;
-
-			qdisc_bstats_update(sch, skb);
-			bstats_update(&flow->bstats, skb);
-			pr_debug("atm_tc_dequeue: sending on class %p\n", flow);
-			/* remove any LL header somebody else has attached */
-			skb_pull(skb, skb_network_offset(skb));
-			if (skb_headroom(skb) < flow->hdr_len) {
-				struct sk_buff *new;
-
-				new = skb_realloc_headroom(skb, flow->hdr_len);
-				dev_kfree_skb(skb);
-				if (!new)
-					continue;
-				skb = new;
-			}
-			pr_debug("sch_atm_dequeue: ip %p, data %p\n",
-				 skb_network_header(skb), skb->data);
-			ATM_SKB(skb)->vcc = flow->vcc;
-			memcpy(skb_push(skb, flow->hdr_len), flow->hdr,
-			       flow->hdr_len);
-			refcount_add(skb->truesize,
-				   &sk_atm(flow->vcc)->sk_wmem_alloc);
-			/* atm.atm_options are already set by atm_tc_enqueue */
-			flow->vcc->send(flow->vcc, skb);
-		}
-	}
-}
-
-static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct sk_buff *skb;
-
-	pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p);
-	tasklet_schedule(&p->task);
-	skb = qdisc_dequeue_peeked(p->link.q);
-	if (skb)
-		sch->q.qlen--;
-	return skb;
-}
-
-static struct sk_buff *atm_tc_peek(struct Qdisc *sch)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-
-	pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p);
-
-	return p->link.q->ops->peek(p->link.q);
-}
-
-static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt,
-		       struct netlink_ext_ack *extack)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-	int err;
-
-	pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
-	INIT_LIST_HEAD(&p->flows);
-	INIT_LIST_HEAD(&p->link.list);
-	gnet_stats_basic_sync_init(&p->link.bstats);
-	list_add(&p->link.list, &p->flows);
-	p->link.q = qdisc_create_dflt(sch->dev_queue,
-				      &pfifo_qdisc_ops, sch->handle, extack);
-	if (!p->link.q)
-		p->link.q = &noop_qdisc;
-	pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
-	p->link.vcc = NULL;
-	p->link.sock = NULL;
-	p->link.common.classid = sch->handle;
-	p->link.ref = 1;
-
-	err = tcf_block_get(&p->link.block, &p->link.filter_list, sch,
-			    extack);
-	if (err)
-		return err;
-
-	tasklet_setup(&p->task, sch_atm_dequeue);
-	return 0;
-}
-
-static void atm_tc_reset(struct Qdisc *sch)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct atm_flow_data *flow;
-
-	pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p);
-	list_for_each_entry(flow, &p->flows, list)
-		qdisc_reset(flow->q);
-}
-
-static void atm_tc_destroy(struct Qdisc *sch)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct atm_flow_data *flow, *tmp;
-
-	pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p);
-	list_for_each_entry(flow, &p->flows, list) {
-		tcf_block_put(flow->block);
-		flow->block = NULL;
-	}
-
-	list_for_each_entry_safe(flow, tmp, &p->flows, list) {
-		if (flow->ref > 1)
-			pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref);
-		atm_tc_put(sch, (unsigned long)flow);
-	}
-	tasklet_kill(&p->task);
-}
-
-static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
-			     struct sk_buff *skb, struct tcmsg *tcm)
-{
-	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
-	struct nlattr *nest;
-
-	pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
-		sch, p, flow, skb, tcm);
-	if (list_empty(&flow->list))
-		return -EINVAL;
-	tcm->tcm_handle = flow->common.classid;
-	tcm->tcm_info = flow->q->handle;
-
-	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
-	if (nest == NULL)
-		goto nla_put_failure;
-
-	if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr))
-		goto nla_put_failure;
-	if (flow->vcc) {
-		struct sockaddr_atmpvc pvc;
-		int state;
-
-		memset(&pvc, 0, sizeof(pvc));
-		pvc.sap_family = AF_ATMPVC;
-		pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
-		pvc.sap_addr.vpi = flow->vcc->vpi;
-		pvc.sap_addr.vci = flow->vcc->vci;
-		if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc))
-			goto nla_put_failure;
-		state = ATM_VF2VS(flow->vcc->flags);
-		if (nla_put_u32(skb, TCA_ATM_STATE, state))
-			goto nla_put_failure;
-	}
-	if (flow->excess) {
-		if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->common.classid))
-			goto nla_put_failure;
-	} else {
-		if (nla_put_u32(skb, TCA_ATM_EXCESS, 0))
-			goto nla_put_failure;
-	}
-	return nla_nest_end(skb, nest);
-
-nla_put_failure:
-	nla_nest_cancel(skb, nest);
-	return -1;
-}
-static int
-atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
-			struct gnet_dump *d)
-{
-	struct atm_flow_data *flow = (struct atm_flow_data *)arg;
-
-	if (gnet_stats_copy_basic(d, NULL, &flow->bstats, true) < 0 ||
-	    gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0)
-		return -1;
-
-	return 0;
-}
-
-static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb)
-{
-	return 0;
-}
-
-static const struct Qdisc_class_ops atm_class_ops = {
-	.graft		= atm_tc_graft,
-	.leaf		= atm_tc_leaf,
-	.find		= atm_tc_find,
-	.change		= atm_tc_change,
-	.delete		= atm_tc_delete,
-	.walk		= atm_tc_walk,
-	.tcf_block	= atm_tc_tcf_block,
-	.bind_tcf	= atm_tc_bind_filter,
-	.unbind_tcf	= atm_tc_put,
-	.dump		= atm_tc_dump_class,
-	.dump_stats	= atm_tc_dump_class_stats,
-};
-
-static struct Qdisc_ops atm_qdisc_ops __read_mostly = {
-	.cl_ops		= &atm_class_ops,
-	.id		= "atm",
-	.priv_size	= sizeof(struct atm_qdisc_data),
-	.enqueue	= atm_tc_enqueue,
-	.dequeue	= atm_tc_dequeue,
-	.peek		= atm_tc_peek,
-	.init		= atm_tc_init,
-	.reset		= atm_tc_reset,
-	.destroy	= atm_tc_destroy,
-	.dump		= atm_tc_dump,
-	.owner		= THIS_MODULE,
-};
-
-static int __init atm_init(void)
-{
-	return register_qdisc(&atm_qdisc_ops);
-}
-
-static void __exit atm_exit(void)
-{
-	unregister_qdisc(&atm_qdisc_ops);
-}
-
-module_init(atm_init)
-module_exit(atm_exit)
-MODULE_LICENSE("GPL");
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/atm.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/atm.json
deleted file mode 100644
index f5bc8670a67d..000000000000
--- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/atm.json
+++ /dev/null
@@ -1,94 +0,0 @@
-[
-    {
-        "id": "7628",
-        "name": "Create ATM with default setting",
-        "category": [
-            "qdisc",
-            "atm"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root atm",
-        "expExitCode": "0",
-        "verifyCmd": "$TC qdisc show dev $DUMMY",
-        "matchPattern": "qdisc atm 1: root refcnt",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DUMMY handle 1: root",
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "390a",
-        "name": "Delete ATM with valid handle",
-        "category": [
-            "qdisc",
-            "atm"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true",
-            "$TC qdisc add dev $DUMMY handle 1: root atm"
-        ],
-        "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 1: root",
-        "expExitCode": "0",
-        "verifyCmd": "$TC qdisc show dev $DUMMY",
-        "matchPattern": "qdisc atm 1: root refcnt",
-        "matchCount": "0",
-        "teardown": [
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "32a0",
-        "name": "Show ATM class",
-        "category": [
-            "qdisc",
-            "atm"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root atm",
-        "expExitCode": "0",
-        "verifyCmd": "$TC class show dev $DUMMY",
-        "matchPattern": "class atm 1: parent 1:",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DUMMY handle 1: root",
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "6310",
-        "name": "Dump ATM stats",
-        "category": [
-            "qdisc",
-            "atm"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root atm",
-        "expExitCode": "0",
-        "verifyCmd": "$TC -s qdisc show dev $DUMMY",
-        "matchPattern": "qdisc atm 1: root refcnt",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DUMMY handle 1: root",
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    }
-]
-- 
cgit 


From bbe77c14ee6185a61ba6d5e435c1cbb489d2a9ed Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Tue, 14 Feb 2023 08:49:13 -0500
Subject: net/sched: Retire dsmark qdisc

The dsmark qdisc has served us well over the years for diffserv but has not
been getting much attention due to other more popular approaches to do diffserv
services. Most recently it has become a shooting target for syzkaller. For this
reason, we are retiring it.

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/sched/Kconfig                                  |  11 -
 net/sched/Makefile                                 |   1 -
 net/sched/sch_dsmark.c                             | 518 ---------------------
 .../tc-testing/tc-tests/qdiscs/dsmark.json         | 140 ------
 4 files changed, 670 deletions(-)
 delete mode 100644 net/sched/sch_dsmark.c
 delete mode 100644 tools/testing/selftests/tc-testing/tc-tests/qdiscs/dsmark.json

(limited to 'tools')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 869321db12f7..d909f289a67f 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -192,17 +192,6 @@ config NET_SCH_GRED
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_gred.
 
-config NET_SCH_DSMARK
-	tristate "Differentiated Services marker (DSMARK)"
-	help
-	  Say Y if you want to schedule packets according to the
-	  Differentiated Services architecture proposed in RFC 2475.
-	  Technical information on this method, with pointers to associated
-	  RFCs, is available at <http://www.gta.ufrj.br/diffserv/>.
-
-	  To compile this code as a module, choose M here: the
-	  module will be called sch_dsmark.
-
 config NET_SCH_NETEM
 	tristate "Network emulator (NETEM)"
 	help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index d2612b47530c..0852e989af96 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -38,7 +38,6 @@ obj-$(CONFIG_NET_SCH_HFSC)	+= sch_hfsc.o
 obj-$(CONFIG_NET_SCH_RED)	+= sch_red.o
 obj-$(CONFIG_NET_SCH_GRED)	+= sch_gred.o
 obj-$(CONFIG_NET_SCH_INGRESS)	+= sch_ingress.o
-obj-$(CONFIG_NET_SCH_DSMARK)	+= sch_dsmark.o
 obj-$(CONFIG_NET_SCH_SFB)	+= sch_sfb.o
 obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
 obj-$(CONFIG_NET_SCH_TBF)	+= sch_tbf.o
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
deleted file mode 100644
index 401ffaf87d62..000000000000
--- a/net/sched/sch_dsmark.c
+++ /dev/null
@@ -1,518 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* net/sched/sch_dsmark.c - Differentiated Services field marker */
-
-/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
-
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <linux/rtnetlink.h>
-#include <linux/bitops.h>
-#include <net/pkt_sched.h>
-#include <net/pkt_cls.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <asm/byteorder.h>
-
-/*
- * classid	class		marking
- * -------	-----		-------
- *   n/a	  0		n/a
- *   x:0	  1		use entry [0]
- *   ...	 ...		...
- *   x:y y>0	 y+1		use entry [y]
- *   ...	 ...		...
- * x:indices-1	indices		use entry [indices-1]
- *   ...	 ...		...
- *   x:y	 y+1		use entry [y & (indices-1)]
- *   ...	 ...		...
- * 0xffff	0x10000		use entry [indices-1]
- */
-
-
-#define NO_DEFAULT_INDEX	(1 << 16)
-
-struct mask_value {
-	u8			mask;
-	u8			value;
-};
-
-struct dsmark_qdisc_data {
-	struct Qdisc		*q;
-	struct tcf_proto __rcu	*filter_list;
-	struct tcf_block	*block;
-	struct mask_value	*mv;
-	u16			indices;
-	u8			set_tc_index;
-	u32			default_index;	/* index range is 0...0xffff */
-#define DSMARK_EMBEDDED_SZ	16
-	struct mask_value	embedded[DSMARK_EMBEDDED_SZ];
-};
-
-static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
-{
-	return index <= p->indices && index > 0;
-}
-
-/* ------------------------- Class/flow operations ------------------------- */
-
-static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
-			struct Qdisc *new, struct Qdisc **old,
-			struct netlink_ext_ack *extack)
-{
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
-	pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n",
-		 __func__, sch, p, new, old);
-
-	if (new == NULL) {
-		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
-					sch->handle, NULL);
-		if (new == NULL)
-			new = &noop_qdisc;
-	}
-
-	*old = qdisc_replace(sch, new, &p->q);
-	return 0;
-}
-
-static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
-{
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-	return p->q;
-}
-
-static unsigned long dsmark_find(struct Qdisc *sch, u32 classid)
-{
-	return TC_H_MIN(classid) + 1;
-}
-
-static unsigned long dsmark_bind_filter(struct Qdisc *sch,
-					unsigned long parent, u32 classid)
-{
-	pr_debug("%s(sch %p,[qdisc %p],classid %x)\n",
-		 __func__, sch, qdisc_priv(sch), classid);
-
-	return dsmark_find(sch, classid);
-}
-
-static void dsmark_unbind_filter(struct Qdisc *sch, unsigned long cl)
-{
-}
-
-static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = {
-	[TCA_DSMARK_INDICES]		= { .type = NLA_U16 },
-	[TCA_DSMARK_DEFAULT_INDEX]	= { .type = NLA_U16 },
-	[TCA_DSMARK_SET_TC_INDEX]	= { .type = NLA_FLAG },
-	[TCA_DSMARK_MASK]		= { .type = NLA_U8 },
-	[TCA_DSMARK_VALUE]		= { .type = NLA_U8 },
-};
-
-static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
-			 struct nlattr **tca, unsigned long *arg,
-			 struct netlink_ext_ack *extack)
-{
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-	struct nlattr *opt = tca[TCA_OPTIONS];
-	struct nlattr *tb[TCA_DSMARK_MAX + 1];
-	int err = -EINVAL;
-
-	pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n",
-		 __func__, sch, p, classid, parent, *arg);
-
-	if (!dsmark_valid_index(p, *arg)) {
-		err = -ENOENT;
-		goto errout;
-	}
-
-	if (!opt)
-		goto errout;
-
-	err = nla_parse_nested_deprecated(tb, TCA_DSMARK_MAX, opt,
-					  dsmark_policy, NULL);
-	if (err < 0)
-		goto errout;
-
-	if (tb[TCA_DSMARK_VALUE])
-		p->mv[*arg - 1].value = nla_get_u8(tb[TCA_DSMARK_VALUE]);
-
-	if (tb[TCA_DSMARK_MASK])
-		p->mv[*arg - 1].mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
-
-	err = 0;
-
-errout:
-	return err;
-}
-
-static int dsmark_delete(struct Qdisc *sch, unsigned long arg,
-			 struct netlink_ext_ack *extack)
-{
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
-	if (!dsmark_valid_index(p, arg))
-		return -EINVAL;
-
-	p->mv[arg - 1].mask = 0xff;
-	p->mv[arg - 1].value = 0;
-
-	return 0;
-}
-
-static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
-{
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-	int i;
-
-	pr_debug("%s(sch %p,[qdisc %p],walker %p)\n",
-		 __func__, sch, p, walker);
-
-	if (walker->stop)
-		return;
-
-	for (i = 0; i < p->indices; i++) {
-		if (p->mv[i].mask == 0xff && !p->mv[i].value) {
-			walker->count++;
-			continue;
-		}
-		if (!tc_qdisc_stats_dump(sch, i + 1, walker))
-			break;
-	}
-}
-
-static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl,
-					  struct netlink_ext_ack *extack)
-{
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
-	return p->block;
-}
-
-/* --------------------------- Qdisc operations ---------------------------- */
-
-static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
-			  struct sk_buff **to_free)
-{
-	unsigned int len = qdisc_pkt_len(skb);
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-	int err;
-
-	pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);
-
-	if (p->set_tc_index) {
-		int wlen = skb_network_offset(skb);
-
-		switch (skb_protocol(skb, true)) {
-		case htons(ETH_P_IP):
-			wlen += sizeof(struct iphdr);
-			if (!pskb_may_pull(skb, wlen) ||
-			    skb_try_make_writable(skb, wlen))
-				goto drop;
-
-			skb->tc_index = ipv4_get_dsfield(ip_hdr(skb))
-				& ~INET_ECN_MASK;
-			break;
-
-		case htons(ETH_P_IPV6):
-			wlen += sizeof(struct ipv6hdr);
-			if (!pskb_may_pull(skb, wlen) ||
-			    skb_try_make_writable(skb, wlen))
-				goto drop;
-
-			skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb))
-				& ~INET_ECN_MASK;
-			break;
-		default:
-			skb->tc_index = 0;
-			break;
-		}
-	}
-
-	if (TC_H_MAJ(skb->priority) == sch->handle)
-		skb->tc_index = TC_H_MIN(skb->priority);
-	else {
-		struct tcf_result res;
-		struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
-		int result = tcf_classify(skb, NULL, fl, &res, false);
-
-		pr_debug("result %d class 0x%04x\n", result, res.classid);
-
-		switch (result) {
-#ifdef CONFIG_NET_CLS_ACT
-		case TC_ACT_QUEUED:
-		case TC_ACT_STOLEN:
-		case TC_ACT_TRAP:
-			__qdisc_drop(skb, to_free);
-			return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
-
-		case TC_ACT_SHOT:
-			goto drop;
-#endif
-		case TC_ACT_OK:
-			skb->tc_index = TC_H_MIN(res.classid);
-			break;
-
-		default:
-			if (p->default_index != NO_DEFAULT_INDEX)
-				skb->tc_index = p->default_index;
-			break;
-		}
-	}
-
-	err = qdisc_enqueue(skb, p->q, to_free);
-	if (err != NET_XMIT_SUCCESS) {
-		if (net_xmit_drop_count(err))
-			qdisc_qstats_drop(sch);
-		return err;
-	}
-
-	sch->qstats.backlog += len;
-	sch->q.qlen++;
-
-	return NET_XMIT_SUCCESS;
-
-drop:
-	qdisc_drop(skb, sch, to_free);
-	return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-}
-
-static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
-{
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-	struct sk_buff *skb;
-	u32 index;
-
-	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
-
-	skb = qdisc_dequeue_peeked(p->q);
-	if (skb == NULL)
-		return NULL;
-
-	qdisc_bstats_update(sch, skb);
-	qdisc_qstats_backlog_dec(sch, skb);
-	sch->q.qlen--;
-
-	index = skb->tc_index & (p->indices - 1);
-	pr_debug("index %d->%d\n", skb->tc_index, index);
-
-	switch (skb_protocol(skb, true)) {
-	case htons(ETH_P_IP):
-		ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask,
-				    p->mv[index].value);
-			break;
-	case htons(ETH_P_IPV6):
-		ipv6_change_dsfield(ipv6_hdr(skb), p->mv[index].mask,
-				    p->mv[index].value);
-			break;
-	default:
-		/*
-		 * Only complain if a change was actually attempted.
-		 * This way, we can send non-IP traffic through dsmark
-		 * and don't need yet another qdisc as a bypass.
-		 */
-		if (p->mv[index].mask != 0xff || p->mv[index].value)
-			pr_warn("%s: unsupported protocol %d\n",
-				__func__, ntohs(skb_protocol(skb, true)));
-		break;
-	}
-
-	return skb;
-}
-
-static struct sk_buff *dsmark_peek(struct Qdisc *sch)
-{
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
-	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
-
-	return p->q->ops->peek(p->q);
-}
-
-static int dsmark_init(struct Qdisc *sch, struct nlattr *opt,
-		       struct netlink_ext_ack *extack)
-{
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-	struct nlattr *tb[TCA_DSMARK_MAX + 1];
-	int err = -EINVAL;
-	u32 default_index = NO_DEFAULT_INDEX;
-	u16 indices;
-	int i;
-
-	pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt);
-
-	if (!opt)
-		goto errout;
-
-	err = tcf_block_get(&p->block, &p->filter_list, sch, extack);
-	if (err)
-		return err;
-
-	err = nla_parse_nested_deprecated(tb, TCA_DSMARK_MAX, opt,
-					  dsmark_policy, NULL);
-	if (err < 0)
-		goto errout;
-
-	err = -EINVAL;
-	if (!tb[TCA_DSMARK_INDICES])
-		goto errout;
-	indices = nla_get_u16(tb[TCA_DSMARK_INDICES]);
-
-	if (hweight32(indices) != 1)
-		goto errout;
-
-	if (tb[TCA_DSMARK_DEFAULT_INDEX])
-		default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]);
-
-	if (indices <= DSMARK_EMBEDDED_SZ)
-		p->mv = p->embedded;
-	else
-		p->mv = kmalloc_array(indices, sizeof(*p->mv), GFP_KERNEL);
-	if (!p->mv) {
-		err = -ENOMEM;
-		goto errout;
-	}
-	for (i = 0; i < indices; i++) {
-		p->mv[i].mask = 0xff;
-		p->mv[i].value = 0;
-	}
-	p->indices = indices;
-	p->default_index = default_index;
-	p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
-
-	p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle,
-				 NULL);
-	if (p->q == NULL)
-		p->q = &noop_qdisc;
-	else
-		qdisc_hash_add(p->q, true);
-
-	pr_debug("%s: qdisc %p\n", __func__, p->q);
-
-	err = 0;
-errout:
-	return err;
-}
-
-static void dsmark_reset(struct Qdisc *sch)
-{
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
-	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
-	if (p->q)
-		qdisc_reset(p->q);
-}
-
-static void dsmark_destroy(struct Qdisc *sch)
-{
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-
-	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
-
-	tcf_block_put(p->block);
-	qdisc_put(p->q);
-	if (p->mv != p->embedded)
-		kfree(p->mv);
-}
-
-static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
-			     struct sk_buff *skb, struct tcmsg *tcm)
-{
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-	struct nlattr *opts = NULL;
-
-	pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl);
-
-	if (!dsmark_valid_index(p, cl))
-		return -EINVAL;
-
-	tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);
-	tcm->tcm_info = p->q->handle;
-
-	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
-	if (opts == NULL)
-		goto nla_put_failure;
-	if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mv[cl - 1].mask) ||
-	    nla_put_u8(skb, TCA_DSMARK_VALUE, p->mv[cl - 1].value))
-		goto nla_put_failure;
-
-	return nla_nest_end(skb, opts);
-
-nla_put_failure:
-	nla_nest_cancel(skb, opts);
-	return -EMSGSIZE;
-}
-
-static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
-{
-	struct dsmark_qdisc_data *p = qdisc_priv(sch);
-	struct nlattr *opts = NULL;
-
-	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
-	if (opts == NULL)
-		goto nla_put_failure;
-	if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices))
-		goto nla_put_failure;
-
-	if (p->default_index != NO_DEFAULT_INDEX &&
-	    nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index))
-		goto nla_put_failure;
-
-	if (p->set_tc_index &&
-	    nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX))
-		goto nla_put_failure;
-
-	return nla_nest_end(skb, opts);
-
-nla_put_failure:
-	nla_nest_cancel(skb, opts);
-	return -EMSGSIZE;
-}
-
-static const struct Qdisc_class_ops dsmark_class_ops = {
-	.graft		=	dsmark_graft,
-	.leaf		=	dsmark_leaf,
-	.find		=	dsmark_find,
-	.change		=	dsmark_change,
-	.delete		=	dsmark_delete,
-	.walk		=	dsmark_walk,
-	.tcf_block	=	dsmark_tcf_block,
-	.bind_tcf	=	dsmark_bind_filter,
-	.unbind_tcf	=	dsmark_unbind_filter,
-	.dump		=	dsmark_dump_class,
-};
-
-static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = {
-	.next		=	NULL,
-	.cl_ops		=	&dsmark_class_ops,
-	.id		=	"dsmark",
-	.priv_size	=	sizeof(struct dsmark_qdisc_data),
-	.enqueue	=	dsmark_enqueue,
-	.dequeue	=	dsmark_dequeue,
-	.peek		=	dsmark_peek,
-	.init		=	dsmark_init,
-	.reset		=	dsmark_reset,
-	.destroy	=	dsmark_destroy,
-	.change		=	NULL,
-	.dump		=	dsmark_dump,
-	.owner		=	THIS_MODULE,
-};
-
-static int __init dsmark_module_init(void)
-{
-	return register_qdisc(&dsmark_qdisc_ops);
-}
-
-static void __exit dsmark_module_exit(void)
-{
-	unregister_qdisc(&dsmark_qdisc_ops);
-}
-
-module_init(dsmark_module_init)
-module_exit(dsmark_module_exit)
-
-MODULE_LICENSE("GPL");
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/dsmark.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/dsmark.json
deleted file mode 100644
index c030795f9c37..000000000000
--- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/dsmark.json
+++ /dev/null
@@ -1,140 +0,0 @@
-[
-    {
-        "id": "6345",
-        "name": "Create DSMARK with default setting",
-        "category": [
-            "qdisc",
-            "dsmark"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root dsmark indices 1024",
-        "expExitCode": "0",
-        "verifyCmd": "$TC qdisc show dev $DUMMY",
-        "matchPattern": "qdisc dsmark 1: root refcnt [0-9]+ indices 0x0400",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DUMMY handle 1: root",
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "3462",
-        "name": "Create DSMARK with default_index setting",
-        "category": [
-            "qdisc",
-            "dsmark"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root dsmark indices 1024 default_index 512",
-        "expExitCode": "0",
-        "verifyCmd": "$TC qdisc show dev $DUMMY",
-        "matchPattern": "qdisc dsmark 1: root refcnt [0-9]+ indices 0x0400 default_index 0x0200",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DUMMY handle 1: root",
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "ca95",
-        "name": "Create DSMARK with set_tc_index flag",
-        "category": [
-            "qdisc",
-            "dsmark"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root dsmark indices 1024 set_tc_index",
-        "expExitCode": "0",
-        "verifyCmd": "$TC qdisc show dev $DUMMY",
-        "matchPattern": "qdisc dsmark 1: root refcnt [0-9]+ indices 0x0400 set_tc_index",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DUMMY handle 1: root",
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "a950",
-        "name": "Create DSMARK with multiple setting",
-        "category": [
-            "qdisc",
-            "dsmark"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root dsmark indices 1024 default_index 1024 set_tc_index",
-        "expExitCode": "0",
-        "verifyCmd": "$TC qdisc show dev $DUMMY",
-        "matchPattern": "qdisc dsmark 1: root refcnt [0-9]+ indices 0x0400 default_index 0x0400 set_tc_index",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DUMMY handle 1: root",
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "4092",
-        "name": "Delete DSMARK with handle",
-        "category": [
-            "qdisc",
-            "dsmark"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true",
-            "$TC qdisc add dev $DUMMY handle 1: root dsmark indices 1024 default_index 1024"
-        ],
-        "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 1: root",
-        "expExitCode": "0",
-        "verifyCmd": "$TC qdisc show dev $DUMMY",
-        "matchPattern": "qdisc dsmark 1: root refcnt [0-9]+ indices 0x0400",
-        "matchCount": "0",
-        "teardown": [
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    },
-    {
-        "id": "5930",
-        "name": "Show DSMARK class",
-        "category": [
-            "qdisc",
-            "dsmark"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$IP link add dev $DUMMY type dummy || /bin/true"
-        ],
-        "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root dsmark indices 1024",
-        "expExitCode": "0",
-        "verifyCmd": "$TC class show dev $DUMMY",
-        "matchPattern": "class dsmark 1:",
-        "matchCount": "0",
-        "teardown": [
-            "$TC qdisc del dev $DUMMY handle 1: root",
-            "$IP link del dev $DUMMY type dummy"
-        ]
-    }
-]
-- 
cgit 


From 8c710f75256bb3cf05ac7b1672c82b92c43f3d28 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Tue, 14 Feb 2023 08:49:14 -0500
Subject: net/sched: Retire tcindex classifier

The tcindex classifier has served us well for about a quarter of a century
but has not been getting much TLC due to lack of known users. Most recently
it has become easy prey to syzkaller. For this reason, we are retiring it.

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tc_wrapper.h                           |   5 -
 net/sched/Kconfig                                  |  11 -
 net/sched/Makefile                                 |   1 -
 net/sched/cls_tcindex.c                            | 716 ---------------------
 .../tc-testing/tc-tests/filters/tcindex.json       | 227 -------
 5 files changed, 960 deletions(-)
 delete mode 100644 net/sched/cls_tcindex.c
 delete mode 100644 tools/testing/selftests/tc-testing/tc-tests/filters/tcindex.json

(limited to 'tools')

diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h
index d323fffb839a..8ba241760d0a 100644
--- a/include/net/tc_wrapper.h
+++ b/include/net/tc_wrapper.h
@@ -154,7 +154,6 @@ TC_INDIRECT_FILTER_DECLARE(mall_classify);
 TC_INDIRECT_FILTER_DECLARE(route4_classify);
 TC_INDIRECT_FILTER_DECLARE(rsvp_classify);
 TC_INDIRECT_FILTER_DECLARE(rsvp6_classify);
-TC_INDIRECT_FILTER_DECLARE(tcindex_classify);
 TC_INDIRECT_FILTER_DECLARE(u32_classify);
 
 static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -207,10 +206,6 @@ static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	if (tp->classify == rsvp6_classify)
 		return rsvp6_classify(skb, tp, res);
 #endif
-#if IS_BUILTIN(CONFIG_NET_CLS_TCINDEX)
-	if (tp->classify == tcindex_classify)
-		return tcindex_classify(skb, tp, res);
-#endif
 
 skip:
 	return tp->classify(skb, tp, res);
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index d909f289a67f..111883853f08 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -468,17 +468,6 @@ config NET_CLS_BASIC
 	  To compile this code as a module, choose M here: the
 	  module will be called cls_basic.
 
-config NET_CLS_TCINDEX
-	tristate "Traffic-Control Index (TCINDEX)"
-	select NET_CLS
-	help
-	  Say Y here if you want to be able to classify packets based on
-	  traffic control indices. You will want this feature if you want
-	  to implement Differentiated Services together with DSMARK.
-
-	  To compile this code as a module, choose M here: the
-	  module will be called cls_tcindex.
-
 config NET_CLS_ROUTE4
 	tristate "Routing decision (ROUTE)"
 	depends on INET
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 0852e989af96..ea236d258c16 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -68,7 +68,6 @@ obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
 obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
 obj-$(CONFIG_NET_CLS_RSVP)	+= cls_rsvp.o
-obj-$(CONFIG_NET_CLS_TCINDEX)	+= cls_tcindex.o
 obj-$(CONFIG_NET_CLS_RSVP6)	+= cls_rsvp6.o
 obj-$(CONFIG_NET_CLS_BASIC)	+= cls_basic.o
 obj-$(CONFIG_NET_CLS_FLOW)	+= cls_flow.o
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
deleted file mode 100644
index ee2a050c887b..000000000000
--- a/net/sched/cls_tcindex.c
+++ /dev/null
@@ -1,716 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * net/sched/cls_tcindex.c	Packet classifier for skb->tc_index
- *
- * Written 1998,1999 by Werner Almesberger, EPFL ICA
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/refcount.h>
-#include <net/act_api.h>
-#include <net/netlink.h>
-#include <net/pkt_cls.h>
-#include <net/sch_generic.h>
-#include <net/tc_wrapper.h>
-
-/*
- * Passing parameters to the root seems to be done more awkwardly than really
- * necessary. At least, u32 doesn't seem to use such dirty hacks. To be
- * verified. FIXME.
- */
-
-#define PERFECT_HASH_THRESHOLD	64	/* use perfect hash if not bigger */
-#define DEFAULT_HASH_SIZE	64	/* optimized for diffserv */
-
-
-struct tcindex_data;
-
-struct tcindex_filter_result {
-	struct tcf_exts		exts;
-	struct tcf_result	res;
-	struct tcindex_data	*p;
-	struct rcu_work		rwork;
-};
-
-struct tcindex_filter {
-	u16 key;
-	struct tcindex_filter_result result;
-	struct tcindex_filter __rcu *next;
-	struct rcu_work rwork;
-};
-
-
-struct tcindex_data {
-	struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
-	struct tcindex_filter __rcu **h; /* imperfect hash; */
-	struct tcf_proto *tp;
-	u16 mask;		/* AND key with mask */
-	u32 shift;		/* shift ANDed key to the right */
-	u32 hash;		/* hash table size; 0 if undefined */
-	u32 alloc_hash;		/* allocated size */
-	u32 fall_through;	/* 0: only classify if explicit match */
-	refcount_t refcnt;	/* a temporary refcnt for perfect hash */
-	struct rcu_work rwork;
-};
-
-static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
-{
-	return tcf_exts_has_actions(&r->exts) || r->res.classid;
-}
-
-static void tcindex_data_get(struct tcindex_data *p)
-{
-	refcount_inc(&p->refcnt);
-}
-
-static void tcindex_data_put(struct tcindex_data *p)
-{
-	if (refcount_dec_and_test(&p->refcnt)) {
-		kfree(p->perfect);
-		kfree(p->h);
-		kfree(p);
-	}
-}
-
-static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
-						    u16 key)
-{
-	if (p->perfect) {
-		struct tcindex_filter_result *f = p->perfect + key;
-
-		return tcindex_filter_is_set(f) ? f : NULL;
-	} else if (p->h) {
-		struct tcindex_filter __rcu **fp;
-		struct tcindex_filter *f;
-
-		fp = &p->h[key % p->hash];
-		for (f = rcu_dereference_bh_rtnl(*fp);
-		     f;
-		     fp = &f->next, f = rcu_dereference_bh_rtnl(*fp))
-			if (f->key == key)
-				return &f->result;
-	}
-
-	return NULL;
-}
-
-TC_INDIRECT_SCOPE int tcindex_classify(struct sk_buff *skb,
-				       const struct tcf_proto *tp,
-				       struct tcf_result *res)
-{
-	struct tcindex_data *p = rcu_dereference_bh(tp->root);
-	struct tcindex_filter_result *f;
-	int key = (skb->tc_index & p->mask) >> p->shift;
-
-	pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n",
-		 skb, tp, res, p);
-
-	f = tcindex_lookup(p, key);
-	if (!f) {
-		struct Qdisc *q = tcf_block_q(tp->chain->block);
-
-		if (!p->fall_through)
-			return -1;
-		res->classid = TC_H_MAKE(TC_H_MAJ(q->handle), key);
-		res->class = 0;
-		pr_debug("alg 0x%x\n", res->classid);
-		return 0;
-	}
-	*res = f->res;
-	pr_debug("map 0x%x\n", res->classid);
-
-	return tcf_exts_exec(skb, &f->exts, res);
-}
-
-
-static void *tcindex_get(struct tcf_proto *tp, u32 handle)
-{
-	struct tcindex_data *p = rtnl_dereference(tp->root);
-	struct tcindex_filter_result *r;
-
-	pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle);
-	if (p->perfect && handle >= p->alloc_hash)
-		return NULL;
-	r = tcindex_lookup(p, handle);
-	return r && tcindex_filter_is_set(r) ? r : NULL;
-}
-
-static int tcindex_init(struct tcf_proto *tp)
-{
-	struct tcindex_data *p;
-
-	pr_debug("tcindex_init(tp %p)\n", tp);
-	p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL);
-	if (!p)
-		return -ENOMEM;
-
-	p->mask = 0xffff;
-	p->hash = DEFAULT_HASH_SIZE;
-	p->fall_through = 1;
-	refcount_set(&p->refcnt, 1); /* Paired with tcindex_destroy_work() */
-
-	rcu_assign_pointer(tp->root, p);
-	return 0;
-}
-
-static void __tcindex_destroy_rexts(struct tcindex_filter_result *r)
-{
-	tcf_exts_destroy(&r->exts);
-	tcf_exts_put_net(&r->exts);
-	tcindex_data_put(r->p);
-}
-
-static void tcindex_destroy_rexts_work(struct work_struct *work)
-{
-	struct tcindex_filter_result *r;
-
-	r = container_of(to_rcu_work(work),
-			 struct tcindex_filter_result,
-			 rwork);
-	rtnl_lock();
-	__tcindex_destroy_rexts(r);
-	rtnl_unlock();
-}
-
-static void __tcindex_destroy_fexts(struct tcindex_filter *f)
-{
-	tcf_exts_destroy(&f->result.exts);
-	tcf_exts_put_net(&f->result.exts);
-	kfree(f);
-}
-
-static void tcindex_destroy_fexts_work(struct work_struct *work)
-{
-	struct tcindex_filter *f = container_of(to_rcu_work(work),
-						struct tcindex_filter,
-						rwork);
-
-	rtnl_lock();
-	__tcindex_destroy_fexts(f);
-	rtnl_unlock();
-}
-
-static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last,
-			  bool rtnl_held, struct netlink_ext_ack *extack)
-{
-	struct tcindex_data *p = rtnl_dereference(tp->root);
-	struct tcindex_filter_result *r = arg;
-	struct tcindex_filter __rcu **walk;
-	struct tcindex_filter *f = NULL;
-
-	pr_debug("tcindex_delete(tp %p,arg %p),p %p\n", tp, arg, p);
-	if (p->perfect) {
-		if (!r->res.class)
-			return -ENOENT;
-	} else {
-		int i;
-
-		for (i = 0; i < p->hash; i++) {
-			walk = p->h + i;
-			for (f = rtnl_dereference(*walk); f;
-			     walk = &f->next, f = rtnl_dereference(*walk)) {
-				if (&f->result == r)
-					goto found;
-			}
-		}
-		return -ENOENT;
-
-found:
-		rcu_assign_pointer(*walk, rtnl_dereference(f->next));
-	}
-	tcf_unbind_filter(tp, &r->res);
-	/* all classifiers are required to call tcf_exts_destroy() after rcu
-	 * grace period, since converted-to-rcu actions are relying on that
-	 * in cleanup() callback
-	 */
-	if (f) {
-		if (tcf_exts_get_net(&f->result.exts))
-			tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work);
-		else
-			__tcindex_destroy_fexts(f);
-	} else {
-		tcindex_data_get(p);
-
-		if (tcf_exts_get_net(&r->exts))
-			tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
-		else
-			__tcindex_destroy_rexts(r);
-	}
-
-	*last = false;
-	return 0;
-}
-
-static void tcindex_destroy_work(struct work_struct *work)
-{
-	struct tcindex_data *p = container_of(to_rcu_work(work),
-					      struct tcindex_data,
-					      rwork);
-
-	tcindex_data_put(p);
-}
-
-static inline int
-valid_perfect_hash(struct tcindex_data *p)
-{
-	return  p->hash > (p->mask >> p->shift);
-}
-
-static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
-	[TCA_TCINDEX_HASH]		= { .type = NLA_U32 },
-	[TCA_TCINDEX_MASK]		= { .type = NLA_U16 },
-	[TCA_TCINDEX_SHIFT]		= { .type = NLA_U32 },
-	[TCA_TCINDEX_FALL_THROUGH]	= { .type = NLA_U32 },
-	[TCA_TCINDEX_CLASSID]		= { .type = NLA_U32 },
-};
-
-static int tcindex_filter_result_init(struct tcindex_filter_result *r,
-				      struct tcindex_data *p,
-				      struct net *net)
-{
-	memset(r, 0, sizeof(*r));
-	r->p = p;
-	return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT,
-			     TCA_TCINDEX_POLICE);
-}
-
-static void tcindex_free_perfect_hash(struct tcindex_data *cp);
-
-static void tcindex_partial_destroy_work(struct work_struct *work)
-{
-	struct tcindex_data *p = container_of(to_rcu_work(work),
-					      struct tcindex_data,
-					      rwork);
-
-	rtnl_lock();
-	if (p->perfect)
-		tcindex_free_perfect_hash(p);
-	kfree(p);
-	rtnl_unlock();
-}
-
-static void tcindex_free_perfect_hash(struct tcindex_data *cp)
-{
-	int i;
-
-	for (i = 0; i < cp->hash; i++)
-		tcf_exts_destroy(&cp->perfect[i].exts);
-	kfree(cp->perfect);
-}
-
-static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp)
-{
-	int i, err = 0;
-
-	cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result),
-			      GFP_KERNEL | __GFP_NOWARN);
-	if (!cp->perfect)
-		return -ENOMEM;
-
-	for (i = 0; i < cp->hash; i++) {
-		err = tcf_exts_init(&cp->perfect[i].exts, net,
-				    TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
-		if (err < 0)
-			goto errout;
-		cp->perfect[i].p = cp;
-	}
-
-	return 0;
-
-errout:
-	tcindex_free_perfect_hash(cp);
-	return err;
-}
-
-static int
-tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
-		  u32 handle, struct tcindex_data *p,
-		  struct tcindex_filter_result *r, struct nlattr **tb,
-		  struct nlattr *est, u32 flags, struct netlink_ext_ack *extack)
-{
-	struct tcindex_filter_result new_filter_result;
-	struct tcindex_data *cp = NULL, *oldp;
-	struct tcindex_filter *f = NULL; /* make gcc behave */
-	struct tcf_result cr = {};
-	int err, balloc = 0;
-	struct tcf_exts e;
-
-	err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
-	if (err < 0)
-		return err;
-	err = tcf_exts_validate(net, tp, tb, est, &e, flags, extack);
-	if (err < 0)
-		goto errout;
-
-	err = -ENOMEM;
-	/* tcindex_data attributes must look atomic to classifier/lookup so
-	 * allocate new tcindex data and RCU assign it onto root. Keeping
-	 * perfect hash and hash pointers from old data.
-	 */
-	cp = kzalloc(sizeof(*cp), GFP_KERNEL);
-	if (!cp)
-		goto errout;
-
-	cp->mask = p->mask;
-	cp->shift = p->shift;
-	cp->hash = p->hash;
-	cp->alloc_hash = p->alloc_hash;
-	cp->fall_through = p->fall_through;
-	cp->tp = tp;
-	refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */
-
-	if (tb[TCA_TCINDEX_HASH])
-		cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
-
-	if (tb[TCA_TCINDEX_MASK])
-		cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
-
-	if (tb[TCA_TCINDEX_SHIFT]) {
-		cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
-		if (cp->shift > 16) {
-			err = -EINVAL;
-			goto errout;
-		}
-	}
-	if (!cp->hash) {
-		/* Hash not specified, use perfect hash if the upper limit
-		 * of the hashing index is below the threshold.
-		 */
-		if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
-			cp->hash = (cp->mask >> cp->shift) + 1;
-		else
-			cp->hash = DEFAULT_HASH_SIZE;
-	}
-
-	if (p->perfect) {
-		int i;
-
-		if (tcindex_alloc_perfect_hash(net, cp) < 0)
-			goto errout;
-		cp->alloc_hash = cp->hash;
-		for (i = 0; i < min(cp->hash, p->hash); i++)
-			cp->perfect[i].res = p->perfect[i].res;
-		balloc = 1;
-	}
-	cp->h = p->h;
-
-	err = tcindex_filter_result_init(&new_filter_result, cp, net);
-	if (err < 0)
-		goto errout_alloc;
-	if (r)
-		cr = r->res;
-
-	err = -EBUSY;
-
-	/* Hash already allocated, make sure that we still meet the
-	 * requirements for the allocated hash.
-	 */
-	if (cp->perfect) {
-		if (!valid_perfect_hash(cp) ||
-		    cp->hash > cp->alloc_hash)
-			goto errout_alloc;
-	} else if (cp->h && cp->hash != cp->alloc_hash) {
-		goto errout_alloc;
-	}
-
-	err = -EINVAL;
-	if (tb[TCA_TCINDEX_FALL_THROUGH])
-		cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
-
-	if (!cp->perfect && !cp->h)
-		cp->alloc_hash = cp->hash;
-
-	/* Note: this could be as restrictive as if (handle & ~(mask >> shift))
-	 * but then, we'd fail handles that may become valid after some future
-	 * mask change. While this is extremely unlikely to ever matter,
-	 * the check below is safer (and also more backwards-compatible).
-	 */
-	if (cp->perfect || valid_perfect_hash(cp))
-		if (handle >= cp->alloc_hash)
-			goto errout_alloc;
-
-
-	err = -ENOMEM;
-	if (!cp->perfect && !cp->h) {
-		if (valid_perfect_hash(cp)) {
-			if (tcindex_alloc_perfect_hash(net, cp) < 0)
-				goto errout_alloc;
-			balloc = 1;
-		} else {
-			struct tcindex_filter __rcu **hash;
-
-			hash = kcalloc(cp->hash,
-				       sizeof(struct tcindex_filter *),
-				       GFP_KERNEL);
-
-			if (!hash)
-				goto errout_alloc;
-
-			cp->h = hash;
-			balloc = 2;
-		}
-	}
-
-	if (cp->perfect)
-		r = cp->perfect + handle;
-	else
-		r = tcindex_lookup(cp, handle) ? : &new_filter_result;
-
-	if (r == &new_filter_result) {
-		f = kzalloc(sizeof(*f), GFP_KERNEL);
-		if (!f)
-			goto errout_alloc;
-		f->key = handle;
-		f->next = NULL;
-		err = tcindex_filter_result_init(&f->result, cp, net);
-		if (err < 0) {
-			kfree(f);
-			goto errout_alloc;
-		}
-	}
-
-	if (tb[TCA_TCINDEX_CLASSID]) {
-		cr.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]);
-		tcf_bind_filter(tp, &cr, base);
-	}
-
-	oldp = p;
-	r->res = cr;
-	tcf_exts_change(&r->exts, &e);
-
-	rcu_assign_pointer(tp->root, cp);
-
-	if (r == &new_filter_result) {
-		struct tcindex_filter *nfp;
-		struct tcindex_filter __rcu **fp;
-
-		f->result.res = r->res;
-		tcf_exts_change(&f->result.exts, &r->exts);
-
-		fp = cp->h + (handle % cp->hash);
-		for (nfp = rtnl_dereference(*fp);
-		     nfp;
-		     fp = &nfp->next, nfp = rtnl_dereference(*fp))
-				; /* nothing */
-
-		rcu_assign_pointer(*fp, f);
-	} else {
-		tcf_exts_destroy(&new_filter_result.exts);
-	}
-
-	if (oldp)
-		tcf_queue_work(&oldp->rwork, tcindex_partial_destroy_work);
-	return 0;
-
-errout_alloc:
-	if (balloc == 1)
-		tcindex_free_perfect_hash(cp);
-	else if (balloc == 2)
-		kfree(cp->h);
-	tcf_exts_destroy(&new_filter_result.exts);
-errout:
-	kfree(cp);
-	tcf_exts_destroy(&e);
-	return err;
-}
-
-static int
-tcindex_change(struct net *net, struct sk_buff *in_skb,
-	       struct tcf_proto *tp, unsigned long base, u32 handle,
-	       struct nlattr **tca, void **arg, u32 flags,
-	       struct netlink_ext_ack *extack)
-{
-	struct nlattr *opt = tca[TCA_OPTIONS];
-	struct nlattr *tb[TCA_TCINDEX_MAX + 1];
-	struct tcindex_data *p = rtnl_dereference(tp->root);
-	struct tcindex_filter_result *r = *arg;
-	int err;
-
-	pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
-	    "p %p,r %p,*arg %p\n",
-	    tp, handle, tca, arg, opt, p, r, *arg);
-
-	if (!opt)
-		return 0;
-
-	err = nla_parse_nested_deprecated(tb, TCA_TCINDEX_MAX, opt,
-					  tcindex_policy, NULL);
-	if (err < 0)
-		return err;
-
-	return tcindex_set_parms(net, tp, base, handle, p, r, tb,
-				 tca[TCA_RATE], flags, extack);
-}
-
-static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker,
-			 bool rtnl_held)
-{
-	struct tcindex_data *p = rtnl_dereference(tp->root);
-	struct tcindex_filter *f, *next;
-	int i;
-
-	pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
-	if (p->perfect) {
-		for (i = 0; i < p->hash; i++) {
-			if (!p->perfect[i].res.class)
-				continue;
-			if (!tc_cls_stats_dump(tp, walker, p->perfect + i))
-				return;
-		}
-	}
-	if (!p->h)
-		return;
-	for (i = 0; i < p->hash; i++) {
-		for (f = rtnl_dereference(p->h[i]); f; f = next) {
-			next = rtnl_dereference(f->next);
-			if (!tc_cls_stats_dump(tp, walker, &f->result))
-				return;
-		}
-	}
-}
-
-static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held,
-			    struct netlink_ext_ack *extack)
-{
-	struct tcindex_data *p = rtnl_dereference(tp->root);
-	int i;
-
-	pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
-
-	if (p->perfect) {
-		for (i = 0; i < p->hash; i++) {
-			struct tcindex_filter_result *r = p->perfect + i;
-
-			/* tcf_queue_work() does not guarantee the ordering we
-			 * want, so we have to take this refcnt temporarily to
-			 * ensure 'p' is freed after all tcindex_filter_result
-			 * here. Imperfect hash does not need this, because it
-			 * uses linked lists rather than an array.
-			 */
-			tcindex_data_get(p);
-
-			tcf_unbind_filter(tp, &r->res);
-			if (tcf_exts_get_net(&r->exts))
-				tcf_queue_work(&r->rwork,
-					       tcindex_destroy_rexts_work);
-			else
-				__tcindex_destroy_rexts(r);
-		}
-	}
-
-	for (i = 0; p->h && i < p->hash; i++) {
-		struct tcindex_filter *f, *next;
-		bool last;
-
-		for (f = rtnl_dereference(p->h[i]); f; f = next) {
-			next = rtnl_dereference(f->next);
-			tcindex_delete(tp, &f->result, &last, rtnl_held, NULL);
-		}
-	}
-
-	tcf_queue_work(&p->rwork, tcindex_destroy_work);
-}
-
-
-static int tcindex_dump(struct net *net, struct tcf_proto *tp, void *fh,
-			struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
-{
-	struct tcindex_data *p = rtnl_dereference(tp->root);
-	struct tcindex_filter_result *r = fh;
-	struct nlattr *nest;
-
-	pr_debug("tcindex_dump(tp %p,fh %p,skb %p,t %p),p %p,r %p\n",
-		 tp, fh, skb, t, p, r);
-	pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h);
-
-	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
-	if (nest == NULL)
-		goto nla_put_failure;
-
-	if (!fh) {
-		t->tcm_handle = ~0; /* whatever ... */
-		if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) ||
-		    nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) ||
-		    nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) ||
-		    nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through))
-			goto nla_put_failure;
-		nla_nest_end(skb, nest);
-	} else {
-		if (p->perfect) {
-			t->tcm_handle = r - p->perfect;
-		} else {
-			struct tcindex_filter *f;
-			struct tcindex_filter __rcu **fp;
-			int i;
-
-			t->tcm_handle = 0;
-			for (i = 0; !t->tcm_handle && i < p->hash; i++) {
-				fp = &p->h[i];
-				for (f = rtnl_dereference(*fp);
-				     !t->tcm_handle && f;
-				     fp = &f->next, f = rtnl_dereference(*fp)) {
-					if (&f->result == r)
-						t->tcm_handle = f->key;
-				}
-			}
-		}
-		pr_debug("handle = %d\n", t->tcm_handle);
-		if (r->res.class &&
-		    nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid))
-			goto nla_put_failure;
-
-		if (tcf_exts_dump(skb, &r->exts) < 0)
-			goto nla_put_failure;
-		nla_nest_end(skb, nest);
-
-		if (tcf_exts_dump_stats(skb, &r->exts) < 0)
-			goto nla_put_failure;
-	}
-
-	return skb->len;
-
-nla_put_failure:
-	nla_nest_cancel(skb, nest);
-	return -1;
-}
-
-static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl,
-			       void *q, unsigned long base)
-{
-	struct tcindex_filter_result *r = fh;
-
-	tc_cls_bind_class(classid, cl, q, &r->res, base);
-}
-
-static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
-	.kind		=	"tcindex",
-	.classify	=	tcindex_classify,
-	.init		=	tcindex_init,
-	.destroy	=	tcindex_destroy,
-	.get		=	tcindex_get,
-	.change		=	tcindex_change,
-	.delete		=	tcindex_delete,
-	.walk		=	tcindex_walk,
-	.dump		=	tcindex_dump,
-	.bind_class	=	tcindex_bind_class,
-	.owner		=	THIS_MODULE,
-};
-
-static int __init init_tcindex(void)
-{
-	return register_tcf_proto_ops(&cls_tcindex_ops);
-}
-
-static void __exit exit_tcindex(void)
-{
-	unregister_tcf_proto_ops(&cls_tcindex_ops);
-}
-
-module_init(init_tcindex)
-module_exit(exit_tcindex)
-MODULE_LICENSE("GPL");
diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/tcindex.json b/tools/testing/selftests/tc-testing/tc-tests/filters/tcindex.json
deleted file mode 100644
index 44901db70376..000000000000
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/tcindex.json
+++ /dev/null
@@ -1,227 +0,0 @@
-[
-    {
-        "id": "8293",
-        "name": "Add tcindex filter with default action",
-        "category": [
-            "filter",
-            "tcindex"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress"
-        ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 tcindex classid 1:1",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip tcindex",
-        "matchPattern": "^filter parent ffff: protocol ip pref 1 tcindex chain 0 handle 0x0001 classid 1:1",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "7281",
-        "name": "Add tcindex filter with hash size and pass action",
-        "category": [
-            "filter",
-            "tcindex"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress"
-        ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 tcindex hash 32 fall_through classid 1:1 action pass",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip tcindex",
-        "matchPattern": "^filter parent ffff: protocol ip pref.*tcindex chain [0-9]+ handle 0x0001 classid 1:1.*action order [0-9]+: gact action pass",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "b294",
-        "name": "Add tcindex filter with mask shift and reclassify action",
-        "category": [
-            "filter",
-            "tcindex"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress"
-        ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 tcindex hash 32 mask 1 shift 2 fall_through classid 1:1 action reclassify",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip tcindex",
-        "matchPattern": "^filter parent ffff: protocol ip pref.*tcindex chain [0-9]+ handle 0x0001 classid 1:1.*action order [0-9]+: gact action reclassify",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "0532",
-        "name": "Add tcindex filter with pass_on and continue actions",
-        "category": [
-            "filter",
-            "tcindex"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress"
-        ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 tcindex hash 32 mask 1 shift 2 pass_on classid 1:1 action continue",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip tcindex",
-        "matchPattern": "^filter parent ffff: protocol ip pref.*tcindex chain [0-9]+ handle 0x0001 classid 1:1.*action order [0-9]+: gact action continue",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "d473",
-        "name": "Add tcindex filter with pipe action",
-        "category": [
-            "filter",
-            "tcindex"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress"
-        ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 tcindex hash 32 mask 1 shift 2 fall_through classid 1:1 action pipe",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip tcindex",
-        "matchPattern": "^filter parent ffff: protocol ip pref.*tcindex chain [0-9]+ handle 0x0001 classid 1:1.*action order [0-9]+: gact action pipe",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "2940",
-        "name": "Add tcindex filter with miltiple actions",
-        "category": [
-            "filter",
-            "tcindex"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress"
-        ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 7 tcindex hash 32 mask 1 shift 2 fall_through classid 1:1 action skbedit mark 7 pipe action gact drop",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 7 protocol ip tcindex",
-        "matchPattern": "^filter parent ffff: protocol ip pref 7 tcindex.*handle 0x0001.*action.*skbedit.*mark 7 pipe.*action.*gact action drop",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "1893",
-        "name": "List tcindex filters",
-        "category": [
-            "filter",
-            "tcindex"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress",
-            "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 tcindex classid 1:1",
-            "$TC filter add dev $DEV1 parent ffff: handle 2 protocol ip prio 1 tcindex classid 1:1"
-        ],
-        "cmdUnderTest": "$TC filter show dev $DEV1 parent ffff:",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
-        "matchPattern": "handle 0x000[0-9]+ classid 1:1",
-        "matchCount": "2",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "2041",
-        "name": "Change tcindex filter with pass action",
-        "category": [
-            "filter",
-            "tcindex"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress",
-            "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 tcindex classid 1:1 action drop"
-        ],
-        "cmdUnderTest": "$TC filter change dev $DEV1 parent ffff: handle 1 protocol ip prio 1 tcindex classid 1:1 action pass",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip tcindex",
-        "matchPattern": "handle 0x0001 classid 1:1.*action order [0-9]+: gact action pass",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "9203",
-        "name": "Replace tcindex filter with pass action",
-        "category": [
-            "filter",
-            "tcindex"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress",
-            "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 tcindex classid 1:1 action drop"
-        ],
-        "cmdUnderTest": "$TC filter replace dev $DEV1 parent ffff: handle 1 protocol ip prio 1 tcindex classid 1:1 action pass",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip tcindex",
-        "matchPattern": "handle 0x0001 classid 1:1.*action order [0-9]+: gact action pass",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "7957",
-        "name": "Delete tcindex filter with drop action",
-        "category": [
-            "filter",
-            "tcindex"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress",
-            "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 tcindex classid 1:1 action drop"
-        ],
-        "cmdUnderTest": "$TC filter del dev $DEV1 parent ffff: handle 1 protocol ip prio 1 tcindex classid 1:1 action drop",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip tcindex",
-        "matchPattern": "handle 0x0001 classid 1:1.*action order [0-9]+: gact action drop",
-        "matchCount": "0",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    }
-]
-- 
cgit 


From 265b4da82dbf5df04bee5a5d46b7474b1aaf326a Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Tue, 14 Feb 2023 08:49:15 -0500
Subject: net/sched: Retire rsvp classifier

The rsvp classifier has served us well for about a quarter of a century but has
has not been getting much maintenance attention due to lack of known users.

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tc_wrapper.h                           |  10 -
 net/sched/Kconfig                                  |  28 -
 net/sched/Makefile                                 |   2 -
 net/sched/cls_rsvp.c                               |  26 -
 net/sched/cls_rsvp.h                               | 764 ---------------------
 net/sched/cls_rsvp6.c                              |  26 -
 .../tc-testing/tc-tests/filters/rsvp.json          | 203 ------
 7 files changed, 1059 deletions(-)
 delete mode 100644 net/sched/cls_rsvp.c
 delete mode 100644 net/sched/cls_rsvp.h
 delete mode 100644 net/sched/cls_rsvp6.c
 delete mode 100644 tools/testing/selftests/tc-testing/tc-tests/filters/rsvp.json

(limited to 'tools')

diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h
index 8ba241760d0a..a6d481b5bcbc 100644
--- a/include/net/tc_wrapper.h
+++ b/include/net/tc_wrapper.h
@@ -152,8 +152,6 @@ TC_INDIRECT_FILTER_DECLARE(flow_classify);
 TC_INDIRECT_FILTER_DECLARE(fw_classify);
 TC_INDIRECT_FILTER_DECLARE(mall_classify);
 TC_INDIRECT_FILTER_DECLARE(route4_classify);
-TC_INDIRECT_FILTER_DECLARE(rsvp_classify);
-TC_INDIRECT_FILTER_DECLARE(rsvp6_classify);
 TC_INDIRECT_FILTER_DECLARE(u32_classify);
 
 static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -198,14 +196,6 @@ static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	if (tp->classify == route4_classify)
 		return route4_classify(skb, tp, res);
 #endif
-#if IS_BUILTIN(CONFIG_NET_CLS_RSVP)
-	if (tp->classify == rsvp_classify)
-		return rsvp_classify(skb, tp, res);
-#endif
-#if IS_BUILTIN(CONFIG_NET_CLS_RSVP6)
-	if (tp->classify == rsvp6_classify)
-		return rsvp6_classify(skb, tp, res);
-#endif
 
 skip:
 	return tp->classify(skb, tp, res);
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 111883853f08..4b95cb1ac435 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -513,34 +513,6 @@ config CLS_U32_MARK
 	help
 	  Say Y here to be able to use netfilter marks as u32 key.
 
-config NET_CLS_RSVP
-	tristate "IPv4 Resource Reservation Protocol (RSVP)"
-	select NET_CLS
-	help
-	  The Resource Reservation Protocol (RSVP) permits end systems to
-	  request a minimum and maximum data flow rate for a connection; this
-	  is important for real time data such as streaming sound or video.
-
-	  Say Y here if you want to be able to classify outgoing packets based
-	  on their RSVP requests.
-
-	  To compile this code as a module, choose M here: the
-	  module will be called cls_rsvp.
-
-config NET_CLS_RSVP6
-	tristate "IPv6 Resource Reservation Protocol (RSVP6)"
-	select NET_CLS
-	help
-	  The Resource Reservation Protocol (RSVP) permits end systems to
-	  request a minimum and maximum data flow rate for a connection; this
-	  is important for real time data such as streaming sound or video.
-
-	  Say Y here if you want to be able to classify outgoing packets based
-	  on their RSVP requests and you are using the IPv6 protocol.
-
-	  To compile this code as a module, choose M here: the
-	  module will be called cls_rsvp6.
-
 config NET_CLS_FLOW
 	tristate "Flow classifier"
 	select NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index ea236d258c16..b5fd49641d91 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -67,8 +67,6 @@ obj-$(CONFIG_NET_SCH_TAPRIO)	+= sch_taprio.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
 obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
-obj-$(CONFIG_NET_CLS_RSVP)	+= cls_rsvp.o
-obj-$(CONFIG_NET_CLS_RSVP6)	+= cls_rsvp6.o
 obj-$(CONFIG_NET_CLS_BASIC)	+= cls_basic.o
 obj-$(CONFIG_NET_CLS_FLOW)	+= cls_flow.o
 obj-$(CONFIG_NET_CLS_CGROUP)	+= cls_cgroup.o
diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c
deleted file mode 100644
index 03d8619bd9c6..000000000000
--- a/net/sched/cls_rsvp.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/sched/cls_rsvp.c	Special RSVP packet classifier for IPv4.
- *
- * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <net/ip.h>
-#include <net/netlink.h>
-#include <net/act_api.h>
-#include <net/pkt_cls.h>
-#include <net/tc_wrapper.h>
-
-#define RSVP_DST_LEN	1
-#define RSVP_ID		"rsvp"
-#define RSVP_OPS	cls_rsvp_ops
-#define RSVP_CLS	rsvp_classify
-
-#include "cls_rsvp.h"
-MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
deleted file mode 100644
index 869efba9f834..000000000000
--- a/net/sched/cls_rsvp.h
+++ /dev/null
@@ -1,764 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * net/sched/cls_rsvp.h	Template file for RSVPv[46] classifiers.
- *
- * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- */
-
-/*
-   Comparing to general packet classification problem,
-   RSVP needs only several relatively simple rules:
-
-   * (dst, protocol) are always specified,
-     so that we are able to hash them.
-   * src may be exact, or may be wildcard, so that
-     we can keep a hash table plus one wildcard entry.
-   * source port (or flow label) is important only if src is given.
-
-   IMPLEMENTATION.
-
-   We use a two level hash table: The top level is keyed by
-   destination address and protocol ID, every bucket contains a list
-   of "rsvp sessions", identified by destination address, protocol and
-   DPI(="Destination Port ID"): triple (key, mask, offset).
-
-   Every bucket has a smaller hash table keyed by source address
-   (cf. RSVP flowspec) and one wildcard entry for wildcard reservations.
-   Every bucket is again a list of "RSVP flows", selected by
-   source address and SPI(="Source Port ID" here rather than
-   "security parameter index"): triple (key, mask, offset).
-
-
-   NOTE 1. All the packets with IPv6 extension headers (but AH and ESP)
-   and all fragmented packets go to the best-effort traffic class.
-
-
-   NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires
-   only one "Generalized Port Identifier". So that for classic
-   ah, esp (and udp,tcp) both *pi should coincide or one of them
-   should be wildcard.
-
-   At first sight, this redundancy is just a waste of CPU
-   resources. But DPI and SPI add the possibility to assign different
-   priorities to GPIs. Look also at note 4 about tunnels below.
-
-
-   NOTE 3. One complication is the case of tunneled packets.
-   We implement it as following: if the first lookup
-   matches a special session with "tunnelhdr" value not zero,
-   flowid doesn't contain the true flow ID, but the tunnel ID (1...255).
-   In this case, we pull tunnelhdr bytes and restart lookup
-   with tunnel ID added to the list of keys. Simple and stupid 8)8)
-   It's enough for PIMREG and IPIP.
-
-
-   NOTE 4. Two GPIs make it possible to parse even GRE packets.
-   F.e. DPI can select ETH_P_IP (and necessary flags to make
-   tunnelhdr correct) in GRE protocol field and SPI matches
-   GRE key. Is it not nice? 8)8)
-
-
-   Well, as result, despite its simplicity, we get a pretty
-   powerful classification engine.  */
-
-
-struct rsvp_head {
-	u32			tmap[256/32];
-	u32			hgenerator;
-	u8			tgenerator;
-	struct rsvp_session __rcu *ht[256];
-	struct rcu_head		rcu;
-};
-
-struct rsvp_session {
-	struct rsvp_session __rcu	*next;
-	__be32				dst[RSVP_DST_LEN];
-	struct tc_rsvp_gpi		dpi;
-	u8				protocol;
-	u8				tunnelid;
-	/* 16 (src,sport) hash slots, and one wildcard source slot */
-	struct rsvp_filter __rcu	*ht[16 + 1];
-	struct rcu_head			rcu;
-};
-
-
-struct rsvp_filter {
-	struct rsvp_filter __rcu	*next;
-	__be32				src[RSVP_DST_LEN];
-	struct tc_rsvp_gpi		spi;
-	u8				tunnelhdr;
-
-	struct tcf_result		res;
-	struct tcf_exts			exts;
-
-	u32				handle;
-	struct rsvp_session		*sess;
-	struct rcu_work			rwork;
-};
-
-static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
-{
-	unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1];
-
-	h ^= h>>16;
-	h ^= h>>8;
-	return (h ^ protocol ^ tunnelid) & 0xFF;
-}
-
-static inline unsigned int hash_src(__be32 *src)
-{
-	unsigned int h = (__force __u32)src[RSVP_DST_LEN-1];
-
-	h ^= h>>16;
-	h ^= h>>8;
-	h ^= h>>4;
-	return h & 0xF;
-}
-
-#define RSVP_APPLY_RESULT()				\
-{							\
-	int r = tcf_exts_exec(skb, &f->exts, res);	\
-	if (r < 0)					\
-		continue;				\
-	else if (r > 0)					\
-		return r;				\
-}
-
-TC_INDIRECT_SCOPE int RSVP_CLS(struct sk_buff *skb, const struct tcf_proto *tp,
-			       struct tcf_result *res)
-{
-	struct rsvp_head *head = rcu_dereference_bh(tp->root);
-	struct rsvp_session *s;
-	struct rsvp_filter *f;
-	unsigned int h1, h2;
-	__be32 *dst, *src;
-	u8 protocol;
-	u8 tunnelid = 0;
-	u8 *xprt;
-#if RSVP_DST_LEN == 4
-	struct ipv6hdr *nhptr;
-
-	if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
-		return -1;
-	nhptr = ipv6_hdr(skb);
-#else
-	struct iphdr *nhptr;
-
-	if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
-		return -1;
-	nhptr = ip_hdr(skb);
-#endif
-restart:
-
-#if RSVP_DST_LEN == 4
-	src = &nhptr->saddr.s6_addr32[0];
-	dst = &nhptr->daddr.s6_addr32[0];
-	protocol = nhptr->nexthdr;
-	xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr);
-#else
-	src = &nhptr->saddr;
-	dst = &nhptr->daddr;
-	protocol = nhptr->protocol;
-	xprt = ((u8 *)nhptr) + (nhptr->ihl<<2);
-	if (ip_is_fragment(nhptr))
-		return -1;
-#endif
-
-	h1 = hash_dst(dst, protocol, tunnelid);
-	h2 = hash_src(src);
-
-	for (s = rcu_dereference_bh(head->ht[h1]); s;
-	     s = rcu_dereference_bh(s->next)) {
-		if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&
-		    protocol == s->protocol &&
-		    !(s->dpi.mask &
-		      (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) &&
-#if RSVP_DST_LEN == 4
-		    dst[0] == s->dst[0] &&
-		    dst[1] == s->dst[1] &&
-		    dst[2] == s->dst[2] &&
-#endif
-		    tunnelid == s->tunnelid) {
-
-			for (f = rcu_dereference_bh(s->ht[h2]); f;
-			     f = rcu_dereference_bh(f->next)) {
-				if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] &&
-				    !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))
-#if RSVP_DST_LEN == 4
-				    &&
-				    src[0] == f->src[0] &&
-				    src[1] == f->src[1] &&
-				    src[2] == f->src[2]
-#endif
-				    ) {
-					*res = f->res;
-					RSVP_APPLY_RESULT();
-
-matched:
-					if (f->tunnelhdr == 0)
-						return 0;
-
-					tunnelid = f->res.classid;
-					nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr));
-					goto restart;
-				}
-			}
-
-			/* And wildcard bucket... */
-			for (f = rcu_dereference_bh(s->ht[16]); f;
-			     f = rcu_dereference_bh(f->next)) {
-				*res = f->res;
-				RSVP_APPLY_RESULT();
-				goto matched;
-			}
-			return -1;
-		}
-	}
-	return -1;
-}
-
-static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h)
-{
-	struct rsvp_head *head = rtnl_dereference(tp->root);
-	struct rsvp_session *s;
-	struct rsvp_filter __rcu **ins;
-	struct rsvp_filter *pins;
-	unsigned int h1 = h & 0xFF;
-	unsigned int h2 = (h >> 8) & 0xFF;
-
-	for (s = rtnl_dereference(head->ht[h1]); s;
-	     s = rtnl_dereference(s->next)) {
-		for (ins = &s->ht[h2], pins = rtnl_dereference(*ins); ;
-		     ins = &pins->next, pins = rtnl_dereference(*ins)) {
-			if (pins->handle == h) {
-				RCU_INIT_POINTER(n->next, pins->next);
-				rcu_assign_pointer(*ins, n);
-				return;
-			}
-		}
-	}
-
-	/* Something went wrong if we are trying to replace a non-existent
-	 * node. Mind as well halt instead of silently failing.
-	 */
-	BUG_ON(1);
-}
-
-static void *rsvp_get(struct tcf_proto *tp, u32 handle)
-{
-	struct rsvp_head *head = rtnl_dereference(tp->root);
-	struct rsvp_session *s;
-	struct rsvp_filter *f;
-	unsigned int h1 = handle & 0xFF;
-	unsigned int h2 = (handle >> 8) & 0xFF;
-
-	if (h2 > 16)
-		return NULL;
-
-	for (s = rtnl_dereference(head->ht[h1]); s;
-	     s = rtnl_dereference(s->next)) {
-		for (f = rtnl_dereference(s->ht[h2]); f;
-		     f = rtnl_dereference(f->next)) {
-			if (f->handle == handle)
-				return f;
-		}
-	}
-	return NULL;
-}
-
-static int rsvp_init(struct tcf_proto *tp)
-{
-	struct rsvp_head *data;
-
-	data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL);
-	if (data) {
-		rcu_assign_pointer(tp->root, data);
-		return 0;
-	}
-	return -ENOBUFS;
-}
-
-static void __rsvp_delete_filter(struct rsvp_filter *f)
-{
-	tcf_exts_destroy(&f->exts);
-	tcf_exts_put_net(&f->exts);
-	kfree(f);
-}
-
-static void rsvp_delete_filter_work(struct work_struct *work)
-{
-	struct rsvp_filter *f = container_of(to_rcu_work(work),
-					     struct rsvp_filter,
-					     rwork);
-	rtnl_lock();
-	__rsvp_delete_filter(f);
-	rtnl_unlock();
-}
-
-static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
-{
-	tcf_unbind_filter(tp, &f->res);
-	/* all classifiers are required to call tcf_exts_destroy() after rcu
-	 * grace period, since converted-to-rcu actions are relying on that
-	 * in cleanup() callback
-	 */
-	if (tcf_exts_get_net(&f->exts))
-		tcf_queue_work(&f->rwork, rsvp_delete_filter_work);
-	else
-		__rsvp_delete_filter(f);
-}
-
-static void rsvp_destroy(struct tcf_proto *tp, bool rtnl_held,
-			 struct netlink_ext_ack *extack)
-{
-	struct rsvp_head *data = rtnl_dereference(tp->root);
-	int h1, h2;
-
-	if (data == NULL)
-		return;
-
-	for (h1 = 0; h1 < 256; h1++) {
-		struct rsvp_session *s;
-
-		while ((s = rtnl_dereference(data->ht[h1])) != NULL) {
-			RCU_INIT_POINTER(data->ht[h1], s->next);
-
-			for (h2 = 0; h2 <= 16; h2++) {
-				struct rsvp_filter *f;
-
-				while ((f = rtnl_dereference(s->ht[h2])) != NULL) {
-					rcu_assign_pointer(s->ht[h2], f->next);
-					rsvp_delete_filter(tp, f);
-				}
-			}
-			kfree_rcu(s, rcu);
-		}
-	}
-	kfree_rcu(data, rcu);
-}
-
-static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last,
-		       bool rtnl_held, struct netlink_ext_ack *extack)
-{
-	struct rsvp_head *head = rtnl_dereference(tp->root);
-	struct rsvp_filter *nfp, *f = arg;
-	struct rsvp_filter __rcu **fp;
-	unsigned int h = f->handle;
-	struct rsvp_session __rcu **sp;
-	struct rsvp_session *nsp, *s = f->sess;
-	int i, h1;
-
-	fp = &s->ht[(h >> 8) & 0xFF];
-	for (nfp = rtnl_dereference(*fp); nfp;
-	     fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
-		if (nfp == f) {
-			RCU_INIT_POINTER(*fp, f->next);
-			rsvp_delete_filter(tp, f);
-
-			/* Strip tree */
-
-			for (i = 0; i <= 16; i++)
-				if (s->ht[i])
-					goto out;
-
-			/* OK, session has no flows */
-			sp = &head->ht[h & 0xFF];
-			for (nsp = rtnl_dereference(*sp); nsp;
-			     sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
-				if (nsp == s) {
-					RCU_INIT_POINTER(*sp, s->next);
-					kfree_rcu(s, rcu);
-					goto out;
-				}
-			}
-
-			break;
-		}
-	}
-
-out:
-	*last = true;
-	for (h1 = 0; h1 < 256; h1++) {
-		if (rcu_access_pointer(head->ht[h1])) {
-			*last = false;
-			break;
-		}
-	}
-
-	return 0;
-}
-
-static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
-{
-	struct rsvp_head *data = rtnl_dereference(tp->root);
-	int i = 0xFFFF;
-
-	while (i-- > 0) {
-		u32 h;
-
-		if ((data->hgenerator += 0x10000) == 0)
-			data->hgenerator = 0x10000;
-		h = data->hgenerator|salt;
-		if (!rsvp_get(tp, h))
-			return h;
-	}
-	return 0;
-}
-
-static int tunnel_bts(struct rsvp_head *data)
-{
-	int n = data->tgenerator >> 5;
-	u32 b = 1 << (data->tgenerator & 0x1F);
-
-	if (data->tmap[n] & b)
-		return 0;
-	data->tmap[n] |= b;
-	return 1;
-}
-
-static void tunnel_recycle(struct rsvp_head *data)
-{
-	struct rsvp_session __rcu **sht = data->ht;
-	u32 tmap[256/32];
-	int h1, h2;
-
-	memset(tmap, 0, sizeof(tmap));
-
-	for (h1 = 0; h1 < 256; h1++) {
-		struct rsvp_session *s;
-		for (s = rtnl_dereference(sht[h1]); s;
-		     s = rtnl_dereference(s->next)) {
-			for (h2 = 0; h2 <= 16; h2++) {
-				struct rsvp_filter *f;
-
-				for (f = rtnl_dereference(s->ht[h2]); f;
-				     f = rtnl_dereference(f->next)) {
-					if (f->tunnelhdr == 0)
-						continue;
-					data->tgenerator = f->res.classid;
-					tunnel_bts(data);
-				}
-			}
-		}
-	}
-
-	memcpy(data->tmap, tmap, sizeof(tmap));
-}
-
-static u32 gen_tunnel(struct rsvp_head *data)
-{
-	int i, k;
-
-	for (k = 0; k < 2; k++) {
-		for (i = 255; i > 0; i--) {
-			if (++data->tgenerator == 0)
-				data->tgenerator = 1;
-			if (tunnel_bts(data))
-				return data->tgenerator;
-		}
-		tunnel_recycle(data);
-	}
-	return 0;
-}
-
-static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
-	[TCA_RSVP_CLASSID]	= { .type = NLA_U32 },
-	[TCA_RSVP_DST]		= { .len = RSVP_DST_LEN * sizeof(u32) },
-	[TCA_RSVP_SRC]		= { .len = RSVP_DST_LEN * sizeof(u32) },
-	[TCA_RSVP_PINFO]	= { .len = sizeof(struct tc_rsvp_pinfo) },
-};
-
-static int rsvp_change(struct net *net, struct sk_buff *in_skb,
-		       struct tcf_proto *tp, unsigned long base,
-		       u32 handle, struct nlattr **tca,
-		       void **arg, u32 flags,
-		       struct netlink_ext_ack *extack)
-{
-	struct rsvp_head *data = rtnl_dereference(tp->root);
-	struct rsvp_filter *f, *nfp;
-	struct rsvp_filter __rcu **fp;
-	struct rsvp_session *nsp, *s;
-	struct rsvp_session __rcu **sp;
-	struct tc_rsvp_pinfo *pinfo = NULL;
-	struct nlattr *opt = tca[TCA_OPTIONS];
-	struct nlattr *tb[TCA_RSVP_MAX + 1];
-	struct tcf_exts e;
-	unsigned int h1, h2;
-	__be32 *dst;
-	int err;
-
-	if (opt == NULL)
-		return handle ? -EINVAL : 0;
-
-	err = nla_parse_nested_deprecated(tb, TCA_RSVP_MAX, opt, rsvp_policy,
-					  NULL);
-	if (err < 0)
-		return err;
-
-	err = tcf_exts_init(&e, net, TCA_RSVP_ACT, TCA_RSVP_POLICE);
-	if (err < 0)
-		return err;
-	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, flags,
-				extack);
-	if (err < 0)
-		goto errout2;
-
-	f = *arg;
-	if (f) {
-		/* Node exists: adjust only classid */
-		struct rsvp_filter *n;
-
-		if (f->handle != handle && handle)
-			goto errout2;
-
-		n = kmemdup(f, sizeof(*f), GFP_KERNEL);
-		if (!n) {
-			err = -ENOMEM;
-			goto errout2;
-		}
-
-		err = tcf_exts_init(&n->exts, net, TCA_RSVP_ACT,
-				    TCA_RSVP_POLICE);
-		if (err < 0) {
-			kfree(n);
-			goto errout2;
-		}
-
-		if (tb[TCA_RSVP_CLASSID]) {
-			n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
-			tcf_bind_filter(tp, &n->res, base);
-		}
-
-		tcf_exts_change(&n->exts, &e);
-		rsvp_replace(tp, n, handle);
-		return 0;
-	}
-
-	/* Now more serious part... */
-	err = -EINVAL;
-	if (handle)
-		goto errout2;
-	if (tb[TCA_RSVP_DST] == NULL)
-		goto errout2;
-
-	err = -ENOBUFS;
-	f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL);
-	if (f == NULL)
-		goto errout2;
-
-	err = tcf_exts_init(&f->exts, net, TCA_RSVP_ACT, TCA_RSVP_POLICE);
-	if (err < 0)
-		goto errout;
-	h2 = 16;
-	if (tb[TCA_RSVP_SRC]) {
-		memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src));
-		h2 = hash_src(f->src);
-	}
-	if (tb[TCA_RSVP_PINFO]) {
-		pinfo = nla_data(tb[TCA_RSVP_PINFO]);
-		f->spi = pinfo->spi;
-		f->tunnelhdr = pinfo->tunnelhdr;
-	}
-	if (tb[TCA_RSVP_CLASSID])
-		f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
-
-	dst = nla_data(tb[TCA_RSVP_DST]);
-	h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);
-
-	err = -ENOMEM;
-	if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0)
-		goto errout;
-
-	if (f->tunnelhdr) {
-		err = -EINVAL;
-		if (f->res.classid > 255)
-			goto errout;
-
-		err = -ENOMEM;
-		if (f->res.classid == 0 &&
-		    (f->res.classid = gen_tunnel(data)) == 0)
-			goto errout;
-	}
-
-	for (sp = &data->ht[h1];
-	     (s = rtnl_dereference(*sp)) != NULL;
-	     sp = &s->next) {
-		if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
-		    pinfo && pinfo->protocol == s->protocol &&
-		    memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 &&
-#if RSVP_DST_LEN == 4
-		    dst[0] == s->dst[0] &&
-		    dst[1] == s->dst[1] &&
-		    dst[2] == s->dst[2] &&
-#endif
-		    pinfo->tunnelid == s->tunnelid) {
-
-insert:
-			/* OK, we found appropriate session */
-
-			fp = &s->ht[h2];
-
-			f->sess = s;
-			if (f->tunnelhdr == 0)
-				tcf_bind_filter(tp, &f->res, base);
-
-			tcf_exts_change(&f->exts, &e);
-
-			fp = &s->ht[h2];
-			for (nfp = rtnl_dereference(*fp); nfp;
-			     fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
-				__u32 mask = nfp->spi.mask & f->spi.mask;
-
-				if (mask != f->spi.mask)
-					break;
-			}
-			RCU_INIT_POINTER(f->next, nfp);
-			rcu_assign_pointer(*fp, f);
-
-			*arg = f;
-			return 0;
-		}
-	}
-
-	/* No session found. Create new one. */
-
-	err = -ENOBUFS;
-	s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL);
-	if (s == NULL)
-		goto errout;
-	memcpy(s->dst, dst, sizeof(s->dst));
-
-	if (pinfo) {
-		s->dpi = pinfo->dpi;
-		s->protocol = pinfo->protocol;
-		s->tunnelid = pinfo->tunnelid;
-	}
-	sp = &data->ht[h1];
-	for (nsp = rtnl_dereference(*sp); nsp;
-	     sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
-		if ((nsp->dpi.mask & s->dpi.mask) != s->dpi.mask)
-			break;
-	}
-	RCU_INIT_POINTER(s->next, nsp);
-	rcu_assign_pointer(*sp, s);
-
-	goto insert;
-
-errout:
-	tcf_exts_destroy(&f->exts);
-	kfree(f);
-errout2:
-	tcf_exts_destroy(&e);
-	return err;
-}
-
-static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg,
-		      bool rtnl_held)
-{
-	struct rsvp_head *head = rtnl_dereference(tp->root);
-	unsigned int h, h1;
-
-	if (arg->stop)
-		return;
-
-	for (h = 0; h < 256; h++) {
-		struct rsvp_session *s;
-
-		for (s = rtnl_dereference(head->ht[h]); s;
-		     s = rtnl_dereference(s->next)) {
-			for (h1 = 0; h1 <= 16; h1++) {
-				struct rsvp_filter *f;
-
-				for (f = rtnl_dereference(s->ht[h1]); f;
-				     f = rtnl_dereference(f->next)) {
-					if (!tc_cls_stats_dump(tp, arg, f))
-						return;
-				}
-			}
-		}
-	}
-}
-
-static int rsvp_dump(struct net *net, struct tcf_proto *tp, void *fh,
-		     struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
-{
-	struct rsvp_filter *f = fh;
-	struct rsvp_session *s;
-	struct nlattr *nest;
-	struct tc_rsvp_pinfo pinfo;
-
-	if (f == NULL)
-		return skb->len;
-	s = f->sess;
-
-	t->tcm_handle = f->handle;
-
-	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
-	if (nest == NULL)
-		goto nla_put_failure;
-
-	if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst))
-		goto nla_put_failure;
-	pinfo.dpi = s->dpi;
-	pinfo.spi = f->spi;
-	pinfo.protocol = s->protocol;
-	pinfo.tunnelid = s->tunnelid;
-	pinfo.tunnelhdr = f->tunnelhdr;
-	pinfo.pad = 0;
-	if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo))
-		goto nla_put_failure;
-	if (f->res.classid &&
-	    nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid))
-		goto nla_put_failure;
-	if (((f->handle >> 8) & 0xFF) != 16 &&
-	    nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src))
-		goto nla_put_failure;
-
-	if (tcf_exts_dump(skb, &f->exts) < 0)
-		goto nla_put_failure;
-
-	nla_nest_end(skb, nest);
-
-	if (tcf_exts_dump_stats(skb, &f->exts) < 0)
-		goto nla_put_failure;
-	return skb->len;
-
-nla_put_failure:
-	nla_nest_cancel(skb, nest);
-	return -1;
-}
-
-static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
-			    unsigned long base)
-{
-	struct rsvp_filter *f = fh;
-
-	tc_cls_bind_class(classid, cl, q, &f->res, base);
-}
-
-static struct tcf_proto_ops RSVP_OPS __read_mostly = {
-	.kind		=	RSVP_ID,
-	.classify	=	RSVP_CLS,
-	.init		=	rsvp_init,
-	.destroy	=	rsvp_destroy,
-	.get		=	rsvp_get,
-	.change		=	rsvp_change,
-	.delete		=	rsvp_delete,
-	.walk		=	rsvp_walk,
-	.dump		=	rsvp_dump,
-	.bind_class	=	rsvp_bind_class,
-	.owner		=	THIS_MODULE,
-};
-
-static int __init init_rsvp(void)
-{
-	return register_tcf_proto_ops(&RSVP_OPS);
-}
-
-static void __exit exit_rsvp(void)
-{
-	unregister_tcf_proto_ops(&RSVP_OPS);
-}
-
-module_init(init_rsvp)
-module_exit(exit_rsvp)
diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c
deleted file mode 100644
index e627cc32d633..000000000000
--- a/net/sched/cls_rsvp6.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/sched/cls_rsvp6.c	Special RSVP packet classifier for IPv6.
- *
- * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/ipv6.h>
-#include <linux/skbuff.h>
-#include <net/act_api.h>
-#include <net/pkt_cls.h>
-#include <net/netlink.h>
-#include <net/tc_wrapper.h>
-
-#define RSVP_DST_LEN	4
-#define RSVP_ID		"rsvp6"
-#define RSVP_OPS	cls_rsvp6_ops
-#define RSVP_CLS rsvp6_classify
-
-#include "cls_rsvp.h"
-MODULE_LICENSE("GPL");
diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/rsvp.json b/tools/testing/selftests/tc-testing/tc-tests/filters/rsvp.json
deleted file mode 100644
index bdcbaa4c5663..000000000000
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/rsvp.json
+++ /dev/null
@@ -1,203 +0,0 @@
-[
-    {
-        "id": "2141",
-        "name": "Add rsvp filter with tcp proto and specific IP address",
-        "category": [
-            "filter",
-            "rsvp"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress"
-        ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: protocol ip prio 1 rsvp ipproto tcp session 198.168.10.64",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
-        "matchPattern": "^filter protocol ip pref [0-9]+ rsvp chain [0-9]+ fh 0x.*session 198.168.10.64 ipproto tcp",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "5267",
-        "name": "Add rsvp filter with udp proto and specific IP address",
-        "category": [
-            "filter",
-            "rsvp"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress"
-        ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: protocol ip prio 1 rsvp ipproto udp session 1.1.1.1",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
-        "matchPattern": "^filter protocol ip pref [0-9]+ rsvp chain [0-9]+ fh 0x.*session 1.1.1.1 ipproto udp",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "2819",
-        "name": "Add rsvp filter with src ip and src port",
-        "category": [
-            "filter",
-            "rsvp"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress"
-        ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: protocol ip prio 1 rsvp ipproto udp session 1.1.1.1 sender 2.2.2.2/5021 classid 1:1",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
-        "matchPattern": "^filter protocol ip pref [0-9]+ rsvp chain [0-9]+ fh 0x.*flowid 1:1 session 1.1.1.1 ipproto udp sender  2.2.2.2/5021",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "c967",
-        "name": "Add rsvp filter with tunnelid and continue action",
-        "category": [
-            "filter",
-            "rsvp"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress"
-        ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: protocol ip prio 1 rsvp ipproto udp session 1.1.1.1 tunnelid 2 classid 1:1 action continue",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
-        "matchPattern": "^filter protocol ip pref [0-9]+ rsvp chain [0-9]+ fh 0x.*flowid 1:1 session 1.1.1.1 ipproto udp tunnelid 2.*action order [0-9]+: gact action continue",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "5463",
-        "name": "Add rsvp filter with tunnel and pipe action",
-        "category": [
-            "filter",
-            "rsvp"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress"
-        ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: protocol ip prio 1 rsvp ipproto udp session 1.1.1.1 tunnel 2 skip 1 action pipe",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
-        "matchPattern": "^filter protocol ip pref [0-9]+ rsvp chain [0-9]+ fh 0x.*tunnel 2 skip 1 session 1.1.1.1 ipproto udp.*action order [0-9]+: gact action pipe",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "2332",
-        "name": "Add rsvp filter with miltiple actions",
-        "category": [
-            "filter",
-            "rsvp"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress"
-        ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: protocol ip prio 7 rsvp ipproto udp session 1.1.1.1 classid 1:1 action skbedit mark 7 pipe action gact drop",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
-        "matchPattern": "^filter protocol ip pref [0-9]+ rsvp chain [0-9]+ fh 0x.*flowid 1:1 session 1.1.1.1 ipproto udp.*action order [0-9]+: skbedit  mark 7 pipe.*action order [0-9]+: gact action drop",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "8879",
-        "name": "Add rsvp filter with tunnel and skp flag",
-        "category": [
-            "filter",
-            "rsvp"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress"
-        ],
-        "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: protocol ip prio 1 rsvp ipproto udp session 1.1.1.1 tunnel 2 skip 1 action pipe",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
-        "matchPattern": "^filter protocol ip pref [0-9]+ rsvp chain [0-9]+ fh 0x.*tunnel 2 skip 1 session 1.1.1.1 ipproto udp.*action order [0-9]+: gact action pipe",
-        "matchCount": "1",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "8261",
-        "name": "List rsvp filters",
-        "category": [
-            "filter",
-            "rsvp"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress",
-            "$TC filter add dev $DEV1 parent ffff: protocol ip prio 1 rsvp ipproto udp session 1.1.1.1/1234 classid 1:1",
-            "$TC filter add dev $DEV1 parent ffff: protocol ip prio 1 rsvp ipproto tcp session 2.2.2.2/1234 classid 2:1"
-        ],
-        "cmdUnderTest": "$TC filter show dev $DEV1 parent ffff:",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
-        "matchPattern": "^filter protocol ip pref [0-9]+ rsvp chain [0-9]+ fh",
-        "matchCount": "2",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    },
-    {
-        "id": "8989",
-        "name": "Delete rsvp filter",
-        "category": [
-            "filter",
-            "rsvp"
-        ],
-        "plugins": {
-            "requires": "nsPlugin"
-        },
-        "setup": [
-            "$TC qdisc add dev $DEV1 ingress",
-            "$TC filter add dev $DEV1 parent ffff: protocol ip prio 1 rsvp ipproto udp session 1.1.1.1/1234 tunnelid 9 classid 2:1"
-        ],
-        "cmdUnderTest": "$TC filter del dev $DEV1 parent ffff: protocol ip prio 1 rsvp ipproto udp session 1.1.1.1/1234 tunnelid 9 classid 2:1",
-        "expExitCode": "0",
-        "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
-        "matchPattern": "filter protocol ip pref [0-9]+ rsvp chain [0-9]+ fh 0x.*flowid 2:1 session 1.1.1.1/1234 ipproto udp tunnelid 9",
-        "matchCount": "0",
-        "teardown": [
-            "$TC qdisc del dev $DEV1 ingress"
-        ]
-    }
-]
-- 
cgit 


From 5198cb408fcfc0b49c418314570423efe2217754 Mon Sep 17 00:00:00 2001
From: Andrea Mayer <andrea.mayer@uniroma2.it>
Date: Wed, 15 Feb 2023 14:46:59 +0100
Subject: selftests: seg6: add selftest for PSP flavor in SRv6 End behavior

This selftest is designed for testing the PSP flavor in SRv6 End behavior.
It instantiates a virtual network composed of several nodes: hosts and
SRv6 routers. Each node is realized using a network namespace that is
properly interconnected to others through veth pairs.
The test makes use of the SRv6 End behavior and of the PSP flavor needed
for removing the SRH from the IPv6 header at the penultimate node.

The correct execution of the behavior is verified through reachability
tests carried out between hosts.

Signed-off-by: Andrea Mayer <andrea.mayer@uniroma2.it>
Signed-off-by: Paolo Lungaroni <paolo.lungaroni@uniroma2.it>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/Makefile               |   1 +
 .../testing/selftests/net/srv6_end_flavors_test.sh | 869 +++++++++++++++++++++
 2 files changed, 870 insertions(+)
 create mode 100755 tools/testing/selftests/net/srv6_end_flavors_test.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 3364c548a23b..6cd8993454d7 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -38,6 +38,7 @@ TEST_PROGS += srv6_end_dt6_l3vpn_test.sh
 TEST_PROGS += srv6_hencap_red_l3vpn_test.sh
 TEST_PROGS += srv6_hl2encap_red_l2vpn_test.sh
 TEST_PROGS += srv6_end_next_csid_l3vpn_test.sh
+TEST_PROGS += srv6_end_flavors_test.sh
 TEST_PROGS += vrf_strict_mode_test.sh
 TEST_PROGS += arp_ndisc_evict_nocarrier.sh
 TEST_PROGS += ndisc_unsolicited_na_test.sh
diff --git a/tools/testing/selftests/net/srv6_end_flavors_test.sh b/tools/testing/selftests/net/srv6_end_flavors_test.sh
new file mode 100755
index 000000000000..50563443a4ad
--- /dev/null
+++ b/tools/testing/selftests/net/srv6_end_flavors_test.sh
@@ -0,0 +1,869 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# author: Andrea Mayer <andrea.mayer@uniroma2.it>
+# author: Paolo Lungaroni <paolo.lungaroni@uniroma2.it>
+#
+# This script is designed to test the support for "flavors" in the SRv6 End
+# behavior.
+#
+# Flavors defined in RFC8986 [1] represent additional operations that can modify
+# or extend the existing SRv6 End, End.X and End.T behaviors. For the sake of
+# convenience, we report the list of flavors described in [1] hereafter:
+#   - Penultimate Segment Pop (PSP);
+#   - Ultimate Segment Pop (USP);
+#   - Ultimate Segment Decapsulation (USD).
+#
+# The End, End.X, and End.T behaviors can support these flavors either
+# individually or in combinations.
+# Currently in this selftest we consider only the PSP flavor for the SRv6 End
+# behavior. However, it is possible to extend the script as soon as other
+# flavors will be supported in the kernel.
+#
+# The purpose of the PSP flavor consists in instructing the penultimate node
+# listed in the SRv6 policy to remove (i.e. pop) the outermost SRH from the IPv6
+# header.
+# A PSP enabled SRv6 End behavior instance processes the SRH by:
+#  - decrementing the Segment Left (SL) value from 1 to 0;
+#  - copying the last SID from the SID List into the IPv6 Destination Address
+#    (DA);
+#  - removing the SRH from the extension headers following the IPv6 header.
+#
+# Once the SRH is removed, the IPv6 packet is forwarded to the destination using
+# the IPv6 DA updated during the PSP operation (i.e. the IPv6 DA corresponding
+# to the last SID carried by the removed SRH).
+#
+# Although the PSP flavor can be set for any SRv6 End behavior instance on any
+# SR node, it will be active only on such behaviors bound to a penultimate SID
+# for a given SRv6 policy.
+#                                                SL=2 SL=1 SL=0
+#                                                  |    |    |
+# For example, given the SRv6 policy (SID List := <X,   Y,   Z>):
+#  - a PSP enabled SRv6 End behavior bound to SID Y will apply the PSP operation
+#    as Segment Left (SL) is 1, corresponding to the Penultimate Segment of the
+#    SID List;
+#  - a PSP enabled SRv6 End behavior bound to SID X will *NOT* apply the PSP
+#    operation as the Segment Left is 2. This behavior instance will apply the
+#    "standard" End packet processing, ignoring the configured PSP flavor at
+#    all.
+#
+# [1] RFC8986: https://datatracker.ietf.org/doc/html/rfc8986
+#
+# Network topology
+# ================
+#
+# The network topology used in this selftest is depicted hereafter, composed by
+# two hosts (hs-1, hs-2) and four routers (rt-1, rt-2, rt-3, rt-4).
+# Hosts hs-1 and hs-2 are connected to routers rt-1 and rt-2, respectively,
+# allowing them to communicate with each other.
+# Traffic exchanged between hs-1 and hs-2 can follow different network paths.
+# The network operator, through specific SRv6 Policies can steer traffic to one
+# path rather than another. In this selftest this is implemented as follows:
+#
+#   i) The SRv6 H.Insert behavior applies SRv6 Policies on traffic received by
+#      connected hosts. It pushes the Segment Routing Header (SRH) after the
+#      IPv6 header. The SRH contains the SID List (i.e. SRv6 Policy) needed for
+#      steering traffic across the segments/waypoints specified in that list;
+#
+#  ii) The SRv6 End behavior advances the active SID in the SID List carried by
+#      the SRH;
+#
+# iii) The PSP enabled SRv6 End behavior is used to remove the SRH when such
+#      behavior is configured on a node bound to the Penultimate Segment carried
+#      by the SID List.
+#
+#                cafe::1                      cafe::2
+#              +--------+                   +--------+
+#              |        |                   |        |
+#              |  hs-1  |                   |  hs-2  |
+#              |        |                   |        |
+#              +---+----+                   +--- +---+
+#     cafe::/64    |                             |      cafe::/64
+#                  |                             |
+#              +---+----+                   +----+---+
+#              |        |  fcf0:0:1:2::/64  |        |
+#              |  rt-1  +-------------------+  rt-2  |
+#              |        |                   |        |
+#              +---+----+                   +----+---+
+#                  |      .               .      |
+#                  |  fcf0:0:1:3::/64   .        |
+#                  |          .       .          |
+#                  |            .   .            |
+#  fcf0:0:1:4::/64 |              .              | fcf0:0:2:3::/64
+#                  |            .   .            |
+#                  |          .       .          |
+#                  |  fcf0:0:2:4::/64   .        |
+#                  |      .               .      |
+#              +---+----+                   +----+---+
+#              |        |                   |        |
+#              |  rt-4  +-------------------+  rt-3  |
+#              |        |  fcf0:0:3:4::/64  |        |
+#              +---+----+                   +----+---+
+#
+# Every fcf0:0:x:y::/64 network interconnects the SRv6 routers rt-x with rt-y in
+# the IPv6 operator network.
+#
+#
+# Local SID table
+# ===============
+#
+# Each SRv6 router is configured with a Local SID table in which SIDs are
+# stored. Considering the given SRv6 router rt-x, at least two SIDs are
+# configured in the Local SID table:
+#
+#   Local SID table for SRv6 router rt-x
+#   +---------------------------------------------------------------------+
+#   |fcff:x::e is associated with the SRv6 End behavior                   |
+#   |fcff:x::ef1 is associated with the SRv6 End behavior with PSP flavor |
+#   +---------------------------------------------------------------------+
+#
+# The fcff::/16 prefix is reserved by the operator for the SIDs. Reachability of
+# SIDs is ensured by proper configuration of the IPv6 operator's network and
+# SRv6 routers.
+#
+#
+# SRv6 Policies
+# =============
+#
+# An SRv6 ingress router applies different SRv6 Policies to the traffic received
+# from connected hosts on the basis of the destination addresses.
+# In case of SRv6 H.Insert behavior, the SRv6 Policy enforcement consists of
+# pushing the SRH (carrying a given SID List) after the existing IPv6 header.
+# Note that in the inserting mode, there is no encapsulation at all.
+#
+#   Before applying an SRv6 Policy using the SRv6 H.Insert behavior
+#   +------+---------+
+#   | IPv6 | Payload |
+#   +------+---------+
+#
+#   After applying an SRv6 Policy using the SRv6 H.Insert behavior
+#   +------+-----+---------+
+#   | IPv6 | SRH | Payload |
+#   +------+-----+---------+
+#
+# Traffic from hs-1 to hs-2
+# -------------------------
+#
+# Packets generated from hs-1 and directed towards hs-2 are
+# handled by rt-1 which applies the following SRv6 Policy:
+#
+#   i.a) IPv6 traffic, SID List=fcff:3::e,fcff:4::ef1,fcff:2::ef1,cafe::2
+#
+# Router rt-1 is configured to enforce the Policy (i.a) through the SRv6
+# H.Insert behavior which pushes the SRH after the existing IPv6 header. This
+# Policy steers the traffic from hs-1 across rt-3, rt-4, rt-2 and finally to the
+# destination hs-2.
+#
+# As the packet reaches the router rt-3, the SRv6 End behavior bound to SID
+# fcff:3::e is triggered. The behavior updates the Segment Left (from SL=3 to
+# SL=2) in the SRH, the IPv6 DA with fcff:4::ef1 and forwards the packet to the
+# next router on the path, i.e. rt-4.
+#
+# When router rt-4 receives the packet, the PSP enabled SRv6 End behavior bound
+# to SID fcff:4::ef1 is executed. Since the SL=2, the PSP operation is *NOT*
+# kicked in and the behavior applies the default End processing: the Segment
+# Left is decreased (from SL=2 to SL=1), the IPv6 DA is updated with the SID
+# fcff:2::ef1 and the packet is forwarded to router rt-2.
+#
+# The PSP enabled SRv6 End behavior on rt-2 is associated with SID fcff:2::ef1
+# and is executed as the packet is received. Because SL=1, the behavior applies
+# the PSP processing on the packet as follows: i) SL is decreased, i.e. from
+# SL=1 to SL=0; ii) last SID (cafe::2) is copied into the IPv6 DA; iii) the
+# outermost SRH is removed from the extension headers following the IPv6 header.
+# Once the PSP processing is completed, the packet is forwarded to the host hs-2
+# (destination).
+#
+# Traffic from hs-2 to hs-1
+# -------------------------
+#
+# Packets generated from hs-2 and directed to hs-1 are handled by rt-2 which
+# applies the following SRv6 Policy:
+#
+#   i.b) IPv6 traffic, SID List=fcff:1::ef1,cafe::1
+#
+# Router rt-2 is configured to enforce the Policy (i.b) through the SRv6
+# H.Insert behavior which pushes the SRH after the existing IPv6 header. This
+# Policy steers the traffic from hs-2 across rt-1 and finally to the
+# destination hs-1
+#
+#
+# When the router rt-1 receives the packet, the PSP enabled SRv6 End behavior
+# associated with the SID fcff:1::ef1 is triggered. Since the SL=1,
+# the PSP operation takes place: i) the SL is decremented; ii) the IPv6 DA is
+# set with the last SID; iii) the SRH is removed from the extension headers
+# after the IPv6 header. At this point, the packet with IPv6 DA=cafe::1 is sent
+# to the destination, i.e. hs-1.
+
+# Kselftest framework requirement - SKIP code is 4.
+readonly ksft_skip=4
+
+readonly RDMSUFF="$(mktemp -u XXXXXXXX)"
+readonly DUMMY_DEVNAME="dum0"
+readonly RT2HS_DEVNAME="veth1"
+readonly LOCALSID_TABLE_ID=90
+readonly IPv6_RT_NETWORK=fcf0:0
+readonly IPv6_HS_NETWORK=cafe
+readonly IPv6_TESTS_ADDR=2001:db8::1
+readonly LOCATOR_SERVICE=fcff
+readonly END_FUNC=000e
+readonly END_PSP_FUNC=0ef1
+
+PING_TIMEOUT_SEC=4
+PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
+
+# IDs of routers and hosts are initialized during the setup of the testing
+# network
+ROUTERS=''
+HOSTS=''
+
+SETUP_ERR=1
+
+ret=${ksft_skip}
+nsuccess=0
+nfail=0
+
+log_test()
+{
+	local rc="$1"
+	local expected="$2"
+	local msg="$3"
+
+	if [ "${rc}" -eq "${expected}" ]; then
+		nsuccess=$((nsuccess+1))
+		printf "\n    TEST: %-60s  [ OK ]\n" "${msg}"
+	else
+		ret=1
+		nfail=$((nfail+1))
+		printf "\n    TEST: %-60s  [FAIL]\n" "${msg}"
+		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+			echo
+			echo "hit enter to continue, 'q' to quit"
+			read a
+			[ "$a" = "q" ] && exit 1
+		fi
+	fi
+}
+
+print_log_test_results()
+{
+	printf "\nTests passed: %3d\n" "${nsuccess}"
+	printf "Tests failed: %3d\n"   "${nfail}"
+
+	# when a test fails, the value of 'ret' is set to 1 (error code).
+	# Conversely, when all tests are passed successfully, the 'ret' value
+	# is set to 0 (success code).
+	if [ "${ret}" -ne 1 ]; then
+		ret=0
+	fi
+}
+
+log_section()
+{
+	echo
+	echo "################################################################################"
+	echo "TEST SECTION: $*"
+	echo "################################################################################"
+}
+
+test_command_or_ksft_skip()
+{
+	local cmd="$1"
+
+	if [ ! -x "$(command -v "${cmd}")" ]; then
+		echo "SKIP: Could not run test without \"${cmd}\" tool";
+		exit "${ksft_skip}"
+	fi
+}
+
+get_nodename()
+{
+	local name="$1"
+
+	echo "${name}-${RDMSUFF}"
+}
+
+get_rtname()
+{
+	local rtid="$1"
+
+	get_nodename "rt-${rtid}"
+}
+
+get_hsname()
+{
+	local hsid="$1"
+
+	get_nodename "hs-${hsid}"
+}
+
+__create_namespace()
+{
+	local name="$1"
+
+	ip netns add "${name}"
+}
+
+create_router()
+{
+	local rtid="$1"
+	local nsname
+
+	nsname="$(get_rtname "${rtid}")"
+
+	__create_namespace "${nsname}"
+}
+
+create_host()
+{
+	local hsid="$1"
+	local nsname
+
+	nsname="$(get_hsname "${hsid}")"
+
+	__create_namespace "${nsname}"
+}
+
+cleanup()
+{
+	local nsname
+	local i
+
+	# destroy routers
+	for i in ${ROUTERS}; do
+		nsname="$(get_rtname "${i}")"
+
+		ip netns del "${nsname}" &>/dev/null || true
+	done
+
+	# destroy hosts
+	for i in ${HOSTS}; do
+		nsname="$(get_hsname "${i}")"
+
+		ip netns del "${nsname}" &>/dev/null || true
+	done
+
+	# check whether the setup phase was completed successfully or not. In
+	# case of an error during the setup phase of the testing environment,
+	# the selftest is considered as "skipped".
+	if [ "${SETUP_ERR}" -ne 0 ]; then
+		echo "SKIP: Setting up the testing environment failed"
+		exit "${ksft_skip}"
+	fi
+
+	exit "${ret}"
+}
+
+add_link_rt_pairs()
+{
+	local rt="$1"
+	local rt_neighs="$2"
+	local neigh
+	local nsname
+	local neigh_nsname
+
+	nsname="$(get_rtname "${rt}")"
+
+	for neigh in ${rt_neighs}; do
+		neigh_nsname="$(get_rtname "${neigh}")"
+
+		ip link add "veth-rt-${rt}-${neigh}" netns "${nsname}" \
+			type veth peer name "veth-rt-${neigh}-${rt}" \
+			netns "${neigh_nsname}"
+	done
+}
+
+get_network_prefix()
+{
+	local rt="$1"
+	local neigh="$2"
+	local p="${rt}"
+	local q="${neigh}"
+
+	if [ "${p}" -gt "${q}" ]; then
+		p="${q}"; q="${rt}"
+	fi
+
+	echo "${IPv6_RT_NETWORK}:${p}:${q}"
+}
+
+# Given the description of a router <id:op> as an input, the function returns
+# the <id> token which represents the ID of the router.
+# i.e. input: "12:psp"
+#      output: "12"
+__get_srv6_rtcfg_id()
+{
+	local element="$1"
+
+	echo "${element}" | cut -d':' -f1
+}
+
+# Given the description of a router <id:op> as an input, the function returns
+# the <op> token which represents the operation (e.g. End behavior with or
+# withouth flavors) configured for the node.
+
+# Note that when the operation represents an End behavior with a list of
+# flavors, the output is the ordered version of that list.
+# i.e. input: "5:usp,psp,usd"
+#      output: "psp,usd,usp"
+__get_srv6_rtcfg_op()
+{
+	local element="$1"
+
+	# return the lexicographically ordered flavors
+	echo "${element}" | cut -d':' -f2 | sed 's/,/\n/g' | sort | \
+		xargs | sed 's/ /,/g'
+}
+
+# Setup the basic networking for the routers
+setup_rt_networking()
+{
+	local rt="$1"
+	local rt_neighs="$2"
+	local nsname
+	local net_prefix
+	local devname
+	local neigh
+
+	nsname="$(get_rtname "${rt}")"
+
+	for neigh in ${rt_neighs}; do
+		devname="veth-rt-${rt}-${neigh}"
+
+		net_prefix="$(get_network_prefix "${rt}" "${neigh}")"
+
+		ip -netns "${nsname}" addr \
+			add "${net_prefix}::${rt}/64" dev "${devname}" nodad
+
+		ip -netns "${nsname}" link set "${devname}" up
+	done
+
+	ip -netns "${nsname}" link set lo up
+
+	ip -netns "${nsname}" link add ${DUMMY_DEVNAME} type dummy
+	ip -netns "${nsname}" link set ${DUMMY_DEVNAME} up
+
+	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.all.accept_dad=0
+	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.default.accept_dad=0
+	ip netns exec "${nsname}" sysctl -wq net.ipv6.conf.all.forwarding=1
+}
+
+# Setup local SIDs for an SRv6 router
+setup_rt_local_sids()
+{
+	local rt="$1"
+	local rt_neighs="$2"
+	local net_prefix
+	local devname
+	local nsname
+	local neigh
+
+	nsname="$(get_rtname "${rt}")"
+
+	for neigh in ${rt_neighs}; do
+		devname="veth-rt-${rt}-${neigh}"
+
+		net_prefix="$(get_network_prefix "${rt}" "${neigh}")"
+
+		# set underlay network routes for SIDs reachability
+		ip -netns "${nsname}" -6 route \
+			add "${LOCATOR_SERVICE}:${neigh}::/32" \
+			table "${LOCALSID_TABLE_ID}" \
+			via "${net_prefix}::${neigh}" dev "${devname}"
+	done
+
+	# Local End behavior (note that "dev" is a dummy interface chosen for
+	# the sake of simplicity).
+	ip -netns "${nsname}" -6 route \
+		add "${LOCATOR_SERVICE}:${rt}::${END_FUNC}" \
+		table "${LOCALSID_TABLE_ID}" \
+		encap seg6local action End dev "${DUMMY_DEVNAME}"
+
+
+	# all SIDs start with a common locator. Routes and SRv6 Endpoint
+	# behavior instaces are grouped together in the 'localsid' table.
+	ip -netns "${nsname}" -6 rule \
+		add to "${LOCATOR_SERVICE}::/16" \
+		lookup "${LOCALSID_TABLE_ID}" prio 999
+
+	# set default routes to unreachable
+	ip -netns "${nsname}" -6 route \
+		add unreachable default metric 4278198272 \
+		dev "${DUMMY_DEVNAME}"
+}
+
+# This helper function builds and installs the SID List (i.e. SRv6 Policy)
+# to be applied on incoming packets at the ingress node. Moreover, it
+# configures the SRv6 nodes specified in the SID List to process the traffic
+# according to the operations required by the Policy itself.
+# args:
+#  $1 - destination host (i.e. cafe::x host)
+#  $2 - SRv6 router configured for enforcing the SRv6 Policy
+#  $3 - compact way to represent a list of SRv6 routers with their operations
+#       (i.e. behaviors) that each of them needs to perform. Every <nodeid:op>
+#       element constructs a SID that is associated with the behavior <op> on
+#       the <nodeid> node. The list of such elements forms an SRv6 Policy.
+__setup_rt_policy()
+{
+	local dst="$1"
+	local encap_rt="$2"
+	local policy_rts="$3"
+	local behavior_cfg
+	local in_nsname
+	local rt_nsname
+	local policy=''
+	local function
+	local fullsid
+	local op_type
+	local node
+	local n
+
+	in_nsname="$(get_rtname "${encap_rt}")"
+
+	for n in ${policy_rts}; do
+		node="$(__get_srv6_rtcfg_id "${n}")"
+		op_type="$(__get_srv6_rtcfg_op "${n}")"
+		rt_nsname="$(get_rtname "${node}")"
+
+		case "${op_type}" in
+		"noflv")
+			policy="${policy}${LOCATOR_SERVICE}:${node}::${END_FUNC},"
+			function="${END_FUNC}"
+			behavior_cfg="End"
+			;;
+
+		"psp")
+			policy="${policy}${LOCATOR_SERVICE}:${node}::${END_PSP_FUNC},"
+			function="${END_PSP_FUNC}"
+			behavior_cfg="End flavors psp"
+			;;
+
+		*)
+			break
+			;;
+		esac
+
+		fullsid="${LOCATOR_SERVICE}:${node}::${function}"
+
+		# add SRv6 Endpoint behavior to the selected router
+		if ! ip -netns "${rt_nsname}" -6 route get "${fullsid}" \
+			&>/dev/null; then
+			ip -netns "${rt_nsname}" -6 route \
+				add "${fullsid}" \
+				table "${LOCALSID_TABLE_ID}" \
+				encap seg6local action ${behavior_cfg} \
+				dev "${DUMMY_DEVNAME}"
+		fi
+	done
+
+	# we need to remove the trailing comma to avoid inserting an empty
+	# address (::0) in the SID List.
+	policy="${policy%,}"
+
+	# add SRv6 policy to incoming traffic sent by connected hosts
+	ip -netns "${in_nsname}" -6 route \
+		add "${IPv6_HS_NETWORK}::${dst}" \
+		encap seg6 mode inline segs "${policy}" \
+		dev "${DUMMY_DEVNAME}"
+
+	ip -netns "${in_nsname}" -6 neigh \
+		add proxy "${IPv6_HS_NETWORK}::${dst}" \
+		dev "${RT2HS_DEVNAME}"
+}
+
+# see __setup_rt_policy
+setup_rt_policy_ipv6()
+{
+	__setup_rt_policy "$1" "$2" "$3"
+}
+
+setup_hs()
+{
+	local hs="$1"
+	local rt="$2"
+	local hsname
+	local rtname
+
+	hsname="$(get_hsname "${hs}")"
+	rtname="$(get_rtname "${rt}")"
+
+	ip netns exec "${hsname}" sysctl -wq net.ipv6.conf.all.accept_dad=0
+	ip netns exec "${hsname}" sysctl -wq net.ipv6.conf.default.accept_dad=0
+
+	ip -netns "${hsname}" link add veth0 type veth \
+		peer name "${RT2HS_DEVNAME}" netns "${rtname}"
+
+	ip -netns "${hsname}" addr \
+		add "${IPv6_HS_NETWORK}::${hs}/64" dev veth0 nodad
+
+	ip -netns "${hsname}" link set veth0 up
+	ip -netns "${hsname}" link set lo up
+
+	ip -netns "${rtname}" addr \
+		add "${IPv6_HS_NETWORK}::254/64" dev "${RT2HS_DEVNAME}" nodad
+
+	ip -netns "${rtname}" link set "${RT2HS_DEVNAME}" up
+
+	ip netns exec "${rtname}" \
+		sysctl -wq net.ipv6.conf."${RT2HS_DEVNAME}".proxy_ndp=1
+}
+
+setup()
+{
+	local i
+
+	# create routers
+	ROUTERS="1 2 3 4"; readonly ROUTERS
+	for i in ${ROUTERS}; do
+		create_router "${i}"
+	done
+
+	# create hosts
+	HOSTS="1 2"; readonly HOSTS
+	for i in ${HOSTS}; do
+		create_host "${i}"
+	done
+
+	# set up the links for connecting routers
+	add_link_rt_pairs 1 "2 3 4"
+	add_link_rt_pairs 2 "3 4"
+	add_link_rt_pairs 3 "4"
+
+	# set up the basic connectivity of routers and routes required for
+	# reachability of SIDs.
+	setup_rt_networking 1 "2 3 4"
+	setup_rt_networking 2 "1 3 4"
+	setup_rt_networking 3 "1 2 4"
+	setup_rt_networking 4 "1 2 3"
+
+	# set up the hosts connected to routers
+	setup_hs 1 1
+	setup_hs 2 2
+
+	# set up default SRv6 Endpoints (i.e. SRv6 End behavior)
+	setup_rt_local_sids 1 "2 3 4"
+	setup_rt_local_sids 2 "1 3 4"
+	setup_rt_local_sids 3 "1 2 4"
+	setup_rt_local_sids 4 "1 2 3"
+
+	# set up SRv6 policies
+	# create a connection between hosts hs-1 and hs-2.
+	# The path between hs-1 and hs-2 traverses SRv6 aware routers.
+	# For each direction two path are chosen:
+	#
+	# Direction hs-1 -> hs-2 (PSP flavor)
+	#  - rt-1 (SRv6 H.Insert policy)
+	#  - rt-3 (SRv6 End behavior)
+	#  - rt-4 (SRv6 End flavor PSP with SL>1, acting as End behavior)
+	#  - rt-2 (SRv6 End flavor PSP with SL=1)
+	#
+	# Direction hs-2 -> hs-1 (PSP flavor)
+	#  - rt-2 (SRv6 H.Insert policy)
+	#  - rt-1 (SRv6 End flavor PSP with SL=1)
+	setup_rt_policy_ipv6 2 1 "3:noflv 4:psp 2:psp"
+	setup_rt_policy_ipv6 1 2 "1:psp"
+
+	# testing environment was set up successfully
+	SETUP_ERR=0
+}
+
+check_rt_connectivity()
+{
+	local rtsrc="$1"
+	local rtdst="$2"
+	local prefix
+	local rtsrc_nsname
+
+	rtsrc_nsname="$(get_rtname "${rtsrc}")"
+
+	prefix="$(get_network_prefix "${rtsrc}" "${rtdst}")"
+
+	ip netns exec "${rtsrc_nsname}" ping -c 1 -W "${PING_TIMEOUT_SEC}" \
+		"${prefix}::${rtdst}" >/dev/null 2>&1
+}
+
+check_and_log_rt_connectivity()
+{
+	local rtsrc="$1"
+	local rtdst="$2"
+
+	check_rt_connectivity "${rtsrc}" "${rtdst}"
+	log_test $? 0 "Routers connectivity: rt-${rtsrc} -> rt-${rtdst}"
+}
+
+check_hs_ipv6_connectivity()
+{
+	local hssrc="$1"
+	local hsdst="$2"
+	local hssrc_nsname
+
+	hssrc_nsname="$(get_hsname "${hssrc}")"
+
+	ip netns exec "${hssrc_nsname}" ping -c 1 -W "${PING_TIMEOUT_SEC}" \
+		"${IPv6_HS_NETWORK}::${hsdst}" >/dev/null 2>&1
+}
+
+check_and_log_hs2gw_connectivity()
+{
+	local hssrc="$1"
+
+	check_hs_ipv6_connectivity "${hssrc}" 254
+	log_test $? 0 "IPv6 Hosts connectivity: hs-${hssrc} -> gw"
+}
+
+check_and_log_hs_ipv6_connectivity()
+{
+	local hssrc="$1"
+	local hsdst="$2"
+
+	check_hs_ipv6_connectivity "${hssrc}" "${hsdst}"
+	log_test $? 0 "IPv6 Hosts connectivity: hs-${hssrc} -> hs-${hsdst}"
+}
+
+check_and_log_hs_connectivity()
+{
+	local hssrc="$1"
+	local hsdst="$2"
+
+	check_and_log_hs_ipv6_connectivity "${hssrc}" "${hsdst}"
+}
+
+router_tests()
+{
+	local i
+	local j
+
+	log_section "IPv6 routers connectivity test"
+
+	for i in ${ROUTERS}; do
+		for j in ${ROUTERS}; do
+			if [ "${i}" -eq "${j}" ]; then
+				continue
+			fi
+
+			check_and_log_rt_connectivity "${i}" "${j}"
+		done
+	done
+}
+
+host2gateway_tests()
+{
+	local hs
+
+	log_section "IPv6 connectivity test among hosts and gateways"
+
+	for hs in ${HOSTS}; do
+		check_and_log_hs2gw_connectivity "${hs}"
+	done
+}
+
+host_srv6_end_flv_psp_tests()
+{
+	log_section "SRv6 connectivity test hosts (h1 <-> h2, PSP flavor)"
+
+	check_and_log_hs_connectivity 1 2
+	check_and_log_hs_connectivity 2 1
+}
+
+test_iproute2_supp_or_ksft_skip()
+{
+	local flavor="$1"
+
+	if ! ip route help 2>&1 | grep -qo "${flavor}"; then
+		echo "SKIP: Missing SRv6 ${flavor} flavor support in iproute2"
+		exit "${ksft_skip}"
+	fi
+}
+
+test_kernel_supp_or_ksft_skip()
+{
+	local flavor="$1"
+	local test_netns
+
+	test_netns="kflv-$(mktemp -u XXXXXXXX)"
+
+	if ! ip netns add "${test_netns}"; then
+		echo "SKIP: Cannot set up netns to test kernel support for flavors"
+		exit "${ksft_skip}"
+	fi
+
+	if ! ip -netns "${test_netns}" link \
+		add "${DUMMY_DEVNAME}" type dummy; then
+		echo "SKIP: Cannot set up dummy dev to test kernel support for flavors"
+
+		ip netns del "${test_netns}"
+		exit "${ksft_skip}"
+	fi
+
+	if ! ip -netns "${test_netns}" link \
+		set "${DUMMY_DEVNAME}" up; then
+		echo "SKIP: Cannot activate dummy dev to test kernel support for flavors"
+
+		ip netns del "${test_netns}"
+		exit "${ksft_skip}"
+	fi
+
+	if ! ip -netns "${test_netns}" -6 route \
+		add "${IPv6_TESTS_ADDR}" encap seg6local \
+		action End flavors "${flavor}" dev "${DUMMY_DEVNAME}"; then
+		echo "SKIP: ${flavor} flavor not supported in kernel"
+
+		ip netns del "${test_netns}"
+		exit "${ksft_skip}"
+	fi
+
+	ip netns del "${test_netns}"
+}
+
+test_dummy_dev_or_ksft_skip()
+{
+	local test_netns
+
+	test_netns="dummy-$(mktemp -u XXXXXXXX)"
+
+	if ! ip netns add "${test_netns}"; then
+		echo "SKIP: Cannot set up netns for testing dummy dev support"
+		exit "${ksft_skip}"
+	fi
+
+	modprobe dummy &>/dev/null || true
+	if ! ip -netns "${test_netns}" link \
+		add "${DUMMY_DEVNAME}" type dummy; then
+		echo "SKIP: dummy dev not supported"
+
+		ip netns del "${test_netns}"
+		exit "${ksft_skip}"
+	fi
+
+	ip netns del "${test_netns}"
+}
+
+if [ "$(id -u)" -ne 0 ]; then
+	echo "SKIP: Need root privileges"
+	exit "${ksft_skip}"
+fi
+
+# required programs to carry out this selftest
+test_command_or_ksft_skip ip
+test_command_or_ksft_skip ping
+test_command_or_ksft_skip sysctl
+test_command_or_ksft_skip grep
+test_command_or_ksft_skip cut
+test_command_or_ksft_skip sed
+test_command_or_ksft_skip sort
+test_command_or_ksft_skip xargs
+
+test_dummy_dev_or_ksft_skip
+test_iproute2_supp_or_ksft_skip psp
+test_kernel_supp_or_ksft_skip psp
+
+set -e
+trap cleanup EXIT
+
+setup
+set +e
+
+router_tests
+host2gateway_tests
+host_srv6_end_flv_psp_tests
+
+print_log_test_results
-- 
cgit 


From 55a9ed0e16baf4d025c160d46bc1e3fac0d4cdc4 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Wed, 15 Feb 2023 00:12:14 +0100
Subject: libbpf: Introduce bpf_{btf,link,map,prog}_get_info_by_fd()

These are type-safe wrappers around bpf_obj_get_info_by_fd(). They
found one problem in selftests, and are also useful for adding
Memory Sanitizer annotations.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230214231221.249277-2-iii@linux.ibm.com
---
 tools/lib/bpf/bpf.c      | 20 ++++++++++++++++++++
 tools/lib/bpf/bpf.h      |  9 +++++++++
 tools/lib/bpf/libbpf.map |  5 +++++
 3 files changed, 34 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 9aff98f42a3d..e750b6f5fcc3 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -1044,6 +1044,26 @@ int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len)
 	return libbpf_err_errno(err);
 }
 
+int bpf_prog_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, __u32 *info_len)
+{
+	return bpf_obj_get_info_by_fd(prog_fd, info, info_len);
+}
+
+int bpf_map_get_info_by_fd(int map_fd, struct bpf_map_info *info, __u32 *info_len)
+{
+	return bpf_obj_get_info_by_fd(map_fd, info, info_len);
+}
+
+int bpf_btf_get_info_by_fd(int btf_fd, struct bpf_btf_info *info, __u32 *info_len)
+{
+	return bpf_obj_get_info_by_fd(btf_fd, info, info_len);
+}
+
+int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info_len)
+{
+	return bpf_obj_get_info_by_fd(link_fd, info, info_len);
+}
+
 int bpf_raw_tracepoint_open(const char *name, int prog_fd)
 {
 	const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint);
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 7468978d3c27..9ed9bceb4111 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -386,6 +386,15 @@ LIBBPF_API int bpf_link_get_fd_by_id(__u32 id);
 LIBBPF_API int bpf_link_get_fd_by_id_opts(__u32 id,
 				const struct bpf_get_fd_by_id_opts *opts);
 LIBBPF_API int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len);
+/* Type-safe variants of bpf_obj_get_info_by_fd(). The callers still needs to
+ * pass info_len, which should normally be
+ * sizeof(struct bpf_{prog,map,btf,link}_info), in order to be compatible with
+ * different libbpf and kernel versions.
+ */
+LIBBPF_API int bpf_prog_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, __u32 *info_len);
+LIBBPF_API int bpf_map_get_info_by_fd(int map_fd, struct bpf_map_info *info, __u32 *info_len);
+LIBBPF_API int bpf_btf_get_info_by_fd(int btf_fd, struct bpf_btf_info *info, __u32 *info_len);
+LIBBPF_API int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info_len);
 
 struct bpf_prog_query_opts {
 	size_t sz; /* size of this struct for forward/backward compatibility */
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 11c36a3c1a9f..50dde1f6521e 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -384,4 +384,9 @@ LIBBPF_1.1.0 {
 } LIBBPF_1.0.0;
 
 LIBBPF_1.2.0 {
+	global:
+		bpf_btf_get_info_by_fd;
+		bpf_link_get_info_by_fd;
+		bpf_map_get_info_by_fd;
+		bpf_prog_get_info_by_fd;
 } LIBBPF_1.1.0;
-- 
cgit 


From 629dfc660cae86a6a48d19f5295226d03caae673 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Wed, 15 Feb 2023 00:12:15 +0100
Subject: libbpf: Use bpf_{btf,link,map,prog}_get_info_by_fd()

Use the new type-safe wrappers around bpf_obj_get_info_by_fd().

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230214231221.249277-3-iii@linux.ibm.com
---
 tools/lib/bpf/btf.c     |  8 ++++----
 tools/lib/bpf/libbpf.c  | 14 +++++++-------
 tools/lib/bpf/netlink.c |  2 +-
 tools/lib/bpf/ringbuf.c |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 64841117fbb2..9181d36118d2 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1350,9 +1350,9 @@ struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf)
 	void *ptr;
 	int err;
 
-	/* we won't know btf_size until we call bpf_obj_get_info_by_fd(). so
+	/* we won't know btf_size until we call bpf_btf_get_info_by_fd(). so
 	 * let's start with a sane default - 4KiB here - and resize it only if
-	 * bpf_obj_get_info_by_fd() needs a bigger buffer.
+	 * bpf_btf_get_info_by_fd() needs a bigger buffer.
 	 */
 	last_size = 4096;
 	ptr = malloc(last_size);
@@ -1362,7 +1362,7 @@ struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf)
 	memset(&btf_info, 0, sizeof(btf_info));
 	btf_info.btf = ptr_to_u64(ptr);
 	btf_info.btf_size = last_size;
-	err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len);
+	err = bpf_btf_get_info_by_fd(btf_fd, &btf_info, &len);
 
 	if (!err && btf_info.btf_size > last_size) {
 		void *temp_ptr;
@@ -1380,7 +1380,7 @@ struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf)
 		btf_info.btf = ptr_to_u64(ptr);
 		btf_info.btf_size = last_size;
 
-		err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len);
+		err = bpf_btf_get_info_by_fd(btf_fd, &btf_info, &len);
 	}
 
 	if (err || btf_info.btf_size > last_size) {
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 35a698eb825d..05c4db355f28 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -4345,7 +4345,7 @@ int bpf_map__reuse_fd(struct bpf_map *map, int fd)
 	char *new_name;
 
 	memset(&info, 0, len);
-	err = bpf_obj_get_info_by_fd(fd, &info, &len);
+	err = bpf_map_get_info_by_fd(fd, &info, &len);
 	if (err && errno == EINVAL)
 		err = bpf_get_map_info_from_fdinfo(fd, &info);
 	if (err)
@@ -4729,7 +4729,7 @@ static int probe_module_btf(void)
 	 * kernel's module BTF support coincides with support for
 	 * name/name_len fields in struct bpf_btf_info.
 	 */
-	err = bpf_obj_get_info_by_fd(fd, &info, &len);
+	err = bpf_btf_get_info_by_fd(fd, &info, &len);
 	close(fd);
 	return !err;
 }
@@ -4892,7 +4892,7 @@ static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd)
 	int err;
 
 	memset(&map_info, 0, map_info_len);
-	err = bpf_obj_get_info_by_fd(map_fd, &map_info, &map_info_len);
+	err = bpf_map_get_info_by_fd(map_fd, &map_info, &map_info_len);
 	if (err && errno == EINVAL)
 		err = bpf_get_map_info_from_fdinfo(map_fd, &map_info);
 	if (err) {
@@ -5437,7 +5437,7 @@ static int load_module_btfs(struct bpf_object *obj)
 		info.name = ptr_to_u64(name);
 		info.name_len = sizeof(name);
 
-		err = bpf_obj_get_info_by_fd(fd, &info, &len);
+		err = bpf_btf_get_info_by_fd(fd, &info, &len);
 		if (err) {
 			err = -errno;
 			pr_warn("failed to get BTF object #%d info: %d\n", id, err);
@@ -9030,9 +9030,9 @@ static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd)
 	int err;
 
 	memset(&info, 0, info_len);
-	err = bpf_obj_get_info_by_fd(attach_prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(attach_prog_fd, &info, &info_len);
 	if (err) {
-		pr_warn("failed bpf_obj_get_info_by_fd for FD %d: %d\n",
+		pr_warn("failed bpf_prog_get_info_by_fd for FD %d: %d\n",
 			attach_prog_fd, err);
 		return err;
 	}
@@ -11741,7 +11741,7 @@ static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt,
 	/* best-effort sanity checks */
 	memset(&map, 0, sizeof(map));
 	map_info_len = sizeof(map);
-	err = bpf_obj_get_info_by_fd(map_fd, &map, &map_info_len);
+	err = bpf_map_get_info_by_fd(map_fd, &map, &map_info_len);
 	if (err) {
 		err = -errno;
 		/* if BPF_OBJ_GET_INFO_BY_FD is supported, will return
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index cb082a04ffa8..1653e7a8b0a1 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -689,7 +689,7 @@ static int tc_add_fd_and_name(struct libbpf_nla_req *req, int fd)
 	int len, ret;
 
 	memset(&info, 0, info_len);
-	ret = bpf_obj_get_info_by_fd(fd, &info, &info_len);
+	ret = bpf_prog_get_info_by_fd(fd, &info, &info_len);
 	if (ret < 0)
 		return ret;
 
diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c
index 47855af25f3b..02199364db13 100644
--- a/tools/lib/bpf/ringbuf.c
+++ b/tools/lib/bpf/ringbuf.c
@@ -83,7 +83,7 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
 
 	memset(&info, 0, sizeof(info));
 
-	err = bpf_obj_get_info_by_fd(map_fd, &info, &len);
+	err = bpf_map_get_info_by_fd(map_fd, &info, &len);
 	if (err) {
 		err = -errno;
 		pr_warn("ringbuf: failed to get map info for fd=%d: %d\n",
@@ -359,7 +359,7 @@ static int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd)
 
 	memset(&info, 0, sizeof(info));
 
-	err = bpf_obj_get_info_by_fd(map_fd, &info, &len);
+	err = bpf_map_get_info_by_fd(map_fd, &info, &len);
 	if (err) {
 		err = -errno;
 		pr_warn("user ringbuf: failed to get map info for fd=%d: %d\n", map_fd, err);
-- 
cgit 


From 38f0408ef756e738387f7d8f62b8d58ca5938da4 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Wed, 15 Feb 2023 00:12:16 +0100
Subject: bpftool: Use bpf_{btf,link,map,prog}_get_info_by_fd()

Use the new type-safe wrappers around bpf_obj_get_info_by_fd().

Split the bpf_obj_get_info_by_fd() call in build_btf_type_table() in
two, since knowing the type helps with the Memory Sanitizer.

Improve map_parse_fd_and_info() type safety by using
struct bpf_map_info * instead of void * for info.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20230214231221.249277-4-iii@linux.ibm.com
---
 tools/bpf/bpftool/btf.c        | 13 ++++++++-----
 tools/bpf/bpftool/btf_dumper.c |  4 ++--
 tools/bpf/bpftool/cgroup.c     |  4 ++--
 tools/bpf/bpftool/common.c     | 13 +++++++------
 tools/bpf/bpftool/link.c       |  4 ++--
 tools/bpf/bpftool/main.h       |  3 ++-
 tools/bpf/bpftool/map.c        |  8 ++++----
 tools/bpf/bpftool/prog.c       | 22 +++++++++++-----------
 tools/bpf/bpftool/struct_ops.c |  6 +++---
 9 files changed, 41 insertions(+), 36 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 352290ba7b29..91fcb75babe3 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -537,7 +537,7 @@ static bool btf_is_kernel_module(__u32 btf_id)
 	len = sizeof(btf_info);
 	btf_info.name = ptr_to_u64(btf_name);
 	btf_info.name_len = sizeof(btf_name);
-	err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len);
+	err = bpf_btf_get_info_by_fd(btf_fd, &btf_info, &len);
 	close(btf_fd);
 	if (err) {
 		p_err("can't get BTF (ID %u) object info: %s", btf_id, strerror(errno));
@@ -606,7 +606,7 @@ static int do_dump(int argc, char **argv)
 		if (fd < 0)
 			return -1;
 
-		err = bpf_obj_get_info_by_fd(fd, &info, &len);
+		err = bpf_prog_get_info_by_fd(fd, &info, &len);
 		if (err) {
 			p_err("can't get prog info: %s", strerror(errno));
 			goto done;
@@ -789,7 +789,10 @@ build_btf_type_table(struct hashmap *tab, enum bpf_obj_type type,
 		}
 
 		memset(info, 0, *len);
-		err = bpf_obj_get_info_by_fd(fd, info, len);
+		if (type == BPF_OBJ_PROG)
+			err = bpf_prog_get_info_by_fd(fd, info, len);
+		else
+			err = bpf_map_get_info_by_fd(fd, info, len);
 		close(fd);
 		if (err) {
 			p_err("can't get %s info: %s", names[type],
@@ -931,7 +934,7 @@ show_btf(int fd, struct hashmap *btf_prog_table,
 	int err;
 
 	memset(&info, 0, sizeof(info));
-	err = bpf_obj_get_info_by_fd(fd, &info, &len);
+	err = bpf_btf_get_info_by_fd(fd, &info, &len);
 	if (err) {
 		p_err("can't get BTF object info: %s", strerror(errno));
 		return -1;
@@ -943,7 +946,7 @@ show_btf(int fd, struct hashmap *btf_prog_table,
 		info.name = ptr_to_u64(name);
 		len = sizeof(info);
 
-		err = bpf_obj_get_info_by_fd(fd, &info, &len);
+		err = bpf_btf_get_info_by_fd(fd, &info, &len);
 		if (err) {
 			p_err("can't get BTF object info: %s", strerror(errno));
 			return -1;
diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c
index eda71fdfe95a..e7f6ec3a8f35 100644
--- a/tools/bpf/bpftool/btf_dumper.c
+++ b/tools/bpf/bpftool/btf_dumper.c
@@ -57,7 +57,7 @@ static int dump_prog_id_as_func_ptr(const struct btf_dumper *d,
 	if (prog_fd < 0)
 		goto print;
 
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 	if (err)
 		goto print;
 
@@ -70,7 +70,7 @@ static int dump_prog_id_as_func_ptr(const struct btf_dumper *d,
 	info.func_info_rec_size = finfo_rec_size;
 	info.func_info = ptr_to_u64(&finfo);
 
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 	if (err)
 		goto print;
 
diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index b46a998d8f8d..ac846b0805b4 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -82,7 +82,7 @@ static void guess_vmlinux_btf_id(__u32 attach_btf_obj_id)
 	if (fd < 0)
 		return;
 
-	err = bpf_obj_get_info_by_fd(fd, &btf_info, &btf_len);
+	err = bpf_btf_get_info_by_fd(fd, &btf_info, &btf_len);
 	if (err)
 		goto out;
 
@@ -108,7 +108,7 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type,
 	if (prog_fd < 0)
 		return -1;
 
-	if (bpf_obj_get_info_by_fd(prog_fd, &info, &info_len)) {
+	if (bpf_prog_get_info_by_fd(prog_fd, &info, &info_len)) {
 		close(prog_fd);
 		return -1;
 	}
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 620032042576..5a73ccf14332 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -353,7 +353,7 @@ void get_prog_full_name(const struct bpf_prog_info *prog_info, int prog_fd,
 		info.func_info_rec_size = sizeof(finfo);
 	info.func_info = ptr_to_u64(&finfo);
 
-	if (bpf_obj_get_info_by_fd(prog_fd, &info, &info_len))
+	if (bpf_prog_get_info_by_fd(prog_fd, &info, &info_len))
 		goto copy_name;
 
 	prog_btf = btf__load_from_kernel_by_id(info.btf_id);
@@ -488,7 +488,7 @@ static int do_build_table_cb(const char *fpath, const struct stat *sb,
 		goto out_close;
 
 	memset(&pinned_info, 0, sizeof(pinned_info));
-	if (bpf_obj_get_info_by_fd(fd, &pinned_info, &len))
+	if (bpf_prog_get_info_by_fd(fd, &pinned_info, &len))
 		goto out_close;
 
 	path = strdup(fpath);
@@ -756,7 +756,7 @@ static int prog_fd_by_nametag(void *nametag, int **fds, bool tag)
 			goto err_close_fds;
 		}
 
-		err = bpf_obj_get_info_by_fd(fd, &info, &len);
+		err = bpf_prog_get_info_by_fd(fd, &info, &len);
 		if (err) {
 			p_err("can't get prog info (%u): %s",
 			      id, strerror(errno));
@@ -916,7 +916,7 @@ static int map_fd_by_name(char *name, int **fds)
 			goto err_close_fds;
 		}
 
-		err = bpf_obj_get_info_by_fd(fd, &info, &len);
+		err = bpf_map_get_info_by_fd(fd, &info, &len);
 		if (err) {
 			p_err("can't get map info (%u): %s",
 			      id, strerror(errno));
@@ -1026,7 +1026,8 @@ exit_free:
 	return fd;
 }
 
-int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len)
+int map_parse_fd_and_info(int *argc, char ***argv, struct bpf_map_info *info,
+			  __u32 *info_len)
 {
 	int err;
 	int fd;
@@ -1035,7 +1036,7 @@ int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len)
 	if (fd < 0)
 		return -1;
 
-	err = bpf_obj_get_info_by_fd(fd, info, info_len);
+	err = bpf_map_get_info_by_fd(fd, info, info_len);
 	if (err) {
 		p_err("can't get map info: %s", strerror(errno));
 		close(fd);
diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c
index 6f4cfe01cad4..f985b79cca27 100644
--- a/tools/bpf/bpftool/link.c
+++ b/tools/bpf/bpftool/link.c
@@ -145,7 +145,7 @@ static int get_prog_info(int prog_id, struct bpf_prog_info *info)
 		return prog_fd;
 
 	memset(info, 0, sizeof(*info));
-	err = bpf_obj_get_info_by_fd(prog_fd, info, &len);
+	err = bpf_prog_get_info_by_fd(prog_fd, info, &len);
 	if (err)
 		p_err("can't get prog info: %s", strerror(errno));
 	close(prog_fd);
@@ -327,7 +327,7 @@ static int do_show_link(int fd)
 
 	memset(&info, 0, sizeof(info));
 again:
-	err = bpf_obj_get_info_by_fd(fd, &info, &len);
+	err = bpf_link_get_info_by_fd(fd, &info, &len);
 	if (err) {
 		p_err("can't get link info: %s",
 		      strerror(errno));
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index a84224b6a604..0ef373cef4c7 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -168,7 +168,8 @@ int prog_parse_fd(int *argc, char ***argv);
 int prog_parse_fds(int *argc, char ***argv, int **fds);
 int map_parse_fd(int *argc, char ***argv);
 int map_parse_fds(int *argc, char ***argv, int **fds);
-int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
+int map_parse_fd_and_info(int *argc, char ***argv, struct bpf_map_info *info,
+			  __u32 *info_len);
 
 struct bpf_prog_linfo;
 #if defined(HAVE_LLVM_SUPPORT) || defined(HAVE_LIBBFD_SUPPORT)
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 88911d3aa2d9..aaeb8939e137 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -638,7 +638,7 @@ static int do_show_subset(int argc, char **argv)
 	if (json_output && nb_fds > 1)
 		jsonw_start_array(json_wtr);	/* root array */
 	for (i = 0; i < nb_fds; i++) {
-		err = bpf_obj_get_info_by_fd(fds[i], &info, &len);
+		err = bpf_map_get_info_by_fd(fds[i], &info, &len);
 		if (err) {
 			p_err("can't get map info: %s",
 			      strerror(errno));
@@ -708,7 +708,7 @@ static int do_show(int argc, char **argv)
 			break;
 		}
 
-		err = bpf_obj_get_info_by_fd(fd, &info, &len);
+		err = bpf_map_get_info_by_fd(fd, &info, &len);
 		if (err) {
 			p_err("can't get map info: %s", strerror(errno));
 			close(fd);
@@ -764,7 +764,7 @@ static int maps_have_btf(int *fds, int nb_fds)
 	int err, i;
 
 	for (i = 0; i < nb_fds; i++) {
-		err = bpf_obj_get_info_by_fd(fds[i], &info, &len);
+		err = bpf_map_get_info_by_fd(fds[i], &info, &len);
 		if (err) {
 			p_err("can't get map info: %s", strerror(errno));
 			return -1;
@@ -925,7 +925,7 @@ static int do_dump(int argc, char **argv)
 	if (wtr && nb_fds > 1)
 		jsonw_start_array(wtr);	/* root array */
 	for (i = 0; i < nb_fds; i++) {
-		if (bpf_obj_get_info_by_fd(fds[i], &info, &len)) {
+		if (bpf_map_get_info_by_fd(fds[i], &info, &len)) {
 			p_err("can't get map info: %s", strerror(errno));
 			break;
 		}
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index e87738dbffc1..afbe3ec342c8 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -198,7 +198,7 @@ static void show_prog_maps(int fd, __u32 num_maps)
 	info.nr_map_ids = num_maps;
 	info.map_ids = ptr_to_u64(map_ids);
 
-	err = bpf_obj_get_info_by_fd(fd, &info, &len);
+	err = bpf_prog_get_info_by_fd(fd, &info, &len);
 	if (err || !info.nr_map_ids)
 		return;
 
@@ -231,7 +231,7 @@ static void *find_metadata(int prog_fd, struct bpf_map_info *map_info)
 
 	memset(&prog_info, 0, sizeof(prog_info));
 	prog_info_len = sizeof(prog_info);
-	ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len);
+	ret = bpf_prog_get_info_by_fd(prog_fd, &prog_info, &prog_info_len);
 	if (ret)
 		return NULL;
 
@@ -248,7 +248,7 @@ static void *find_metadata(int prog_fd, struct bpf_map_info *map_info)
 	prog_info.map_ids = ptr_to_u64(map_ids);
 	prog_info_len = sizeof(prog_info);
 
-	ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len);
+	ret = bpf_prog_get_info_by_fd(prog_fd, &prog_info, &prog_info_len);
 	if (ret)
 		goto free_map_ids;
 
@@ -259,7 +259,7 @@ static void *find_metadata(int prog_fd, struct bpf_map_info *map_info)
 
 		memset(map_info, 0, sizeof(*map_info));
 		map_info_len = sizeof(*map_info);
-		ret = bpf_obj_get_info_by_fd(map_fd, map_info, &map_info_len);
+		ret = bpf_map_get_info_by_fd(map_fd, map_info, &map_info_len);
 		if (ret < 0) {
 			close(map_fd);
 			goto free_map_ids;
@@ -580,7 +580,7 @@ static int show_prog(int fd)
 	__u32 len = sizeof(info);
 	int err;
 
-	err = bpf_obj_get_info_by_fd(fd, &info, &len);
+	err = bpf_prog_get_info_by_fd(fd, &info, &len);
 	if (err) {
 		p_err("can't get prog info: %s", strerror(errno));
 		return -1;
@@ -949,7 +949,7 @@ static int do_dump(int argc, char **argv)
 	for (i = 0; i < nb_fds; i++) {
 		memset(&info, 0, sizeof(info));
 
-		err = bpf_obj_get_info_by_fd(fds[i], &info, &info_len);
+		err = bpf_prog_get_info_by_fd(fds[i], &info, &info_len);
 		if (err) {
 			p_err("can't get prog info: %s", strerror(errno));
 			break;
@@ -961,7 +961,7 @@ static int do_dump(int argc, char **argv)
 			break;
 		}
 
-		err = bpf_obj_get_info_by_fd(fds[i], &info, &info_len);
+		err = bpf_prog_get_info_by_fd(fds[i], &info, &info_len);
 		if (err) {
 			p_err("can't get prog info: %s", strerror(errno));
 			break;
@@ -2170,9 +2170,9 @@ static char *profile_target_name(int tgt_fd)
 	char *name = NULL;
 	int err;
 
-	err = bpf_obj_get_info_by_fd(tgt_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(tgt_fd, &info, &info_len);
 	if (err) {
-		p_err("failed to bpf_obj_get_info_by_fd for prog FD %d", tgt_fd);
+		p_err("failed to get info for prog FD %d", tgt_fd);
 		goto out;
 	}
 
@@ -2183,7 +2183,7 @@ static char *profile_target_name(int tgt_fd)
 
 	func_info_rec_size = info.func_info_rec_size;
 	if (info.nr_func_info == 0) {
-		p_err("bpf_obj_get_info_by_fd for prog FD %d found 0 func_info", tgt_fd);
+		p_err("found 0 func_info for prog FD %d", tgt_fd);
 		goto out;
 	}
 
@@ -2192,7 +2192,7 @@ static char *profile_target_name(int tgt_fd)
 	info.func_info_rec_size = func_info_rec_size;
 	info.func_info = ptr_to_u64(&func_info);
 
-	err = bpf_obj_get_info_by_fd(tgt_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(tgt_fd, &info, &info_len);
 	if (err) {
 		p_err("failed to get func_info for prog FD %d", tgt_fd);
 		goto out;
diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c
index 903b80ff4e9a..b389f4830e11 100644
--- a/tools/bpf/bpftool/struct_ops.c
+++ b/tools/bpf/bpftool/struct_ops.c
@@ -151,7 +151,7 @@ static int get_next_struct_ops_map(const char *name, int *res_fd,
 			return -1;
 		}
 
-		err = bpf_obj_get_info_by_fd(fd, info, &info_len);
+		err = bpf_map_get_info_by_fd(fd, info, &info_len);
 		if (err) {
 			p_err("can't get map info: %s", strerror(errno));
 			close(fd);
@@ -262,7 +262,7 @@ static struct res do_one_id(const char *id_str, work_func func, void *data,
 		goto done;
 	}
 
-	if (bpf_obj_get_info_by_fd(fd, info, &info_len)) {
+	if (bpf_map_get_info_by_fd(fd, info, &info_len)) {
 		p_err("can't get map info: %s", strerror(errno));
 		res.nr_errs++;
 		goto done;
@@ -522,7 +522,7 @@ static int do_register(int argc, char **argv)
 		bpf_link__disconnect(link);
 		bpf_link__destroy(link);
 
-		if (!bpf_obj_get_info_by_fd(bpf_map__fd(map), &info,
+		if (!bpf_map_get_info_by_fd(bpf_map__fd(map), &info,
 					    &info_len))
 			p_info("Registered %s %s id %u",
 			       get_kern_struct_ops_name(&info),
-- 
cgit 


From c5a237a4db21ca7a28518c994def39d7bd62a0d1 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Wed, 15 Feb 2023 00:12:18 +0100
Subject: selftests/bpf: Use bpf_{btf,link,map,prog}_get_info_by_fd()

Use the new type-safe wrappers around bpf_obj_get_info_by_fd().
Fix a prog/map mixup in prog_holds_map().

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230214231221.249277-6-iii@linux.ibm.com
---
 .../selftests/bpf/map_tests/map_in_map_batch_ops.c |  2 +-
 tools/testing/selftests/bpf/prog_tests/bpf_iter.c  |  8 ++++----
 .../testing/selftests/bpf/prog_tests/bpf_obj_id.c  | 20 +++++++++---------
 tools/testing/selftests/bpf/prog_tests/btf.c       | 24 +++++++++++-----------
 .../selftests/bpf/prog_tests/btf_map_in_map.c      |  2 +-
 tools/testing/selftests/bpf/prog_tests/check_mtu.c |  2 +-
 .../selftests/bpf/prog_tests/enable_stats.c        |  2 +-
 .../selftests/bpf/prog_tests/fexit_bpf2bpf.c       | 14 ++++++-------
 .../bpf/prog_tests/flow_dissector_reattach.c       | 10 ++++-----
 .../bpf/prog_tests/libbpf_get_fd_by_id_opts.c      |  4 ++--
 .../testing/selftests/bpf/prog_tests/lsm_cgroup.c  |  3 ++-
 tools/testing/selftests/bpf/prog_tests/metadata.c  |  8 ++++----
 tools/testing/selftests/bpf/prog_tests/mmap.c      |  2 +-
 tools/testing/selftests/bpf/prog_tests/perf_link.c |  2 +-
 tools/testing/selftests/bpf/prog_tests/pinning.c   |  2 +-
 .../selftests/bpf/prog_tests/prog_run_opts.c       |  2 +-
 tools/testing/selftests/bpf/prog_tests/recursion.c |  4 ++--
 .../selftests/bpf/prog_tests/sockmap_basic.c       |  6 +++---
 .../selftests/bpf/prog_tests/task_local_storage.c  |  8 ++++----
 tools/testing/selftests/bpf/prog_tests/tc_bpf.c    |  4 ++--
 .../selftests/bpf/prog_tests/tp_attach_query.c     |  5 +++--
 .../selftests/bpf/prog_tests/unpriv_bpf_disabled.c |  8 ++++----
 .../testing/selftests/bpf/prog_tests/verif_stats.c |  5 +++--
 .../testing/selftests/bpf/prog_tests/xdp_attach.c  |  4 ++--
 .../selftests/bpf/prog_tests/xdp_cpumap_attach.c   |  8 ++++----
 .../selftests/bpf/prog_tests/xdp_devmap_attach.c   |  8 ++++----
 tools/testing/selftests/bpf/prog_tests/xdp_info.c  |  2 +-
 tools/testing/selftests/bpf/prog_tests/xdp_link.c  | 10 +++++----
 tools/testing/selftests/bpf/test_maps.c            |  2 +-
 .../selftests/bpf/test_skb_cgroup_id_user.c        |  2 +-
 .../selftests/bpf/test_tcp_check_syncookie_user.c  |  2 +-
 tools/testing/selftests/bpf/test_verifier.c        |  8 ++++----
 tools/testing/selftests/bpf/testing_helpers.c      |  2 +-
 tools/testing/selftests/bpf/xdp_synproxy.c         | 15 ++++++++------
 34 files changed, 109 insertions(+), 101 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/map_tests/map_in_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/map_in_map_batch_ops.c
index f472d28ad11a..16f1671e4bde 100644
--- a/tools/testing/selftests/bpf/map_tests/map_in_map_batch_ops.c
+++ b/tools/testing/selftests/bpf/map_tests/map_in_map_batch_ops.c
@@ -18,7 +18,7 @@ static __u32 get_map_id_from_fd(int map_fd)
 	uint32_t info_len = sizeof(map_info);
 	int ret;
 
-	ret = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
+	ret = bpf_map_get_info_by_fd(map_fd, &map_info, &info_len);
 	CHECK(ret < 0, "Finding map info failed", "error:%s\n",
 	      strerror(errno));
 
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index 3af6450763e9..1f02168103dd 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -195,8 +195,8 @@ static void check_bpf_link_info(const struct bpf_program *prog)
 		return;
 
 	info_len = sizeof(info);
-	err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len);
-	ASSERT_OK(err, "bpf_obj_get_info_by_fd");
+	err = bpf_link_get_info_by_fd(bpf_link__fd(link), &info, &info_len);
+	ASSERT_OK(err, "bpf_link_get_info_by_fd");
 	ASSERT_EQ(info.iter.task.tid, getpid(), "check_task_tid");
 
 	bpf_link__destroy(link);
@@ -684,13 +684,13 @@ static void test_overflow(bool test_e2big_overflow, bool ret1)
 
 	/* setup filtering map_id in bpf program */
 	map_info_len = sizeof(map_info);
-	err = bpf_obj_get_info_by_fd(map1_fd, &map_info, &map_info_len);
+	err = bpf_map_get_info_by_fd(map1_fd, &map_info, &map_info_len);
 	if (CHECK(err, "get_map_info", "get map info failed: %s\n",
 		  strerror(errno)))
 		goto free_map2;
 	skel->bss->map1_id = map_info.id;
 
-	err = bpf_obj_get_info_by_fd(map2_fd, &map_info, &map_info_len);
+	err = bpf_map_get_info_by_fd(map2_fd, &map_info, &map_info_len);
 	if (CHECK(err, "get_map_info", "get map info failed: %s\n",
 		  strerror(errno)))
 		goto free_map2;
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
index e1c1e521cca2..675b90b15280 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
@@ -44,7 +44,7 @@ void serial_test_bpf_obj_id(void)
 	CHECK(err >= 0 || errno != ENOENT,
 	      "get-fd-by-notexist-link-id", "err %d errno %d\n", err, errno);
 
-	/* Check bpf_obj_get_info_by_fd() */
+	/* Check bpf_map_get_info_by_fd() */
 	bzero(zeros, sizeof(zeros));
 	for (i = 0; i < nr_iters; i++) {
 		now = time(NULL);
@@ -79,7 +79,7 @@ void serial_test_bpf_obj_id(void)
 		/* Check getting map info */
 		info_len = sizeof(struct bpf_map_info) * 2;
 		bzero(&map_infos[i], info_len);
-		err = bpf_obj_get_info_by_fd(map_fds[i], &map_infos[i],
+		err = bpf_map_get_info_by_fd(map_fds[i], &map_infos[i],
 					     &info_len);
 		if (CHECK(err ||
 			  map_infos[i].type != BPF_MAP_TYPE_ARRAY ||
@@ -118,8 +118,8 @@ void serial_test_bpf_obj_id(void)
 		err = clock_gettime(CLOCK_BOOTTIME, &boot_time_ts);
 		if (CHECK_FAIL(err))
 			goto done;
-		err = bpf_obj_get_info_by_fd(prog_fds[i], &prog_infos[i],
-					     &info_len);
+		err = bpf_prog_get_info_by_fd(prog_fds[i], &prog_infos[i],
+					      &info_len);
 		load_time = (real_time_ts.tv_sec - boot_time_ts.tv_sec)
 			+ (prog_infos[i].load_time / nsec_per_sec);
 		if (CHECK(err ||
@@ -161,8 +161,8 @@ void serial_test_bpf_obj_id(void)
 		bzero(&link_infos[i], info_len);
 		link_infos[i].raw_tracepoint.tp_name = ptr_to_u64(&tp_name);
 		link_infos[i].raw_tracepoint.tp_name_len = sizeof(tp_name);
-		err = bpf_obj_get_info_by_fd(bpf_link__fd(links[i]),
-					     &link_infos[i], &info_len);
+		err = bpf_link_get_info_by_fd(bpf_link__fd(links[i]),
+					      &link_infos[i], &info_len);
 		if (CHECK(err ||
 			  link_infos[i].type != BPF_LINK_TYPE_RAW_TRACEPOINT ||
 			  link_infos[i].prog_id != prog_infos[i].id ||
@@ -217,7 +217,7 @@ void serial_test_bpf_obj_id(void)
 		 * prog_info.map_ids = NULL
 		 */
 		prog_info.nr_map_ids = 1;
-		err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len);
+		err = bpf_prog_get_info_by_fd(prog_fd, &prog_info, &info_len);
 		if (CHECK(!err || errno != EFAULT,
 			  "get-prog-fd-bad-nr-map-ids", "err %d errno %d(%d)",
 			  err, errno, EFAULT))
@@ -228,7 +228,7 @@ void serial_test_bpf_obj_id(void)
 		saved_map_id = *(int *)((long)prog_infos[i].map_ids);
 		prog_info.map_ids = prog_infos[i].map_ids;
 		prog_info.nr_map_ids = 2;
-		err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len);
+		err = bpf_prog_get_info_by_fd(prog_fd, &prog_info, &info_len);
 		prog_infos[i].jited_prog_insns = 0;
 		prog_infos[i].xlated_prog_insns = 0;
 		CHECK(err || info_len != sizeof(struct bpf_prog_info) ||
@@ -277,7 +277,7 @@ void serial_test_bpf_obj_id(void)
 		if (CHECK_FAIL(err))
 			goto done;
 
-		err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
+		err = bpf_map_get_info_by_fd(map_fd, &map_info, &info_len);
 		CHECK(err || info_len != sizeof(struct bpf_map_info) ||
 		      memcmp(&map_info, &map_infos[i], info_len) ||
 		      array_value != array_magic_value,
@@ -322,7 +322,7 @@ void serial_test_bpf_obj_id(void)
 
 		nr_id_found++;
 
-		err = bpf_obj_get_info_by_fd(link_fd, &link_info, &info_len);
+		err = bpf_link_get_info_by_fd(link_fd, &link_info, &info_len);
 		cmp_res = memcmp(&link_info, &link_infos[i],
 				offsetof(struct bpf_link_info, raw_tracepoint));
 		CHECK(err || info_len != sizeof(link_info) || cmp_res,
diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index de1b5b9eb93a..cbb600be943d 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -4422,7 +4422,7 @@ static int test_big_btf_info(unsigned int test_num)
 	info->btf = ptr_to_u64(user_btf);
 	info->btf_size = raw_btf_size;
 
-	err = bpf_obj_get_info_by_fd(btf_fd, info, &info_len);
+	err = bpf_btf_get_info_by_fd(btf_fd, info, &info_len);
 	if (CHECK(!err, "!err")) {
 		err = -1;
 		goto done;
@@ -4435,7 +4435,7 @@ static int test_big_btf_info(unsigned int test_num)
 	 * to userspace.
 	 */
 	info_garbage.garbage = 0;
-	err = bpf_obj_get_info_by_fd(btf_fd, info, &info_len);
+	err = bpf_btf_get_info_by_fd(btf_fd, info, &info_len);
 	if (CHECK(err || info_len != sizeof(*info),
 		  "err:%d errno:%d info_len:%u sizeof(*info):%zu",
 		  err, errno, info_len, sizeof(*info))) {
@@ -4499,7 +4499,7 @@ static int test_btf_id(unsigned int test_num)
 
 	/* Test BPF_OBJ_GET_INFO_BY_ID on btf_id */
 	info_len = sizeof(info[0]);
-	err = bpf_obj_get_info_by_fd(btf_fd[0], &info[0], &info_len);
+	err = bpf_btf_get_info_by_fd(btf_fd[0], &info[0], &info_len);
 	if (CHECK(err, "errno:%d", errno)) {
 		err = -1;
 		goto done;
@@ -4512,7 +4512,7 @@ static int test_btf_id(unsigned int test_num)
 	}
 
 	ret = 0;
-	err = bpf_obj_get_info_by_fd(btf_fd[1], &info[1], &info_len);
+	err = bpf_btf_get_info_by_fd(btf_fd[1], &info[1], &info_len);
 	if (CHECK(err || info[0].id != info[1].id ||
 		  info[0].btf_size != info[1].btf_size ||
 		  (ret = memcmp(user_btf[0], user_btf[1], info[0].btf_size)),
@@ -4535,7 +4535,7 @@ static int test_btf_id(unsigned int test_num)
 	}
 
 	info_len = sizeof(map_info);
-	err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
+	err = bpf_map_get_info_by_fd(map_fd, &map_info, &info_len);
 	if (CHECK(err || map_info.btf_id != info[0].id ||
 		  map_info.btf_key_type_id != 1 || map_info.btf_value_type_id != 2,
 		  "err:%d errno:%d info.id:%u btf_id:%u btf_key_type_id:%u btf_value_type_id:%u",
@@ -4638,7 +4638,7 @@ static void do_test_get_info(unsigned int test_num)
 	info.btf_size = user_btf_size;
 
 	ret = 0;
-	err = bpf_obj_get_info_by_fd(btf_fd, &info, &info_len);
+	err = bpf_btf_get_info_by_fd(btf_fd, &info, &info_len);
 	if (CHECK(err || !info.id || info_len != sizeof(info) ||
 		  info.btf_size != raw_btf_size ||
 		  (ret = memcmp(raw_btf, user_btf, expected_nbytes)),
@@ -4755,7 +4755,7 @@ static void do_test_file(unsigned int test_num)
 
 	/* get necessary program info */
 	info_len = sizeof(struct bpf_prog_info);
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 
 	if (CHECK(err < 0, "invalid get info (1st) errno:%d", errno)) {
 		fprintf(stderr, "%s\n", btf_log_buf);
@@ -4787,7 +4787,7 @@ static void do_test_file(unsigned int test_num)
 	info.func_info_rec_size = rec_size;
 	info.func_info = ptr_to_u64(func_info);
 
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 
 	if (CHECK(err < 0, "invalid get info (2nd) errno:%d", errno)) {
 		fprintf(stderr, "%s\n", btf_log_buf);
@@ -6405,7 +6405,7 @@ static int test_get_finfo(const struct prog_info_raw_test *test,
 
 	/* get necessary lens */
 	info_len = sizeof(struct bpf_prog_info);
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 	if (CHECK(err < 0, "invalid get info (1st) errno:%d", errno)) {
 		fprintf(stderr, "%s\n", btf_log_buf);
 		return -1;
@@ -6435,7 +6435,7 @@ static int test_get_finfo(const struct prog_info_raw_test *test,
 	info.nr_func_info = nr_func_info;
 	info.func_info_rec_size = rec_size;
 	info.func_info = ptr_to_u64(func_info);
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 	if (CHECK(err < 0, "invalid get info (2nd) errno:%d", errno)) {
 		fprintf(stderr, "%s\n", btf_log_buf);
 		err = -1;
@@ -6499,7 +6499,7 @@ static int test_get_linfo(const struct prog_info_raw_test *test,
 	nr_jited_func_lens = nr_jited_ksyms;
 
 	info_len = sizeof(struct bpf_prog_info);
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 	if (CHECK(err < 0, "err:%d errno:%d", err, errno)) {
 		err = -1;
 		goto done;
@@ -6573,7 +6573,7 @@ static int test_get_linfo(const struct prog_info_raw_test *test,
 		info.jited_func_lens = ptr_to_u64(jited_func_lens);
 	}
 
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 
 	/*
 	 * Only recheck the info.*line_info* fields.
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c
index eb90a6b8850d..a8b53b8736f0 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c
@@ -14,7 +14,7 @@ static __u32 bpf_map_id(struct bpf_map *map)
 	int err;
 
 	memset(&info, 0, info_len);
-	err = bpf_obj_get_info_by_fd(bpf_map__fd(map), &info, &info_len);
+	err = bpf_map_get_info_by_fd(bpf_map__fd(map), &info, &info_len);
 	if (err)
 		return 0;
 	return info.id;
diff --git a/tools/testing/selftests/bpf/prog_tests/check_mtu.c b/tools/testing/selftests/bpf/prog_tests/check_mtu.c
index 12f4395f18b3..5338d2ea0460 100644
--- a/tools/testing/selftests/bpf/prog_tests/check_mtu.c
+++ b/tools/testing/selftests/bpf/prog_tests/check_mtu.c
@@ -59,7 +59,7 @@ static void test_check_mtu_xdp_attach(void)
 
 	memset(&link_info, 0, sizeof(link_info));
 	fd = bpf_link__fd(link);
-	err = bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len);
+	err = bpf_link_get_info_by_fd(fd, &link_info, &link_info_len);
 	if (CHECK(err, "link_info", "failed: %d\n", err))
 		goto out;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/enable_stats.c b/tools/testing/selftests/bpf/prog_tests/enable_stats.c
index 2cb2085917e7..75f85d0fe74a 100644
--- a/tools/testing/selftests/bpf/prog_tests/enable_stats.c
+++ b/tools/testing/selftests/bpf/prog_tests/enable_stats.c
@@ -28,7 +28,7 @@ void test_enable_stats(void)
 
 	prog_fd = bpf_program__fd(skel->progs.test_enable_stats);
 	memset(&info, 0, info_len);
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 	if (CHECK(err, "get_prog_info",
 		  "failed to get bpf_prog_info for fd %d\n", prog_fd))
 		goto cleanup;
diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
index 20f5fa0fcec9..8ec73fdfcdab 100644
--- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
@@ -79,7 +79,7 @@ static void test_fexit_bpf2bpf_common(const char *obj_file,
 		return;
 
 	info_len = sizeof(prog_info);
-	err = bpf_obj_get_info_by_fd(tgt_fd, &prog_info, &info_len);
+	err = bpf_prog_get_info_by_fd(tgt_fd, &prog_info, &info_len);
 	if (!ASSERT_OK(err, "tgt_fd_get_info"))
 		goto close_prog;
 
@@ -136,8 +136,8 @@ static void test_fexit_bpf2bpf_common(const char *obj_file,
 
 		info_len = sizeof(link_info);
 		memset(&link_info, 0, sizeof(link_info));
-		err = bpf_obj_get_info_by_fd(bpf_link__fd(link[i]),
-					     &link_info, &info_len);
+		err = bpf_link_get_info_by_fd(bpf_link__fd(link[i]),
+					      &link_info, &info_len);
 		ASSERT_OK(err, "link_fd_get_info");
 		ASSERT_EQ(link_info.tracing.attach_type,
 			  bpf_program__expected_attach_type(prog[i]),
@@ -417,7 +417,7 @@ static int find_prog_btf_id(const char *name, __u32 attach_prog_fd)
 	struct btf *btf;
 	int ret;
 
-	ret = bpf_obj_get_info_by_fd(attach_prog_fd, &info, &info_len);
+	ret = bpf_prog_get_info_by_fd(attach_prog_fd, &info, &info_len);
 	if (ret)
 		return ret;
 
@@ -483,12 +483,12 @@ static void test_fentry_to_cgroup_bpf(void)
 	if (!ASSERT_GE(fentry_fd, 0, "load_fentry"))
 		goto cleanup;
 
-	/* Make sure bpf_obj_get_info_by_fd works correctly when attaching
+	/* Make sure bpf_prog_get_info_by_fd works correctly when attaching
 	 * to another BPF program.
 	 */
 
-	ASSERT_OK(bpf_obj_get_info_by_fd(fentry_fd, &info, &info_len),
-		  "bpf_obj_get_info_by_fd");
+	ASSERT_OK(bpf_prog_get_info_by_fd(fentry_fd, &info, &info_len),
+		  "bpf_prog_get_info_by_fd");
 
 	ASSERT_EQ(info.btf_id, 0, "info.btf_id");
 	ASSERT_EQ(info.attach_btf_id, btf_id, "info.attach_btf_id");
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c
index 7c79462d2702..9333f7346d15 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c
@@ -60,9 +60,9 @@ static __u32 query_prog_id(int prog)
 	__u32 info_len = sizeof(info);
 	int err;
 
-	err = bpf_obj_get_info_by_fd(prog, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog, &info, &info_len);
 	if (CHECK_FAIL(err || info_len != sizeof(info))) {
-		perror("bpf_obj_get_info_by_fd");
+		perror("bpf_prog_get_info_by_fd");
 		return 0;
 	}
 
@@ -497,7 +497,7 @@ static void test_link_get_info(int netns, int prog1, int prog2)
 	}
 
 	info_len = sizeof(info);
-	err = bpf_obj_get_info_by_fd(link, &info, &info_len);
+	err = bpf_link_get_info_by_fd(link, &info, &info_len);
 	if (CHECK_FAIL(err)) {
 		perror("bpf_obj_get_info");
 		goto out_unlink;
@@ -521,7 +521,7 @@ static void test_link_get_info(int netns, int prog1, int prog2)
 
 	link_id = info.id;
 	info_len = sizeof(info);
-	err = bpf_obj_get_info_by_fd(link, &info, &info_len);
+	err = bpf_link_get_info_by_fd(link, &info, &info_len);
 	if (CHECK_FAIL(err)) {
 		perror("bpf_obj_get_info");
 		goto out_unlink;
@@ -546,7 +546,7 @@ static void test_link_get_info(int netns, int prog1, int prog2)
 	netns = -1;
 
 	info_len = sizeof(info);
-	err = bpf_obj_get_info_by_fd(link, &info, &info_len);
+	err = bpf_link_get_info_by_fd(link, &info, &info_len);
 	if (CHECK_FAIL(err)) {
 		perror("bpf_obj_get_info");
 		goto out_unlink;
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_get_fd_by_id_opts.c b/tools/testing/selftests/bpf/prog_tests/libbpf_get_fd_by_id_opts.c
index 25e5dfa9c315..a3f238f51d05 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_get_fd_by_id_opts.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_get_fd_by_id_opts.c
@@ -29,9 +29,9 @@ void test_libbpf_get_fd_by_id_opts(void)
 	if (!ASSERT_OK(ret, "test_libbpf_get_fd_by_id_opts__attach"))
 		goto close_prog;
 
-	ret = bpf_obj_get_info_by_fd(bpf_map__fd(skel->maps.data_input),
+	ret = bpf_map_get_info_by_fd(bpf_map__fd(skel->maps.data_input),
 				     &info_m, &len);
-	if (!ASSERT_OK(ret, "bpf_obj_get_info_by_fd"))
+	if (!ASSERT_OK(ret, "bpf_map_get_info_by_fd"))
 		goto close_prog;
 
 	fd = bpf_map_get_fd_by_id(info_m.id);
diff --git a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c
index f117bfef68a1..130a3b21e467 100644
--- a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c
+++ b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c
@@ -47,7 +47,8 @@ static __u32 query_prog_cnt(int cgroup_fd, const char *attach_func)
 
 		fd = bpf_prog_get_fd_by_id(p.prog_ids[i]);
 		ASSERT_GE(fd, 0, "prog_get_fd_by_id");
-		ASSERT_OK(bpf_obj_get_info_by_fd(fd, &info, &info_len), "prog_info_by_fd");
+		ASSERT_OK(bpf_prog_get_info_by_fd(fd, &info, &info_len),
+			  "prog_info_by_fd");
 		close(fd);
 
 		if (info.attach_btf_id ==
diff --git a/tools/testing/selftests/bpf/prog_tests/metadata.c b/tools/testing/selftests/bpf/prog_tests/metadata.c
index 2c53eade88e3..8b67dfc10f5c 100644
--- a/tools/testing/selftests/bpf/prog_tests/metadata.c
+++ b/tools/testing/selftests/bpf/prog_tests/metadata.c
@@ -16,7 +16,7 @@ static int duration;
 static int prog_holds_map(int prog_fd, int map_fd)
 {
 	struct bpf_prog_info prog_info = {};
-	struct bpf_prog_info map_info = {};
+	struct bpf_map_info map_info = {};
 	__u32 prog_info_len;
 	__u32 map_info_len;
 	__u32 *map_ids;
@@ -25,12 +25,12 @@ static int prog_holds_map(int prog_fd, int map_fd)
 	int i;
 
 	map_info_len = sizeof(map_info);
-	ret = bpf_obj_get_info_by_fd(map_fd, &map_info, &map_info_len);
+	ret = bpf_map_get_info_by_fd(map_fd, &map_info, &map_info_len);
 	if (ret)
 		return -errno;
 
 	prog_info_len = sizeof(prog_info);
-	ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len);
+	ret = bpf_prog_get_info_by_fd(prog_fd, &prog_info, &prog_info_len);
 	if (ret)
 		return -errno;
 
@@ -44,7 +44,7 @@ static int prog_holds_map(int prog_fd, int map_fd)
 	prog_info.map_ids = ptr_to_u64(map_ids);
 	prog_info_len = sizeof(prog_info);
 
-	ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len);
+	ret = bpf_prog_get_info_by_fd(prog_fd, &prog_info, &prog_info_len);
 	if (ret) {
 		ret = -errno;
 		goto free_map_ids;
diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c
index 37b002ca1167..a271d5a0f7ab 100644
--- a/tools/testing/selftests/bpf/prog_tests/mmap.c
+++ b/tools/testing/selftests/bpf/prog_tests/mmap.c
@@ -64,7 +64,7 @@ void test_mmap(void)
 
 	/* get map's ID */
 	memset(&map_info, 0, map_info_sz);
-	err = bpf_obj_get_info_by_fd(data_map_fd, &map_info, &map_info_sz);
+	err = bpf_map_get_info_by_fd(data_map_fd, &map_info, &map_info_sz);
 	if (CHECK(err, "map_get_info", "failed %d\n", errno))
 		goto cleanup;
 	data_map_id = map_info.id;
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_link.c b/tools/testing/selftests/bpf/prog_tests/perf_link.c
index 224eba6fef2e..3a25f1c743a1 100644
--- a/tools/testing/selftests/bpf/prog_tests/perf_link.c
+++ b/tools/testing/selftests/bpf/prog_tests/perf_link.c
@@ -54,7 +54,7 @@ void serial_test_perf_link(void)
 		goto cleanup;
 
 	memset(&info, 0, sizeof(info));
-	err = bpf_obj_get_info_by_fd(link_fd, &info, &info_len);
+	err = bpf_link_get_info_by_fd(link_fd, &info, &info_len);
 	if (!ASSERT_OK(err, "link_get_info"))
 		goto cleanup;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/pinning.c b/tools/testing/selftests/bpf/prog_tests/pinning.c
index d95cee5867b7..c799a3c5ad1f 100644
--- a/tools/testing/selftests/bpf/prog_tests/pinning.c
+++ b/tools/testing/selftests/bpf/prog_tests/pinning.c
@@ -18,7 +18,7 @@ __u32 get_map_id(struct bpf_object *obj, const char *name)
 	if (CHECK(!map, "find map", "NULL map"))
 		return 0;
 
-	err = bpf_obj_get_info_by_fd(bpf_map__fd(map),
+	err = bpf_map_get_info_by_fd(bpf_map__fd(map),
 				     &map_info, &map_info_len);
 	CHECK(err, "get map info", "err %d errno %d", err, errno);
 	return map_info.id;
diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_opts.c b/tools/testing/selftests/bpf/prog_tests/prog_run_opts.c
index 1ccd2bdf8fa8..01f1d1b6715a 100644
--- a/tools/testing/selftests/bpf/prog_tests/prog_run_opts.c
+++ b/tools/testing/selftests/bpf/prog_tests/prog_run_opts.c
@@ -12,7 +12,7 @@ static void check_run_cnt(int prog_fd, __u64 run_cnt)
 	__u32 info_len = sizeof(info);
 	int err;
 
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 	if (CHECK(err, "get_prog_info", "failed to get bpf_prog_info for fd %d\n", prog_fd))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/recursion.c b/tools/testing/selftests/bpf/prog_tests/recursion.c
index f3af2627b599..23552d3e3365 100644
--- a/tools/testing/selftests/bpf/prog_tests/recursion.c
+++ b/tools/testing/selftests/bpf/prog_tests/recursion.c
@@ -31,8 +31,8 @@ void test_recursion(void)
 	bpf_map_delete_elem(bpf_map__fd(skel->maps.hash2), &key);
 	ASSERT_EQ(skel->bss->pass2, 2, "pass2 == 2");
 
-	err = bpf_obj_get_info_by_fd(bpf_program__fd(skel->progs.on_delete),
-				     &prog_info, &prog_info_len);
+	err = bpf_prog_get_info_by_fd(bpf_program__fd(skel->progs.on_delete),
+				      &prog_info, &prog_info_len);
 	if (!ASSERT_OK(err, "get_prog_info"))
 		goto out;
 	ASSERT_EQ(prog_info.recursion_misses, 2, "recursion_misses");
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index 0aa088900699..0ce25a967481 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -299,9 +299,9 @@ static __u32 query_prog_id(int prog_fd)
 	__u32 info_len = sizeof(info);
 	int err;
 
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
-	if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd") ||
-	    !ASSERT_EQ(info_len, sizeof(info), "bpf_obj_get_info_by_fd"))
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
+	if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd") ||
+	    !ASSERT_EQ(info_len, sizeof(info), "bpf_prog_get_info_by_fd"))
 		return 0;
 
 	return info.id;
diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
index a176bd75a748..ea8537c54413 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
@@ -119,19 +119,19 @@ static void test_recursion(void)
 
 	prog_fd = bpf_program__fd(skel->progs.on_lookup);
 	memset(&info, 0, sizeof(info));
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 	ASSERT_OK(err, "get prog info");
 	ASSERT_GT(info.recursion_misses, 0, "on_lookup prog recursion");
 
 	prog_fd = bpf_program__fd(skel->progs.on_update);
 	memset(&info, 0, sizeof(info));
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 	ASSERT_OK(err, "get prog info");
 	ASSERT_EQ(info.recursion_misses, 0, "on_update prog recursion");
 
 	prog_fd = bpf_program__fd(skel->progs.on_enter);
 	memset(&info, 0, sizeof(info));
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 	ASSERT_OK(err, "get prog info");
 	ASSERT_EQ(info.recursion_misses, 0, "on_enter prog recursion");
 
@@ -221,7 +221,7 @@ static void test_nodeadlock(void)
 
 	info_len = sizeof(info);
 	prog_fd = bpf_program__fd(skel->progs.socket_post_create);
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 	ASSERT_OK(err, "get prog info");
 	ASSERT_EQ(info.recursion_misses, 0, "prog recursion");
 
diff --git a/tools/testing/selftests/bpf/prog_tests/tc_bpf.c b/tools/testing/selftests/bpf/prog_tests/tc_bpf.c
index 4a505a5adf4d..e873766276d1 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_bpf.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_bpf.c
@@ -29,8 +29,8 @@ static int test_tc_bpf_basic(const struct bpf_tc_hook *hook, int fd)
 	__u32 info_len = sizeof(info);
 	int ret;
 
-	ret = bpf_obj_get_info_by_fd(fd, &info, &info_len);
-	if (!ASSERT_OK(ret, "bpf_obj_get_info_by_fd"))
+	ret = bpf_prog_get_info_by_fd(fd, &info, &info_len);
+	if (!ASSERT_OK(ret, "bpf_prog_get_info_by_fd"))
 		return ret;
 
 	ret = bpf_tc_attach(hook, &opts);
diff --git a/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c b/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c
index a479080533db..770fcc3bb1ba 100644
--- a/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c
+++ b/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c
@@ -45,8 +45,9 @@ void serial_test_tp_attach_query(void)
 		prog_info.xlated_prog_len = 0;
 		prog_info.nr_map_ids = 0;
 		info_len = sizeof(prog_info);
-		err = bpf_obj_get_info_by_fd(prog_fd[i], &prog_info, &info_len);
-		if (CHECK(err, "bpf_obj_get_info_by_fd", "err %d errno %d\n",
+		err = bpf_prog_get_info_by_fd(prog_fd[i], &prog_info,
+					      &info_len);
+		if (CHECK(err, "bpf_prog_get_info_by_fd", "err %d errno %d\n",
 			  err, errno))
 			goto cleanup1;
 		saved_prog_ids[i] = prog_info.id;
diff --git a/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c b/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c
index 1ed3cc2092db..8383a99f610f 100644
--- a/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c
+++ b/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c
@@ -179,7 +179,7 @@ static void test_unpriv_bpf_disabled_negative(struct test_unpriv_bpf_disabled *s
 	ASSERT_EQ(bpf_prog_get_next_id(prog_id, &next), -EPERM, "prog_get_next_id_fails");
 	ASSERT_EQ(bpf_prog_get_next_id(0, &next), -EPERM, "prog_get_next_id_fails");
 
-	if (ASSERT_OK(bpf_obj_get_info_by_fd(map_fds[0], &map_info, &map_info_len),
+	if (ASSERT_OK(bpf_map_get_info_by_fd(map_fds[0], &map_info, &map_info_len),
 		      "obj_get_info_by_fd")) {
 		ASSERT_EQ(bpf_map_get_fd_by_id(map_info.id), -EPERM, "map_get_fd_by_id_fails");
 		ASSERT_EQ(bpf_map_get_next_id(map_info.id, &next), -EPERM,
@@ -187,8 +187,8 @@ static void test_unpriv_bpf_disabled_negative(struct test_unpriv_bpf_disabled *s
 	}
 	ASSERT_EQ(bpf_map_get_next_id(0, &next), -EPERM, "map_get_next_id_fails");
 
-	if (ASSERT_OK(bpf_obj_get_info_by_fd(bpf_link__fd(skel->links.sys_nanosleep_enter),
-					     &link_info, &link_info_len),
+	if (ASSERT_OK(bpf_link_get_info_by_fd(bpf_link__fd(skel->links.sys_nanosleep_enter),
+					      &link_info, &link_info_len),
 		      "obj_get_info_by_fd")) {
 		ASSERT_EQ(bpf_link_get_fd_by_id(link_info.id), -EPERM, "link_get_fd_by_id_fails");
 		ASSERT_EQ(bpf_link_get_next_id(link_info.id, &next), -EPERM,
@@ -269,7 +269,7 @@ void test_unpriv_bpf_disabled(void)
 	}
 
 	prog_fd = bpf_program__fd(skel->progs.sys_nanosleep_enter);
-	ASSERT_OK(bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len),
+	ASSERT_OK(bpf_prog_get_info_by_fd(prog_fd, &prog_info, &prog_info_len),
 		  "obj_get_info_by_fd");
 	prog_id = prog_info.id;
 	ASSERT_GT(prog_id, 0, "valid_prog_id");
diff --git a/tools/testing/selftests/bpf/prog_tests/verif_stats.c b/tools/testing/selftests/bpf/prog_tests/verif_stats.c
index a47e7c0e1ffd..af4b95f57ac1 100644
--- a/tools/testing/selftests/bpf/prog_tests/verif_stats.c
+++ b/tools/testing/selftests/bpf/prog_tests/verif_stats.c
@@ -16,8 +16,9 @@ void test_verif_stats(void)
 	if (!ASSERT_OK_PTR(skel, "trace_vprintk__open_and_load"))
 		goto cleanup;
 
-	err = bpf_obj_get_info_by_fd(skel->progs.sys_enter.prog_fd, &info, &len);
-	if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd"))
+	err = bpf_prog_get_info_by_fd(skel->progs.sys_enter.prog_fd,
+				      &info, &len);
+	if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd"))
 		goto cleanup;
 
 	if (!ASSERT_GT(info.verified_insns, 0, "verified_insns"))
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c
index 062fbc8c8e5e..d4cd9f873c14 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_attach.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c
@@ -18,7 +18,7 @@ void serial_test_xdp_attach(void)
 	err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj1, &fd1);
 	if (CHECK_FAIL(err))
 		return;
-	err = bpf_obj_get_info_by_fd(fd1, &info, &len);
+	err = bpf_prog_get_info_by_fd(fd1, &info, &len);
 	if (CHECK_FAIL(err))
 		goto out_1;
 	id1 = info.id;
@@ -28,7 +28,7 @@ void serial_test_xdp_attach(void)
 		goto out_1;
 
 	memset(&info, 0, sizeof(info));
-	err = bpf_obj_get_info_by_fd(fd2, &info, &len);
+	err = bpf_prog_get_info_by_fd(fd2, &info, &len);
 	if (CHECK_FAIL(err))
 		goto out_2;
 	id2 = info.id;
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
index f775a1613833..481626a875d1 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
@@ -33,8 +33,8 @@ static void test_xdp_with_cpumap_helpers(void)
 
 	prog_fd = bpf_program__fd(skel->progs.xdp_dummy_cm);
 	map_fd = bpf_map__fd(skel->maps.cpu_map);
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &len);
-	if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd"))
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &len);
+	if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd"))
 		goto out_close;
 
 	val.bpf_prog.fd = prog_fd;
@@ -85,8 +85,8 @@ static void test_xdp_with_cpumap_frags_helpers(void)
 
 	frags_prog_fd = bpf_program__fd(skel->progs.xdp_dummy_cm_frags);
 	map_fd = bpf_map__fd(skel->maps.cpu_map);
-	err = bpf_obj_get_info_by_fd(frags_prog_fd, &info, &len);
-	if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd"))
+	err = bpf_prog_get_info_by_fd(frags_prog_fd, &info, &len);
+	if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd"))
 		goto out_close;
 
 	val.bpf_prog.fd = frags_prog_fd;
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c
index ead40016c324..ce6812558287 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c
@@ -35,8 +35,8 @@ static void test_xdp_with_devmap_helpers(void)
 
 	dm_fd = bpf_program__fd(skel->progs.xdp_dummy_dm);
 	map_fd = bpf_map__fd(skel->maps.dm_ports);
-	err = bpf_obj_get_info_by_fd(dm_fd, &info, &len);
-	if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd"))
+	err = bpf_prog_get_info_by_fd(dm_fd, &info, &len);
+	if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd"))
 		goto out_close;
 
 	val.bpf_prog.fd = dm_fd;
@@ -98,8 +98,8 @@ static void test_xdp_with_devmap_frags_helpers(void)
 
 	dm_fd_frags = bpf_program__fd(skel->progs.xdp_dummy_dm_frags);
 	map_fd = bpf_map__fd(skel->maps.dm_ports);
-	err = bpf_obj_get_info_by_fd(dm_fd_frags, &info, &len);
-	if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd"))
+	err = bpf_prog_get_info_by_fd(dm_fd_frags, &info, &len);
+	if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd"))
 		goto out_close;
 
 	val.bpf_prog.fd = dm_fd_frags;
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_info.c b/tools/testing/selftests/bpf/prog_tests/xdp_info.c
index 286c21ecdc65..1dbddcab87a8 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_info.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_info.c
@@ -34,7 +34,7 @@ void serial_test_xdp_info(void)
 	if (CHECK_FAIL(err))
 		return;
 
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &len);
 	if (CHECK(err, "get_prog_info", "errno=%d\n", errno))
 		goto out_close;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_link.c b/tools/testing/selftests/bpf/prog_tests/xdp_link.c
index 3e9d5c5521f0..e7e9f3c22edf 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_link.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_link.c
@@ -29,13 +29,13 @@ void serial_test_xdp_link(void)
 	prog_fd2 = bpf_program__fd(skel2->progs.xdp_handler);
 
 	memset(&prog_info, 0, sizeof(prog_info));
-	err = bpf_obj_get_info_by_fd(prog_fd1, &prog_info, &prog_info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd1, &prog_info, &prog_info_len);
 	if (!ASSERT_OK(err, "fd_info1"))
 		goto cleanup;
 	id1 = prog_info.id;
 
 	memset(&prog_info, 0, sizeof(prog_info));
-	err = bpf_obj_get_info_by_fd(prog_fd2, &prog_info, &prog_info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd2, &prog_info, &prog_info_len);
 	if (!ASSERT_OK(err, "fd_info2"))
 		goto cleanup;
 	id2 = prog_info.id;
@@ -119,7 +119,8 @@ void serial_test_xdp_link(void)
 		goto cleanup;
 
 	memset(&link_info, 0, sizeof(link_info));
-	err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &link_info, &link_info_len);
+	err = bpf_link_get_info_by_fd(bpf_link__fd(link),
+				      &link_info, &link_info_len);
 	if (!ASSERT_OK(err, "link_info"))
 		goto cleanup;
 
@@ -137,7 +138,8 @@ void serial_test_xdp_link(void)
 		goto cleanup;
 
 	memset(&link_info, 0, sizeof(link_info));
-	err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &link_info, &link_info_len);
+	err = bpf_link_get_info_by_fd(bpf_link__fd(link),
+				      &link_info, &link_info_len);
 
 	ASSERT_OK(err, "link_info");
 	ASSERT_EQ(link_info.prog_id, id1, "link_prog_id");
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index b73152822aa2..7fc00e423e4d 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -1275,7 +1275,7 @@ static void test_map_in_map(void)
 			goto out_map_in_map;
 		}
 
-		err = bpf_obj_get_info_by_fd(fd, &info, &len);
+		err = bpf_map_get_info_by_fd(fd, &info, &len);
 		if (err) {
 			printf("Failed to get map info by fd %d: %d", fd,
 			       errno);
diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c
index 3256de30f563..ed518d075d1d 100644
--- a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c
+++ b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c
@@ -93,7 +93,7 @@ int get_map_fd_by_prog_id(int prog_id)
 	info.nr_map_ids = 1;
 	info.map_ids = (__u64) (unsigned long) map_ids;
 
-	if (bpf_obj_get_info_by_fd(prog_fd, &info, &info_len)) {
+	if (bpf_prog_get_info_by_fd(prog_fd, &info, &info_len)) {
 		log_err("Failed to get info by prog fd %d", prog_fd);
 		goto err;
 	}
diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c
index 5c8ef062f760..32df93747095 100644
--- a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c
+++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c
@@ -96,7 +96,7 @@ static int get_map_fd_by_prog_id(int prog_id, bool *xdp)
 	info.nr_map_ids = 1;
 	info.map_ids = (__u64)(unsigned long)map_ids;
 
-	if (bpf_obj_get_info_by_fd(prog_fd, &info, &info_len)) {
+	if (bpf_prog_get_info_by_fd(prog_fd, &info, &info_len)) {
 		log_err("Failed to get info by prog fd %d", prog_fd);
 		goto err;
 	}
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 887c49dc5abd..8b9949bb833d 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -1239,8 +1239,8 @@ static int get_xlated_program(int fd_prog, struct bpf_insn **buf, int *cnt)
 	__u32 xlated_prog_len;
 	__u32 buf_element_size = sizeof(struct bpf_insn);
 
-	if (bpf_obj_get_info_by_fd(fd_prog, &info, &info_len)) {
-		perror("bpf_obj_get_info_by_fd failed");
+	if (bpf_prog_get_info_by_fd(fd_prog, &info, &info_len)) {
+		perror("bpf_prog_get_info_by_fd failed");
 		return -1;
 	}
 
@@ -1261,8 +1261,8 @@ static int get_xlated_program(int fd_prog, struct bpf_insn **buf, int *cnt)
 	bzero(&info, sizeof(info));
 	info.xlated_prog_len = xlated_prog_len;
 	info.xlated_prog_insns = (__u64)(unsigned long)*buf;
-	if (bpf_obj_get_info_by_fd(fd_prog, &info, &info_len)) {
-		perror("second bpf_obj_get_info_by_fd failed");
+	if (bpf_prog_get_info_by_fd(fd_prog, &info, &info_len)) {
+		perror("second bpf_prog_get_info_by_fd failed");
 		goto out_free_buf;
 	}
 
diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index 9695318e8132..6c44153755e6 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -164,7 +164,7 @@ __u32 link_info_prog_id(const struct bpf_link *link, struct bpf_link_info *info)
 	int err;
 
 	memset(info, 0, sizeof(*info));
-	err = bpf_obj_get_info_by_fd(bpf_link__fd(link), info, &info_len);
+	err = bpf_link_get_info_by_fd(bpf_link__fd(link), info, &info_len);
 	if (err) {
 		printf("failed to get link info: %d\n", -errno);
 		return 0;
diff --git a/tools/testing/selftests/bpf/xdp_synproxy.c b/tools/testing/selftests/bpf/xdp_synproxy.c
index 6dbe0b745198..ce68c342b56f 100644
--- a/tools/testing/selftests/bpf/xdp_synproxy.c
+++ b/tools/testing/selftests/bpf/xdp_synproxy.c
@@ -217,9 +217,10 @@ static int syncookie_attach(const char *argv0, unsigned int ifindex, bool tc)
 
 	prog_fd = bpf_program__fd(prog);
 
-	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 	if (err < 0) {
-		fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err));
+		fprintf(stderr, "Error: bpf_prog_get_info_by_fd: %s\n",
+			strerror(-err));
 		goto out;
 	}
 	attached_tc = tc;
@@ -292,9 +293,10 @@ static int syncookie_open_bpf_maps(__u32 prog_id, int *values_map_fd, int *ports
 	};
 	info_len = sizeof(prog_info);
 
-	err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len);
+	err = bpf_prog_get_info_by_fd(prog_fd, &prog_info, &info_len);
 	if (err != 0) {
-		fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err));
+		fprintf(stderr, "Error: bpf_prog_get_info_by_fd: %s\n",
+			strerror(-err));
 		goto out;
 	}
 
@@ -317,9 +319,10 @@ static int syncookie_open_bpf_maps(__u32 prog_id, int *values_map_fd, int *ports
 		map_fd = err;
 
 		info_len = sizeof(map_info);
-		err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
+		err = bpf_map_get_info_by_fd(map_fd, &map_info, &info_len);
 		if (err != 0) {
-			fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err));
+			fprintf(stderr, "Error: bpf_map_get_info_by_fd: %s\n",
+				strerror(-err));
 			close(map_fd);
 			goto err_close_map_fds;
 		}
-- 
cgit 


From df71a42cc37a44cdc7682f57aecf14ff44391eed Mon Sep 17 00:00:00 2001
From: Taichi Nishimura <awkrail01@gmail.com>
Date: Thu, 16 Feb 2023 17:55:37 +0900
Subject: Fix typos in selftest/bpf files

Run spell checker on files in selftest/bpf and fixed typos.

Signed-off-by: Taichi Nishimura <awkrail01@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/bpf/20230216085537.519062-1-awkrail01@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c    | 2 +-
 tools/testing/selftests/bpf/prog_tests/trampoline_count.c     | 2 +-
 tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c | 2 +-
 tools/testing/selftests/bpf/progs/dynptr_fail.c               | 2 +-
 tools/testing/selftests/bpf/progs/strobemeta.h                | 2 +-
 tools/testing/selftests/bpf/progs/test_cls_redirect.c         | 6 +++---
 tools/testing/selftests/bpf/progs/test_subprogs.c             | 2 +-
 tools/testing/selftests/bpf/progs/test_xdp_vlan.c             | 2 +-
 tools/testing/selftests/bpf/test_cpp.cpp                      | 2 +-
 tools/testing/selftests/bpf/veristat.c                        | 4 ++--
 10 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
index eb2feaac81fe..653b0a20fab9 100644
--- a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
+++ b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
@@ -488,7 +488,7 @@ static void run_test(struct migrate_reuseport_test_case *test_case,
 			goto close_servers;
 	}
 
-	/* Tie requests to the first four listners */
+	/* Tie requests to the first four listeners */
 	err = start_clients(test_case);
 	if (!ASSERT_OK(err, "start_clients"))
 		goto close_clients;
diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
index 8fd4c0d78089..e91d0d1769f1 100644
--- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
+++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
@@ -79,7 +79,7 @@ void serial_test_trampoline_count(void)
 	if (!ASSERT_EQ(link, NULL, "ptr_is_null"))
 		goto cleanup;
 
-	/* and finaly execute the probe */
+	/* and finally execute the probe */
 	prog_fd = bpf_program__fd(prog);
 	if (!ASSERT_GE(prog_fd, 0, "bpf_program__fd"))
 		goto cleanup;
diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
index 26fffb02ed10..ad21ee8c7e23 100644
--- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
@@ -84,7 +84,7 @@ typedef void (*printf_fn_t)(const char *, ...);
  *	typedef int (*fn_t)(int);
  *	typedef char * const * (*fn_ptr2_t)(s_t, fn_t);
  *
- * - `fn_complext_t`: pointer to a function returning struct and accepting
+ * - `fn_complex_t`: pointer to a function returning struct and accepting
  *   union and struct. All structs and enum are anonymous and defined inline.
  *
  * - `signal_t: pointer to a function accepting a pointer to a function as an
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index 5950ad6ec2e6..aa5b69354b91 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -630,7 +630,7 @@ static int release_twice_callback_fn(__u32 index, void *data)
 }
 
 /* Test that releasing a dynptr twice, where one of the releases happens
- * within a calback function, fails
+ * within a callback function, fails
  */
 SEC("?raw_tp")
 __failure __msg("arg 1 is an unacquired reference")
diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h
index 753718595c26..e562be6356f3 100644
--- a/tools/testing/selftests/bpf/progs/strobemeta.h
+++ b/tools/testing/selftests/bpf/progs/strobemeta.h
@@ -135,7 +135,7 @@ struct strobe_value_loc {
 	 * tpidr_el0 for aarch64).
 	 * TLS_IMM_EXEC: absolute address of GOT entry containing offset
 	 * from thread pointer;
-	 * TLS_GENERAL_DYN: absolute addres of double GOT entry
+	 * TLS_GENERAL_DYN: absolute address of double GOT entry
 	 * containing tls_index_t struct;
 	 */
 	int64_t offset;
diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c
index 2833ad722cb7..66b304982245 100644
--- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c
+++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c
@@ -600,7 +600,7 @@ static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap,
 		return TC_ACT_SHOT;
 	}
 
-	/* Skip the remainig next hops (may be zero). */
+	/* Skip the remaining next hops (may be zero). */
 	return skip_next_hops(pkt, encap->unigue.hop_count -
 					   encap->unigue.next_hop - 1);
 }
@@ -610,8 +610,8 @@ static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap,
  *
  *    fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
  *
- * clang will substitue a costant for sizeof, which allows the verifier
- * to track it's value. Based on this, it can figure out the constant
+ * clang will substitute a constant for sizeof, which allows the verifier
+ * to track its value. Based on this, it can figure out the constant
  * return value, and calling code works while still being "generic" to
  * IPv4 and IPv6.
  */
diff --git a/tools/testing/selftests/bpf/progs/test_subprogs.c b/tools/testing/selftests/bpf/progs/test_subprogs.c
index f8e9256cf18d..a8d602d7c88a 100644
--- a/tools/testing/selftests/bpf/progs/test_subprogs.c
+++ b/tools/testing/selftests/bpf/progs/test_subprogs.c
@@ -47,7 +47,7 @@ static __noinline int sub5(int v)
 	return sub1(v) - 1; /* compensates sub1()'s + 1 */
 }
 
-/* unfortunately verifier rejects `struct task_struct *t` as an unkown pointer
+/* unfortunately verifier rejects `struct task_struct *t` as an unknown pointer
  * type, so we need to accept pointer as integer and then cast it inside the
  * function
  */
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c
index cdf3c48d6cbb..4ddcb6dfe500 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c
@@ -98,7 +98,7 @@ bool parse_eth_frame(struct ethhdr *eth, void *data_end, struct parse_pkt *pkt)
 	return true;
 }
 
-/* Hint, VLANs are choosen to hit network-byte-order issues */
+/* Hint, VLANs are chosen to hit network-byte-order issues */
 #define TESTVLAN 4011 /* 0xFAB */
 // #define TO_VLAN  4000 /* 0xFA0 (hint 0xOA0 = 160) */
 
diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp
index 0bd9990e83fa..f4936834f76f 100644
--- a/tools/testing/selftests/bpf/test_cpp.cpp
+++ b/tools/testing/selftests/bpf/test_cpp.cpp
@@ -91,7 +91,7 @@ static void try_skeleton_template()
 
 	skel.detach();
 
-	/* destructor will destory underlying skeleton */
+	/* destructor will destroy underlying skeleton */
 }
 
 int main(int argc, char *argv[])
diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index f961b49b8ef4..83231456d3c5 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -144,7 +144,7 @@ static struct env {
 	struct verif_stats *prog_stats;
 	int prog_stat_cnt;
 
-	/* baseline_stats is allocated and used only in comparsion mode */
+	/* baseline_stats is allocated and used only in comparison mode */
 	struct verif_stats *baseline_stats;
 	int baseline_stat_cnt;
 
@@ -882,7 +882,7 @@ static int process_obj(const char *filename)
 		 * that BPF object file is incomplete and has to be statically
 		 * linked into a final BPF object file; instead of bailing
 		 * out, report it into stderr, mark it as skipped, and
-		 * proceeed
+		 * proceed
 		 */
 		fprintf(stderr, "Failed to open '%s': %d\n", filename, -errno);
 		env.files_skipped++;
-- 
cgit 


From 95ebb376176c52382293e05e63f142114a5e40ef Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 15 Feb 2023 20:59:53 -0800
Subject: selftests/bpf: Convert test_global_funcs test to test_loader
 framework

Convert 17 test_global_funcs subtests into test_loader framework for
easier maintenance and more declarative way to define expected
failures/successes.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/20230216045954.3002473-3-andrii@kernel.org
---
 .../selftests/bpf/prog_tests/test_global_funcs.c   | 131 ++++++---------------
 .../selftests/bpf/progs/test_global_func1.c        |   6 +-
 .../selftests/bpf/progs/test_global_func10.c       |   4 +-
 .../selftests/bpf/progs/test_global_func11.c       |   4 +-
 .../selftests/bpf/progs/test_global_func12.c       |   4 +-
 .../selftests/bpf/progs/test_global_func13.c       |   4 +-
 .../selftests/bpf/progs/test_global_func14.c       |   4 +-
 .../selftests/bpf/progs/test_global_func15.c       |   4 +-
 .../selftests/bpf/progs/test_global_func16.c       |   4 +-
 .../selftests/bpf/progs/test_global_func17.c       |   4 +-
 .../selftests/bpf/progs/test_global_func2.c        |  43 ++++++-
 .../selftests/bpf/progs/test_global_func3.c        |  10 +-
 .../selftests/bpf/progs/test_global_func4.c        |  55 ++++++++-
 .../selftests/bpf/progs/test_global_func5.c        |   4 +-
 .../selftests/bpf/progs/test_global_func6.c        |   4 +-
 .../selftests/bpf/progs/test_global_func7.c        |   4 +-
 .../selftests/bpf/progs/test_global_func8.c        |   4 +-
 .../selftests/bpf/progs/test_global_func9.c        |   4 +-
 18 files changed, 174 insertions(+), 123 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
index 7295cc60f724..2ff4d5c7abfc 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
@@ -1,104 +1,41 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
 #include <test_progs.h>
-
-const char *err_str;
-bool found;
-
-static int libbpf_debug_print(enum libbpf_print_level level,
-			      const char *format, va_list args)
-{
-	char *log_buf;
-
-	if (level != LIBBPF_WARN ||
-	    strcmp(format, "libbpf: \n%s\n")) {
-		vprintf(format, args);
-		return 0;
-	}
-
-	log_buf = va_arg(args, char *);
-	if (!log_buf)
-		goto out;
-	if (err_str && strstr(log_buf, err_str) == 0)
-		found = true;
-out:
-	printf(format, log_buf);
-	return 0;
-}
-
-extern int extra_prog_load_log_flags;
-
-static int check_load(const char *file)
-{
-	struct bpf_object *obj = NULL;
-	struct bpf_program *prog;
-	int err;
-
-	found = false;
-
-	obj = bpf_object__open_file(file, NULL);
-	err = libbpf_get_error(obj);
-	if (err)
-		return err;
-
-	prog = bpf_object__next_program(obj, NULL);
-	if (!prog) {
-		err = -ENOENT;
-		goto err_out;
-	}
-
-	bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32);
-	bpf_program__set_log_level(prog, extra_prog_load_log_flags);
-
-	err = bpf_object__load(obj);
-
-err_out:
-	bpf_object__close(obj);
-	return err;
-}
-
-struct test_def {
-	const char *file;
-	const char *err_str;
-};
+#include "test_global_func1.skel.h"
+#include "test_global_func2.skel.h"
+#include "test_global_func3.skel.h"
+#include "test_global_func4.skel.h"
+#include "test_global_func5.skel.h"
+#include "test_global_func6.skel.h"
+#include "test_global_func7.skel.h"
+#include "test_global_func8.skel.h"
+#include "test_global_func9.skel.h"
+#include "test_global_func10.skel.h"
+#include "test_global_func11.skel.h"
+#include "test_global_func12.skel.h"
+#include "test_global_func13.skel.h"
+#include "test_global_func14.skel.h"
+#include "test_global_func15.skel.h"
+#include "test_global_func16.skel.h"
+#include "test_global_func17.skel.h"
 
 void test_test_global_funcs(void)
 {
-	struct test_def tests[] = {
-		{ "test_global_func1.bpf.o", "combined stack size of 4 calls is 544" },
-		{ "test_global_func2.bpf.o" },
-		{ "test_global_func3.bpf.o", "the call stack of 8 frames" },
-		{ "test_global_func4.bpf.o" },
-		{ "test_global_func5.bpf.o", "expected pointer to ctx, but got PTR" },
-		{ "test_global_func6.bpf.o", "modified ctx ptr R2" },
-		{ "test_global_func7.bpf.o", "foo() doesn't return scalar" },
-		{ "test_global_func8.bpf.o" },
-		{ "test_global_func9.bpf.o" },
-		{ "test_global_func10.bpf.o", "invalid indirect read from stack" },
-		{ "test_global_func11.bpf.o", "Caller passes invalid args into func#1" },
-		{ "test_global_func12.bpf.o", "invalid mem access 'mem_or_null'" },
-		{ "test_global_func13.bpf.o", "Caller passes invalid args into func#1" },
-		{ "test_global_func14.bpf.o", "reference type('FWD S') size cannot be determined" },
-		{ "test_global_func15.bpf.o", "At program exit the register R0 has value" },
-		{ "test_global_func16.bpf.o", "invalid indirect read from stack" },
-		{ "test_global_func17.bpf.o", "Caller passes invalid args into func#1" },
-	};
-	libbpf_print_fn_t old_print_fn = NULL;
-	int err, i, duration = 0;
-
-	old_print_fn = libbpf_set_print(libbpf_debug_print);
-
-	for (i = 0; i < ARRAY_SIZE(tests); i++) {
-		const struct test_def *test = &tests[i];
-
-		if (!test__start_subtest(test->file))
-			continue;
-
-		err_str = test->err_str;
-		err = check_load(test->file);
-		CHECK_FAIL(!!err ^ !!err_str);
-		if (err_str)
-			CHECK(found, "", "expected string '%s'", err_str);
-	}
-	libbpf_set_print(old_print_fn);
+	RUN_TESTS(test_global_func1);
+	RUN_TESTS(test_global_func2);
+	RUN_TESTS(test_global_func3);
+	RUN_TESTS(test_global_func4);
+	RUN_TESTS(test_global_func5);
+	RUN_TESTS(test_global_func6);
+	RUN_TESTS(test_global_func7);
+	RUN_TESTS(test_global_func8);
+	RUN_TESTS(test_global_func9);
+	RUN_TESTS(test_global_func10);
+	RUN_TESTS(test_global_func11);
+	RUN_TESTS(test_global_func12);
+	RUN_TESTS(test_global_func13);
+	RUN_TESTS(test_global_func14);
+	RUN_TESTS(test_global_func15);
+	RUN_TESTS(test_global_func16);
+	RUN_TESTS(test_global_func17);
 }
diff --git a/tools/testing/selftests/bpf/progs/test_global_func1.c b/tools/testing/selftests/bpf/progs/test_global_func1.c
index 7b42dad187b8..23970a20b324 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func1.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func1.c
@@ -3,10 +3,9 @@
 #include <stddef.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
-#ifndef MAX_STACK
 #define MAX_STACK (512 - 3 * 32 + 8)
-#endif
 
 static __attribute__ ((noinline))
 int f0(int var, struct __sk_buff *skb)
@@ -39,7 +38,8 @@ int f3(int val, struct __sk_buff *skb, int var)
 }
 
 SEC("tc")
-int test_cls(struct __sk_buff *skb)
+__failure __msg("combined stack size of 4 calls is 544")
+int global_func1(struct __sk_buff *skb)
 {
 	return f0(1, skb) + f1(skb) + f2(2, skb) + f3(3, skb, 4);
 }
diff --git a/tools/testing/selftests/bpf/progs/test_global_func10.c b/tools/testing/selftests/bpf/progs/test_global_func10.c
index 97b7031d0e22..98327bdbbfd2 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func10.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func10.c
@@ -2,6 +2,7 @@
 #include <stddef.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 struct Small {
 	int x;
@@ -21,7 +22,8 @@ __noinline int foo(const struct Big *big)
 }
 
 SEC("cgroup_skb/ingress")
-int test_cls(struct __sk_buff *skb)
+__failure __msg("invalid indirect read from stack")
+int global_func10(struct __sk_buff *skb)
 {
 	const struct Small small = {.x = skb->len };
 
diff --git a/tools/testing/selftests/bpf/progs/test_global_func11.c b/tools/testing/selftests/bpf/progs/test_global_func11.c
index ef5277d982d9..283e036dc401 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func11.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func11.c
@@ -2,6 +2,7 @@
 #include <stddef.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 struct S {
 	int x;
@@ -13,7 +14,8 @@ __noinline int foo(const struct S *s)
 }
 
 SEC("cgroup_skb/ingress")
-int test_cls(struct __sk_buff *skb)
+__failure __msg("Caller passes invalid args into func#1")
+int global_func11(struct __sk_buff *skb)
 {
 	return foo((const void *)skb);
 }
diff --git a/tools/testing/selftests/bpf/progs/test_global_func12.c b/tools/testing/selftests/bpf/progs/test_global_func12.c
index 62343527cc59..7f159d83c6f6 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func12.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func12.c
@@ -2,6 +2,7 @@
 #include <stddef.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 struct S {
 	int x;
@@ -13,7 +14,8 @@ __noinline int foo(const struct S *s)
 }
 
 SEC("cgroup_skb/ingress")
-int test_cls(struct __sk_buff *skb)
+__failure __msg("invalid mem access 'mem_or_null'")
+int global_func12(struct __sk_buff *skb)
 {
 	const struct S s = {.x = skb->len };
 
diff --git a/tools/testing/selftests/bpf/progs/test_global_func13.c b/tools/testing/selftests/bpf/progs/test_global_func13.c
index ff8897c1ac22..02ea80da75b5 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func13.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func13.c
@@ -2,6 +2,7 @@
 #include <stddef.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 struct S {
 	int x;
@@ -16,7 +17,8 @@ __noinline int foo(const struct S *s)
 }
 
 SEC("cgroup_skb/ingress")
-int test_cls(struct __sk_buff *skb)
+__failure __msg("Caller passes invalid args into func#1")
+int global_func13(struct __sk_buff *skb)
 {
 	const struct S *s = (const struct S *)(0xbedabeda);
 
diff --git a/tools/testing/selftests/bpf/progs/test_global_func14.c b/tools/testing/selftests/bpf/progs/test_global_func14.c
index 698c77199ebf..33b7d5efd7b2 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func14.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func14.c
@@ -2,6 +2,7 @@
 #include <stddef.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 struct S;
 
@@ -14,7 +15,8 @@ __noinline int foo(const struct S *s)
 }
 
 SEC("cgroup_skb/ingress")
-int test_cls(struct __sk_buff *skb)
+__failure __msg("reference type('FWD S') size cannot be determined")
+int global_func14(struct __sk_buff *skb)
 {
 
 	return foo(NULL);
diff --git a/tools/testing/selftests/bpf/progs/test_global_func15.c b/tools/testing/selftests/bpf/progs/test_global_func15.c
index c19c435988d5..b512d6a6c75e 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func15.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func15.c
@@ -2,6 +2,7 @@
 #include <stddef.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 __noinline int foo(unsigned int *v)
 {
@@ -12,7 +13,8 @@ __noinline int foo(unsigned int *v)
 }
 
 SEC("cgroup_skb/ingress")
-int test_cls(struct __sk_buff *skb)
+__failure __msg("At program exit the register R0 has value")
+int global_func15(struct __sk_buff *skb)
 {
 	unsigned int v = 1;
 
diff --git a/tools/testing/selftests/bpf/progs/test_global_func16.c b/tools/testing/selftests/bpf/progs/test_global_func16.c
index 0312d1e8d8c0..e7206304632e 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func16.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func16.c
@@ -2,6 +2,7 @@
 #include <stddef.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 __noinline int foo(int (*arr)[10])
 {
@@ -12,7 +13,8 @@ __noinline int foo(int (*arr)[10])
 }
 
 SEC("cgroup_skb/ingress")
-int test_cls(struct __sk_buff *skb)
+__failure __msg("invalid indirect read from stack")
+int global_func16(struct __sk_buff *skb)
 {
 	int array[10];
 
diff --git a/tools/testing/selftests/bpf/progs/test_global_func17.c b/tools/testing/selftests/bpf/progs/test_global_func17.c
index 2b8b9b8ba018..a32e11c7d933 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func17.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func17.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 __noinline int foo(int *p)
 {
@@ -10,7 +11,8 @@ __noinline int foo(int *p)
 const volatile int i;
 
 SEC("tc")
-int test_cls(struct __sk_buff *skb)
+__failure __msg("Caller passes invalid args into func#1")
+int global_func17(struct __sk_buff *skb)
 {
 	return foo((int *)&i);
 }
diff --git a/tools/testing/selftests/bpf/progs/test_global_func2.c b/tools/testing/selftests/bpf/progs/test_global_func2.c
index 2c18d82923a2..3dce97fb52a4 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func2.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func2.c
@@ -1,4 +1,45 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2020 Facebook */
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
 #define MAX_STACK (512 - 3 * 32)
-#include "test_global_func1.c"
+
+static __attribute__ ((noinline))
+int f0(int var, struct __sk_buff *skb)
+{
+	return skb->len;
+}
+
+__attribute__ ((noinline))
+int f1(struct __sk_buff *skb)
+{
+	volatile char buf[MAX_STACK] = {};
+
+	return f0(0, skb) + skb->len;
+}
+
+int f3(int, struct __sk_buff *skb, int);
+
+__attribute__ ((noinline))
+int f2(int val, struct __sk_buff *skb)
+{
+	return f1(skb) + f3(val, skb, 1);
+}
+
+__attribute__ ((noinline))
+int f3(int val, struct __sk_buff *skb, int var)
+{
+	volatile char buf[MAX_STACK] = {};
+
+	return skb->ifindex * val * var;
+}
+
+SEC("tc")
+__success
+int global_func2(struct __sk_buff *skb)
+{
+	return f0(1, skb) + f1(skb) + f2(2, skb) + f3(3, skb, 4);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_global_func3.c b/tools/testing/selftests/bpf/progs/test_global_func3.c
index 01bf8275dfd6..142b682d3c2f 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func3.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func3.c
@@ -3,6 +3,7 @@
 #include <stddef.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 __attribute__ ((noinline))
 int f1(struct __sk_buff *skb)
@@ -46,20 +47,15 @@ int f7(struct __sk_buff *skb)
 	return f6(skb);
 }
 
-#ifndef NO_FN8
 __attribute__ ((noinline))
 int f8(struct __sk_buff *skb)
 {
 	return f7(skb);
 }
-#endif
 
 SEC("tc")
-int test_cls(struct __sk_buff *skb)
+__failure __msg("the call stack of 8 frames")
+int global_func3(struct __sk_buff *skb)
 {
-#ifndef NO_FN8
 	return f8(skb);
-#else
-	return f7(skb);
-#endif
 }
diff --git a/tools/testing/selftests/bpf/progs/test_global_func4.c b/tools/testing/selftests/bpf/progs/test_global_func4.c
index 610f75edf276..1733d87ad3f3 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func4.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func4.c
@@ -1,4 +1,55 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2020 Facebook */
-#define NO_FN8
-#include "test_global_func3.c"
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+__attribute__ ((noinline))
+int f1(struct __sk_buff *skb)
+{
+	return skb->len;
+}
+
+__attribute__ ((noinline))
+int f2(int val, struct __sk_buff *skb)
+{
+	return f1(skb) + val;
+}
+
+__attribute__ ((noinline))
+int f3(int val, struct __sk_buff *skb, int var)
+{
+	return f2(var, skb) + val;
+}
+
+__attribute__ ((noinline))
+int f4(struct __sk_buff *skb)
+{
+	return f3(1, skb, 2);
+}
+
+__attribute__ ((noinline))
+int f5(struct __sk_buff *skb)
+{
+	return f4(skb);
+}
+
+__attribute__ ((noinline))
+int f6(struct __sk_buff *skb)
+{
+	return f5(skb);
+}
+
+__attribute__ ((noinline))
+int f7(struct __sk_buff *skb)
+{
+	return f6(skb);
+}
+
+SEC("tc")
+__success
+int global_func4(struct __sk_buff *skb)
+{
+	return f7(skb);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_global_func5.c b/tools/testing/selftests/bpf/progs/test_global_func5.c
index 9248d03e0d06..cc55aedaf82d 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func5.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func5.c
@@ -3,6 +3,7 @@
 #include <stddef.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 __attribute__ ((noinline))
 int f1(struct __sk_buff *skb)
@@ -25,7 +26,8 @@ int f3(int val, struct __sk_buff *skb)
 }
 
 SEC("tc")
-int test_cls(struct __sk_buff *skb)
+__failure __msg("expected pointer to ctx, but got PTR")
+int global_func5(struct __sk_buff *skb)
 {
 	return f1(skb) + f2(2, skb) + f3(3, skb);
 }
diff --git a/tools/testing/selftests/bpf/progs/test_global_func6.c b/tools/testing/selftests/bpf/progs/test_global_func6.c
index af8c78bdfb25..46c38c8f2cf0 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func6.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func6.c
@@ -3,6 +3,7 @@
 #include <stddef.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 __attribute__ ((noinline))
 int f1(struct __sk_buff *skb)
@@ -25,7 +26,8 @@ int f3(int val, struct __sk_buff *skb)
 }
 
 SEC("tc")
-int test_cls(struct __sk_buff *skb)
+__failure __msg("modified ctx ptr R2")
+int global_func6(struct __sk_buff *skb)
 {
 	return f1(skb) + f2(2, skb) + f3(3, skb);
 }
diff --git a/tools/testing/selftests/bpf/progs/test_global_func7.c b/tools/testing/selftests/bpf/progs/test_global_func7.c
index 6cb8e2f5254c..f182febfde3c 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func7.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func7.c
@@ -3,6 +3,7 @@
 #include <stddef.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 __attribute__ ((noinline))
 void foo(struct __sk_buff *skb)
@@ -11,7 +12,8 @@ void foo(struct __sk_buff *skb)
 }
 
 SEC("tc")
-int test_cls(struct __sk_buff *skb)
+__failure __msg("foo() doesn't return scalar")
+int global_func7(struct __sk_buff *skb)
 {
 	foo(skb);
 	return 0;
diff --git a/tools/testing/selftests/bpf/progs/test_global_func8.c b/tools/testing/selftests/bpf/progs/test_global_func8.c
index d55a6544b1ab..9b9c57fa2dd3 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func8.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func8.c
@@ -3,6 +3,7 @@
 #include <stddef.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 __noinline int foo(struct __sk_buff *skb)
 {
@@ -10,7 +11,8 @@ __noinline int foo(struct __sk_buff *skb)
 }
 
 SEC("cgroup_skb/ingress")
-int test_cls(struct __sk_buff *skb)
+__success
+int global_func8(struct __sk_buff *skb)
 {
 	if (!foo(skb))
 		return 0;
diff --git a/tools/testing/selftests/bpf/progs/test_global_func9.c b/tools/testing/selftests/bpf/progs/test_global_func9.c
index bd233ddede98..1f2cb0159b8d 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func9.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func9.c
@@ -2,6 +2,7 @@
 #include <stddef.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 struct S {
 	int x;
@@ -74,7 +75,8 @@ __noinline int quuz(int **p)
 }
 
 SEC("cgroup_skb/ingress")
-int test_cls(struct __sk_buff *skb)
+__success
+int global_func9(struct __sk_buff *skb)
 {
 	int result = 0;
 
-- 
cgit 


From e2b5cfc978f871996d1f8667515c0e06b33e620e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 15 Feb 2023 20:59:54 -0800
Subject: selftests/bpf: Add global subprog context passing tests

Add tests validating that it's possible to pass context arguments into
global subprogs for various types of programs, including a particularly
tricky KPROBE programs (which cover kprobes, uprobes, USDTs, a vast and
important class of programs).

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/20230216045954.3002473-4-andrii@kernel.org
---
 .../selftests/bpf/prog_tests/test_global_funcs.c   |   2 +
 .../bpf/progs/test_global_func_ctx_args.c          | 104 +++++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/test_global_func_ctx_args.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
index 2ff4d5c7abfc..e0879df38639 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
@@ -18,6 +18,7 @@
 #include "test_global_func15.skel.h"
 #include "test_global_func16.skel.h"
 #include "test_global_func17.skel.h"
+#include "test_global_func_ctx_args.skel.h"
 
 void test_test_global_funcs(void)
 {
@@ -38,4 +39,5 @@ void test_test_global_funcs(void)
 	RUN_TESTS(test_global_func15);
 	RUN_TESTS(test_global_func16);
 	RUN_TESTS(test_global_func17);
+	RUN_TESTS(test_global_func_ctx_args);
 }
diff --git a/tools/testing/selftests/bpf/progs/test_global_func_ctx_args.c b/tools/testing/selftests/bpf/progs/test_global_func_ctx_args.c
new file mode 100644
index 000000000000..7faa8eef0598
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_global_func_ctx_args.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+static long stack[256];
+
+/*
+ * KPROBE contexts
+ */
+
+__weak int kprobe_typedef_ctx_subprog(bpf_user_pt_regs_t *ctx)
+{
+	return bpf_get_stack(ctx, &stack, sizeof(stack), 0);
+}
+
+SEC("?kprobe")
+__success
+int kprobe_typedef_ctx(void *ctx)
+{
+	return kprobe_typedef_ctx_subprog(ctx);
+}
+
+#define pt_regs_struct_t typeof(*(__PT_REGS_CAST((struct pt_regs *)NULL)))
+
+__weak int kprobe_struct_ctx_subprog(pt_regs_struct_t *ctx)
+{
+	return bpf_get_stack((void *)ctx, &stack, sizeof(stack), 0);
+}
+
+SEC("?kprobe")
+__success
+int kprobe_resolved_ctx(void *ctx)
+{
+	return kprobe_struct_ctx_subprog(ctx);
+}
+
+/* this is current hack to make this work on old kernels */
+struct bpf_user_pt_regs_t {};
+
+__weak int kprobe_workaround_ctx_subprog(struct bpf_user_pt_regs_t *ctx)
+{
+	return bpf_get_stack(ctx, &stack, sizeof(stack), 0);
+}
+
+SEC("?kprobe")
+__success
+int kprobe_workaround_ctx(void *ctx)
+{
+	return kprobe_workaround_ctx_subprog(ctx);
+}
+
+/*
+ * RAW_TRACEPOINT contexts
+ */
+
+__weak int raw_tp_ctx_subprog(struct bpf_raw_tracepoint_args *ctx)
+{
+	return bpf_get_stack(ctx, &stack, sizeof(stack), 0);
+}
+
+SEC("?raw_tp")
+__success
+int raw_tp_ctx(void *ctx)
+{
+	return raw_tp_ctx_subprog(ctx);
+}
+
+/*
+ * RAW_TRACEPOINT_WRITABLE contexts
+ */
+
+__weak int raw_tp_writable_ctx_subprog(struct bpf_raw_tracepoint_args *ctx)
+{
+	return bpf_get_stack(ctx, &stack, sizeof(stack), 0);
+}
+
+SEC("?raw_tp")
+__success
+int raw_tp_writable_ctx(void *ctx)
+{
+	return raw_tp_writable_ctx_subprog(ctx);
+}
+
+/*
+ * PERF_EVENT contexts
+ */
+
+__weak int perf_event_ctx_subprog(struct bpf_perf_event_data *ctx)
+{
+	return bpf_get_stack(ctx, &stack, sizeof(stack), 0);
+}
+
+SEC("?perf_event")
+__success
+int perf_event_ctx(void *ctx)
+{
+	return perf_event_ctx_subprog(ctx);
+}
-- 
cgit 


From 181127fb76e62d06ab17a75fd610129688612343 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Fri, 17 Feb 2023 12:13:09 -0800
Subject: Revert "bpf, test_run: fix &xdp_frame misplacement for LIVE_FRAMES"

This reverts commit 6c20822fada1b8adb77fa450d03a0d449686a4a9.

build bot failed on arch with different cache line size:
https://lore.kernel.org/bpf/50c35055-afa9-d01e-9a05-ea5351280e4f@intel.com/

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 net/bpf/test_run.c                                 | 29 +++++-----------------
 .../selftests/bpf/prog_tests/xdp_do_redirect.c     |  7 +++---
 2 files changed, 9 insertions(+), 27 deletions(-)

(limited to 'tools')

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 982e81bba6cf..6f3d654b3339 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -97,11 +97,8 @@ reset:
 struct xdp_page_head {
 	struct xdp_buff orig_ctx;
 	struct xdp_buff ctx;
-	union {
-		/* ::data_hard_start starts here */
-		DECLARE_FLEX_ARRAY(struct xdp_frame, frame);
-		DECLARE_FLEX_ARRAY(u8, data);
-	};
+	struct xdp_frame frm;
+	u8 data[];
 };
 
 struct xdp_test_data {
@@ -119,20 +116,6 @@ struct xdp_test_data {
 #define TEST_XDP_FRAME_SIZE (PAGE_SIZE - sizeof(struct xdp_page_head))
 #define TEST_XDP_MAX_BATCH 256
 
-#if BITS_PER_LONG == 64 && PAGE_SIZE == SZ_4K
-/* tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c:%MAX_PKT_SIZE
- * must be updated accordingly when any of these changes, otherwise BPF
- * selftests will fail.
- */
-#ifdef __s390x__
-#define TEST_MAX_PKT_SIZE 3216
-#else
-#define TEST_MAX_PKT_SIZE 3408
-#endif
-static_assert(SKB_WITH_OVERHEAD(TEST_XDP_FRAME_SIZE - XDP_PACKET_HEADROOM) ==
-	      TEST_MAX_PKT_SIZE);
-#endif
-
 static void xdp_test_run_init_page(struct page *page, void *arg)
 {
 	struct xdp_page_head *head = phys_to_virt(page_to_phys(page));
@@ -149,8 +132,8 @@ static void xdp_test_run_init_page(struct page *page, void *arg)
 	headroom -= meta_len;
 
 	new_ctx = &head->ctx;
-	frm = head->frame;
-	data = head->data;
+	frm = &head->frm;
+	data = &head->data;
 	memcpy(data + headroom, orig_ctx->data_meta, frm_len);
 
 	xdp_init_buff(new_ctx, TEST_XDP_FRAME_SIZE, &xdp->rxq);
@@ -240,7 +223,7 @@ static void reset_ctx(struct xdp_page_head *head)
 	head->ctx.data = head->orig_ctx.data;
 	head->ctx.data_meta = head->orig_ctx.data_meta;
 	head->ctx.data_end = head->orig_ctx.data_end;
-	xdp_update_frame_from_buff(&head->ctx, head->frame);
+	xdp_update_frame_from_buff(&head->ctx, &head->frm);
 }
 
 static int xdp_recv_frames(struct xdp_frame **frames, int nframes,
@@ -302,7 +285,7 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
 		head = phys_to_virt(page_to_phys(page));
 		reset_ctx(head);
 		ctx = &head->ctx;
-		frm = head->frame;
+		frm = &head->frm;
 		xdp->frame_cnt++;
 
 		act = bpf_prog_run_xdp(prog, ctx);
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
index 7271a18ab3e2..2666c84dbd01 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
@@ -65,13 +65,12 @@ static int attach_tc_prog(struct bpf_tc_hook *hook, int fd)
 }
 
 /* The maximum permissible size is: PAGE_SIZE - sizeof(struct xdp_page_head) -
- * SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) - XDP_PACKET_HEADROOM =
- * 3408 bytes for 64-byte cacheline and 3216 for 256-byte one.
+ * sizeof(struct skb_shared_info) - XDP_PACKET_HEADROOM = 3368 bytes
  */
 #if defined(__s390x__)
-#define MAX_PKT_SIZE 3216
+#define MAX_PKT_SIZE 3176
 #else
-#define MAX_PKT_SIZE 3408
+#define MAX_PKT_SIZE 3368
 #endif
 static void test_max_pkt_size(int fd)
 {
-- 
cgit 


From 31de4105f00d64570139bc5494a201b0bd57349f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Fri, 17 Feb 2023 12:55:14 -0800
Subject: bpf: Add BPF_FIB_LOOKUP_SKIP_NEIGH for bpf_fib_lookup

The bpf_fib_lookup() also looks up the neigh table.
This was done before bpf_redirect_neigh() was added.

In the use case that does not manage the neigh table
and requires bpf_fib_lookup() to lookup a fib to
decide if it needs to redirect or not, the bpf prog can
depend only on using bpf_redirect_neigh() to lookup the
neigh. It also keeps the neigh entries fresh and connected.

This patch adds a bpf_fib_lookup flag, SKIP_NEIGH, to avoid
the double neigh lookup when the bpf prog always call
bpf_redirect_neigh() to do the neigh lookup. The params->smac
output is skipped together when SKIP_NEIGH is set because
bpf_redirect_neigh() will figure out the smac also.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230217205515.3583372-1-martin.lau@linux.dev
---
 include/uapi/linux/bpf.h       |  6 ++++++
 net/core/filter.c              | 39 ++++++++++++++++++++++++++-------------
 tools/include/uapi/linux/bpf.h |  6 ++++++
 3 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1503f61336b6..62ce1f5d1b1d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3134,6 +3134,11 @@ union bpf_attr {
  *		**BPF_FIB_LOOKUP_OUTPUT**
  *			Perform lookup from an egress perspective (default is
  *			ingress).
+ *		**BPF_FIB_LOOKUP_SKIP_NEIGH**
+ *			Skip the neighbour table lookup. *params*->dmac
+ *			and *params*->smac will not be set as output. A common
+ *			use case is to call **bpf_redirect_neigh**\ () after
+ *			doing **bpf_fib_lookup**\ ().
  *
  *		*ctx* is either **struct xdp_md** for XDP programs or
  *		**struct sk_buff** tc cls_act programs.
@@ -6750,6 +6755,7 @@ struct bpf_raw_tracepoint_args {
 enum {
 	BPF_FIB_LOOKUP_DIRECT  = (1U << 0),
 	BPF_FIB_LOOKUP_OUTPUT  = (1U << 1),
+	BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
 };
 
 enum {
diff --git a/net/core/filter.c b/net/core/filter.c
index 8daaaf76ab15..1d6f165923bf 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5722,12 +5722,8 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
 #endif
 
 #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
-static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
-				  const struct neighbour *neigh,
-				  const struct net_device *dev, u32 mtu)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu)
 {
-	memcpy(params->dmac, neigh->ha, ETH_ALEN);
-	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
 	params->h_vlan_TCI = 0;
 	params->h_vlan_proto = 0;
 	if (mtu)
@@ -5838,21 +5834,29 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	if (likely(nhc->nhc_gw_family != AF_INET6)) {
 		if (nhc->nhc_gw_family)
 			params->ipv4_dst = nhc->nhc_gw.ipv4;
-
-		neigh = __ipv4_neigh_lookup_noref(dev,
-						 (__force u32)params->ipv4_dst);
 	} else {
 		struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst;
 
 		params->family = AF_INET6;
 		*dst = nhc->nhc_gw.ipv6;
-		neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
 	}
 
+	if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
+		goto set_fwd_params;
+
+	if (likely(nhc->nhc_gw_family != AF_INET6))
+		neigh = __ipv4_neigh_lookup_noref(dev,
+						  (__force u32)params->ipv4_dst);
+	else
+		neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst);
+
 	if (!neigh || !(neigh->nud_state & NUD_VALID))
 		return BPF_FIB_LKUP_RET_NO_NEIGH;
+	memcpy(params->dmac, neigh->ha, ETH_ALEN);
+	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
 
-	return bpf_fib_set_fwd_params(params, neigh, dev, mtu);
+set_fwd_params:
+	return bpf_fib_set_fwd_params(params, mtu);
 }
 #endif
 
@@ -5960,24 +5964,33 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	params->rt_metric = res.f6i->fib6_metric;
 	params->ifindex = dev->ifindex;
 
+	if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
+		goto set_fwd_params;
+
 	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
 	 * not needed here.
 	 */
 	neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
 	if (!neigh || !(neigh->nud_state & NUD_VALID))
 		return BPF_FIB_LKUP_RET_NO_NEIGH;
+	memcpy(params->dmac, neigh->ha, ETH_ALEN);
+	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
 
-	return bpf_fib_set_fwd_params(params, neigh, dev, mtu);
+set_fwd_params:
+	return bpf_fib_set_fwd_params(params, mtu);
 }
 #endif
 
+#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
+			     BPF_FIB_LOOKUP_SKIP_NEIGH)
+
 BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
 	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
 {
 	if (plen < sizeof(*params))
 		return -EINVAL;
 
-	if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+	if (flags & ~BPF_FIB_LOOKUP_MASK)
 		return -EINVAL;
 
 	switch (params->family) {
@@ -6015,7 +6028,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
 	if (plen < sizeof(*params))
 		return -EINVAL;
 
-	if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+	if (flags & ~BPF_FIB_LOOKUP_MASK)
 		return -EINVAL;
 
 	if (params->tot_len)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1503f61336b6..62ce1f5d1b1d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3134,6 +3134,11 @@ union bpf_attr {
  *		**BPF_FIB_LOOKUP_OUTPUT**
  *			Perform lookup from an egress perspective (default is
  *			ingress).
+ *		**BPF_FIB_LOOKUP_SKIP_NEIGH**
+ *			Skip the neighbour table lookup. *params*->dmac
+ *			and *params*->smac will not be set as output. A common
+ *			use case is to call **bpf_redirect_neigh**\ () after
+ *			doing **bpf_fib_lookup**\ ().
  *
  *		*ctx* is either **struct xdp_md** for XDP programs or
  *		**struct sk_buff** tc cls_act programs.
@@ -6750,6 +6755,7 @@ struct bpf_raw_tracepoint_args {
 enum {
 	BPF_FIB_LOOKUP_DIRECT  = (1U << 0),
 	BPF_FIB_LOOKUP_OUTPUT  = (1U << 1),
+	BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
 };
 
 enum {
-- 
cgit 


From 168de0233586fb06c5c5c56304aa9a928a09b0ba Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Fri, 17 Feb 2023 12:55:15 -0800
Subject: selftests/bpf: Add bpf_fib_lookup test

This patch tests the bpf_fib_lookup helper when looking up
a neigh in NUD_FAILED and NUD_STALE state. It also adds test
for the new BPF_FIB_LOOKUP_SKIP_NEIGH flag.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230217205515.3583372-2-martin.lau@linux.dev
---
 .../testing/selftests/bpf/prog_tests/fib_lookup.c  | 187 +++++++++++++++++++++
 tools/testing/selftests/bpf/progs/fib_lookup.c     |  22 +++
 2 files changed, 209 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/fib_lookup.c
 create mode 100644 tools/testing/selftests/bpf/progs/fib_lookup.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c
new file mode 100644
index 000000000000..61ccddccf485
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include <sys/types.h>
+#include <net/if.h>
+
+#include "test_progs.h"
+#include "network_helpers.h"
+#include "fib_lookup.skel.h"
+
+#define SYS(fmt, ...)						\
+	({							\
+		char cmd[1024];					\
+		snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__);	\
+		if (!ASSERT_OK(system(cmd), cmd))		\
+			goto fail;				\
+	})
+
+#define NS_TEST			"fib_lookup_ns"
+#define IPV6_IFACE_ADDR		"face::face"
+#define IPV6_NUD_FAILED_ADDR	"face::1"
+#define IPV6_NUD_STALE_ADDR	"face::2"
+#define IPV4_IFACE_ADDR		"10.0.0.254"
+#define IPV4_NUD_FAILED_ADDR	"10.0.0.1"
+#define IPV4_NUD_STALE_ADDR	"10.0.0.2"
+#define DMAC			"11:11:11:11:11:11"
+#define DMAC_INIT { 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, }
+
+struct fib_lookup_test {
+	const char *desc;
+	const char *daddr;
+	int expected_ret;
+	int lookup_flags;
+	__u8 dmac[6];
+};
+
+static const struct fib_lookup_test tests[] = {
+	{ .desc = "IPv6 failed neigh",
+	  .daddr = IPV6_NUD_FAILED_ADDR, .expected_ret = BPF_FIB_LKUP_RET_NO_NEIGH, },
+	{ .desc = "IPv6 stale neigh",
+	  .daddr = IPV6_NUD_STALE_ADDR, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS,
+	  .dmac = DMAC_INIT, },
+	{ .desc = "IPv6 skip neigh",
+	  .daddr = IPV6_NUD_FAILED_ADDR, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS,
+	  .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH, },
+	{ .desc = "IPv4 failed neigh",
+	  .daddr = IPV4_NUD_FAILED_ADDR, .expected_ret = BPF_FIB_LKUP_RET_NO_NEIGH, },
+	{ .desc = "IPv4 stale neigh",
+	  .daddr = IPV4_NUD_STALE_ADDR, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS,
+	  .dmac = DMAC_INIT, },
+	{ .desc = "IPv4 skip neigh",
+	  .daddr = IPV4_NUD_FAILED_ADDR, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS,
+	  .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH, },
+};
+
+static int ifindex;
+
+static int setup_netns(void)
+{
+	int err;
+
+	SYS("ip link add veth1 type veth peer name veth2");
+	SYS("ip link set dev veth1 up");
+
+	SYS("ip addr add %s/64 dev veth1 nodad", IPV6_IFACE_ADDR);
+	SYS("ip neigh add %s dev veth1 nud failed", IPV6_NUD_FAILED_ADDR);
+	SYS("ip neigh add %s dev veth1 lladdr %s nud stale", IPV6_NUD_STALE_ADDR, DMAC);
+
+	SYS("ip addr add %s/24 dev veth1 nodad", IPV4_IFACE_ADDR);
+	SYS("ip neigh add %s dev veth1 nud failed", IPV4_NUD_FAILED_ADDR);
+	SYS("ip neigh add %s dev veth1 lladdr %s nud stale", IPV4_NUD_STALE_ADDR, DMAC);
+
+	err = write_sysctl("/proc/sys/net/ipv4/conf/veth1/forwarding", "1");
+	if (!ASSERT_OK(err, "write_sysctl(net.ipv4.conf.veth1.forwarding)"))
+		goto fail;
+
+	err = write_sysctl("/proc/sys/net/ipv6/conf/veth1/forwarding", "1");
+	if (!ASSERT_OK(err, "write_sysctl(net.ipv6.conf.veth1.forwarding)"))
+		goto fail;
+
+	return 0;
+fail:
+	return -1;
+}
+
+static int set_lookup_params(struct bpf_fib_lookup *params, const char *daddr)
+{
+	int ret;
+
+	memset(params, 0, sizeof(*params));
+
+	params->l4_protocol = IPPROTO_TCP;
+	params->ifindex = ifindex;
+
+	if (inet_pton(AF_INET6, daddr, params->ipv6_dst) == 1) {
+		params->family = AF_INET6;
+		ret = inet_pton(AF_INET6, IPV6_IFACE_ADDR, params->ipv6_src);
+		if (!ASSERT_EQ(ret, 1, "inet_pton(IPV6_IFACE_ADDR)"))
+			return -1;
+		return 0;
+	}
+
+	ret = inet_pton(AF_INET, daddr, &params->ipv4_dst);
+	if (!ASSERT_EQ(ret, 1, "convert IP[46] address"))
+		return -1;
+	params->family = AF_INET;
+	ret = inet_pton(AF_INET, IPV4_IFACE_ADDR, &params->ipv4_src);
+	if (!ASSERT_EQ(ret, 1, "inet_pton(IPV4_IFACE_ADDR)"))
+		return -1;
+
+	return 0;
+}
+
+static void mac_str(char *b, const __u8 *mac)
+{
+	sprintf(b, "%02X:%02X:%02X:%02X:%02X:%02X",
+		mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+}
+
+void test_fib_lookup(void)
+{
+	struct bpf_fib_lookup *fib_params;
+	struct nstoken *nstoken = NULL;
+	struct __sk_buff skb = { };
+	struct fib_lookup *skel;
+	int prog_fd, err, ret, i;
+
+	/* The test does not use the skb->data, so
+	 * use pkt_v6 for both v6 and v4 test.
+	 */
+	LIBBPF_OPTS(bpf_test_run_opts, run_opts,
+		    .data_in = &pkt_v6,
+		    .data_size_in = sizeof(pkt_v6),
+		    .ctx_in = &skb,
+		    .ctx_size_in = sizeof(skb),
+	);
+
+	skel = fib_lookup__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel open_and_load"))
+		return;
+	prog_fd = bpf_program__fd(skel->progs.fib_lookup);
+
+	SYS("ip netns add %s", NS_TEST);
+
+	nstoken = open_netns(NS_TEST);
+	if (!ASSERT_OK_PTR(nstoken, "open_netns"))
+		goto fail;
+
+	if (setup_netns())
+		goto fail;
+
+	ifindex = if_nametoindex("veth1");
+	skb.ifindex = ifindex;
+	fib_params = &skel->bss->fib_params;
+
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		printf("Testing %s\n", tests[i].desc);
+
+		if (set_lookup_params(fib_params, tests[i].daddr))
+			continue;
+		skel->bss->fib_lookup_ret = -1;
+		skel->bss->lookup_flags = BPF_FIB_LOOKUP_OUTPUT |
+			tests[i].lookup_flags;
+
+		err = bpf_prog_test_run_opts(prog_fd, &run_opts);
+		if (!ASSERT_OK(err, "bpf_prog_test_run_opts"))
+			continue;
+
+		ASSERT_EQ(tests[i].expected_ret, skel->bss->fib_lookup_ret,
+			  "fib_lookup_ret");
+
+		ret = memcmp(tests[i].dmac, fib_params->dmac, sizeof(tests[i].dmac));
+		if (!ASSERT_EQ(ret, 0, "dmac not match")) {
+			char expected[18], actual[18];
+
+			mac_str(expected, tests[i].dmac);
+			mac_str(actual, fib_params->dmac);
+			printf("dmac expected %s actual %s\n", expected, actual);
+		}
+	}
+
+fail:
+	if (nstoken)
+		close_netns(nstoken);
+	system("ip netns del " NS_TEST " &> /dev/null");
+	fib_lookup__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/fib_lookup.c b/tools/testing/selftests/bpf/progs/fib_lookup.c
new file mode 100644
index 000000000000..c4514dd58c62
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/fib_lookup.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tracing_net.h"
+
+struct bpf_fib_lookup fib_params = {};
+int fib_lookup_ret = 0;
+int lookup_flags = 0;
+
+SEC("tc")
+int fib_lookup(struct __sk_buff *skb)
+{
+	fib_lookup_ret = bpf_fib_lookup(skb, &fib_params, sizeof(fib_params),
+					lookup_flags);
+
+	return TC_ACT_SHOT;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit 


From 2455f0e124d317dd08d337a7550a78a224d4ba41 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <zwisler@chromium.org>
Date: Wed, 15 Feb 2023 15:33:45 -0700
Subject: tracing: Always use canonical ftrace path

The canonical location for the tracefs filesystem is at /sys/kernel/tracing.

But, from Documentation/trace/ftrace.rst:

  Before 4.1, all ftrace tracing control files were within the debugfs
  file system, which is typically located at /sys/kernel/debug/tracing.
  For backward compatibility, when mounting the debugfs file system,
  the tracefs file system will be automatically mounted at:

  /sys/kernel/debug/tracing

Many comments and Kconfig help messages in the tracing code still refer
to this older debugfs path, so let's update them to avoid confusion.

Link: https://lore.kernel.org/linux-trace-kernel/20230215223350.2658616-2-zwisler@google.com

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Mukesh Ojha <quic_mojha@quicinc.com>
Signed-off-by: Ross Zwisler <zwisler@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/kernel.h                    |  2 +-
 include/linux/tracepoint.h                |  4 ++--
 kernel/trace/Kconfig                      | 20 ++++++++++----------
 kernel/trace/kprobe_event_gen_test.c      |  2 +-
 kernel/trace/ring_buffer.c                |  2 +-
 kernel/trace/synth_event_gen_test.c       |  2 +-
 kernel/trace/trace.c                      |  2 +-
 samples/user_events/example.c             |  4 ++--
 scripts/tracing/draw_functrace.py         |  6 +++---
 tools/lib/api/fs/tracing_path.c           |  4 ++--
 tools/tracing/latency/latency-collector.c |  2 +-
 11 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index fe6efb24d151..40bce7495af8 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -297,7 +297,7 @@ bool mac_pton(const char *s, u8 *mac);
  *
  * Use tracing_on/tracing_off when you want to quickly turn on or off
  * tracing. It simply enables or disables the recording of the trace events.
- * This also corresponds to the user space /sys/kernel/debug/tracing/tracing_on
+ * This also corresponds to the user space /sys/kernel/tracing/tracing_on
  * file, which gives a means for the kernel and userspace to interact.
  * Place a tracing_off() in the kernel where you want tracing to end.
  * From user space, examine the trace, and then echo 1 > tracing_on
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 4b33b95eb8be..fa1004fcf810 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -471,7 +471,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  *	* This is how the trace record is structured and will
  *	* be saved into the ring buffer. These are the fields
  *	* that will be exposed to user-space in
- *	* /sys/kernel/debug/tracing/events/<*>/format.
+ *	* /sys/kernel/tracing/events/<*>/format.
  *	*
  *	* The declared 'local variable' is called '__entry'
  *	*
@@ -531,7 +531,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  * tracepoint callback (this is used by programmatic plugins and
  * can also by used by generic instrumentation like SystemTap), and
  * it is also used to expose a structured trace record in
- * /sys/kernel/debug/tracing/events/.
+ * /sys/kernel/tracing/events/.
  *
  * A set of (un)registration functions can be passed to the variant
  * TRACE_EVENT_FN to perform any (un)registration work.
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d7043043f59c..5f5e64f9e715 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -239,7 +239,7 @@ config DYNAMIC_FTRACE
 	  enabled, and the functions not enabled will not affect
 	  performance of the system.
 
-	  See the files in /sys/kernel/debug/tracing:
+	  See the files in /sys/kernel/tracing:
 	    available_filter_functions
 	    set_ftrace_filter
 	    set_ftrace_notrace
@@ -299,7 +299,7 @@ config STACK_TRACER
 	select KALLSYMS
 	help
 	  This special tracer records the maximum stack footprint of the
-	  kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
+	  kernel and displays it in /sys/kernel/tracing/stack_trace.
 
 	  This tracer works by hooking into every function call that the
 	  kernel executes, and keeping a maximum stack depth value and
@@ -339,7 +339,7 @@ config IRQSOFF_TRACER
 	  disabled by default and can be runtime (re-)started
 	  via:
 
-	      echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
+	      echo 0 > /sys/kernel/tracing/tracing_max_latency
 
 	  (Note that kernel size and overhead increase with this option
 	  enabled. This option and the preempt-off timing option can be
@@ -363,7 +363,7 @@ config PREEMPT_TRACER
 	  disabled by default and can be runtime (re-)started
 	  via:
 
-	      echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
+	      echo 0 > /sys/kernel/tracing/tracing_max_latency
 
 	  (Note that kernel size and overhead increase with this option
 	  enabled. This option and the irqs-off timing option can be
@@ -515,7 +515,7 @@ config TRACER_SNAPSHOT
 	  Allow tracing users to take snapshot of the current buffer using the
 	  ftrace interface, e.g.:
 
-	      echo 1 > /sys/kernel/debug/tracing/snapshot
+	      echo 1 > /sys/kernel/tracing/snapshot
 	      cat snapshot
 
 config TRACER_SNAPSHOT_PER_CPU_SWAP
@@ -527,7 +527,7 @@ config TRACER_SNAPSHOT_PER_CPU_SWAP
 	  full swap (all buffers). If this is set, then the following is
 	  allowed:
 
-	      echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot
+	      echo 1 > /sys/kernel/tracing/per_cpu/cpu2/snapshot
 
 	  After which, only the tracing buffer for CPU 2 was swapped with
 	  the main tracing buffer, and the other CPU buffers remain the same.
@@ -574,7 +574,7 @@ config PROFILE_ANNOTATED_BRANCHES
 	  This tracer profiles all likely and unlikely macros
 	  in the kernel. It will display the results in:
 
-	  /sys/kernel/debug/tracing/trace_stat/branch_annotated
+	  /sys/kernel/tracing/trace_stat/branch_annotated
 
 	  Note: this will add a significant overhead; only turn this
 	  on if you need to profile the system's use of these macros.
@@ -587,7 +587,7 @@ config PROFILE_ALL_BRANCHES
 	  taken in the kernel is recorded whether it hit or miss.
 	  The results will be displayed in:
 
-	  /sys/kernel/debug/tracing/trace_stat/branch_all
+	  /sys/kernel/tracing/trace_stat/branch_all
 
 	  This option also enables the likely/unlikely profiler.
 
@@ -638,8 +638,8 @@ config BLK_DEV_IO_TRACE
 	  Tracing also is possible using the ftrace interface, e.g.:
 
 	    echo 1 > /sys/block/sda/sda1/trace/enable
-	    echo blk > /sys/kernel/debug/tracing/current_tracer
-	    cat /sys/kernel/debug/tracing/trace_pipe
+	    echo blk > /sys/kernel/tracing/current_tracer
+	    cat /sys/kernel/tracing/trace_pipe
 
 	  If unsure, say N.
 
diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c
index c736487fc0e4..4850fdfe27f1 100644
--- a/kernel/trace/kprobe_event_gen_test.c
+++ b/kernel/trace/kprobe_event_gen_test.c
@@ -21,7 +21,7 @@
  * Then:
  *
  * # insmod kernel/trace/kprobe_event_gen_test.ko
- * # cat /sys/kernel/debug/tracing/trace
+ * # cat /sys/kernel/tracing/trace
  *
  * You should see many instances of the "gen_kprobe_test" and
  * "gen_kretprobe_test" events in the trace buffer.
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 45d4a23d6044..071184324d18 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2886,7 +2886,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  sched_clock_stable() ? "" :
 		  "If you just came from a suspend/resume,\n"
 		  "please switch to the trace global clock:\n"
-		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n"
+		  "  echo global > /sys/kernel/tracing/trace_clock\n"
 		  "or add trace_clock=global to the kernel command line\n");
 }
 
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index 8d77526892f4..8dfe85499d4a 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -22,7 +22,7 @@
  * Then:
  *
  * # insmod kernel/trace/synth_event_gen_test.ko
- * # cat /sys/kernel/debug/tracing/trace
+ * # cat /sys/kernel/tracing/trace
  *
  * You should see several events in the trace buffer -
  * "create_synth_test", "empty_synth_test", and several instances of
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 97c88711f270..fbb602a8b64b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1187,7 +1187,7 @@ void tracing_snapshot_instance(struct trace_array *tr)
  *
  * Note, make sure to allocate the snapshot with either
  * a tracing_snapshot_alloc(), or by doing it manually
- * with: echo 1 > /sys/kernel/debug/tracing/snapshot
+ * with: echo 1 > /sys/kernel/tracing/snapshot
  *
  * If the snapshot buffer is not allocated, it will stop tracing.
  * Basically making a permanent snapshot.
diff --git a/samples/user_events/example.c b/samples/user_events/example.c
index d06dc24156ec..18e34c9d708e 100644
--- a/samples/user_events/example.c
+++ b/samples/user_events/example.c
@@ -23,8 +23,8 @@
 #endif
 
 /* Assumes debugfs is mounted */
-const char *data_file = "/sys/kernel/debug/tracing/user_events_data";
-const char *status_file = "/sys/kernel/debug/tracing/user_events_status";
+const char *data_file = "/sys/kernel/tracing/user_events_data";
+const char *status_file = "/sys/kernel/tracing/user_events_status";
 
 static int event_status(long **status)
 {
diff --git a/scripts/tracing/draw_functrace.py b/scripts/tracing/draw_functrace.py
index 438516bdfb3c..42fa87300941 100755
--- a/scripts/tracing/draw_functrace.py
+++ b/scripts/tracing/draw_functrace.py
@@ -12,9 +12,9 @@ calls. Only the functions's names and the call time are provided.
 
 Usage:
 	Be sure that you have CONFIG_FUNCTION_TRACER
-	# mount -t debugfs nodev /sys/kernel/debug
-	# echo function > /sys/kernel/debug/tracing/current_tracer
-	$ cat /sys/kernel/debug/tracing/trace_pipe > ~/raw_trace_func
+	# mount -t tracefs nodev /sys/kernel/tracing
+	# echo function > /sys/kernel/tracing/current_tracer
+	$ cat /sys/kernel/tracing/trace_pipe > ~/raw_trace_func
 	Wait some times but not too much, the script is a bit slow.
 	Break the pipe (Ctrl + Z)
 	$ scripts/tracing/draw_functrace.py < ~/raw_trace_func > draw_functrace
diff --git a/tools/lib/api/fs/tracing_path.c b/tools/lib/api/fs/tracing_path.c
index b8e457c841ab..7ba3e81274e8 100644
--- a/tools/lib/api/fs/tracing_path.c
+++ b/tools/lib/api/fs/tracing_path.c
@@ -14,8 +14,8 @@
 #include "tracing_path.h"
 
 static char tracing_mnt[PATH_MAX]  = "/sys/kernel/debug";
-static char tracing_path[PATH_MAX]        = "/sys/kernel/debug/tracing";
-static char tracing_events_path[PATH_MAX] = "/sys/kernel/debug/tracing/events";
+static char tracing_path[PATH_MAX]        = "/sys/kernel/tracing";
+static char tracing_events_path[PATH_MAX] = "/sys/kernel/tracing/events";
 
 static void __tracing_path_set(const char *tracing, const char *mountpoint)
 {
diff --git a/tools/tracing/latency/latency-collector.c b/tools/tracing/latency/latency-collector.c
index 59a7f2346eab..0fd9c747d396 100644
--- a/tools/tracing/latency/latency-collector.c
+++ b/tools/tracing/latency/latency-collector.c
@@ -1584,7 +1584,7 @@ static void *do_printloop(void *arg)
 		/*
 		 * Toss a coin to decide if we want to sleep before printing
 		 * out the backtrace. The reason for this is that opening
-		 * /sys/kernel/debug/tracing/trace will cause a blackout of
+		 * /sys/kernel/tracing/trace will cause a blackout of
 		 * hundreds of ms, where no latencies will be noted by the
 		 * latency tracer. Thus by randomly sleeping we try to avoid
 		 * missing traces systematically due to this. With this option
-- 
cgit 


From 436864095a95fcc611c20c44a111985fa9848730 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Thu, 16 Feb 2023 13:43:40 +0100
Subject: selftests/net: Interpret UDP_GRO cmsg data as an int value

Data passed to user-space with a (SOL_UDP, UDP_GRO) cmsg carries an
int (see udp_cmsg_recv), not a u16 value, as strace confirms:

  recvmsg(8, {msg_name=...,
              msg_iov=[{iov_base="\0\0..."..., iov_len=96000}],
              msg_iovlen=1,
              msg_control=[{cmsg_len=20,         <-- sizeof(cmsghdr) + 4
                            cmsg_level=SOL_UDP,
                            cmsg_type=0x68}],    <-- UDP_GRO
                            msg_controllen=24,
                            msg_flags=0}, 0) = 11200

Interpreting the data as an u16 value won't work on big-endian platforms.
Since it is too late to back out of this API decision [1], fix the test.

[1]: https://lore.kernel.org/netdev/20230131174601.203127-1-jakub@cloudflare.com/

Fixes: 3327a9c46352 ("selftests: add functionals test for UDP GRO")
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/udpgso_bench_rx.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c
index 4058c7451e70..f35a924d4a30 100644
--- a/tools/testing/selftests/net/udpgso_bench_rx.c
+++ b/tools/testing/selftests/net/udpgso_bench_rx.c
@@ -214,11 +214,10 @@ static void do_verify_udp(const char *data, int len)
 
 static int recv_msg(int fd, char *buf, int len, int *gso_size)
 {
-	char control[CMSG_SPACE(sizeof(uint16_t))] = {0};
+	char control[CMSG_SPACE(sizeof(int))] = {0};
 	struct msghdr msg = {0};
 	struct iovec iov = {0};
 	struct cmsghdr *cmsg;
-	uint16_t *gsosizeptr;
 	int ret;
 
 	iov.iov_base = buf;
@@ -237,8 +236,7 @@ static int recv_msg(int fd, char *buf, int len, int *gso_size)
 		     cmsg = CMSG_NXTHDR(&msg, cmsg)) {
 			if (cmsg->cmsg_level == SOL_UDP
 			    && cmsg->cmsg_type == UDP_GRO) {
-				gsosizeptr = (uint16_t *) CMSG_DATA(cmsg);
-				*gso_size = *gsosizeptr;
+				*gso_size = *(int *)CMSG_DATA(cmsg);
 				break;
 			}
 		}
-- 
cgit 


From 3a7d84eae03bef4c02c39822b2ea6be5ac73de7b Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 17 Feb 2023 13:28:50 +0100
Subject: self-tests: more rps self tests

Explicitly check for child netns and main ns independency

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/rps_default_mask.sh | 41 +++++++++++++++++--------
 1 file changed, 29 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/rps_default_mask.sh b/tools/testing/selftests/net/rps_default_mask.sh
index c81c0ac7ddfe..0fd0d2db3abc 100755
--- a/tools/testing/selftests/net/rps_default_mask.sh
+++ b/tools/testing/selftests/net/rps_default_mask.sh
@@ -8,7 +8,9 @@ ret=0
 [ $cpus -gt 2 ] || exit $ksft_skip
 
 readonly INITIAL_RPS_DEFAULT_MASK=$(cat /proc/sys/net/core/rps_default_mask)
-readonly NETNS="ns-$(mktemp -u XXXXXX)"
+readonly TAG="$(mktemp -u XXXXXX)"
+readonly VETH="veth${TAG}"
+readonly NETNS="ns-${TAG}"
 
 setup() {
 	ip netns add "${NETNS}"
@@ -21,11 +23,15 @@ cleanup() {
 }
 
 chk_rps() {
-	local rps_mask expected_rps_mask=$3
-	local dev_name=$2
+	local rps_mask expected_rps_mask=$4
+	local dev_name=$3
+	local netns=$2
+	local cmd="cat"
 	local msg=$1
 
-	rps_mask=$(ip netns exec $NETNS cat /sys/class/net/$dev_name/queues/rx-0/rps_cpus)
+	[ -n "$netns" ] && cmd="ip netns exec $netns $cmd"
+
+	rps_mask=$($cmd /sys/class/net/$dev_name/queues/rx-0/rps_cpus)
 	printf "%-60s" "$msg"
 	if [ $rps_mask -eq $expected_rps_mask ]; then
 		echo "[ ok ]"
@@ -39,19 +45,30 @@ trap cleanup EXIT
 
 echo 0 > /proc/sys/net/core/rps_default_mask
 setup
-chk_rps "empty rps_default_mask" lo 0
+chk_rps "empty rps_default_mask" $NETNS lo 0
 cleanup
 
 echo 1 > /proc/sys/net/core/rps_default_mask
 setup
-chk_rps "non zero rps_default_mask" lo 1
+chk_rps "changing rps_default_mask dont affect existing devices" "" lo $INITIAL_RPS_DEFAULT_MASK
 
 echo 3 > /proc/sys/net/core/rps_default_mask
-chk_rps "changing rps_default_mask dont affect existing netns" lo 1
+chk_rps "changing rps_default_mask dont affect existing netns" $NETNS lo 0
+
+ip link add name $VETH type veth peer netns $NETNS name $VETH
+ip link set dev $VETH up
+ip -n $NETNS link set dev $VETH up
+chk_rps "changing rps_default_mask affect newly created devices" "" $VETH 3
+chk_rps "changing rps_default_mask don't affect newly child netns[II]" $NETNS $VETH 0
+ip netns del $NETNS
+
+setup
+chk_rps "rps_default_mask is 0 by default in child netns" "$NETNS" lo 0
+
+ip netns exec $NETNS sysctl -qw net.core.rps_default_mask=1
+ip link add name $VETH type veth peer netns $NETNS name $VETH
+chk_rps "changing rps_default_mask in child ns don't affect the main one" "" lo $INITIAL_RPS_DEFAULT_MASK
+chk_rps "changing rps_default_mask in child ns affects new childns devices" $NETNS $VETH 1
+chk_rps "changing rps_default_mask in child ns don't affect existing devices" $NETNS lo 0
 
-ip -n $NETNS link add type veth
-ip -n $NETNS link set dev veth0 up
-ip -n $NETNS link set dev veth1 up
-chk_rps "changing rps_default_mask affect newly created devices" veth0 3
-chk_rps "changing rps_default_mask affect newly created devices[II]" veth1 3
 exit $ret
-- 
cgit 


From e8bf9b98d40dbdf4e39362e3b85a70c61da68cb7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 18 Jan 2023 11:31:25 -0500
Subject: ktest.pl: Fix missing "end_monitor" when machine check fails

In the "reboot" command, it does a check of the machine to see if it is
still alive with a simple "ssh echo" command. If it fails, it will assume
that a normal "ssh reboot" is not possible and force a power cycle.

In this case, the "start_monitor" is executed, but the "end_monitor" is
not, and this causes the screen will not be given back to the console. That
is, after the test, a "reset" command needs to be performed, as "echo" is
turned off.

Cc: stable@vger.kernel.org
Fixes: 6474ace999edd ("ktest.pl: Powercycle the box on reboot if no connection can be made")
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/testing/ktest/ktest.pl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index ac59999ed3de..62823a4232ab 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1495,7 +1495,8 @@ sub reboot {
 
 	# Still need to wait for the reboot to finish
 	wait_for_monitor($time, $reboot_success_line);
-
+    }
+    if ($powercycle || $time) {
 	end_monitor;
     }
 }
-- 
cgit 


From 83d29d439cd3ef23041570d55841f814af2ecac0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 18 Jan 2023 16:32:13 -0500
Subject: ktest.pl: Give back console on Ctrt^C on monitor

When monitoring the console output, the stdout is being redirected to do
so. If Ctrl^C is hit during this mode, the stdout is not back to the
console, the user does not see anything they type (no echo).

Add "end_monitor" to the SIGINT interrupt handler to give back the console
on Ctrl^C.

Cc: stable@vger.kernel.org
Fixes: 9f2cdcbbb90e7 ("ktest: Give console process a dedicated tty")
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/testing/ktest/ktest.pl | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 62823a4232ab..74801811372f 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -4201,6 +4201,9 @@ sub send_email {
 }
 
 sub cancel_test {
+    if ($monitor_cnt) {
+	end_monitor;
+    }
     if ($email_when_canceled) {
 	my $name = get_test_name;
 	send_email("KTEST: Your [$name] test was cancelled",
-- 
cgit 


From 4e7d2a8f0b52abf23b1dc13b3d88bc0923383cd5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 18 Jan 2023 16:37:25 -0500
Subject: ktest.pl: Add RUN_TIMEOUT option with default unlimited

There is a disconnect between the run_command function and the
wait_for_input. The wait_for_input has a default timeout of 2 minutes. But
if that happens, the run_command loop will exit out to the waitpid() of
the executing command. This fails in that it no longer monitors the
command, and also, the ssh to the test box can hang when its finished, as
it's waiting for the pipe it's writing to to flush, but the loop that
reads that pipe has already exited, leaving the command stuck, and the
test hangs.

Instead, make the default "wait_for_input" of the run_command infinite,
and allow the user to override it if they want with a default timeout
option "RUN_TIMEOUT".

But this fixes the hang that happens when the pipe is full and the ssh
session never exits.

Cc: stable@vger.kernel.org
Fixes: 6e98d1b4415fe ("ktest: Add timeout to ssh command")
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/testing/ktest/ktest.pl    | 20 ++++++++++++++++----
 tools/testing/ktest/sample.conf |  5 +++++
 2 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 74801811372f..822794ca4029 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -178,6 +178,7 @@ my $store_failures;
 my $store_successes;
 my $test_name;
 my $timeout;
+my $run_timeout;
 my $connect_timeout;
 my $config_bisect_exec;
 my $booted_timeout;
@@ -340,6 +341,7 @@ my %option_map = (
     "STORE_SUCCESSES"		=> \$store_successes,
     "TEST_NAME"			=> \$test_name,
     "TIMEOUT"			=> \$timeout,
+    "RUN_TIMEOUT"		=> \$run_timeout,
     "CONNECT_TIMEOUT"		=> \$connect_timeout,
     "CONFIG_BISECT_EXEC"	=> \$config_bisect_exec,
     "BOOTED_TIMEOUT"		=> \$booted_timeout,
@@ -1858,6 +1860,14 @@ sub run_command {
     $command =~ s/\$SSH_USER/$ssh_user/g;
     $command =~ s/\$MACHINE/$machine/g;
 
+    if (!defined($timeout)) {
+	$timeout = $run_timeout;
+    }
+
+    if (!defined($timeout)) {
+	$timeout = -1; # tell wait_for_input to wait indefinitely
+    }
+
     doprint("$command ... ");
     $start_time = time;
 
@@ -1884,13 +1894,10 @@ sub run_command {
 
     while (1) {
 	my $fp = \*CMD;
-	if (defined($timeout)) {
-	    doprint "timeout = $timeout\n";
-	}
 	my $line = wait_for_input($fp, $timeout);
 	if (!defined($line)) {
 	    my $now = time;
-	    if (defined($timeout) && (($now - $start_time) >= $timeout)) {
+	    if ($timeout >= 0 && (($now - $start_time) >= $timeout)) {
 		doprint "Hit timeout of $timeout, killing process\n";
 		$hit_timeout = 1;
 		kill 9, $pid;
@@ -2062,6 +2069,11 @@ sub wait_for_input {
 	$time = $timeout;
     }
 
+    if ($time < 0) {
+	# Negative number means wait indefinitely
+	undef $time;
+    }
+
     $rin = '';
     vec($rin, fileno($fp), 1) = 1;
     vec($rin, fileno(\*STDIN), 1) = 1;
diff --git a/tools/testing/ktest/sample.conf b/tools/testing/ktest/sample.conf
index 2d0fe15a096d..f43477a9b857 100644
--- a/tools/testing/ktest/sample.conf
+++ b/tools/testing/ktest/sample.conf
@@ -817,6 +817,11 @@
 # is issued instead of a reboot.
 # CONNECT_TIMEOUT = 25
 
+# The timeout in seconds for how long to wait for any running command
+# to timeout. If not defined, it will let it go indefinitely.
+# (default undefined)
+#RUN_TIMEOUT = 600
+
 # In between tests, a reboot of the box may occur, and this
 # is the time to wait for the console after it stops producing
 # output. Some machines may not produce a large lag on reboot
-- 
cgit 


From 7dc8e24f0e09834341f84d37433840b353d64bc8 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Fri, 20 Jan 2023 18:16:16 +0900
Subject: ktest: Restore stty setting at first in dodie

The do_send_email() will call die before restoring stty if sendmail
setting is not correct or sendmail is not installed. It is safer to
restore it in the beginning of dodie().

Link: https://lkml.kernel.org/r/167420617635.2988775.13045295332829029437.stgit@devnote3

Cc: John 'Warthog9' Hawley <warthog9@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/testing/ktest/ktest.pl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 822794ca4029..829f5bdfd2e4 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1538,6 +1538,11 @@ sub dodie {
     return if ($in_die);
     $in_die = 1;
 
+    if ($monitor_cnt) {
+	# restore terminal settings
+	system("stty $stty_orig");
+    }
+
     my $i = $iteration;
 
     doprint "CRITICAL FAILURE... [TEST $i] ", @_, "\n";
@@ -1584,11 +1589,6 @@ sub dodie {
 		"Your test started at $script_start_time has failed with:\n@_\n", $log_file);
     }
 
-    if ($monitor_cnt) {
-	# restore terminal settings
-	system("stty $stty_orig");
-    }
-
     if (defined($post_test)) {
 	run_command $post_test;
     }
-- 
cgit 


From d5d469247264e56960705dc5ae7e1d014861fe40 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 15 Feb 2023 14:00:58 +0100
Subject: objtool: add UACCESS exceptions for __tsan_volatile_read/write

A lot of the tsan helpers are already excempt from the UACCESS warnings,
but some more functions were added that need the same thing:

kernel/kcsan/core.o: warning: objtool: __tsan_volatile_read16+0x0: call to __tsan_unaligned_read16() with UACCESS enabled
kernel/kcsan/core.o: warning: objtool: __tsan_volatile_write16+0x0: call to __tsan_unaligned_write16() with UACCESS enabled
vmlinux.o: warning: objtool: __tsan_unaligned_volatile_read16+0x4: call to __tsan_unaligned_read16() with UACCESS enabled
vmlinux.o: warning: objtool: __tsan_unaligned_volatile_write16+0x4: call to __tsan_unaligned_write16() with UACCESS enabled

As Marco points out, these functions don't even call each other
explicitly but instead gcc (but not clang) notices the functions
being identical and turns one symbol into a direct branch to the
other.

Link: https://lkml.kernel.org/r/20230215130058.3836177-4-arnd@kernel.org
Fixes: 75d75b7a4d54 ("kcsan: Support distinguishing volatile accesses")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Kuan-Ying Lee <Kuan-Ying.Lee@mediatek.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/objtool/check.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 4b7c8b33069e..b1a5f658673f 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1186,6 +1186,8 @@ static const char *uaccess_safe_builtin[] = {
 	"__tsan_atomic64_compare_exchange_val",
 	"__tsan_atomic_thread_fence",
 	"__tsan_atomic_signal_fence",
+	"__tsan_unaligned_read16",
+	"__tsan_unaligned_write16",
 	/* KCOV */
 	"write_comp_data",
 	"check_kcov_mode",
-- 
cgit 


From a457e944df92789ab31aaf35fae9db064e3c51c4 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Tue, 21 Feb 2023 08:49:16 +0900
Subject: selftests/ftrace: Fix eprobe syntax test case to check filter support

Fix eprobe syntax test case to check whether the kernel supports the filter
on eprobe for filter syntax test command. Without this fix, this test case
will fail if the kernel supports eprobe but doesn't support the filter on
eprobe.

Link: https://lore.kernel.org/all/167309834742.640500.379128668288448035.stgit@devnote3/

Fixes: 9e14bae7d049 ("selftests/ftrace: Add eprobe syntax error testcase")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
---
 .../testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
index fc1daac7f066..4f5e8c665156 100644
--- a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
@@ -22,6 +22,8 @@ check_error 'e:foo/^bar.1 syscalls/sys_enter_openat'	# BAD_EVENT_NAME
 check_error 'e:foo/bar syscalls/sys_enter_openat arg=^dfd'	# BAD_FETCH_ARG
 check_error 'e:foo/bar syscalls/sys_enter_openat ^arg=$foo'	# BAD_ATTACH_ARG
 
-check_error 'e:foo/bar syscalls/sys_enter_openat if ^'	# NO_EP_FILTER
+if grep -q '<attached-group>\.<attached-event>.*\[if <filter>\]' README; then
+  check_error 'e:foo/bar syscalls/sys_enter_openat if ^'	# NO_EP_FILTER
+fi
 
 exit 0
-- 
cgit 


From 96cd93af794cf3ef83ae1ad7291160029d7b525e Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Tue, 21 Feb 2023 08:49:16 +0900
Subject: selftests/ftrace: Fix probepoint testcase to ignore __pfx_* symbols

Fix kprobe probepoint testcase to ignore __pfx_* prefix symbols. Those are
introduced by commit b341b20d648b ("x86: Add prefix symbols for function
padding") for identifying PADDING_BYTES of NOPs. Since kprobe events can
not probe these prefix symbols, this testcase has to skip those symbols.

Link: https://lore.kernel.org/all/167309835609.640500.9664678940260305746.stgit@devnote3/

Fixes: b341b20d648b ("x86: Add prefix symbols for function padding")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ftrace/test.d/kprobe/probepoint.tc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/probepoint.tc b/tools/testing/selftests/ftrace/test.d/kprobe/probepoint.tc
index 624269c8d534..68425987a5dd 100644
--- a/tools/testing/selftests/ftrace/test.d/kprobe/probepoint.tc
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/probepoint.tc
@@ -21,7 +21,7 @@ set_offs() { # prev target next
 
 # We have to decode symbol addresses to get correct offsets.
 # If the offset is not an instruction boundary, it cause -EILSEQ.
-set_offs `grep -A1 -B1 ${TARGET_FUNC} /proc/kallsyms | cut -f 1 -d " " | xargs`
+set_offs `grep -v __pfx_ /proc/kallsyms | grep -A1 -B1 ${TARGET_FUNC} | cut -f 1 -d " " | xargs`
 
 UINT_TEST=no
 # printf "%x" -1 returns (unsigned long)-1.
-- 
cgit 


From 8478cca1e3abd183f309cd9c2491f484acf5d377 Mon Sep 17 00:00:00 2001
From: Donglin Peng <dolinux.peng@gmail.com>
Date: Tue, 21 Feb 2023 08:52:42 +0900
Subject: tracing/probe: add a char type to show the character value of traced
 arguments

There are scenes that we want to show the character value of traced
arguments other than a decimal or hexadecimal or string value for debug
convinience. I add a new type named 'char' to do it and a new test case
file named 'kprobe_args_char.tc' to do selftest for char type.

For example:

The to be traced function is 'void demo_func(char type, char *name);', we
can add a kprobe event as follows to show argument values as we want:

echo  'p:myprobe demo_func $arg1:char +0($arg2):char[5]' > kprobe_events

we will get the following trace log:

... myprobe: (demo_func+0x0/0x29) arg1='A' arg2={'b','p','f','1',''}

Link: https://lore.kernel.org/all/20221219110613.367098-1-dolinux.peng@gmail.com/

Signed-off-by: Donglin Peng <dolinux.peng@gmail.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Documentation/trace/kprobetrace.rst                |  3 +-
 kernel/trace/trace.c                               |  2 +-
 kernel/trace/trace_probe.c                         |  2 +
 kernel/trace/trace_probe.h                         |  1 +
 .../ftrace/test.d/kprobe/kprobe_args_char.tc       | 47 ++++++++++++++++++++++
 5 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc

(limited to 'tools')

diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index 08a2a6a3782f..ef223b8ad6d5 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -58,7 +58,7 @@ Synopsis of kprobe_events
   NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
   FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
 		  (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
-		  (x8/x16/x32/x64), "string", "ustring", "symbol", "symstr"
+		  (x8/x16/x32/x64), "char", "string", "ustring", "symbol", "symstr"
                   and bitfield are supported.
 
   (\*1) only for the probe on function entry (offs == 0).
@@ -80,6 +80,7 @@ E.g. 'x16[4]' means an array of x16 (2bytes hex) with 4 elements.
 Note that the array can be applied to memory type fetchargs, you can not
 apply it to registers/stack-entries etc. (for example, '$stack1:x8[8]' is
 wrong, but '+8($stack):x8[8]' is OK.)
+Char type can be used to show the character value of traced arguments.
 String type is a special type, which fetches a "null-terminated" string from
 kernel space. This means it will fail and store NULL if the string container
 has been paged out. "ustring" type is an alternative of string for user-space.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b677f8d61deb..712ba8d6f91f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5615,7 +5615,7 @@ static const char readme_msg[] =
 	"\t           $stack<index>, $stack, $retval, $comm,\n"
 #endif
 	"\t           +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
-	"\t     type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n"
+	"\t     type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"
 	"\t           b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
 	"\t           symstr, <type>\\[<array-size>\\]\n"
 #ifdef CONFIG_HIST_TRIGGERS
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 01ebabbbe8c9..11008c098727 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -50,6 +50,7 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(x8,  u8,  "0x%x")
 DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x")
 DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x")
 DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx")
+DEFINE_BASIC_PRINT_TYPE_FUNC(char, u8, "'%c'")
 
 int PRINT_TYPE_FUNC_NAME(symbol)(struct trace_seq *s, void *data, void *ent)
 {
@@ -95,6 +96,7 @@ static const struct fetch_type probe_fetch_types[] = {
 	ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
 	ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
 	ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
+	ASSIGN_FETCH_TYPE_ALIAS(char, u8, u8,  0),
 	ASSIGN_FETCH_TYPE_ALIAS(symbol, ADDR_FETCH_TYPE, ADDR_FETCH_TYPE, 0),
 
 	ASSIGN_FETCH_TYPE_END
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 23acfd1c3812..b4f99553411e 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -166,6 +166,7 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(x16);
 DECLARE_BASIC_PRINT_TYPE_FUNC(x32);
 DECLARE_BASIC_PRINT_TYPE_FUNC(x64);
 
+DECLARE_BASIC_PRINT_TYPE_FUNC(char);
 DECLARE_BASIC_PRINT_TYPE_FUNC(string);
 DECLARE_BASIC_PRINT_TYPE_FUNC(symbol);
 
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc
new file mode 100644
index 000000000000..285b4770efad
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc
@@ -0,0 +1,47 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe event char type argument
+# requires: kprobe_events
+
+case `uname -m` in
+x86_64)
+  ARG1=%di
+;;
+i[3456]86)
+  ARG1=%ax
+;;
+aarch64)
+  ARG1=%x0
+;;
+arm*)
+  ARG1=%r0
+;;
+ppc64*)
+  ARG1=%r3
+;;
+ppc*)
+  ARG1=%r3
+;;
+s390*)
+  ARG1=%r2
+;;
+mips*)
+  ARG1=%r4
+;;
+*)
+  echo "Please implement other architecture here"
+  exit_untested
+esac
+
+: "Test get argument (1)"
+echo "p:testprobe tracefs_create_dir arg1=+0(${ARG1}):char" > kprobe_events
+echo 1 > events/kprobes/testprobe/enable
+echo "p:test $FUNCTION_FORK" >> kprobe_events
+grep -qe "testprobe.* arg1='t'" trace
+
+echo 0 > events/kprobes/testprobe/enable
+: "Test get argument (2)"
+echo "p:testprobe tracefs_create_dir arg1=+0(${ARG1}):char arg2=+0(${ARG1}):char[4]" > kprobe_events
+echo 1 > events/kprobes/testprobe/enable
+echo "p:test $FUNCTION_FORK" >> kprobe_events
+grep -qe "testprobe.* arg1='t' arg2={'t','e','s','t'}" trace
-- 
cgit 


From 0d0ed4006127714e318c9a9d7796f5c8532a21e7 Mon Sep 17 00:00:00 2001
From: Shunsuke Mie <mie@igel.co.jp>
Date: Thu, 2 Feb 2023 19:45:38 +0900
Subject: tools/virtio: enable to build with retpoline

Add build options to bring it close to a linux kernel. It allows for
testing that is close to reality.

Signed-off-by: Shunsuke Mie <mie@igel.co.jp>
Message-Id: <20230202104538.2041879-1-mie@igel.co.jp>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tools/virtio/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index 1b25cc7c64bb..7b7139d97d74 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -4,7 +4,7 @@ test: virtio_test vringh_test
 virtio_test: virtio_ring.o virtio_test.o
 vringh_test: vringh_test.o vringh.o virtio_ring.o
 
-CFLAGS += -g -O2 -Werror -Wno-maybe-uninitialized -Wall -I. -I../include/ -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -include ../../include/linux/kconfig.h
+CFLAGS += -g -O2 -Werror -Wno-maybe-uninitialized -Wall -I. -I../include/ -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -include ../../include/linux/kconfig.h -mfunction-return=thunk -fcf-protection=none -mindirect-branch-register
 CFLAGS += -pthread
 LDFLAGS += -pthread
 vpath %.c ../../drivers/virtio ../../drivers/vhost
-- 
cgit 


From b60417a9f2b890a8094477b2204d4f73c535725e Mon Sep 17 00:00:00 2001
From: Roxana Nicolescu <roxana.nicolescu@canonical.com>
Date: Mon, 20 Feb 2023 12:04:00 +0100
Subject: selftest: fib_tests: Always cleanup before exit

Usage of `set -e` before executing a command causes immediate exit
on failure, without cleanup up the resources allocated at setup.
This can affect the next tests that use the same resources,
leading to a chain of failures.

A simple fix is to always call cleanup function when the script exists.
This approach is already used by other existing tests.

Fixes: 1056691b2680 ("selftests: fib_tests: Make test results more verbose")
Signed-off-by: Roxana Nicolescu <roxana.nicolescu@canonical.com>
Link: https://lore.kernel.org/r/20230220110400.26737-2-roxana.nicolescu@canonical.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/fib_tests.sh | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 5637b5dadabd..70ea8798b1f6 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -2065,6 +2065,8 @@ EOF
 ################################################################################
 # main
 
+trap cleanup EXIT
+
 while getopts :t:pPhv o
 do
 	case $o in
-- 
cgit 


From f922c7b1c1c45740d329bf248936fdb78c0cff6e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 20 Feb 2023 14:23:36 +0100
Subject: sefltests: netdevsim: wait for devlink instance after netns removal

When devlink instance is put into network namespace and that network
namespace gets deleted, devlink instance is moved back into init_ns.
This is done as a part of cleanup_net() routine. Since cleanup_net()
is called asynchronously from workqueue, there is no guarantee that
the devlink instance move is done after "ip netns del" returns.

So fix this race by making sure that the devlink instance is present
before any other operation.

Reported-by: Amir Tzin <amirtz@nvidia.com>
Fixes: b74c37fd35a2 ("selftests: netdevsim: add tests for devlink reload with resources")
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
Link: https://lore.kernel.org/r/20230220132336.198597-1-jiri@resnulli.us
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../testing/selftests/drivers/net/netdevsim/devlink.sh | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
index a08c02abde12..7f7d20f22207 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
@@ -17,6 +17,18 @@ SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV_NAME/net/
 DEBUGFS_DIR=/sys/kernel/debug/netdevsim/$DEV_NAME/
 DL_HANDLE=netdevsim/$DEV_NAME
 
+wait_for_devlink()
+{
+	"$@" | grep -q $DL_HANDLE
+}
+
+devlink_wait()
+{
+	local timeout=$1
+
+	busywait "$timeout" wait_for_devlink devlink dev
+}
+
 fw_flash_test()
 {
 	RET=0
@@ -256,6 +268,9 @@ netns_reload_test()
 	ip netns del testns2
 	ip netns del testns1
 
+	# Wait until netns async cleanup is done.
+	devlink_wait 2000
+
 	log_test "netns reload test"
 }
 
@@ -348,6 +363,9 @@ resource_test()
 	ip netns del testns2
 	ip netns del testns1
 
+	# Wait until netns async cleanup is done.
+	devlink_wait 2000
+
 	log_test "resource test"
 }
 
-- 
cgit 


From efb056e5f1f0036179b2f92c1c15f5ea7a891d70 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 16 Feb 2023 17:05:36 +0100
Subject: netfilter: ip6t_rpfilter: Fix regression with VRF interfaces

When calling ip6_route_lookup() for the packet arriving on the VRF
interface, the result is always the real (slave) interface. Expect this
when validating the result.

Fixes: acc641ab95b66 ("netfilter: rpfilter/fib: Populate flowic_l3mdev field")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/ip6t_rpfilter.c         |  4 +++-
 tools/testing/selftests/netfilter/rpath.sh | 32 ++++++++++++++++++++++++------
 2 files changed, 29 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index a01d9b842bd0..67c87a88cde4 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -72,7 +72,9 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
 		goto out;
 	}
 
-	if (rt->rt6i_idev->dev == dev || (flags & XT_RPFILTER_LOOSE))
+	if (rt->rt6i_idev->dev == dev ||
+	    l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == dev->ifindex ||
+	    (flags & XT_RPFILTER_LOOSE))
 		ret = true;
  out:
 	ip6_rt_put(rt);
diff --git a/tools/testing/selftests/netfilter/rpath.sh b/tools/testing/selftests/netfilter/rpath.sh
index f7311e66d219..5289c8447a41 100755
--- a/tools/testing/selftests/netfilter/rpath.sh
+++ b/tools/testing/selftests/netfilter/rpath.sh
@@ -62,10 +62,16 @@ ip -net "$ns1" a a fec0:42::2/64 dev v0 nodad
 ip -net "$ns2" a a fec0:42::1/64 dev d0 nodad
 
 # firewall matches to test
-[ -n "$iptables" ] && ip netns exec "$ns2" \
-	"$iptables" -t raw -A PREROUTING -s 192.168.0.0/16 -m rpfilter
-[ -n "$ip6tables" ] && ip netns exec "$ns2" \
-	"$ip6tables" -t raw -A PREROUTING -s fec0::/16 -m rpfilter
+[ -n "$iptables" ] && {
+	common='-t raw -A PREROUTING -s 192.168.0.0/16'
+	ip netns exec "$ns2" "$iptables" $common -m rpfilter
+	ip netns exec "$ns2" "$iptables" $common -m rpfilter --invert
+}
+[ -n "$ip6tables" ] && {
+	common='-t raw -A PREROUTING -s fec0::/16'
+	ip netns exec "$ns2" "$ip6tables" $common -m rpfilter
+	ip netns exec "$ns2" "$ip6tables" $common -m rpfilter --invert
+}
 [ -n "$nft" ] && ip netns exec "$ns2" $nft -f - <<EOF
 table inet t {
 	chain c {
@@ -89,6 +95,11 @@ ipt_zero_rule() { # (command)
 	[ -n "$1" ] || return 0
 	ip netns exec "$ns2" "$1" -t raw -vS | grep -q -- "-m rpfilter -c 0 0"
 }
+ipt_zero_reverse_rule() { # (command)
+	[ -n "$1" ] || return 0
+	ip netns exec "$ns2" "$1" -t raw -vS | \
+		grep -q -- "-m rpfilter --invert -c 0 0"
+}
 nft_zero_rule() { # (family)
 	[ -n "$nft" ] || return 0
 	ip netns exec "$ns2" "$nft" list chain inet t c | \
@@ -101,8 +112,7 @@ netns_ping() { # (netns, args...)
 	ip netns exec "$netns" ping -q -c 1 -W 1 "$@" >/dev/null
 }
 
-testrun() {
-	# clear counters first
+clear_counters() {
 	[ -n "$iptables" ] && ip netns exec "$ns2" "$iptables" -t raw -Z
 	[ -n "$ip6tables" ] && ip netns exec "$ns2" "$ip6tables" -t raw -Z
 	if [ -n "$nft" ]; then
@@ -111,6 +121,10 @@ testrun() {
 			ip netns exec "$ns2" $nft -s list table inet t;
 		) | ip netns exec "$ns2" $nft -f -
 	fi
+}
+
+testrun() {
+	clear_counters
 
 	# test 1: martian traffic should fail rpfilter matches
 	netns_ping "$ns1" -I v0 192.168.42.1 && \
@@ -120,9 +134,13 @@ testrun() {
 
 	ipt_zero_rule "$iptables" || die "iptables matched martian"
 	ipt_zero_rule "$ip6tables" || die "ip6tables matched martian"
+	ipt_zero_reverse_rule "$iptables" && die "iptables not matched martian"
+	ipt_zero_reverse_rule "$ip6tables" && die "ip6tables not matched martian"
 	nft_zero_rule ip || die "nft IPv4 matched martian"
 	nft_zero_rule ip6 || die "nft IPv6 matched martian"
 
+	clear_counters
+
 	# test 2: rpfilter match should pass for regular traffic
 	netns_ping "$ns1" 192.168.23.1 || \
 		die "regular ping 192.168.23.1 failed"
@@ -131,6 +149,8 @@ testrun() {
 
 	ipt_zero_rule "$iptables" && die "iptables match not effective"
 	ipt_zero_rule "$ip6tables" && die "ip6tables match not effective"
+	ipt_zero_reverse_rule "$iptables" || die "iptables match over-effective"
+	ipt_zero_reverse_rule "$ip6tables" || die "ip6tables match over-effective"
 	nft_zero_rule ip && die "nft IPv4 match not effective"
 	nft_zero_rule ip6 && die "nft IPv6 match not effective"
 
-- 
cgit 


From cf8c59a3756b2735c409a9b3ac1e4ec556546a7a Mon Sep 17 00:00:00 2001
From: Antonio Alvarez Feijoo <antonio.feijoo@suse.com>
Date: Wed, 22 Feb 2023 08:27:35 +0900
Subject: tools/bootconfig: fix single & used for logical condition

A single & will create a background process and return true, so the grep
command will run even if the file checked in the first condition does not
exist.

Link: https://lore.kernel.org/all/20230112114215.17103-1-antonio.feijoo@suse.com/

Fixes: 1eaad3ac3f39 ("tools/bootconfig: Use per-group/all enable option in ftrace2bconf script")
Signed-off-by: Antonio Alvarez Feijoo <antonio.feijoo@suse.com>
Cc: stable@vger.kernel.org
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 tools/bootconfig/scripts/ftrace2bconf.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bootconfig/scripts/ftrace2bconf.sh b/tools/bootconfig/scripts/ftrace2bconf.sh
index 6183b36c6846..1603801cf126 100755
--- a/tools/bootconfig/scripts/ftrace2bconf.sh
+++ b/tools/bootconfig/scripts/ftrace2bconf.sh
@@ -93,7 +93,7 @@ referred_vars() {
 }
 
 event_is_enabled() { # enable-file
-	test -f $1 & grep -q "1" $1
+	test -f $1 && grep -q "1" $1
 }
 
 per_event_options() { # event-dir
-- 
cgit 


From b48279af636de613e9dd03857d9cadd4beca8e7e Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 18 Feb 2023 08:27:24 -0800
Subject: perf test: Fix offcpu test prev_state check

On Fedora 36, the 'perf record' offcpu profiling tests are failing.  It

was because the BPF checks the prev task's state being S or D but
actually it has more bits set.  Let's check the LSB 8 bits for the
purpose of offcpu profiling.

Reported-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230218162724.1292657-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_skel/off_cpu.bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
index 38e3b287dbb2..d877a0a9731f 100644
--- a/tools/perf/util/bpf_skel/off_cpu.bpf.c
+++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
@@ -277,7 +277,7 @@ int on_switch(u64 *ctx)
 	else
 		prev_state = get_task_state(prev);
 
-	return off_cpu_stat(ctx, prev, next, prev_state);
+	return off_cpu_stat(ctx, prev, next, prev_state & 0xff);
 }
 
 char LICENSE[] SEC("license") = "Dual BSD/GPL";
-- 
cgit 


From 3ef9fec011d471a259935f2e2ff58ab12886d683 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 22 Feb 2023 16:17:36 -0300
Subject: tools arch x86: Sync the msr-index.h copy with the kernel sources

To pick up the changes from these csets:

  8c29f01654053258 ("x86/sev: Add SEV-SNP guest feature negotiation support")

That cause no changes to tooling:

  $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > before
  $ cp arch/x86/include/asm/msr-index.h tools/arch/x86/include/asm/msr-index.h
  $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > after
  $ diff -u before after
  $

Just silences this perf build warning:

  Warning: Kernel ABI header at 'tools/arch/x86/include/asm/msr-index.h' differs from latest version at 'arch/x86/include/asm/msr-index.h'
  diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h

Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Nikunj A Dadhania <nikunj@amd.com>
Link: https://lore.kernel.org/lkml/Y%2FZrNvtcijPWagCp@kernel.org/
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/x86/include/asm/msr-index.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'tools')

diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h
index 37ff47552bcb..d3fe82c5d6b6 100644
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -566,6 +566,26 @@
 #define MSR_AMD64_SEV_ES_ENABLED	BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)
 #define MSR_AMD64_SEV_SNP_ENABLED	BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT)
 
+/* SNP feature bits enabled by the hypervisor */
+#define MSR_AMD64_SNP_VTOM			BIT_ULL(3)
+#define MSR_AMD64_SNP_REFLECT_VC		BIT_ULL(4)
+#define MSR_AMD64_SNP_RESTRICTED_INJ		BIT_ULL(5)
+#define MSR_AMD64_SNP_ALT_INJ			BIT_ULL(6)
+#define MSR_AMD64_SNP_DEBUG_SWAP		BIT_ULL(7)
+#define MSR_AMD64_SNP_PREVENT_HOST_IBS		BIT_ULL(8)
+#define MSR_AMD64_SNP_BTB_ISOLATION		BIT_ULL(9)
+#define MSR_AMD64_SNP_VMPL_SSS			BIT_ULL(10)
+#define MSR_AMD64_SNP_SECURE_TSC		BIT_ULL(11)
+#define MSR_AMD64_SNP_VMGEXIT_PARAM		BIT_ULL(12)
+#define MSR_AMD64_SNP_IBS_VIRT			BIT_ULL(14)
+#define MSR_AMD64_SNP_VMSA_REG_PROTECTION	BIT_ULL(16)
+#define MSR_AMD64_SNP_SMT_PROTECTION		BIT_ULL(17)
+
+/* SNP feature bits reserved for future use. */
+#define MSR_AMD64_SNP_RESERVED_BIT13		BIT_ULL(13)
+#define MSR_AMD64_SNP_RESERVED_BIT15		BIT_ULL(15)
+#define MSR_AMD64_SNP_RESERVED_MASK		GENMASK_ULL(63, 18)
+
 #define MSR_AMD64_VIRT_SPEC_CTRL	0xc001011f
 
 /* AMD Collaborative Processor Performance Control MSRs */
-- 
cgit 


From 20a554638dd2665a88d3d68a68f7981480a27f36 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Feb 2023 18:17:57 +0100
Subject: objtool: Change arch_decode_instruction() signature
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation to changing struct instruction around a bit, avoid
passing it's members by pointer and instead pass the whole thing.

A cleanup in it's own right too.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org> # build only
Tested-by: Thomas Weißschuh <linux@weissschuh.net> # compile and run
Link: https://lore.kernel.org/r/20230208172245.291087549@infradead.org
---
 tools/objtool/arch/powerpc/decode.c  |  22 ++++----
 tools/objtool/arch/x86/decode.c      | 105 +++++++++++++++++------------------
 tools/objtool/check.c                |   4 +-
 tools/objtool/include/objtool/arch.h |   4 +-
 4 files changed, 64 insertions(+), 71 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/arch/powerpc/decode.c b/tools/objtool/arch/powerpc/decode.c
index 9c653805a08a..53b55690f320 100644
--- a/tools/objtool/arch/powerpc/decode.c
+++ b/tools/objtool/arch/powerpc/decode.c
@@ -41,38 +41,36 @@ const char *arch_ret_insn(int len)
 
 int arch_decode_instruction(struct objtool_file *file, const struct section *sec,
 			    unsigned long offset, unsigned int maxlen,
-			    unsigned int *len, enum insn_type *type,
-			    unsigned long *immediate,
-			    struct list_head *ops_list)
+			    struct instruction *insn)
 {
 	unsigned int opcode;
 	enum insn_type typ;
 	unsigned long imm;
-	u32 insn;
+	u32 ins;
 
-	insn = bswap_if_needed(file->elf, *(u32 *)(sec->data->d_buf + offset));
-	opcode = insn >> 26;
+	ins = bswap_if_needed(file->elf, *(u32 *)(sec->data->d_buf + offset));
+	opcode = ins >> 26;
 	typ = INSN_OTHER;
 	imm = 0;
 
 	switch (opcode) {
 	case 18: /* b[l][a] */
-		if ((insn & 3) == 1) /* bl */
+		if ((ins & 3) == 1) /* bl */
 			typ = INSN_CALL;
 
-		imm = insn & 0x3fffffc;
+		imm = ins & 0x3fffffc;
 		if (imm & 0x2000000)
 			imm -= 0x4000000;
 		break;
 	}
 
 	if (opcode == 1)
-		*len = 8;
+		insn->len = 8;
 	else
-		*len = 4;
+		insn->len = 4;
 
-	*type = typ;
-	*immediate = imm;
+	insn->type = typ;
+	insn->immediate = imm;
 
 	return 0;
 }
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index e7b030f7e2a5..c5c49277cf1a 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -146,12 +146,11 @@ static bool has_notrack_prefix(struct insn *insn)
 
 int arch_decode_instruction(struct objtool_file *file, const struct section *sec,
 			    unsigned long offset, unsigned int maxlen,
-			    unsigned int *len, enum insn_type *type,
-			    unsigned long *immediate,
-			    struct list_head *ops_list)
+			    struct instruction *insn)
 {
+	struct list_head *ops_list = &insn->stack_ops;
 	const struct elf *elf = file->elf;
-	struct insn insn;
+	struct insn ins;
 	int x86_64, ret;
 	unsigned char op1, op2, op3, prefix,
 		      rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0,
@@ -165,42 +164,42 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 	if (x86_64 == -1)
 		return -1;
 
-	ret = insn_decode(&insn, sec->data->d_buf + offset, maxlen,
+	ret = insn_decode(&ins, sec->data->d_buf + offset, maxlen,
 			  x86_64 ? INSN_MODE_64 : INSN_MODE_32);
 	if (ret < 0) {
 		WARN("can't decode instruction at %s:0x%lx", sec->name, offset);
 		return -1;
 	}
 
-	*len = insn.length;
-	*type = INSN_OTHER;
+	insn->len = ins.length;
+	insn->type = INSN_OTHER;
 
-	if (insn.vex_prefix.nbytes)
+	if (ins.vex_prefix.nbytes)
 		return 0;
 
-	prefix = insn.prefixes.bytes[0];
+	prefix = ins.prefixes.bytes[0];
 
-	op1 = insn.opcode.bytes[0];
-	op2 = insn.opcode.bytes[1];
-	op3 = insn.opcode.bytes[2];
+	op1 = ins.opcode.bytes[0];
+	op2 = ins.opcode.bytes[1];
+	op3 = ins.opcode.bytes[2];
 
-	if (insn.rex_prefix.nbytes) {
-		rex = insn.rex_prefix.bytes[0];
+	if (ins.rex_prefix.nbytes) {
+		rex = ins.rex_prefix.bytes[0];
 		rex_w = X86_REX_W(rex) >> 3;
 		rex_r = X86_REX_R(rex) >> 2;
 		rex_x = X86_REX_X(rex) >> 1;
 		rex_b = X86_REX_B(rex);
 	}
 
-	if (insn.modrm.nbytes) {
-		modrm = insn.modrm.bytes[0];
+	if (ins.modrm.nbytes) {
+		modrm = ins.modrm.bytes[0];
 		modrm_mod = X86_MODRM_MOD(modrm);
 		modrm_reg = X86_MODRM_REG(modrm) + 8*rex_r;
 		modrm_rm  = X86_MODRM_RM(modrm)  + 8*rex_b;
 	}
 
-	if (insn.sib.nbytes) {
-		sib = insn.sib.bytes[0];
+	if (ins.sib.nbytes) {
+		sib = ins.sib.bytes[0];
 		/* sib_scale = X86_SIB_SCALE(sib); */
 		sib_index = X86_SIB_INDEX(sib) + 8*rex_x;
 		sib_base  = X86_SIB_BASE(sib)  + 8*rex_b;
@@ -254,7 +253,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 		break;
 
 	case 0x70 ... 0x7f:
-		*type = INSN_JUMP_CONDITIONAL;
+		insn->type = INSN_JUMP_CONDITIONAL;
 		break;
 
 	case 0x80 ... 0x83:
@@ -278,7 +277,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 		if (!rm_is_reg(CFI_SP))
 			break;
 
-		imm = insn.immediate.value;
+		imm = ins.immediate.value;
 		if (op1 & 2) { /* sign extend */
 			if (op1 & 1) { /* imm32 */
 				imm <<= 32;
@@ -309,7 +308,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 			ADD_OP(op) {
 				op->src.type = OP_SRC_AND;
 				op->src.reg = CFI_SP;
-				op->src.offset = insn.immediate.value;
+				op->src.offset = ins.immediate.value;
 				op->dest.type = OP_DEST_REG;
 				op->dest.reg = CFI_SP;
 			}
@@ -356,7 +355,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 					op->src.reg = CFI_SP;
 					op->dest.type = OP_DEST_REG_INDIRECT;
 					op->dest.reg = modrm_rm;
-					op->dest.offset = insn.displacement.value;
+					op->dest.offset = ins.displacement.value;
 				}
 				break;
 			}
@@ -389,7 +388,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 				op->src.reg = modrm_reg;
 				op->dest.type = OP_DEST_REG_INDIRECT;
 				op->dest.reg = CFI_BP;
-				op->dest.offset = insn.displacement.value;
+				op->dest.offset = ins.displacement.value;
 			}
 			break;
 		}
@@ -402,7 +401,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 				op->src.reg = modrm_reg;
 				op->dest.type = OP_DEST_REG_INDIRECT;
 				op->dest.reg = CFI_SP;
-				op->dest.offset = insn.displacement.value;
+				op->dest.offset = ins.displacement.value;
 			}
 			break;
 		}
@@ -419,7 +418,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 			ADD_OP(op) {
 				op->src.type = OP_SRC_REG_INDIRECT;
 				op->src.reg = CFI_BP;
-				op->src.offset = insn.displacement.value;
+				op->src.offset = ins.displacement.value;
 				op->dest.type = OP_DEST_REG;
 				op->dest.reg = modrm_reg;
 			}
@@ -432,7 +431,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 			ADD_OP(op) {
 				op->src.type = OP_SRC_REG_INDIRECT;
 				op->src.reg = CFI_SP;
-				op->src.offset = insn.displacement.value;
+				op->src.offset = ins.displacement.value;
 				op->dest.type = OP_DEST_REG;
 				op->dest.reg = modrm_reg;
 			}
@@ -464,7 +463,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 
 		/* lea disp(%src), %dst */
 		ADD_OP(op) {
-			op->src.offset = insn.displacement.value;
+			op->src.offset = ins.displacement.value;
 			if (!op->src.offset) {
 				/* lea (%src), %dst */
 				op->src.type = OP_SRC_REG;
@@ -487,7 +486,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 		break;
 
 	case 0x90:
-		*type = INSN_NOP;
+		insn->type = INSN_NOP;
 		break;
 
 	case 0x9c:
@@ -511,39 +510,39 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 		if (op2 == 0x01) {
 
 			if (modrm == 0xca)
-				*type = INSN_CLAC;
+				insn->type = INSN_CLAC;
 			else if (modrm == 0xcb)
-				*type = INSN_STAC;
+				insn->type = INSN_STAC;
 
 		} else if (op2 >= 0x80 && op2 <= 0x8f) {
 
-			*type = INSN_JUMP_CONDITIONAL;
+			insn->type = INSN_JUMP_CONDITIONAL;
 
 		} else if (op2 == 0x05 || op2 == 0x07 || op2 == 0x34 ||
 			   op2 == 0x35) {
 
 			/* sysenter, sysret */
-			*type = INSN_CONTEXT_SWITCH;
+			insn->type = INSN_CONTEXT_SWITCH;
 
 		} else if (op2 == 0x0b || op2 == 0xb9) {
 
 			/* ud2 */
-			*type = INSN_BUG;
+			insn->type = INSN_BUG;
 
 		} else if (op2 == 0x0d || op2 == 0x1f) {
 
 			/* nopl/nopw */
-			*type = INSN_NOP;
+			insn->type = INSN_NOP;
 
 		} else if (op2 == 0x1e) {
 
 			if (prefix == 0xf3 && (modrm == 0xfa || modrm == 0xfb))
-				*type = INSN_ENDBR;
+				insn->type = INSN_ENDBR;
 
 
 		} else if (op2 == 0x38 && op3 == 0xf8) {
-			if (insn.prefixes.nbytes == 1 &&
-			    insn.prefixes.bytes[0] == 0xf2) {
+			if (ins.prefixes.nbytes == 1 &&
+			    ins.prefixes.bytes[0] == 0xf2) {
 				/* ENQCMD cannot be used in the kernel. */
 				WARN("ENQCMD instruction at %s:%lx", sec->name,
 				     offset);
@@ -591,29 +590,29 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 
 	case 0xcc:
 		/* int3 */
-		*type = INSN_TRAP;
+		insn->type = INSN_TRAP;
 		break;
 
 	case 0xe3:
 		/* jecxz/jrcxz */
-		*type = INSN_JUMP_CONDITIONAL;
+		insn->type = INSN_JUMP_CONDITIONAL;
 		break;
 
 	case 0xe9:
 	case 0xeb:
-		*type = INSN_JUMP_UNCONDITIONAL;
+		insn->type = INSN_JUMP_UNCONDITIONAL;
 		break;
 
 	case 0xc2:
 	case 0xc3:
-		*type = INSN_RETURN;
+		insn->type = INSN_RETURN;
 		break;
 
 	case 0xc7: /* mov imm, r/m */
 		if (!opts.noinstr)
 			break;
 
-		if (insn.length == 3+4+4 && !strncmp(sec->name, ".init.text", 10)) {
+		if (ins.length == 3+4+4 && !strncmp(sec->name, ".init.text", 10)) {
 			struct reloc *immr, *disp;
 			struct symbol *func;
 			int idx;
@@ -661,17 +660,17 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 
 	case 0xca: /* retf */
 	case 0xcb: /* retf */
-		*type = INSN_CONTEXT_SWITCH;
+		insn->type = INSN_CONTEXT_SWITCH;
 		break;
 
 	case 0xe0: /* loopne */
 	case 0xe1: /* loope */
 	case 0xe2: /* loop */
-		*type = INSN_JUMP_CONDITIONAL;
+		insn->type = INSN_JUMP_CONDITIONAL;
 		break;
 
 	case 0xe8:
-		*type = INSN_CALL;
+		insn->type = INSN_CALL;
 		/*
 		 * For the impact on the stack, a CALL behaves like
 		 * a PUSH of an immediate value (the return address).
@@ -683,30 +682,30 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 		break;
 
 	case 0xfc:
-		*type = INSN_CLD;
+		insn->type = INSN_CLD;
 		break;
 
 	case 0xfd:
-		*type = INSN_STD;
+		insn->type = INSN_STD;
 		break;
 
 	case 0xff:
 		if (modrm_reg == 2 || modrm_reg == 3) {
 
-			*type = INSN_CALL_DYNAMIC;
-			if (has_notrack_prefix(&insn))
+			insn->type = INSN_CALL_DYNAMIC;
+			if (has_notrack_prefix(&ins))
 				WARN("notrack prefix found at %s:0x%lx", sec->name, offset);
 
 		} else if (modrm_reg == 4) {
 
-			*type = INSN_JUMP_DYNAMIC;
-			if (has_notrack_prefix(&insn))
+			insn->type = INSN_JUMP_DYNAMIC;
+			if (has_notrack_prefix(&ins))
 				WARN("notrack prefix found at %s:0x%lx", sec->name, offset);
 
 		} else if (modrm_reg == 5) {
 
 			/* jmpf */
-			*type = INSN_CONTEXT_SWITCH;
+			insn->type = INSN_CONTEXT_SWITCH;
 
 		} else if (modrm_reg == 6) {
 
@@ -723,7 +722,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 		break;
 	}
 
-	*immediate = insn.immediate.nbytes ? insn.immediate.value : 0;
+	insn->immediate = ins.immediate.nbytes ? ins.immediate.value : 0;
 
 	return 0;
 }
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index ba07a8ebaf73..b3b423d33cc2 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -406,9 +406,7 @@ static int decode_instructions(struct objtool_file *file)
 
 			ret = arch_decode_instruction(file, sec, offset,
 						      sec->sh.sh_size - offset,
-						      &insn->len, &insn->type,
-						      &insn->immediate,
-						      &insn->stack_ops);
+						      insn);
 			if (ret)
 				goto err;
 
diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h
index 4ecb480131c7..73149f8090fa 100644
--- a/tools/objtool/include/objtool/arch.h
+++ b/tools/objtool/include/objtool/arch.h
@@ -75,9 +75,7 @@ void arch_initial_func_cfi_state(struct cfi_init_state *state);
 
 int arch_decode_instruction(struct objtool_file *file, const struct section *sec,
 			    unsigned long offset, unsigned int maxlen,
-			    unsigned int *len, enum insn_type *type,
-			    unsigned long *immediate,
-			    struct list_head *ops_list);
+			    struct instruction *insn);
 
 bool arch_callee_saved_reg(unsigned char reg);
 
-- 
cgit 


From 3ee88df1b063962e39d7798ccc3b18fd10cea813 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Feb 2023 18:17:58 +0100
Subject: objtool: Make instruction::stack_ops a single-linked list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

 struct instruction {
 	struct list_head           list;                 /*     0    16 */
 	struct hlist_node          hash;                 /*    16    16 */
 	struct list_head           call_node;            /*    32    16 */
 	struct section *           sec;                  /*    48     8 */
 	long unsigned int          offset;               /*    56     8 */
 	/* --- cacheline 1 boundary (64 bytes) --- */
 	unsigned int               len;                  /*    64     4 */
 	enum insn_type             type;                 /*    68     4 */
 	long unsigned int          immediate;            /*    72     8 */
 	u16                        dead_end:1;           /*    80: 0  2 */
 	u16                        ignore:1;             /*    80: 1  2 */
 	u16                        ignore_alts:1;        /*    80: 2  2 */
 	u16                        hint:1;               /*    80: 3  2 */
 	u16                        save:1;               /*    80: 4  2 */
 	u16                        restore:1;            /*    80: 5  2 */
 	u16                        retpoline_safe:1;     /*    80: 6  2 */
 	u16                        noendbr:1;            /*    80: 7  2 */
 	u16                        entry:1;              /*    80: 8  2 */

 	/* XXX 7 bits hole, try to pack */

 	s8                         instr;                /*    82     1 */
 	u8                         visited;              /*    83     1 */

 	/* XXX 4 bytes hole, try to pack */

 	struct alt_group *         alt_group;            /*    88     8 */
 	struct symbol *            call_dest;            /*    96     8 */
 	struct instruction *       jump_dest;            /*   104     8 */
 	struct instruction *       first_jump_src;       /*   112     8 */
 	struct reloc *             jump_table;           /*   120     8 */
 	/* --- cacheline 2 boundary (128 bytes) --- */
 	struct reloc *             reloc;                /*   128     8 */
 	struct list_head           alts;                 /*   136    16 */
 	struct symbol *            sym;                  /*   152     8 */
-	struct list_head           stack_ops;            /*   160    16 */
-	struct cfi_state *         cfi;                  /*   176     8 */
+	struct stack_op *          stack_ops;            /*   160     8 */
+	struct cfi_state *         cfi;                  /*   168     8 */

-	/* size: 184, cachelines: 3, members: 29 */
-	/* sum members: 178, holes: 1, sum holes: 4 */
+	/* size: 176, cachelines: 3, members: 29 */
+	/* sum members: 170, holes: 1, sum holes: 4 */
 	/* sum bitfield members: 9 bits, bit holes: 1, sum bit holes: 7 bits */
-	/* last cacheline: 56 bytes */
+	/* last cacheline: 48 bytes */
 };

pre:	5:58.22 real,   226.69 user,    131.22 sys,     26221520 mem
post:	5:58.50 real,   229.64 user,    128.65 sys,     26221520 mem

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org> # build only
Tested-by: Thomas Weißschuh <linux@weissschuh.net> # compile and run
Link: https://lore.kernel.org/r/20230208172245.362196959@infradead.org
---
 tools/objtool/arch/x86/decode.c       |  4 ++--
 tools/objtool/check.c                 | 11 +++++------
 tools/objtool/include/objtool/arch.h  |  2 +-
 tools/objtool/include/objtool/check.h |  2 +-
 4 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index c5c49277cf1a..9ef024fd648c 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -105,7 +105,7 @@ bool arch_pc_relative_reloc(struct reloc *reloc)
 #define ADD_OP(op) \
 	if (!(op = calloc(1, sizeof(*op)))) \
 		return -1; \
-	else for (list_add_tail(&op->list, ops_list); op; op = NULL)
+	else for (*ops_list = op, ops_list = &op->next; op; op = NULL)
 
 /*
  * Helpers to decode ModRM/SIB:
@@ -148,7 +148,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 			    unsigned long offset, unsigned int maxlen,
 			    struct instruction *insn)
 {
-	struct list_head *ops_list = &insn->stack_ops;
+	struct stack_op **ops_list = &insn->stack_ops;
 	const struct elf *elf = file->elf;
 	struct insn ins;
 	int x86_64, ret;
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index b3b423d33cc2..8109d7405297 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -398,7 +398,6 @@ static int decode_instructions(struct objtool_file *file)
 			}
 			memset(insn, 0, sizeof(*insn));
 			INIT_LIST_HEAD(&insn->alts);
-			INIT_LIST_HEAD(&insn->stack_ops);
 			INIT_LIST_HEAD(&insn->call_node);
 
 			insn->sec = sec;
@@ -1331,12 +1330,13 @@ static struct reloc *insn_reloc(struct objtool_file *file, struct instruction *i
 
 static void remove_insn_ops(struct instruction *insn)
 {
-	struct stack_op *op, *tmp;
+	struct stack_op *op, *next;
 
-	list_for_each_entry_safe(op, tmp, &insn->stack_ops, list) {
-		list_del(&op->list);
+	for (op = insn->stack_ops; op; op = next) {
+		next = op->next;
 		free(op);
 	}
+	insn->stack_ops = NULL;
 }
 
 static void annotate_call_site(struct objtool_file *file,
@@ -1781,7 +1781,6 @@ static int handle_group_alt(struct objtool_file *file,
 		}
 		memset(nop, 0, sizeof(*nop));
 		INIT_LIST_HEAD(&nop->alts);
-		INIT_LIST_HEAD(&nop->stack_ops);
 
 		nop->sec = special_alt->new_sec;
 		nop->offset = special_alt->new_off + special_alt->new_len;
@@ -3226,7 +3225,7 @@ static int handle_insn_ops(struct instruction *insn,
 {
 	struct stack_op *op;
 
-	list_for_each_entry(op, &insn->stack_ops, list) {
+	for (op = insn->stack_ops; op; op = op->next) {
 
 		if (update_cfi_state(insn, next_insn, &state->cfi, op))
 			return 1;
diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h
index 73149f8090fa..2b6d2ce4f9a5 100644
--- a/tools/objtool/include/objtool/arch.h
+++ b/tools/objtool/include/objtool/arch.h
@@ -62,9 +62,9 @@ struct op_src {
 };
 
 struct stack_op {
+	struct stack_op *next;
 	struct op_dest dest;
 	struct op_src src;
-	struct list_head list;
 };
 
 struct instruction;
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index acd7fae59348..23e981999365 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -68,7 +68,7 @@ struct instruction {
 	struct reloc *reloc;
 	struct list_head alts;
 	struct symbol *sym;
-	struct list_head stack_ops;
+	struct stack_op *stack_ops;
 	struct cfi_state *cfi;
 };
 
-- 
cgit 


From d54066546121426ecd7ad01a53ae429c4e37a9d5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Feb 2023 18:17:59 +0100
Subject: objtool: Make instruction::alts a single-linked list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

 struct instruction {
 	struct list_head           list;                 /*     0    16 */
 	struct hlist_node          hash;                 /*    16    16 */
 	struct list_head           call_node;            /*    32    16 */
 	struct section *           sec;                  /*    48     8 */
 	long unsigned int          offset;               /*    56     8 */
 	/* --- cacheline 1 boundary (64 bytes) --- */
 	unsigned int               len;                  /*    64     4 */
 	enum insn_type             type;                 /*    68     4 */
 	long unsigned int          immediate;            /*    72     8 */
 	u16                        dead_end:1;           /*    80: 0  2 */
 	u16                        ignore:1;             /*    80: 1  2 */
 	u16                        ignore_alts:1;        /*    80: 2  2 */
 	u16                        hint:1;               /*    80: 3  2 */
 	u16                        save:1;               /*    80: 4  2 */
 	u16                        restore:1;            /*    80: 5  2 */
 	u16                        retpoline_safe:1;     /*    80: 6  2 */
 	u16                        noendbr:1;            /*    80: 7  2 */
 	u16                        entry:1;              /*    80: 8  2 */

 	/* XXX 7 bits hole, try to pack */

 	s8                         instr;                /*    82     1 */
 	u8                         visited;              /*    83     1 */

 	/* XXX 4 bytes hole, try to pack */

 	struct alt_group *         alt_group;            /*    88     8 */
 	struct symbol *            call_dest;            /*    96     8 */
 	struct instruction *       jump_dest;            /*   104     8 */
 	struct instruction *       first_jump_src;       /*   112     8 */
 	struct reloc *             jump_table;           /*   120     8 */
 	/* --- cacheline 2 boundary (128 bytes) --- */
 	struct reloc *             reloc;                /*   128     8 */
-	struct list_head           alts;                 /*   136    16 */
-	struct symbol *            sym;                  /*   152     8 */
-	struct stack_op *          stack_ops;            /*   160     8 */
-	struct cfi_state *         cfi;                  /*   168     8 */
+	struct alternative *       alts;                 /*   136     8 */
+	struct symbol *            sym;                  /*   144     8 */
+	struct stack_op *          stack_ops;            /*   152     8 */
+	struct cfi_state *         cfi;                  /*   160     8 */

-	/* size: 176, cachelines: 3, members: 29 */
-	/* sum members: 170, holes: 1, sum holes: 4 */
+	/* size: 168, cachelines: 3, members: 29 */
+	/* sum members: 162, holes: 1, sum holes: 4 */
 	/* sum bitfield members: 9 bits, bit holes: 1, sum bit holes: 7 bits */
-	/* last cacheline: 48 bytes */
+	/* last cacheline: 40 bytes */
 };

pre:	5:58.50 real,   229.64 user,    128.65 sys,     26221520 mem
post:	5:48.86 real,   220.30 user,    128.34 sys,     24834672 mem

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org> # build only
Tested-by: Thomas Weißschuh <linux@weissschuh.net> # compile and run
Link: https://lore.kernel.org/r/20230208172245.430556498@infradead.org
---
 tools/objtool/check.c                 | 18 +++++++++---------
 tools/objtool/include/objtool/check.h |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 8109d7405297..9f83e85e2093 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -23,7 +23,7 @@
 #include <linux/static_call_types.h>
 
 struct alternative {
-	struct list_head list;
+	struct alternative *next;
 	struct instruction *insn;
 	bool skip_orig;
 };
@@ -397,7 +397,6 @@ static int decode_instructions(struct objtool_file *file)
 				return -1;
 			}
 			memset(insn, 0, sizeof(*insn));
-			INIT_LIST_HEAD(&insn->alts);
 			INIT_LIST_HEAD(&insn->call_node);
 
 			insn->sec = sec;
@@ -1780,7 +1779,6 @@ static int handle_group_alt(struct objtool_file *file,
 			return -1;
 		}
 		memset(nop, 0, sizeof(*nop));
-		INIT_LIST_HEAD(&nop->alts);
 
 		nop->sec = special_alt->new_sec;
 		nop->offset = special_alt->new_off + special_alt->new_len;
@@ -1978,7 +1976,8 @@ static int add_special_section_alts(struct objtool_file *file)
 		alt->insn = new_insn;
 		alt->skip_orig = special_alt->skip_orig;
 		orig_insn->ignore_alts |= special_alt->skip_alt;
-		list_add_tail(&alt->list, &orig_insn->alts);
+		alt->next = orig_insn->alts;
+		orig_insn->alts = alt;
 
 		list_del(&special_alt->list);
 		free(special_alt);
@@ -2037,7 +2036,8 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn,
 		}
 
 		alt->insn = dest_insn;
-		list_add_tail(&alt->list, &insn->alts);
+		alt->next = insn->alts;
+		insn->alts = alt;
 		prev_offset = reloc->offset;
 	}
 
@@ -3594,10 +3594,10 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
 		if (propagate_alt_cfi(file, insn))
 			return 1;
 
-		if (!insn->ignore_alts && !list_empty(&insn->alts)) {
+		if (!insn->ignore_alts && insn->alts) {
 			bool skip_orig = false;
 
-			list_for_each_entry(alt, &insn->alts, list) {
+			for (alt = insn->alts; alt; alt = alt->next) {
 				if (alt->skip_orig)
 					skip_orig = true;
 
@@ -3796,11 +3796,11 @@ static int validate_entry(struct objtool_file *file, struct instruction *insn)
 
 		insn->visited |= VISITED_ENTRY;
 
-		if (!insn->ignore_alts && !list_empty(&insn->alts)) {
+		if (!insn->ignore_alts && insn->alts) {
 			struct alternative *alt;
 			bool skip_orig = false;
 
-			list_for_each_entry(alt, &insn->alts, list) {
+			for (alt = insn->alts; alt; alt = alt->next) {
 				if (alt->skip_orig)
 					skip_orig = true;
 
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index 23e981999365..7966f60f858b 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -66,7 +66,7 @@ struct instruction {
 	struct instruction *first_jump_src;
 	struct reloc *jump_table;
 	struct reloc *reloc;
-	struct list_head alts;
+	struct alternative *alts;
 	struct symbol *sym;
 	struct stack_op *stack_ops;
 	struct cfi_state *cfi;
-- 
cgit 


From 8b2de412158ecdb312c707918432e6650df808cc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Feb 2023 18:18:00 +0100
Subject: objtool: Shrink instruction::{type,visited}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since we don't have that many types in enum insn_type, force it into a
u8 and re-arrange member to get rid of the holes, saves another 8
bytes.

 struct instruction {
 	struct list_head           list;                 /*     0    16 */
 	struct hlist_node          hash;                 /*    16    16 */
 	struct list_head           call_node;            /*    32    16 */
 	struct section *           sec;                  /*    48     8 */
 	long unsigned int          offset;               /*    56     8 */
 	/* --- cacheline 1 boundary (64 bytes) --- */
-	unsigned int               len;                  /*    64     4 */
-	enum insn_type             type;                 /*    68     4 */
-	long unsigned int          immediate;            /*    72     8 */
-	u16                        dead_end:1;           /*    80: 0  2 */
-	u16                        ignore:1;             /*    80: 1  2 */
-	u16                        ignore_alts:1;        /*    80: 2  2 */
-	u16                        hint:1;               /*    80: 3  2 */
-	u16                        save:1;               /*    80: 4  2 */
-	u16                        restore:1;            /*    80: 5  2 */
-	u16                        retpoline_safe:1;     /*    80: 6  2 */
-	u16                        noendbr:1;            /*    80: 7  2 */
-	u16                        entry:1;              /*    80: 8  2 */
+	long unsigned int          immediate;            /*    64     8 */
+	unsigned int               len;                  /*    72     4 */
+	u8                         type;                 /*    76     1 */

-	/* XXX 7 bits hole, try to pack */
+	/* Bitfield combined with previous fields */

-	s8                         instr;                /*    82     1 */
-	u8                         visited;              /*    83     1 */
+	u16                        dead_end:1;           /*    76: 8  2 */
+	u16                        ignore:1;             /*    76: 9  2 */
+	u16                        ignore_alts:1;        /*    76:10  2 */
+	u16                        hint:1;               /*    76:11  2 */
+	u16                        save:1;               /*    76:12  2 */
+	u16                        restore:1;            /*    76:13  2 */
+	u16                        retpoline_safe:1;     /*    76:14  2 */
+	u16                        noendbr:1;            /*    76:15  2 */
+	u16                        entry:1;              /*    78: 0  2 */
+	u16                        visited:4;            /*    78: 1  2 */

-	/* XXX 4 bytes hole, try to pack */
+	/* XXX 3 bits hole, try to pack */
+	/* Bitfield combined with next fields */

-	struct alt_group *         alt_group;            /*    88     8 */
-	struct symbol *            call_dest;            /*    96     8 */
-	struct instruction *       jump_dest;            /*   104     8 */
-	struct instruction *       first_jump_src;       /*   112     8 */
-	struct reloc *             jump_table;           /*   120     8 */
+	s8                         instr;                /*    79     1 */
+	struct alt_group *         alt_group;            /*    80     8 */
+	struct symbol *            call_dest;            /*    88     8 */
+	struct instruction *       jump_dest;            /*    96     8 */
+	struct instruction *       first_jump_src;       /*   104     8 */
+	struct reloc *             jump_table;           /*   112     8 */
+	struct reloc *             reloc;                /*   120     8 */
 	/* --- cacheline 2 boundary (128 bytes) --- */
-	struct reloc *             reloc;                /*   128     8 */
-	struct alternative *       alts;                 /*   136     8 */
-	struct symbol *            sym;                  /*   144     8 */
-	struct stack_op *          stack_ops;            /*   152     8 */
-	struct cfi_state *         cfi;                  /*   160     8 */
+	struct alternative *       alts;                 /*   128     8 */
+	struct symbol *            sym;                  /*   136     8 */
+	struct stack_op *          stack_ops;            /*   144     8 */
+	struct cfi_state *         cfi;                  /*   152     8 */

-	/* size: 168, cachelines: 3, members: 29 */
-	/* sum members: 162, holes: 1, sum holes: 4 */
-	/* sum bitfield members: 9 bits, bit holes: 1, sum bit holes: 7 bits */
-	/* last cacheline: 40 bytes */
+	/* size: 160, cachelines: 3, members: 29 */
+	/* sum members: 158 */
+	/* sum bitfield members: 13 bits, bit holes: 1, sum bit holes: 3 bits */
+	/* last cacheline: 32 bytes */
 };

pre:	5:48.86 real,   220.30 user,    128.34 sys,     24834672 mem
post:	5:48.89 real,   220.96 user,    127.55 sys,     24834672 mem

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org> # build only
Tested-by: Thomas Weißschuh <linux@weissschuh.net> # compile and run
Link: https://lore.kernel.org/r/20230208172245.501847188@infradead.org
---
 tools/objtool/include/objtool/check.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index 7966f60f858b..a497ee7672fb 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -42,9 +42,9 @@ struct instruction {
 	struct list_head call_node;
 	struct section *sec;
 	unsigned long offset;
-	unsigned int len;
-	enum insn_type type;
 	unsigned long immediate;
+	unsigned int len;
+	u8 type;
 
 	u16 dead_end		: 1,
 	   ignore		: 1,
@@ -54,11 +54,11 @@ struct instruction {
 	   restore		: 1,
 	   retpoline_safe	: 1,
 	   noendbr		: 1,
-	   entry		: 1;
-		/* 7 bit hole */
+	   entry		: 1,
+	   visited		: 4;
+		/* 3 bit hole */
 
 	s8 instr;
-	u8 visited;
 
 	struct alt_group *alt_group;
 	struct symbol *call_dest;
-- 
cgit 


From 0932dbe1f5680481e612cafe0c7d0f1796f68612 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Feb 2023 18:18:01 +0100
Subject: objtool: Remove instruction::reloc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of caching the reloc for each instruction, only keep a
negative cache of not having a reloc (by far the most common case).

 struct instruction {
 	struct list_head           list;                 /*     0    16 */
 	struct hlist_node          hash;                 /*    16    16 */
 	struct list_head           call_node;            /*    32    16 */
 	struct section *           sec;                  /*    48     8 */
 	long unsigned int          offset;               /*    56     8 */
 	/* --- cacheline 1 boundary (64 bytes) --- */
 	long unsigned int          immediate;            /*    64     8 */
 	unsigned int               len;                  /*    72     4 */
 	u8                         type;                 /*    76     1 */

 	/* Bitfield combined with previous fields */

 	u16                        dead_end:1;           /*    76: 8  2 */
 	u16                        ignore:1;             /*    76: 9  2 */
 	u16                        ignore_alts:1;        /*    76:10  2 */
 	u16                        hint:1;               /*    76:11  2 */
 	u16                        save:1;               /*    76:12  2 */
 	u16                        restore:1;            /*    76:13  2 */
 	u16                        retpoline_safe:1;     /*    76:14  2 */
 	u16                        noendbr:1;            /*    76:15  2 */
 	u16                        entry:1;              /*    78: 0  2 */
 	u16                        visited:4;            /*    78: 1  2 */
+	u16                        no_reloc:1;           /*    78: 5  2 */

-	/* XXX 3 bits hole, try to pack */
+	/* XXX 2 bits hole, try to pack */
 	/* Bitfield combined with next fields */

 	s8                         instr;                /*    79     1 */
 	struct alt_group *         alt_group;            /*    80     8 */
 	struct symbol *            call_dest;            /*    88     8 */
 	struct instruction *       jump_dest;            /*    96     8 */
 	struct instruction *       first_jump_src;       /*   104     8 */
 	struct reloc *             jump_table;           /*   112     8 */
-	struct reloc *             reloc;                /*   120     8 */
+	struct alternative *       alts;                 /*   120     8 */
 	/* --- cacheline 2 boundary (128 bytes) --- */
-	struct alternative *       alts;                 /*   128     8 */
-	struct symbol *            sym;                  /*   136     8 */
-	struct stack_op *          stack_ops;            /*   144     8 */
-	struct cfi_state *         cfi;                  /*   152     8 */
+	struct symbol *            sym;                  /*   128     8 */
+	struct stack_op *          stack_ops;            /*   136     8 */
+	struct cfi_state *         cfi;                  /*   144     8 */

-	/* size: 160, cachelines: 3, members: 29 */
-	/* sum members: 158 */
-	/* sum bitfield members: 13 bits, bit holes: 1, sum bit holes: 3 bits */
-	/* last cacheline: 32 bytes */
+	/* size: 152, cachelines: 3, members: 29 */
+	/* sum members: 150 */
+	/* sum bitfield members: 14 bits, bit holes: 1, sum bit holes: 2 bits */
+	/* last cacheline: 24 bytes */
 };

pre:	5:48.89 real,   220.96 user,    127.55 sys,     24834672 mem
post:	5:39.35 real,   215.58 user,    123.69 sys,     23448736 mem

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org> # build only
Tested-by: Thomas Weißschuh <linux@weissschuh.net> # compile and run
Link: https://lore.kernel.org/r/20230208172245.572145269@infradead.org
---
 tools/objtool/check.c                 | 24 +++++++++++-------------
 tools/objtool/include/objtool/check.h |  6 +++---
 2 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 9f83e85e2093..6d0ce2395554 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1305,26 +1305,24 @@ __weak bool arch_is_rethunk(struct symbol *sym)
 	return false;
 }
 
-#define NEGATIVE_RELOC	((void *)-1L)
-
 static struct reloc *insn_reloc(struct objtool_file *file, struct instruction *insn)
 {
-	if (insn->reloc == NEGATIVE_RELOC)
+	struct reloc *reloc;
+
+	if (insn->no_reloc)
 		return NULL;
 
-	if (!insn->reloc) {
-		if (!file)
-			return NULL;
+	if (!file)
+		return NULL;
 
-		insn->reloc = find_reloc_by_dest_range(file->elf, insn->sec,
-						       insn->offset, insn->len);
-		if (!insn->reloc) {
-			insn->reloc = NEGATIVE_RELOC;
-			return NULL;
-		}
+	reloc = find_reloc_by_dest_range(file->elf, insn->sec,
+					 insn->offset, insn->len);
+	if (!reloc) {
+		insn->no_reloc = 1;
+		return NULL;
 	}
 
-	return insn->reloc;
+	return reloc;
 }
 
 static void remove_insn_ops(struct instruction *insn)
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index a497ee7672fb..fffc8b86f9f3 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -55,8 +55,9 @@ struct instruction {
 	   retpoline_safe	: 1,
 	   noendbr		: 1,
 	   entry		: 1,
-	   visited		: 4;
-		/* 3 bit hole */
+	   visited		: 4,
+	   no_reloc		: 1;
+		/* 2 bit hole */
 
 	s8 instr;
 
@@ -65,7 +66,6 @@ struct instruction {
 	struct instruction *jump_dest;
 	struct instruction *first_jump_src;
 	struct reloc *jump_table;
-	struct reloc *reloc;
 	struct alternative *alts;
 	struct symbol *sym;
 	struct stack_op *stack_ops;
-- 
cgit 


From c6f5dc28fb3d736fa8d7f7d31e0664a9c772c299 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Feb 2023 18:18:02 +0100
Subject: objtool: Union instruction::{call_dest,jump_table}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The instruction call_dest and jump_table members can never be used at
the same time, their usage depends on type.

 struct instruction {
 	struct list_head           list;                 /*     0    16 */
 	struct hlist_node          hash;                 /*    16    16 */
 	struct list_head           call_node;            /*    32    16 */
 	struct section *           sec;                  /*    48     8 */
 	long unsigned int          offset;               /*    56     8 */
 	/* --- cacheline 1 boundary (64 bytes) --- */
 	long unsigned int          immediate;            /*    64     8 */
 	unsigned int               len;                  /*    72     4 */
 	u8                         type;                 /*    76     1 */

 	/* Bitfield combined with previous fields */

 	u16                        dead_end:1;           /*    76: 8  2 */
 	u16                        ignore:1;             /*    76: 9  2 */
 	u16                        ignore_alts:1;        /*    76:10  2 */
 	u16                        hint:1;               /*    76:11  2 */
 	u16                        save:1;               /*    76:12  2 */
 	u16                        restore:1;            /*    76:13  2 */
 	u16                        retpoline_safe:1;     /*    76:14  2 */
 	u16                        noendbr:1;            /*    76:15  2 */
 	u16                        entry:1;              /*    78: 0  2 */
 	u16                        visited:4;            /*    78: 1  2 */
 	u16                        no_reloc:1;           /*    78: 5  2 */

 	/* XXX 2 bits hole, try to pack */
 	/* Bitfield combined with next fields */

 	s8                         instr;                /*    79     1 */
 	struct alt_group *         alt_group;            /*    80     8 */
-	struct symbol *            call_dest;            /*    88     8 */
-	struct instruction *       jump_dest;            /*    96     8 */
-	struct instruction *       first_jump_src;       /*   104     8 */
-	struct reloc *             jump_table;           /*   112     8 */
-	struct alternative *       alts;                 /*   120     8 */
+	struct instruction *       jump_dest;            /*    88     8 */
+	struct instruction *       first_jump_src;       /*    96     8 */
+	union {
+		struct symbol *    _call_dest;           /*   104     8 */
+		struct reloc *     _jump_table;          /*   104     8 */
+	};                                               /*   104     8 */
+	struct alternative *       alts;                 /*   112     8 */
+	struct symbol *            sym;                  /*   120     8 */
 	/* --- cacheline 2 boundary (128 bytes) --- */
-	struct symbol *            sym;                  /*   128     8 */
-	struct stack_op *          stack_ops;            /*   136     8 */
-	struct cfi_state *         cfi;                  /*   144     8 */
+	struct stack_op *          stack_ops;            /*   128     8 */
+	struct cfi_state *         cfi;                  /*   136     8 */

-	/* size: 152, cachelines: 3, members: 29 */
-	/* sum members: 150 */
+	/* size: 144, cachelines: 3, members: 28 */
+	/* sum members: 142 */
 	/* sum bitfield members: 14 bits, bit holes: 1, sum bit holes: 2 bits */
-	/* last cacheline: 24 bytes */
+	/* last cacheline: 16 bytes */
 };

pre:	5:39.35 real,   215.58 user,    123.69 sys,     23448736 mem
post:	5:38.18 real,   213.25 user,    124.90 sys,     23449040 mem

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org> # build only
Tested-by: Thomas Weißschuh <linux@weissschuh.net> # compile and run
Link: https://lore.kernel.org/r/20230208172245.640914454@infradead.org
---
 tools/objtool/check.c                 | 73 ++++++++++++++++++++++-------------
 tools/objtool/include/objtool/check.h |  6 ++-
 2 files changed, 50 insertions(+), 29 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 6d0ce2395554..6f0adb2f76c6 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -114,16 +114,34 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file,
 	for (insn = next_insn_same_sec(file, insn); insn;		\
 	     insn = next_insn_same_sec(file, insn))
 
+static inline struct symbol *insn_call_dest(struct instruction *insn)
+{
+	if (insn->type == INSN_JUMP_DYNAMIC ||
+	    insn->type == INSN_CALL_DYNAMIC)
+		return NULL;
+
+	return insn->_call_dest;
+}
+
+static inline struct reloc *insn_jump_table(struct instruction *insn)
+{
+	if (insn->type == INSN_JUMP_DYNAMIC ||
+	    insn->type == INSN_CALL_DYNAMIC)
+		return insn->_jump_table;
+
+	return NULL;
+}
+
 static bool is_jump_table_jump(struct instruction *insn)
 {
 	struct alt_group *alt_group = insn->alt_group;
 
-	if (insn->jump_table)
+	if (insn_jump_table(insn))
 		return true;
 
 	/* Retpoline alternative for a jump table? */
 	return alt_group && alt_group->orig_group &&
-	       alt_group->orig_group->first_insn->jump_table;
+	       insn_jump_table(alt_group->orig_group->first_insn);
 }
 
 static bool is_sibling_call(struct instruction *insn)
@@ -137,8 +155,8 @@ static bool is_sibling_call(struct instruction *insn)
 			return !is_jump_table_jump(insn);
 	}
 
-	/* add_jump_destinations() sets insn->call_dest for sibling calls. */
-	return (is_static_jump(insn) && insn->call_dest);
+	/* add_jump_destinations() sets insn_call_dest(insn) for sibling calls. */
+	return (is_static_jump(insn) && insn_call_dest(insn));
 }
 
 /*
@@ -274,8 +292,8 @@ static void init_insn_state(struct objtool_file *file, struct insn_state *state,
 
 	/*
 	 * We need the full vmlinux for noinstr validation, otherwise we can
-	 * not correctly determine insn->call_dest->sec (external symbols do
-	 * not have a section).
+	 * not correctly determine insn_call_dest(insn)->sec (external symbols
+	 * do not have a section).
 	 */
 	if (opts.link && opts.noinstr && sec)
 		state->noinstr = sec->noinstr;
@@ -678,7 +696,7 @@ static int create_static_call_sections(struct objtool_file *file)
 			return -1;
 
 		/* find key symbol */
-		key_name = strdup(insn->call_dest->name);
+		key_name = strdup(insn_call_dest(insn)->name);
 		if (!key_name) {
 			perror("strdup");
 			return -1;
@@ -709,7 +727,7 @@ static int create_static_call_sections(struct objtool_file *file)
 			 * trampoline address.  This is fixed up in
 			 * static_call_add_module().
 			 */
-			key_sym = insn->call_dest;
+			key_sym = insn_call_dest(insn);
 		}
 		free(key_name);
 
@@ -1340,7 +1358,7 @@ static void annotate_call_site(struct objtool_file *file,
 			       struct instruction *insn, bool sibling)
 {
 	struct reloc *reloc = insn_reloc(file, insn);
-	struct symbol *sym = insn->call_dest;
+	struct symbol *sym = insn_call_dest(insn);
 
 	if (!sym)
 		sym = reloc->sym;
@@ -1425,7 +1443,7 @@ static void annotate_call_site(struct objtool_file *file,
 static void add_call_dest(struct objtool_file *file, struct instruction *insn,
 			  struct symbol *dest, bool sibling)
 {
-	insn->call_dest = dest;
+	insn->_call_dest = dest;
 	if (!dest)
 		return;
 
@@ -1683,12 +1701,12 @@ static int add_call_destinations(struct objtool_file *file)
 			if (insn->ignore)
 				continue;
 
-			if (!insn->call_dest) {
+			if (!insn_call_dest(insn)) {
 				WARN_FUNC("unannotated intra-function call", insn->sec, insn->offset);
 				return -1;
 			}
 
-			if (insn_func(insn) && insn->call_dest->type != STT_FUNC) {
+			if (insn_func(insn) && insn_call_dest(insn)->type != STT_FUNC) {
 				WARN_FUNC("unsupported call to non-function",
 					  insn->sec, insn->offset);
 				return -1;
@@ -2125,7 +2143,7 @@ static void mark_func_jump_tables(struct objtool_file *file,
 		reloc = find_jump_table(file, func, insn);
 		if (reloc) {
 			reloc->jump_table_start = true;
-			insn->jump_table = reloc;
+			insn->_jump_table = reloc;
 		}
 	}
 }
@@ -2137,10 +2155,10 @@ static int add_func_jump_tables(struct objtool_file *file,
 	int ret;
 
 	func_for_each_insn(file, func, insn) {
-		if (!insn->jump_table)
+		if (!insn_jump_table(insn))
 			continue;
 
-		ret = add_jump_table(file, insn, insn->jump_table);
+		ret = add_jump_table(file, insn, insn_jump_table(insn));
 		if (ret)
 			return ret;
 	}
@@ -2612,8 +2630,8 @@ static int decode_sections(struct objtool_file *file)
 static bool is_fentry_call(struct instruction *insn)
 {
 	if (insn->type == INSN_CALL &&
-	    insn->call_dest &&
-	    insn->call_dest->fentry)
+	    insn_call_dest(insn) &&
+	    insn_call_dest(insn)->fentry)
 		return true;
 
 	return false;
@@ -3320,8 +3338,8 @@ static inline const char *call_dest_name(struct instruction *insn)
 	struct reloc *rel;
 	int idx;
 
-	if (insn->call_dest)
-		return insn->call_dest->name;
+	if (insn_call_dest(insn))
+		return insn_call_dest(insn)->name;
 
 	rel = insn_reloc(NULL, insn);
 	if (rel && !strcmp(rel->sym->name, "pv_ops")) {
@@ -3403,13 +3421,13 @@ static int validate_call(struct objtool_file *file,
 			 struct insn_state *state)
 {
 	if (state->noinstr && state->instr <= 0 &&
-	    !noinstr_call_dest(file, insn, insn->call_dest)) {
+	    !noinstr_call_dest(file, insn, insn_call_dest(insn))) {
 		WARN_FUNC("call to %s() leaves .noinstr.text section",
 				insn->sec, insn->offset, call_dest_name(insn));
 		return 1;
 	}
 
-	if (state->uaccess && !func_uaccess_safe(insn->call_dest)) {
+	if (state->uaccess && !func_uaccess_safe(insn_call_dest(insn))) {
 		WARN_FUNC("call to %s() with UACCESS enabled",
 				insn->sec, insn->offset, call_dest_name(insn));
 		return 1;
@@ -3847,11 +3865,11 @@ static int validate_entry(struct objtool_file *file, struct instruction *insn)
 
 			/* fallthrough */
 		case INSN_CALL:
-			dest = find_insn(file, insn->call_dest->sec,
-					 insn->call_dest->offset);
+			dest = find_insn(file, insn_call_dest(insn)->sec,
+					 insn_call_dest(insn)->offset);
 			if (!dest) {
 				WARN("Unresolved function after linking!?: %s",
-				     insn->call_dest->name);
+				     insn_call_dest(insn)->name);
 				return -1;
 			}
 
@@ -3952,13 +3970,13 @@ static int validate_retpoline(struct objtool_file *file)
 static bool is_kasan_insn(struct instruction *insn)
 {
 	return (insn->type == INSN_CALL &&
-		!strcmp(insn->call_dest->name, "__asan_handle_no_return"));
+		!strcmp(insn_call_dest(insn)->name, "__asan_handle_no_return"));
 }
 
 static bool is_ubsan_insn(struct instruction *insn)
 {
 	return (insn->type == INSN_CALL &&
-		!strcmp(insn->call_dest->name,
+		!strcmp(insn_call_dest(insn)->name,
 			"__ubsan_handle_builtin_unreachable"));
 }
 
@@ -4036,7 +4054,8 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio
 	 * It may also insert a UD2 after calling a __noreturn function.
 	 */
 	prev_insn = list_prev_entry(insn, list);
-	if ((prev_insn->dead_end || dead_end_function(file, prev_insn->call_dest)) &&
+	if ((prev_insn->dead_end ||
+	     dead_end_function(file, insn_call_dest(prev_insn))) &&
 	    (insn->type == INSN_BUG ||
 	     (insn->type == INSN_JUMP_UNCONDITIONAL &&
 	      insn->jump_dest && insn->jump_dest->type == INSN_BUG)))
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index fffc8b86f9f3..ab6deaed9777 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -62,10 +62,12 @@ struct instruction {
 	s8 instr;
 
 	struct alt_group *alt_group;
-	struct symbol *call_dest;
 	struct instruction *jump_dest;
 	struct instruction *first_jump_src;
-	struct reloc *jump_table;
+	union {
+		struct symbol *_call_dest;
+		struct reloc *_jump_table;
+	};
 	struct alternative *alts;
 	struct symbol *sym;
 	struct stack_op *stack_ops;
-- 
cgit 


From a706bb08c81ac878982e41d4b6abcc42258bd39e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Feb 2023 18:18:03 +0100
Subject: objtool: Fix overlapping alternatives
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Things like ALTERNATIVE_{2,3}() generate multiple alternatives on the
same place, objtool would override the first orig_alt_group with the
second (or third), failing to check the CFI among all the different
variants.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org> # build only
Tested-by: Thomas Weißschuh <linux@weissschuh.net> # compile and run
Link: https://lore.kernel.org/r/20230208172245.711471461@infradead.org
---
 tools/objtool/check.c | 69 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 26 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 6f0adb2f76c6..7e9d3d3eed65 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1744,36 +1744,49 @@ static int handle_group_alt(struct objtool_file *file,
 			    struct instruction *orig_insn,
 			    struct instruction **new_insn)
 {
-	struct instruction *last_orig_insn, *last_new_insn = NULL, *insn, *nop = NULL;
+	struct instruction *last_new_insn = NULL, *insn, *nop = NULL;
 	struct alt_group *orig_alt_group, *new_alt_group;
 	unsigned long dest_off;
 
-
-	orig_alt_group = malloc(sizeof(*orig_alt_group));
+	orig_alt_group = orig_insn->alt_group;
 	if (!orig_alt_group) {
-		WARN("malloc failed");
-		return -1;
-	}
-	orig_alt_group->cfi = calloc(special_alt->orig_len,
-				     sizeof(struct cfi_state *));
-	if (!orig_alt_group->cfi) {
-		WARN("calloc failed");
-		return -1;
-	}
+		struct instruction *last_orig_insn = NULL;
 
-	last_orig_insn = NULL;
-	insn = orig_insn;
-	sec_for_each_insn_from(file, insn) {
-		if (insn->offset >= special_alt->orig_off + special_alt->orig_len)
-			break;
+		orig_alt_group = malloc(sizeof(*orig_alt_group));
+		if (!orig_alt_group) {
+			WARN("malloc failed");
+			return -1;
+		}
+		orig_alt_group->cfi = calloc(special_alt->orig_len,
+					     sizeof(struct cfi_state *));
+		if (!orig_alt_group->cfi) {
+			WARN("calloc failed");
+			return -1;
+		}
 
-		insn->alt_group = orig_alt_group;
-		last_orig_insn = insn;
-	}
-	orig_alt_group->orig_group = NULL;
-	orig_alt_group->first_insn = orig_insn;
-	orig_alt_group->last_insn = last_orig_insn;
+		insn = orig_insn;
+		sec_for_each_insn_from(file, insn) {
+			if (insn->offset >= special_alt->orig_off + special_alt->orig_len)
+				break;
 
+			insn->alt_group = orig_alt_group;
+			last_orig_insn = insn;
+		}
+		orig_alt_group->orig_group = NULL;
+		orig_alt_group->first_insn = orig_insn;
+		orig_alt_group->last_insn = last_orig_insn;
+	} else {
+		if (orig_alt_group->last_insn->offset + orig_alt_group->last_insn->len -
+		    orig_alt_group->first_insn->offset != special_alt->orig_len) {
+			WARN_FUNC("weirdly overlapping alternative! %ld != %d",
+				  orig_insn->sec, orig_insn->offset,
+				  orig_alt_group->last_insn->offset +
+				  orig_alt_group->last_insn->len -
+				  orig_alt_group->first_insn->offset,
+				  special_alt->orig_len);
+			return -1;
+		}
+	}
 
 	new_alt_group = malloc(sizeof(*new_alt_group));
 	if (!new_alt_group) {
@@ -1848,7 +1861,7 @@ static int handle_group_alt(struct objtool_file *file,
 
 		dest_off = arch_jump_destination(insn);
 		if (dest_off == special_alt->new_off + special_alt->new_len) {
-			insn->jump_dest = next_insn_same_sec(file, last_orig_insn);
+			insn->jump_dest = next_insn_same_sec(file, orig_alt_group->last_insn);
 			if (!insn->jump_dest) {
 				WARN_FUNC("can't find alternative jump destination",
 					  insn->sec, insn->offset);
@@ -3226,8 +3239,12 @@ static int propagate_alt_cfi(struct objtool_file *file, struct instruction *insn
 		alt_cfi[group_off] = insn->cfi;
 	} else {
 		if (cficmp(alt_cfi[group_off], insn->cfi)) {
-			WARN_FUNC("stack layout conflict in alternatives",
-				  insn->sec, insn->offset);
+			struct alt_group *orig_group = insn->alt_group->orig_group ?: insn->alt_group;
+			struct instruction *orig = orig_group->first_insn;
+			char *where = offstr(insn->sec, insn->offset);
+			WARN_FUNC("stack layout conflict in alternatives: %s",
+				  orig->sec, orig->offset, where);
+			free(where);
 			return -1;
 		}
 	}
-- 
cgit 


From 1c34496e5856886d565665fb64029ecdeb080ffb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Feb 2023 18:18:05 +0100
Subject: objtool: Remove instruction::list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the instruction::list by allocating instructions in arrays of
256 entries and stringing them together by (amortized) find_insn().
This shrinks instruction by 16 bytes and brings it down to 128.

 struct instruction {
-	struct list_head           list;                 /*     0    16 */
-	struct hlist_node          hash;                 /*    16    16 */
-	struct list_head           call_node;            /*    32    16 */
-	struct section *           sec;                  /*    48     8 */
-	long unsigned int          offset;               /*    56     8 */
-	/* --- cacheline 1 boundary (64 bytes) --- */
-	long unsigned int          immediate;            /*    64     8 */
-	unsigned int               len;                  /*    72     4 */
-	u8                         type;                 /*    76     1 */
-
-	/* Bitfield combined with previous fields */
+	struct hlist_node          hash;                 /*     0    16 */
+	struct list_head           call_node;            /*    16    16 */
+	struct section *           sec;                  /*    32     8 */
+	long unsigned int          offset;               /*    40     8 */
+	long unsigned int          immediate;            /*    48     8 */
+	u8                         len;                  /*    56     1 */
+	u8                         prev_len;             /*    57     1 */
+	u8                         type;                 /*    58     1 */
+	s8                         instr;                /*    59     1 */
+	u32                        idx:8;                /*    60: 0  4 */
+	u32                        dead_end:1;           /*    60: 8  4 */
+	u32                        ignore:1;             /*    60: 9  4 */
+	u32                        ignore_alts:1;        /*    60:10  4 */
+	u32                        hint:1;               /*    60:11  4 */
+	u32                        save:1;               /*    60:12  4 */
+	u32                        restore:1;            /*    60:13  4 */
+	u32                        retpoline_safe:1;     /*    60:14  4 */
+	u32                        noendbr:1;            /*    60:15  4 */
+	u32                        entry:1;              /*    60:16  4 */
+	u32                        visited:4;            /*    60:17  4 */
+	u32                        no_reloc:1;           /*    60:21  4 */

-	u16                        dead_end:1;           /*    76: 8  2 */
-	u16                        ignore:1;             /*    76: 9  2 */
-	u16                        ignore_alts:1;        /*    76:10  2 */
-	u16                        hint:1;               /*    76:11  2 */
-	u16                        save:1;               /*    76:12  2 */
-	u16                        restore:1;            /*    76:13  2 */
-	u16                        retpoline_safe:1;     /*    76:14  2 */
-	u16                        noendbr:1;            /*    76:15  2 */
-	u16                        entry:1;              /*    78: 0  2 */
-	u16                        visited:4;            /*    78: 1  2 */
-	u16                        no_reloc:1;           /*    78: 5  2 */
+	/* XXX 10 bits hole, try to pack */

-	/* XXX 2 bits hole, try to pack */
-	/* Bitfield combined with next fields */
-
-	s8                         instr;                /*    79     1 */
-	struct alt_group *         alt_group;            /*    80     8 */
-	struct instruction *       jump_dest;            /*    88     8 */
-	struct instruction *       first_jump_src;       /*    96     8 */
+	/* --- cacheline 1 boundary (64 bytes) --- */
+	struct alt_group *         alt_group;            /*    64     8 */
+	struct instruction *       jump_dest;            /*    72     8 */
+	struct instruction *       first_jump_src;       /*    80     8 */
 	union {
-		struct symbol *    _call_dest;           /*   104     8 */
-		struct reloc *     _jump_table;          /*   104     8 */
-	};                                               /*   104     8 */
-	struct alternative *       alts;                 /*   112     8 */
-	struct symbol *            sym;                  /*   120     8 */
-	/* --- cacheline 2 boundary (128 bytes) --- */
-	struct stack_op *          stack_ops;            /*   128     8 */
-	struct cfi_state *         cfi;                  /*   136     8 */
+		struct symbol *    _call_dest;           /*    88     8 */
+		struct reloc *     _jump_table;          /*    88     8 */
+	};                                               /*    88     8 */
+	struct alternative *       alts;                 /*    96     8 */
+	struct symbol *            sym;                  /*   104     8 */
+	struct stack_op *          stack_ops;            /*   112     8 */
+	struct cfi_state *         cfi;                  /*   120     8 */

-	/* size: 144, cachelines: 3, members: 28 */
-	/* sum members: 142 */
-	/* sum bitfield members: 14 bits, bit holes: 1, sum bit holes: 2 bits */
-	/* last cacheline: 16 bytes */
+	/* size: 128, cachelines: 2, members: 29 */
+	/* sum members: 124 */
+	/* sum bitfield members: 22 bits, bit holes: 1, sum bit holes: 10 bits */
 };

pre:	5:38.18 real,   213.25 user,    124.90 sys,     23449040 mem
post:	5:03.34 real,   210.75 user,    88.80 sys,      20241232 mem

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org> # build only
Tested-by: Thomas Weißschuh <linux@weissschuh.net> # compile and run
Link: https://lore.kernel.org/r/20230208172245.851307606@infradead.org
---
 tools/objtool/check.c                   | 166 ++++++++++++++++++++------------
 tools/objtool/include/objtool/check.h   |  51 +++++-----
 tools/objtool/include/objtool/objtool.h |   1 -
 tools/objtool/objtool.c                 |   1 -
 4 files changed, 133 insertions(+), 86 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 7e9d3d3eed65..b0b467d9608a 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -47,27 +47,29 @@ struct instruction *find_insn(struct objtool_file *file,
 	return NULL;
 }
 
-static struct instruction *next_insn_same_sec(struct objtool_file *file,
-					      struct instruction *insn)
+struct instruction *next_insn_same_sec(struct objtool_file *file,
+				       struct instruction *insn)
 {
-	struct instruction *next = list_next_entry(insn, list);
+	if (insn->idx == INSN_CHUNK_MAX)
+		return find_insn(file, insn->sec, insn->offset + insn->len);
 
-	if (!next || &next->list == &file->insn_list || next->sec != insn->sec)
+	insn++;
+	if (!insn->len)
 		return NULL;
 
-	return next;
+	return insn;
 }
 
 static struct instruction *next_insn_same_func(struct objtool_file *file,
 					       struct instruction *insn)
 {
-	struct instruction *next = list_next_entry(insn, list);
+	struct instruction *next = next_insn_same_sec(file, insn);
 	struct symbol *func = insn_func(insn);
 
 	if (!func)
 		return NULL;
 
-	if (&next->list != &file->insn_list && insn_func(next) == func)
+	if (next && insn_func(next) == func)
 		return next;
 
 	/* Check if we're already in the subfunction: */
@@ -78,17 +80,35 @@ static struct instruction *next_insn_same_func(struct objtool_file *file,
 	return find_insn(file, func->cfunc->sec, func->cfunc->offset);
 }
 
+static struct instruction *prev_insn_same_sec(struct objtool_file *file,
+					      struct instruction *insn)
+{
+	if (insn->idx == 0) {
+		if (insn->prev_len)
+			return find_insn(file, insn->sec, insn->offset - insn->prev_len);
+		return NULL;
+	}
+
+	return insn - 1;
+}
+
 static struct instruction *prev_insn_same_sym(struct objtool_file *file,
-					       struct instruction *insn)
+					      struct instruction *insn)
 {
-	struct instruction *prev = list_prev_entry(insn, list);
+	struct instruction *prev = prev_insn_same_sec(file, insn);
 
-	if (&prev->list != &file->insn_list && insn_func(prev) == insn_func(insn))
+	if (prev && insn_func(prev) == insn_func(insn))
 		return prev;
 
 	return NULL;
 }
 
+#define for_each_insn(file, insn)					\
+	for (struct section *__sec, *__fake = (struct section *)1;	\
+	     __fake; __fake = NULL)					\
+		for_each_sec(file, __sec)				\
+			sec_for_each_insn(file, __sec, insn)
+
 #define func_for_each_insn(file, func, insn)				\
 	for (insn = find_insn(file, func->sec, func->offset);		\
 	     insn;							\
@@ -96,16 +116,13 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file,
 
 #define sym_for_each_insn(file, sym, insn)				\
 	for (insn = find_insn(file, sym->sec, sym->offset);		\
-	     insn && &insn->list != &file->insn_list &&			\
-		insn->sec == sym->sec &&				\
-		insn->offset < sym->offset + sym->len;			\
-	     insn = list_next_entry(insn, list))
+	     insn && insn->offset < sym->offset + sym->len;		\
+	     insn = next_insn_same_sec(file, insn))
 
 #define sym_for_each_insn_continue_reverse(file, sym, insn)		\
-	for (insn = list_prev_entry(insn, list);			\
-	     &insn->list != &file->insn_list &&				\
-		insn->sec == sym->sec && insn->offset >= sym->offset;	\
-	     insn = list_prev_entry(insn, list))
+	for (insn = prev_insn_same_sec(file, insn);			\
+	     insn && insn->offset >= sym->offset;			\
+	     insn = prev_insn_same_sec(file, insn))
 
 #define sec_for_each_insn_from(file, insn)				\
 	for (; insn; insn = next_insn_same_sec(file, insn))
@@ -384,6 +401,9 @@ static int decode_instructions(struct objtool_file *file)
 	int ret;
 
 	for_each_sec(file, sec) {
+		struct instruction *insns = NULL;
+		u8 prev_len = 0;
+		u8 idx = 0;
 
 		if (!(sec->sh.sh_flags & SHF_EXECINSTR))
 			continue;
@@ -409,22 +429,31 @@ static int decode_instructions(struct objtool_file *file)
 			sec->init = true;
 
 		for (offset = 0; offset < sec->sh.sh_size; offset += insn->len) {
-			insn = malloc(sizeof(*insn));
-			if (!insn) {
-				WARN("malloc failed");
-				return -1;
+			if (!insns || idx == INSN_CHUNK_MAX) {
+				insns = calloc(sizeof(*insn), INSN_CHUNK_SIZE);
+				if (!insns) {
+					WARN("malloc failed");
+					return -1;
+				}
+				idx = 0;
+			} else {
+				idx++;
 			}
-			memset(insn, 0, sizeof(*insn));
-			INIT_LIST_HEAD(&insn->call_node);
+			insn = &insns[idx];
+			insn->idx = idx;
 
+			INIT_LIST_HEAD(&insn->call_node);
 			insn->sec = sec;
 			insn->offset = offset;
+			insn->prev_len = prev_len;
 
 			ret = arch_decode_instruction(file, sec, offset,
 						      sec->sh.sh_size - offset,
 						      insn);
 			if (ret)
-				goto err;
+				return ret;
+
+			prev_len = insn->len;
 
 			/*
 			 * By default, "ud2" is a dead end unless otherwise
@@ -435,10 +464,11 @@ static int decode_instructions(struct objtool_file *file)
 				insn->dead_end = true;
 
 			hash_add(file->insn_hash, &insn->hash, sec_offset_hash(sec, insn->offset));
-			list_add_tail(&insn->list, &file->insn_list);
 			nr_insns++;
 		}
 
+//		printf("%s: last chunk used: %d\n", sec->name, (int)idx);
+
 		list_for_each_entry(func, &sec->symbol_list, list) {
 			if (func->type != STT_NOTYPE && func->type != STT_FUNC)
 				continue;
@@ -481,10 +511,6 @@ static int decode_instructions(struct objtool_file *file)
 		printf("nr_insns: %lu\n", nr_insns);
 
 	return 0;
-
-err:
-	free(insn);
-	return ret;
 }
 
 /*
@@ -599,7 +625,7 @@ static int add_dead_ends(struct objtool_file *file)
 		}
 		insn = find_insn(file, reloc->sym->sec, reloc->addend);
 		if (insn)
-			insn = list_prev_entry(insn, list);
+			insn = prev_insn_same_sec(file, insn);
 		else if (reloc->addend == reloc->sym->sec->sh.sh_size) {
 			insn = find_last_insn(file, reloc->sym->sec);
 			if (!insn) {
@@ -634,7 +660,7 @@ reachable:
 		}
 		insn = find_insn(file, reloc->sym->sec, reloc->addend);
 		if (insn)
-			insn = list_prev_entry(insn, list);
+			insn = prev_insn_same_sec(file, insn);
 		else if (reloc->addend == reloc->sym->sec->sh.sh_size) {
 			insn = find_last_insn(file, reloc->sym->sec);
 			if (!insn) {
@@ -1775,6 +1801,7 @@ static int handle_group_alt(struct objtool_file *file,
 		orig_alt_group->orig_group = NULL;
 		orig_alt_group->first_insn = orig_insn;
 		orig_alt_group->last_insn = last_orig_insn;
+		orig_alt_group->nop = NULL;
 	} else {
 		if (orig_alt_group->last_insn->offset + orig_alt_group->last_insn->len -
 		    orig_alt_group->first_insn->offset != special_alt->orig_len) {
@@ -1876,12 +1903,11 @@ static int handle_group_alt(struct objtool_file *file,
 		return -1;
 	}
 
-	if (nop)
-		list_add(&nop->list, &last_new_insn->list);
 end:
 	new_alt_group->orig_group = orig_alt_group;
 	new_alt_group->first_insn = *new_insn;
-	new_alt_group->last_insn = nop ? : last_new_insn;
+	new_alt_group->last_insn = last_new_insn;
+	new_alt_group->nop = nop;
 	new_alt_group->cfi = orig_alt_group->cfi;
 	return 0;
 }
@@ -1931,7 +1957,7 @@ static int handle_jump_alt(struct objtool_file *file,
 	else
 		file->jl_long++;
 
-	*new_insn = list_next_entry(orig_insn, list);
+	*new_insn = next_insn_same_sec(file, orig_insn);
 	return 0;
 }
 
@@ -3522,11 +3548,28 @@ static struct instruction *next_insn_to_validate(struct objtool_file *file,
 	 * Simulate the fact that alternatives are patched in-place.  When the
 	 * end of a replacement alt_group is reached, redirect objtool flow to
 	 * the end of the original alt_group.
+	 *
+	 * insn->alts->insn -> alt_group->first_insn
+	 *		       ...
+	 *		       alt_group->last_insn
+	 *		       [alt_group->nop]      -> next(orig_group->last_insn)
 	 */
-	if (alt_group && insn == alt_group->last_insn && alt_group->orig_group)
-		return next_insn_same_sec(file, alt_group->orig_group->last_insn);
+	if (alt_group) {
+		if (alt_group->nop) {
+			/* ->nop implies ->orig_group */
+			if (insn == alt_group->last_insn)
+				return alt_group->nop;
+			if (insn == alt_group->nop)
+				goto next_orig;
+		}
+		if (insn == alt_group->last_insn && alt_group->orig_group)
+			goto next_orig;
+	}
 
 	return next_insn_same_sec(file, insn);
+
+next_orig:
+	return next_insn_same_sec(file, alt_group->orig_group->last_insn);
 }
 
 /*
@@ -3777,11 +3820,25 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
 	return 0;
 }
 
+static int validate_unwind_hint(struct objtool_file *file,
+				  struct instruction *insn,
+				  struct insn_state *state)
+{
+	if (insn->hint && !insn->visited && !insn->ignore) {
+		int ret = validate_branch(file, insn_func(insn), insn, *state);
+		if (ret && opts.backtrace)
+			BT_FUNC("<=== (hint)", insn);
+		return ret;
+	}
+
+	return 0;
+}
+
 static int validate_unwind_hints(struct objtool_file *file, struct section *sec)
 {
 	struct instruction *insn;
 	struct insn_state state;
-	int ret, warnings = 0;
+	int warnings = 0;
 
 	if (!file->hints)
 		return 0;
@@ -3789,22 +3846,11 @@ static int validate_unwind_hints(struct objtool_file *file, struct section *sec)
 	init_insn_state(file, &state, sec);
 
 	if (sec) {
-		insn = find_insn(file, sec, 0);
-		if (!insn)
-			return 0;
+		sec_for_each_insn(file, sec, insn)
+			warnings += validate_unwind_hint(file, insn, &state);
 	} else {
-		insn = list_first_entry(&file->insn_list, typeof(*insn), list);
-	}
-
-	while (&insn->list != &file->insn_list && (!sec || insn->sec == sec)) {
-		if (insn->hint && !insn->visited && !insn->ignore) {
-			ret = validate_branch(file, insn_func(insn), insn, state);
-			if (ret && opts.backtrace)
-				BT_FUNC("<=== (hint)", insn);
-			warnings += ret;
-		}
-
-		insn = list_next_entry(insn, list);
+		for_each_insn(file, insn)
+			warnings += validate_unwind_hint(file, insn, &state);
 	}
 
 	return warnings;
@@ -4070,7 +4116,7 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio
 	 *
 	 * It may also insert a UD2 after calling a __noreturn function.
 	 */
-	prev_insn = list_prev_entry(insn, list);
+	prev_insn = prev_insn_same_sec(file, insn);
 	if ((prev_insn->dead_end ||
 	     dead_end_function(file, insn_call_dest(prev_insn))) &&
 	    (insn->type == INSN_BUG ||
@@ -4102,7 +4148,7 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio
 		if (insn->offset + insn->len >= insn_func(insn)->offset + insn_func(insn)->len)
 			break;
 
-		insn = list_next_entry(insn, list);
+		insn = next_insn_same_sec(file, insn);
 	}
 
 	return false;
@@ -4115,10 +4161,10 @@ static int add_prefix_symbol(struct objtool_file *file, struct symbol *func,
 		return 0;
 
 	for (;;) {
-		struct instruction *prev = list_prev_entry(insn, list);
+		struct instruction *prev = prev_insn_same_sec(file, insn);
 		u64 offset;
 
-		if (&prev->list == &file->insn_list)
+		if (!prev)
 			break;
 
 		if (prev->type != INSN_NOP)
@@ -4517,7 +4563,7 @@ int check(struct objtool_file *file)
 
 	warnings += ret;
 
-	if (list_empty(&file->insn_list))
+	if (!nr_insns)
 		goto out;
 
 	if (opts.retpoline) {
@@ -4626,7 +4672,7 @@ int check(struct objtool_file *file)
 		warnings += ret;
 	}
 
-	if (opts.orc && !list_empty(&file->insn_list)) {
+	if (opts.orc && nr_insns) {
 		ret = orc_create(file);
 		if (ret < 0)
 			goto out;
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index ab6deaed9777..3e7c7004f7df 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -27,7 +27,7 @@ struct alt_group {
 	struct alt_group *orig_group;
 
 	/* First and last instructions in the group */
-	struct instruction *first_insn, *last_insn;
+	struct instruction *first_insn, *last_insn, *nop;
 
 	/*
 	 * Byte-offset-addressed len-sized array of pointers to CFI structs.
@@ -36,31 +36,36 @@ struct alt_group {
 	struct cfi_state **cfi;
 };
 
+#define INSN_CHUNK_BITS		8
+#define INSN_CHUNK_SIZE		(1 << INSN_CHUNK_BITS)
+#define INSN_CHUNK_MAX		(INSN_CHUNK_SIZE - 1)
+
 struct instruction {
-	struct list_head list;
 	struct hlist_node hash;
 	struct list_head call_node;
 	struct section *sec;
 	unsigned long offset;
 	unsigned long immediate;
-	unsigned int len;
-	u8 type;
-
-	u16 dead_end		: 1,
-	   ignore		: 1,
-	   ignore_alts		: 1,
-	   hint			: 1,
-	   save			: 1,
-	   restore		: 1,
-	   retpoline_safe	: 1,
-	   noendbr		: 1,
-	   entry		: 1,
-	   visited		: 4,
-	   no_reloc		: 1;
-		/* 2 bit hole */
 
+	u8 len;
+	u8 prev_len;
+	u8 type;
 	s8 instr;
 
+	u32 idx			: INSN_CHUNK_BITS,
+	    dead_end		: 1,
+	    ignore		: 1,
+	    ignore_alts		: 1,
+	    hint		: 1,
+	    save		: 1,
+	    restore		: 1,
+	    retpoline_safe	: 1,
+	    noendbr		: 1,
+	    entry		: 1,
+	    visited		: 4,
+	    no_reloc		: 1;
+		/* 10 bit hole */
+
 	struct alt_group *alt_group;
 	struct instruction *jump_dest;
 	struct instruction *first_jump_src;
@@ -109,13 +114,11 @@ static inline bool is_jump(struct instruction *insn)
 struct instruction *find_insn(struct objtool_file *file,
 			      struct section *sec, unsigned long offset);
 
-#define for_each_insn(file, insn)					\
-	list_for_each_entry(insn, &file->insn_list, list)
+struct instruction *next_insn_same_sec(struct objtool_file *file, struct instruction *insn);
 
-#define sec_for_each_insn(file, sec, insn)				\
-	for (insn = find_insn(file, sec, 0);				\
-	     insn && &insn->list != &file->insn_list &&			\
-			insn->sec == sec;				\
-	     insn = list_next_entry(insn, list))
+#define sec_for_each_insn(file, _sec, insn)				\
+	for (insn = find_insn(file, _sec, 0);				\
+	     insn && insn->sec == _sec;					\
+	     insn = next_insn_same_sec(file, insn))
 
 #endif /* _CHECK_H */
diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h
index 6b40977bcdb1..94a33ee7b363 100644
--- a/tools/objtool/include/objtool/objtool.h
+++ b/tools/objtool/include/objtool/objtool.h
@@ -21,7 +21,6 @@ struct pv_state {
 
 struct objtool_file {
 	struct elf *elf;
-	struct list_head insn_list;
 	DECLARE_HASHTABLE(insn_hash, 20);
 	struct list_head retpoline_call_list;
 	struct list_head return_thunk_list;
diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
index 6affd8067f83..c54f7235c5d9 100644
--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -99,7 +99,6 @@ struct objtool_file *objtool_open_read(const char *_objname)
 		return NULL;
 	}
 
-	INIT_LIST_HEAD(&file.insn_list);
 	hash_init(file.insn_hash);
 	INIT_LIST_HEAD(&file.retpoline_call_list);
 	INIT_LIST_HEAD(&file.return_thunk_list);
-- 
cgit 


From 00c8f01c4e84637c3db76f368b8687cb61f4dd9d Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Thu, 16 Feb 2023 12:34:41 -0800
Subject: objtool: Fix ORC 'signal' propagation

There have been some recently reported ORC unwinder warnings like:

  WARNING: can't access registers at entry_SYSCALL_64_after_hwframe+0x63/0xcd
  WARNING: stack going in the wrong direction? at __sys_setsockopt+0x2c6/0x5b0 net/socket.c:2271

And a KASAN warning:

  BUG: KASAN: stack-out-of-bounds in unwind_next_frame (arch/x86/include/asm/ptrace.h:136 arch/x86/kernel/unwind_orc.c:455)

It turns out the 'signal' bit isn't getting propagated from the unwind
hints to the ORC entries, making the unwinder confused at times.

Fixes: ffb1b4a41016 ("x86/unwind/orc: Add 'signal' field to ORC metadata")
Reported-by: kernel test robot <oliver.sang@intel.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/97eef9db60cd86d376a9a40d49d77bb67a8f6526.1676579666.git.jpoimboe@kernel.org
---
 tools/objtool/check.c               | 1 +
 tools/objtool/include/objtool/cfi.h | 1 +
 tools/objtool/orc_gen.c             | 1 +
 3 files changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index b0b467d9608a..5822de376d9a 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2330,6 +2330,7 @@ static int read_unwind_hints(struct objtool_file *file)
 
 		cfi.cfa.offset = bswap_if_needed(file->elf, hint->sp_offset);
 		cfi.type = hint->type;
+		cfi.signal = hint->signal;
 		cfi.end = hint->end;
 
 		insn->cfi = cfi_hash_find_or_add(&cfi);
diff --git a/tools/objtool/include/objtool/cfi.h b/tools/objtool/include/objtool/cfi.h
index f11d1ac1dadf..b1258e79a1b7 100644
--- a/tools/objtool/include/objtool/cfi.h
+++ b/tools/objtool/include/objtool/cfi.h
@@ -34,6 +34,7 @@ struct cfi_state {
 	unsigned char type;
 	bool bp_scratch;
 	bool drap;
+	bool signal;
 	bool end;
 };
 
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index 1f22b7ebae58..57a4527d5988 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -27,6 +27,7 @@ static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi,
 	}
 
 	orc->end = cfi->end;
+	orc->signal = cfi->signal;
 
 	if (cfi->cfa.base == CFI_UNDEFINED) {
 		orc->sp_reg = ORC_REG_UNDEFINED;
-- 
cgit 


From 44bd0394fe10903757da0863e0cf62c2d9846ea6 Mon Sep 17 00:00:00 2001
From: Lu Wei <luwei32@huawei.com>
Date: Wed, 22 Feb 2023 16:36:29 +0800
Subject: selftests: fib_tests: Add test cases for IPv4/IPv6 in route notify

Add tests to check whether the total fib info length is calculated
corretly in route notify process.

Signed-off-by: Lu Wei <luwei32@huawei.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20230222083629.335683-3-luwei32@huawei.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/fib_tests.sh | 96 +++++++++++++++++++++++++++++++-
 1 file changed, 95 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 70ea8798b1f6..7da8ec838c63 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -9,7 +9,7 @@ ret=0
 ksft_skip=4
 
 # all tests in this script. Can be overridden with -t option
-TESTS="unregister down carrier nexthop suppress ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh"
+TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh"
 
 VERBOSE=0
 PAUSE_ON_FAIL=no
@@ -655,6 +655,98 @@ fib_nexthop_test()
 	cleanup
 }
 
+fib6_notify_test()
+{
+	setup
+
+	echo
+	echo "Fib6 info length calculation in route notify test"
+	set -e
+
+	for i in 10 20 30 40 50 60 70;
+	do
+		$IP link add dummy_$i type dummy
+		$IP link set dev dummy_$i up
+		$IP -6 address add 2001:$i::1/64 dev dummy_$i
+	done
+
+	$NS_EXEC ip monitor route &> errors.txt &
+	sleep 2
+
+	$IP -6 route add 2001::/64 \
+                nexthop via 2001:10::2 dev dummy_10 \
+                nexthop encap ip6 dst 2002::20 via 2001:20::2 dev dummy_20 \
+                nexthop encap ip6 dst 2002::30 via 2001:30::2 dev dummy_30 \
+                nexthop encap ip6 dst 2002::40 via 2001:40::2 dev dummy_40 \
+                nexthop encap ip6 dst 2002::50 via 2001:50::2 dev dummy_50 \
+                nexthop encap ip6 dst 2002::60 via 2001:60::2 dev dummy_60 \
+                nexthop encap ip6 dst 2002::70 via 2001:70::2 dev dummy_70
+
+	set +e
+
+	err=`cat errors.txt |grep "Message too long"`
+	if [ -z "$err" ];then
+		ret=0
+	else
+		ret=1
+	fi
+
+	log_test $ret 0 "ipv6 route add notify"
+
+	{ kill %% && wait %%; } 2>/dev/null
+
+	#rm errors.txt
+
+	cleanup &> /dev/null
+}
+
+
+fib_notify_test()
+{
+	setup
+
+	echo
+	echo "Fib4 info length calculation in route notify test"
+
+	set -e
+
+	for i in 10 20 30 40 50 60 70;
+	do
+		$IP link add dummy_$i type dummy
+		$IP link set dev dummy_$i up
+		$IP address add 20.20.$i.2/24 dev dummy_$i
+	done
+
+	$NS_EXEC ip monitor route &> errors.txt &
+	sleep 2
+
+        $IP route add 10.0.0.0/24 \
+                nexthop via 20.20.10.1 dev dummy_10 \
+                nexthop encap ip dst 192.168.10.20 via 20.20.20.1 dev dummy_20 \
+                nexthop encap ip dst 192.168.10.30 via 20.20.30.1 dev dummy_30 \
+                nexthop encap ip dst 192.168.10.40 via 20.20.40.1 dev dummy_40 \
+                nexthop encap ip dst 192.168.10.50 via 20.20.50.1 dev dummy_50 \
+                nexthop encap ip dst 192.168.10.60 via 20.20.60.1 dev dummy_60 \
+                nexthop encap ip dst 192.168.10.70 via 20.20.70.1 dev dummy_70
+
+	set +e
+
+	err=`cat errors.txt |grep "Message too long"`
+	if [ -z "$err" ];then
+		ret=0
+	else
+		ret=1
+	fi
+
+	log_test $ret 0 "ipv4 route add notify"
+
+	{ kill %% && wait %%; } 2>/dev/null
+
+	rm  errors.txt
+
+	cleanup &> /dev/null
+}
+
 fib_suppress_test()
 {
 	echo
@@ -2111,6 +2203,8 @@ do
 	fib_carrier_test|carrier)	fib_carrier_test;;
 	fib_rp_filter_test|rp_filter)	fib_rp_filter_test;;
 	fib_nexthop_test|nexthop)	fib_nexthop_test;;
+	fib_notify_test|ipv4_notify)	fib_notify_test;;
+	fib6_notify_test|ipv6_notify)	fib6_notify_test;;
 	fib_suppress_test|suppress)	fib_suppress_test;;
 	ipv6_route_test|ipv6_rt)	ipv6_route_test;;
 	ipv4_route_test|ipv4_rt)	ipv4_route_test;;
-- 
cgit 


From ce9f1c05d2edfa6cdf2c1a510495d333e11810a8 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 22 Feb 2023 23:01:55 -0800
Subject: perf inject: Fix --buildid-all not to eat up MMAP2

When MMAP2 has the PERF_RECORD_MISC_MMAP_BUILD_ID flag, it means the
record already has the build-id info.  So it marks the DSO as hit, to
skip if the same DSO is not processed if it happens to miss the build-id
later.

But it missed to copy the MMAP2 record itself so it'd fail to symbolize
samples for those regions.

For example, the following generates 249 MMAP2 events.

  $ perf record --buildid-mmap -o- true | perf report --stat -i- | grep MMAP2
           MMAP2 events:        249  (86.8%)

Adding perf inject should not change the number of events like this

  $ perf record --buildid-mmap -o- true | perf inject -b | \
  > perf report --stat -i- | grep MMAP2
           MMAP2 events:        249  (86.5%)

But when --buildid-all is used, it eats most of the MMAP2 events.

  $ perf record --buildid-mmap -o- true | perf inject -b --buildid-all | \
  > perf report --stat -i- | grep MMAP2
           MMAP2 events:          1  ( 2.5%)

With this patch, it shows the original number now.

  $ perf record --buildid-mmap -o- true | perf inject -b --buildid-all | \
  > perf report --stat -i- | grep MMAP2
           MMAP2 events:        249  (86.5%)

Committer testing:

Before:

  $ perf record --buildid-mmap -o- perf stat --null sleep 1 2> /dev/null | perf inject -b | perf report --stat -i- | grep MMAP2
           MMAP2 events:         58  (36.2%)
  $ perf record --buildid-mmap -o- perf stat --null sleep 1 2> /dev/null | perf report --stat -i- | grep MMAP2
           MMAP2 events:         58  (36.2%)
  $ perf record --buildid-mmap -o- perf stat --null sleep 1 2> /dev/null | perf inject -b --buildid-all | perf report --stat -i- | grep MMAP2
           MMAP2 events:          2  ( 1.9%)
  $

After:

  $ perf record --buildid-mmap -o- perf stat --null sleep 1 2> /dev/null | perf inject -b | perf report --stat -i- | grep MMAP2
           MMAP2 events:         58  (29.3%)
  $ perf record --buildid-mmap -o- perf stat --null sleep 1 2> /dev/null | perf report --stat -i- | grep MMAP2
           MMAP2 events:         58  (34.3%)
  $ perf record --buildid-mmap -o- perf stat --null sleep 1 2> /dev/null | perf inject -b --buildid-all | perf report --stat -i- | grep MMAP2
           MMAP2 events:         58  (38.4%)
  $

Fixes: f7fc0d1c915a74ff ("perf inject: Do not inject BUILD_ID record if MMAP2 has it")
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230223070155.54251-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index f8182417b734..10bb1d494258 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -538,6 +538,7 @@ static int perf_event__repipe_buildid_mmap2(struct perf_tool *tool,
 			dso->hit = 1;
 		}
 		dso__put(dso);
+		perf_event__repipe(tool, event, sample, machine);
 		return 0;
 	}
 
-- 
cgit 


From d3e104bb024e01a7da6af3b270a7b46054b74ac9 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 22 Feb 2023 23:18:17 -0800
Subject: perf tests stat+csv_output: Switch CSV separator to @

Commas may appear in events like:

  cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/

which causes the commachecker to see more fields than expected. Use @ as
the CSV separator to avoid this.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Claire Jensen <cjense@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Link: https://lore.kernel.org/r/20230223071818.329671-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/stat+csv_output.sh | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/stat+csv_output.sh b/tools/perf/tests/shell/stat+csv_output.sh
index b7f050aa6210..324fc9e6edd7 100755
--- a/tools/perf/tests/shell/stat+csv_output.sh
+++ b/tools/perf/tests/shell/stat+csv_output.sh
@@ -7,6 +7,7 @@
 set -e
 
 skip_test=0
+csv_sep=@
 
 function commachecker()
 {
@@ -34,7 +35,7 @@ function commachecker()
 		[ "$x" = "Failed" ] && continue
 
 		# Count the number of commas
-		x=$(echo $line | tr -d -c ',')
+		x=$(echo $line | tr -d -c $csv_sep)
 		cnt="${#x}"
 		# echo $line $cnt
 		[[ ! "$cnt" =~ $exp ]] && {
@@ -54,7 +55,7 @@ function ParanoidAndNotRoot()
 check_no_args()
 {
 	echo -n "Checking CSV output: no args "
-	perf stat -x, true 2>&1 | commachecker --no-args
+	perf stat -x$csv_sep true 2>&1 | commachecker --no-args
 	echo "[Success]"
 }
 
@@ -66,7 +67,7 @@ check_system_wide()
 		echo "[Skip] paranoid and not root"
 		return
 	fi
-	perf stat -x, -a true 2>&1 | commachecker --system-wide
+	perf stat -x$csv_sep -a true 2>&1 | commachecker --system-wide
 	echo "[Success]"
 }
 
@@ -79,14 +80,14 @@ check_system_wide_no_aggr()
 		return
 	fi
 	echo -n "Checking CSV output: system wide no aggregation "
-	perf stat -x, -A -a --no-merge true 2>&1 | commachecker --system-wide-no-aggr
+	perf stat -x$csv_sep -A -a --no-merge true 2>&1 | commachecker --system-wide-no-aggr
 	echo "[Success]"
 }
 
 check_interval()
 {
 	echo -n "Checking CSV output: interval "
-	perf stat -x, -I 1000 true 2>&1 | commachecker --interval
+	perf stat -x$csv_sep -I 1000 true 2>&1 | commachecker --interval
 	echo "[Success]"
 }
 
@@ -94,7 +95,7 @@ check_interval()
 check_event()
 {
 	echo -n "Checking CSV output: event "
-	perf stat -x, -e cpu-clock true 2>&1 | commachecker --event
+	perf stat -x$csv_sep -e cpu-clock true 2>&1 | commachecker --event
 	echo "[Success]"
 }
 
@@ -106,7 +107,7 @@ check_per_core()
 		echo "[Skip] paranoid and not root"
 		return
 	fi
-	perf stat -x, --per-core -a true 2>&1 | commachecker --per-core
+	perf stat -x$csv_sep --per-core -a true 2>&1 | commachecker --per-core
 	echo "[Success]"
 }
 
@@ -118,7 +119,7 @@ check_per_thread()
 		echo "[Skip] paranoid and not root"
 		return
 	fi
-	perf stat -x, --per-thread -a true 2>&1 | commachecker --per-thread
+	perf stat -x$csv_sep --per-thread -a true 2>&1 | commachecker --per-thread
 	echo "[Success]"
 }
 
@@ -130,7 +131,7 @@ check_per_die()
 		echo "[Skip] paranoid and not root"
 		return
 	fi
-	perf stat -x, --per-die -a true 2>&1 | commachecker --per-die
+	perf stat -x$csv_sep --per-die -a true 2>&1 | commachecker --per-die
 	echo "[Success]"
 }
 
@@ -142,7 +143,7 @@ check_per_node()
 		echo "[Skip] paranoid and not root"
 		return
 	fi
-	perf stat -x, --per-node -a true 2>&1 | commachecker --per-node
+	perf stat -x$csv_sep --per-node -a true 2>&1 | commachecker --per-node
 	echo "[Success]"
 }
 
@@ -154,7 +155,7 @@ check_per_socket()
 		echo "[Skip] paranoid and not root"
 		return
 	fi
-	perf stat -x, --per-socket -a true 2>&1 | commachecker --per-socket
+	perf stat -x$csv_sep --per-socket -a true 2>&1 | commachecker --per-socket
 	echo "[Success]"
 }
 
-- 
cgit 


From 3de34f85bf437e47556044191f0bd7d0eb644e21 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 22 Feb 2023 23:18:18 -0800
Subject: perf test: Avoid counting commas in json linter

Commas may appear in events like:

  cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/

which causes the count of commas to see more items than expected. Switch
to counting the entries in the dictionary, which is 1 more than the
number of commas.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Claire Jensen <cjense@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Link: https://lore.kernel.org/r/20230223071818.329671-2-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../perf/tests/shell/lib/perf_json_output_lint.py  | 29 ++++++++++------------
 1 file changed, 13 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/lib/perf_json_output_lint.py b/tools/perf/tests/shell/lib/perf_json_output_lint.py
index d90f8d102eb9..97598d14e532 100644
--- a/tools/perf/tests/shell/lib/perf_json_output_lint.py
+++ b/tools/perf/tests/shell/lib/perf_json_output_lint.py
@@ -40,19 +40,6 @@ def is_counter_value(num):
   return isfloat(num) or num == '<not counted>' or num == '<not supported>'
 
 def check_json_output(expected_items):
-  if expected_items != -1:
-    for line in Lines:
-      if 'failed' not in line:
-        count = 0
-        count = line.count(',')
-        if count != expected_items and count >= 1 and count <= 3 and 'metric-value' in line:
-          # Events that generate >1 metric may have isolated metric
-          # values and possibly other prefixes like interval, core and
-          # aggregate-number.
-          continue
-        if count != expected_items:
-          raise RuntimeError(f'wrong number of fields. counted {count} expected {expected_items}'
-                             f' in \'{line}\'')
   checks = {
       'aggregate-number': lambda x: isfloat(x),
       'core': lambda x: True,
@@ -73,6 +60,16 @@ def check_json_output(expected_items):
   }
   input = '[\n' + ','.join(Lines) + '\n]'
   for item in json.loads(input):
+    if expected_items != -1:
+      count = len(item)
+      if count != expected_items and count >= 1 and count <= 4 and 'metric-value' in item:
+        # Events that generate >1 metric may have isolated metric
+        # values and possibly other prefixes like interval, core and
+        # aggregate-number.
+        pass
+      elif count != expected_items:
+        raise RuntimeError(f'wrong number of fields. counted {count} expected {expected_items}'
+                           f' in \'{item}\'')
     for key, value in item.items():
       if key not in checks:
         raise RuntimeError(f'Unexpected key: key={key} value={value}')
@@ -82,11 +79,11 @@ def check_json_output(expected_items):
 
 try:
   if args.no_args or args.system_wide or args.event:
-    expected_items = 6
-  elif args.interval or args.per_thread or args.system_wide_no_aggr:
     expected_items = 7
-  elif args.per_core or args.per_socket or args.per_node or args.per_die:
+  elif args.interval or args.per_thread or args.system_wide_no_aggr:
     expected_items = 8
+  elif args.per_core or args.per_socket or args.per_node or args.per_die:
+    expected_items = 9
   else:
     # If no option is specified, don't check the number of items.
     expected_items = -1
-- 
cgit 


From 22ef7d7be4dc071712ddf0ae09df6ee39b4901a0 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Thu, 23 Feb 2023 15:17:26 +0100
Subject: selftest: hid: fix hid_bpf not set in config

Now that CONFIG_HID_BPF is not automatically implied by HID, we need
to set it properly in the selftests config.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 tools/testing/selftests/hid/config | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/hid/config b/tools/testing/selftests/hid/config
index 9c5a55abca6b..5b5cef445b54 100644
--- a/tools/testing/selftests/hid/config
+++ b/tools/testing/selftests/hid/config
@@ -17,5 +17,6 @@ CONFIG_FTRACE_SYSCALLS=y
 CONFIG_FUNCTION_TRACER=y
 CONFIG_HIDRAW=y
 CONFIG_HID=y
+CONFIG_HID_BPF=y
 CONFIG_INPUT_EVDEV=y
 CONFIG_UHID=y
-- 
cgit 


From 1862de92c81c2a74ff05819aca20b0b83192c83b Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Thu, 23 Feb 2023 09:26:56 +0200
Subject: netdev-genl: fix repeated typo oflloading -> offloading

Fix a repeated copy/paste typo.

Fixes: d3d854fd6a1d ("netdev-genl: create a simple family for netdev stuff")
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Acked-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/netlink/specs/netdev.yaml | 2 +-
 include/uapi/linux/netdev.h             | 2 +-
 tools/include/uapi/linux/netdev.h       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index b4dcdae54ffd..cffef09729f1 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -28,7 +28,7 @@ definitions:
       -
         name: hw-offload
         doc:
-         This feature informs if netdev supports XDP hw oflloading.
+         This feature informs if netdev supports XDP hw offloading.
       -
         name: rx-sg
         doc:
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 9ee459872600..588391447bfb 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -19,7 +19,7 @@
  * @NETDEV_XDP_ACT_XSK_ZEROCOPY: This feature informs if netdev supports AF_XDP
  *   in zero copy mode.
  * @NETDEV_XDP_ACT_HW_OFFLOAD: This feature informs if netdev supports XDP hw
- *   oflloading.
+ *   offloading.
  * @NETDEV_XDP_ACT_RX_SG: This feature informs if netdev implements non-linear
  *   XDP buffer support in the driver napi callback.
  * @NETDEV_XDP_ACT_NDO_XMIT_SG: This feature informs if netdev implements
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 9ee459872600..588391447bfb 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -19,7 +19,7 @@
  * @NETDEV_XDP_ACT_XSK_ZEROCOPY: This feature informs if netdev supports AF_XDP
  *   in zero copy mode.
  * @NETDEV_XDP_ACT_HW_OFFLOAD: This feature informs if netdev supports XDP hw
- *   oflloading.
+ *   offloading.
  * @NETDEV_XDP_ACT_RX_SG: This feature informs if netdev implements non-linear
  *   XDP buffer support in the driver napi callback.
  * @NETDEV_XDP_ACT_NDO_XMIT_SG: This feature informs if netdev implements
-- 
cgit 


From f7cf644796fcdb6a0e30e4a7f218be74ac8cb31d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 23 Feb 2023 10:31:39 -0800
Subject: tools: ynl-gen: fix single attribute structs with attr 0 only

Chuck run into an issue with a single-element attr-set which
only has an attr with value of 0. The search for max attr in
a struct records attrs with value larger than 0 only (max_val
is set to 0 at the start). Adjust the comparison, alternatively
max_val could be init'ed to -1. Somehow picking the last attr
of a value seems like a good idea in general.

Reported-by: Chuck Lever III <chuck.lever@oracle.com>
Fixes: be5bea1cc0bf ("net: add basic C code generators for Netlink")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynl-gen-c.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 3942f24b9163..274e9c566f61 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -546,7 +546,7 @@ class Struct:
         max_val = 0
         self.attr_max_val = None
         for name, attr in self.attr_list:
-            if attr.value > max_val:
+            if attr.value >= max_val:
                 max_val = attr.value
                 self.attr_max_val = attr
             self.attrs[name] = attr
-- 
cgit 


From b9d3a3e4ae0cb7c443d46ffe413e17749baab3ba Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 23 Feb 2023 10:31:40 -0800
Subject: tools: ynl-gen: re-raise the exception instead of printing

traceback.print_exception() seems tricky to call, we're missing
some argument, so re-raise instead.

Reported-by: Chuck Lever III <chuck.lever@oracle.com>
Fixes: 3aacf8281336 ("tools: ynl: add an object hierarchy to represent parsed spec")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/nlspec.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py
index e204679ad8b7..71da568e2c28 100644
--- a/tools/net/ynl/lib/nlspec.py
+++ b/tools/net/ynl/lib/nlspec.py
@@ -3,7 +3,6 @@
 import collections
 import importlib
 import os
-import traceback
 import yaml
 
 
@@ -234,8 +233,7 @@ class SpecFamily(SpecElement):
                 resolved.append(elem)
 
             if len(resolved) == 0:
-                traceback.print_exception(last_exception)
-                raise Exception("Could not resolve any spec element, infinite loop?")
+                raise last_exception
 
     def new_attr_set(self, elem):
         return SpecAttrSet(self, elem)
-- 
cgit 


From d77e7eceeac9f3ab40b45649531993b456eeacd0 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 23 Feb 2023 10:31:41 -0800
Subject: tools: net: add __pycache__ to gitignore

Python will generate its customary cache when running ynl scripts:

?? tools/net/ynl/lib/__pycache__/

Reported-by: Chuck Lever III <chuck.lever@oracle.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/.gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 tools/net/ynl/lib/.gitignore

(limited to 'tools')

diff --git a/tools/net/ynl/lib/.gitignore b/tools/net/ynl/lib/.gitignore
new file mode 100644
index 000000000000..c18dd8d83cee
--- /dev/null
+++ b/tools/net/ynl/lib/.gitignore
@@ -0,0 +1 @@
+__pycache__/
-- 
cgit 


From 121ff07bdee06555835e26499f5c125223a6beb3 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Sat, 25 Feb 2023 15:52:57 +0800
Subject: tools: Add LoongArch build infrastructure

We will add tools support for LoongArch (bpf, perf, objtool, etc.), add
build infrastructure and common headers for preparation.

Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 tools/arch/loongarch/include/uapi/asm/bitsperlong.h |  9 +++++++++
 tools/scripts/Makefile.arch                         | 11 ++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)
 create mode 100644 tools/arch/loongarch/include/uapi/asm/bitsperlong.h

(limited to 'tools')

diff --git a/tools/arch/loongarch/include/uapi/asm/bitsperlong.h b/tools/arch/loongarch/include/uapi/asm/bitsperlong.h
new file mode 100644
index 000000000000..d4e32b3d4843
--- /dev/null
+++ b/tools/arch/loongarch/include/uapi/asm/bitsperlong.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_LOONGARCH_BITSPERLONG_H
+#define __ASM_LOONGARCH_BITSPERLONG_H
+
+#define __BITS_PER_LONG (__SIZEOF_POINTER__ * 8)
+
+#include <asm-generic/bitsperlong.h>
+
+#endif /* __ASM_LOONGARCH_BITSPERLONG_H */
diff --git a/tools/scripts/Makefile.arch b/tools/scripts/Makefile.arch
index 0c6c7f456887..1c72d07cb9fe 100644
--- a/tools/scripts/Makefile.arch
+++ b/tools/scripts/Makefile.arch
@@ -5,7 +5,7 @@ HOSTARCH := $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ \
                                   -e s/s390x/s390/ -e s/parisc64/parisc/ \
                                   -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
                                   -e s/sh[234].*/sh/ -e s/aarch64.*/arm64/ \
-                                  -e s/riscv.*/riscv/)
+                                  -e s/riscv.*/riscv/ -e s/loongarch.*/loongarch/)
 
 ifndef ARCH
 ARCH := $(HOSTARCH)
@@ -34,6 +34,15 @@ ifeq ($(ARCH),sh64)
        SRCARCH := sh
 endif
 
+# Additional ARCH settings for loongarch
+ifeq ($(ARCH),loongarch32)
+       SRCARCH := loongarch
+endif
+
+ifeq ($(ARCH),loongarch64)
+       SRCARCH := loongarch
+endif
+
 LP64 := $(shell echo __LP64__ | ${CC} ${CFLAGS} -E -x c - | tail -n 1)
 ifeq ($(LP64), 1)
   IS_64_BIT := 1
-- 
cgit 


From eb4071b988397144a35c4aa98061f587cbb3b467 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Sat, 25 Feb 2023 15:52:57 +0800
Subject: selftests/seccomp: Add LoongArch selftesting support

BPF for LoongArch is supported now, add the selftesting support in
seccomp_bpf.c.

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 9c2f448bb3a9..3933f34b6b05 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -128,6 +128,8 @@ struct seccomp_data {
 #  define __NR_seccomp 277
 # elif defined(__csky__)
 #  define __NR_seccomp 277
+# elif defined(__loongarch__)
+#  define __NR_seccomp 277
 # elif defined(__hppa__)
 #  define __NR_seccomp 338
 # elif defined(__powerpc__)
@@ -1753,6 +1755,10 @@ TEST_F(TRACE_poke, getpid_runs_normally)
 				    NT_ARM_SYSTEM_CALL, &__v));	\
 	} while (0)
 # define SYSCALL_RET(_regs)	(_regs).regs[0]
+#elif defined(__loongarch__)
+# define ARCH_REGS		struct user_pt_regs
+# define SYSCALL_NUM(_regs)	(_regs).regs[11]
+# define SYSCALL_RET(_regs)	(_regs).regs[4]
 #elif defined(__riscv) && __riscv_xlen == 64
 # define ARCH_REGS		struct user_regs_struct
 # define SYSCALL_NUM(_regs)	(_regs).a7
-- 
cgit 


From 8883bf83127d533abb415b204eabc064863ae6c9 Mon Sep 17 00:00:00 2001
From: Qing Zhang <zhangqing@loongson.cn>
Date: Sat, 25 Feb 2023 15:52:57 +0800
Subject: selftests/ftrace: Add LoongArch kprobe args string tests support

Before:
[5] Kprobe event string type argument	[UNTESTED]
[7] Kprobe event argument syntax	[UNTESTED]

After:
[5] Kprobe event string type argument	[PASS]
[7] Kprobe event argument syntax	[PASS]

Signed-off-by: Qing Zhang <zhangqing@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc | 3 +++
 tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc | 4 ++++
 2 files changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc
index 459741565222..a4f8e7c53c1f 100644
--- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc
@@ -28,6 +28,9 @@ s390*)
 mips*)
   ARG1=%r4
 ;;
+loongarch*)
+  ARG1=%r4
+;;
 *)
   echo "Please implement other architecture here"
   exit_untested
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc
index d4662c8cf407..1df61e13a812 100644
--- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc
@@ -40,6 +40,10 @@ mips*)
   GOODREG=%r4
   BADREG=%r12
 ;;
+loongarch*)
+  GOODREG=%r4
+  BADREG=%r12
+;;
 *)
   echo "Please implement other architecture here"
   exit_untested
-- 
cgit 


From 2067e7a00aa604b94de31d64f29b8893b1696f26 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 27 Feb 2023 17:36:46 +0800
Subject: selftests: nft_nat: ensuring the listening side is up before starting
 the client

The test_local_dnat_portonly() function initiates the client-side as
soon as it sets the listening side to the background. This could lead to
a race condition where the server may not be ready to listen. To ensure
that the server-side is up and running before initiating the
client-side, a delay is introduced to the test_local_dnat_portonly()
function.

Before the fix:
  # ./nft_nat.sh
  PASS: netns routing/connectivity: ns0-rthlYrBU can reach ns1-rthlYrBU and ns2-rthlYrBU
  PASS: ping to ns1-rthlYrBU was ip NATted to ns2-rthlYrBU
  PASS: ping to ns1-rthlYrBU OK after ip nat output chain flush
  PASS: ipv6 ping to ns1-rthlYrBU was ip6 NATted to ns2-rthlYrBU
  2023/02/27 04:11:03 socat[6055] E connect(5, AF=2 10.0.1.99:2000, 16): Connection refused
  ERROR: inet port rewrite

After the fix:
  # ./nft_nat.sh
  PASS: netns routing/connectivity: ns0-9sPJV6JJ can reach ns1-9sPJV6JJ and ns2-9sPJV6JJ
  PASS: ping to ns1-9sPJV6JJ was ip NATted to ns2-9sPJV6JJ
  PASS: ping to ns1-9sPJV6JJ OK after ip nat output chain flush
  PASS: ipv6 ping to ns1-9sPJV6JJ was ip6 NATted to ns2-9sPJV6JJ
  PASS: inet port rewrite without l3 address

Fixes: 282e5f8fe907 ("netfilter: nat: really support inet nat without l3 address")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/nft_nat.sh | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh
index 924ecb3f1f73..dd40d9f6f259 100755
--- a/tools/testing/selftests/netfilter/nft_nat.sh
+++ b/tools/testing/selftests/netfilter/nft_nat.sh
@@ -404,6 +404,8 @@ EOF
 	echo SERVER-$family | ip netns exec "$ns1" timeout 5 socat -u STDIN TCP-LISTEN:2000 &
 	sc_s=$!
 
+	sleep 1
+
 	result=$(ip netns exec "$ns0" timeout 1 socat TCP:$daddr:2000 STDOUT)
 
 	if [ "$result" = "SERVER-inet" ];then
-- 
cgit 


From f122a08b197d076ccf136c73fae0146875812a88 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 28 Feb 2023 11:39:09 -0800
Subject: capability: just use a 'u64' instead of a 'u32[2]' array

Back in 2008 we extended the capability bits from 32 to 64, and we did
it by extending the single 32-bit capability word from one word to an
array of two words.  It was then obfuscated by hiding the "2" behind two
macro expansions, with the reasoning being that maybe it gets extended
further some day.

That reasoning may have been valid at the time, but the last thing we
want to do is to extend the capability set any more.  And the array of
values not only causes source code oddities (with loops to deal with
it), but also results in worse code generation.  It's a lose-lose
situation.

So just change the 'u32[2]' into a 'u64' and be done with it.

We still have to deal with the fact that the user space interface is
designed around an array of these 32-bit values, but that was the case
before too, since the array layouts were different (ie user space
doesn't use an array of 32-bit values for individual capability masks,
but an array of 32-bit slices of multiple masks).

So that marshalling of data is actually simplified too, even if it does
remain somewhat obscure and odd.

This was all triggered by my reaction to the new "cap_isidentical()"
introduced recently.  By just using a saner data structure, it went from

	unsigned __capi;
	CAP_FOR_EACH_U32(__capi) {
		if (a.cap[__capi] != b.cap[__capi])
			return false;
	}
	return true;

to just being

	return a.val == b.val;

instead.  Which is rather more obvious both to humans and to compilers.

Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Paul Moore <paul@paul-moore.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c                                    |   7 +-
 include/linux/capability.h                         | 131 +++++----------------
 io_uring/fdinfo.c                                  |   4 +-
 kernel/auditsc.c                                   |   6 +-
 kernel/capability.c                                | 104 +++++++---------
 kernel/umh.c                                       |  41 +++----
 security/apparmor/policy_unpack.c                  |  40 +++++--
 security/commoncap.c                               |  49 ++++----
 .../selftests/bpf/progs/test_deny_namespace.c      |   7 +-
 9 files changed, 150 insertions(+), 239 deletions(-)

(limited to 'tools')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 49283b8103c7..9b0315d34c58 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -300,13 +300,8 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 static void render_cap_t(struct seq_file *m, const char *header,
 			kernel_cap_t *a)
 {
-	unsigned __capi;
-
 	seq_puts(m, header);
-	CAP_FOR_EACH_U32(__capi) {
-		seq_put_hex_ll(m, NULL,
-			   a->cap[CAP_LAST_U32 - __capi], 8);
-	}
+	seq_put_hex_ll(m, NULL, a->val, 16);
 	seq_putc(m, '\n');
 }
 
diff --git a/include/linux/capability.h b/include/linux/capability.h
index d3c6c2d1ff45..0c356a517991 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -15,28 +15,25 @@
 
 #include <uapi/linux/capability.h>
 #include <linux/uidgid.h>
+#include <linux/bits.h>
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
-#define _KERNEL_CAPABILITY_U32S    _LINUX_CAPABILITY_U32S_3
 
 extern int file_caps_enabled;
 
-typedef struct kernel_cap_struct {
-	__u32 cap[_KERNEL_CAPABILITY_U32S];
-} kernel_cap_t;
+typedef struct { u64 val; } kernel_cap_t;
 
 /* same as vfs_ns_cap_data but in cpu endian and always filled completely */
 struct cpu_vfs_cap_data {
 	__u32 magic_etc;
+	kuid_t rootid;
 	kernel_cap_t permitted;
 	kernel_cap_t inheritable;
-	kuid_t rootid;
 };
 
 #define _USER_CAP_HEADER_SIZE  (sizeof(struct __user_cap_header_struct))
 #define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))
 
-
 struct file;
 struct inode;
 struct dentry;
@@ -44,16 +41,6 @@ struct task_struct;
 struct user_namespace;
 struct mnt_idmap;
 
-extern const kernel_cap_t __cap_empty_set;
-extern const kernel_cap_t __cap_init_eff_set;
-
-/*
- * Internal kernel functions only
- */
-
-#define CAP_FOR_EACH_U32(__capi)  \
-	for (__capi = 0; __capi < _KERNEL_CAPABILITY_U32S; ++__capi)
-
 /*
  * CAP_FS_MASK and CAP_NFSD_MASKS:
  *
@@ -67,104 +54,52 @@ extern const kernel_cap_t __cap_init_eff_set;
  *   2. The security.* and trusted.* xattrs are fs-related MAC permissions
  */
 
-# define CAP_FS_MASK_B0     (CAP_TO_MASK(CAP_CHOWN)		\
-			    | CAP_TO_MASK(CAP_MKNOD)		\
-			    | CAP_TO_MASK(CAP_DAC_OVERRIDE)	\
-			    | CAP_TO_MASK(CAP_DAC_READ_SEARCH)	\
-			    | CAP_TO_MASK(CAP_FOWNER)		\
-			    | CAP_TO_MASK(CAP_FSETID))
-
-# define CAP_FS_MASK_B1     (CAP_TO_MASK(CAP_MAC_OVERRIDE))
-
-#if _KERNEL_CAPABILITY_U32S != 2
-# error Fix up hand-coded capability macro initializers
-#else /* HAND-CODED capability initializers */
+# define CAP_FS_MASK     (BIT_ULL(CAP_CHOWN)		\
+			| BIT_ULL(CAP_MKNOD)		\
+			| BIT_ULL(CAP_DAC_OVERRIDE)	\
+			| BIT_ULL(CAP_DAC_READ_SEARCH)	\
+			| BIT_ULL(CAP_FOWNER)		\
+			| BIT_ULL(CAP_FSETID)		\
+			| BIT_ULL(CAP_MAC_OVERRIDE))
+#define CAP_VALID_MASK	 (BIT_ULL(CAP_LAST_CAP+1)-1)
 
-#define CAP_LAST_U32			((_KERNEL_CAPABILITY_U32S) - 1)
-#define CAP_LAST_U32_VALID_MASK		(CAP_TO_MASK(CAP_LAST_CAP + 1) -1)
+# define CAP_EMPTY_SET    ((kernel_cap_t) { 0 })
+# define CAP_FULL_SET     ((kernel_cap_t) { CAP_VALID_MASK })
+# define CAP_FS_SET       ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_LINUX_IMMUTABLE) })
+# define CAP_NFSD_SET     ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_SYS_RESOURCE) })
 
-# define CAP_EMPTY_SET    ((kernel_cap_t){{ 0, 0 }})
-# define CAP_FULL_SET     ((kernel_cap_t){{ ~0, CAP_LAST_U32_VALID_MASK }})
-# define CAP_FS_SET       ((kernel_cap_t){{ CAP_FS_MASK_B0 \
-				    | CAP_TO_MASK(CAP_LINUX_IMMUTABLE), \
-				    CAP_FS_MASK_B1 } })
-# define CAP_NFSD_SET     ((kernel_cap_t){{ CAP_FS_MASK_B0 \
-				    | CAP_TO_MASK(CAP_SYS_RESOURCE), \
-				    CAP_FS_MASK_B1 } })
+# define cap_clear(c)         do { (c).val = 0; } while (0)
 
-#endif /* _KERNEL_CAPABILITY_U32S != 2 */
-
-# define cap_clear(c)         do { (c) = __cap_empty_set; } while (0)
-
-#define cap_raise(c, flag)  ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag))
-#define cap_lower(c, flag)  ((c).cap[CAP_TO_INDEX(flag)] &= ~CAP_TO_MASK(flag))
-#define cap_raised(c, flag) ((c).cap[CAP_TO_INDEX(flag)] & CAP_TO_MASK(flag))
-
-#define CAP_BOP_ALL(c, a, b, OP)                                    \
-do {                                                                \
-	unsigned __capi;                                            \
-	CAP_FOR_EACH_U32(__capi) {                                  \
-		c.cap[__capi] = a.cap[__capi] OP b.cap[__capi];     \
-	}                                                           \
-} while (0)
-
-#define CAP_UOP_ALL(c, a, OP)                                       \
-do {                                                                \
-	unsigned __capi;                                            \
-	CAP_FOR_EACH_U32(__capi) {                                  \
-		c.cap[__capi] = OP a.cap[__capi];                   \
-	}                                                           \
-} while (0)
+#define cap_raise(c, flag)  ((c).val |= BIT_ULL(flag))
+#define cap_lower(c, flag)  ((c).val &= ~BIT_ULL(flag))
+#define cap_raised(c, flag) (((c).val & BIT_ULL(flag)) != 0)
 
 static inline kernel_cap_t cap_combine(const kernel_cap_t a,
 				       const kernel_cap_t b)
 {
-	kernel_cap_t dest;
-	CAP_BOP_ALL(dest, a, b, |);
-	return dest;
+	return (kernel_cap_t) { a.val | b.val };
 }
 
 static inline kernel_cap_t cap_intersect(const kernel_cap_t a,
 					 const kernel_cap_t b)
 {
-	kernel_cap_t dest;
-	CAP_BOP_ALL(dest, a, b, &);
-	return dest;
+	return (kernel_cap_t) { a.val & b.val };
 }
 
 static inline kernel_cap_t cap_drop(const kernel_cap_t a,
 				    const kernel_cap_t drop)
 {
-	kernel_cap_t dest;
-	CAP_BOP_ALL(dest, a, drop, &~);
-	return dest;
-}
-
-static inline kernel_cap_t cap_invert(const kernel_cap_t c)
-{
-	kernel_cap_t dest;
-	CAP_UOP_ALL(dest, c, ~);
-	return dest;
+	return (kernel_cap_t) { a.val &~ drop.val };
 }
 
 static inline bool cap_isclear(const kernel_cap_t a)
 {
-	unsigned __capi;
-	CAP_FOR_EACH_U32(__capi) {
-		if (a.cap[__capi] != 0)
-			return false;
-	}
-	return true;
+	return !a.val;
 }
 
 static inline bool cap_isidentical(const kernel_cap_t a, const kernel_cap_t b)
 {
-	unsigned __capi;
-	CAP_FOR_EACH_U32(__capi) {
-		if (a.cap[__capi] != b.cap[__capi])
-			return false;
-	}
-	return true;
+	return a.val == b.val;
 }
 
 /*
@@ -176,39 +111,31 @@ static inline bool cap_isidentical(const kernel_cap_t a, const kernel_cap_t b)
  */
 static inline bool cap_issubset(const kernel_cap_t a, const kernel_cap_t set)
 {
-	kernel_cap_t dest;
-	dest = cap_drop(a, set);
-	return cap_isclear(dest);
+	return !(a.val & ~set.val);
 }
 
 /* Used to decide between falling back on the old suser() or fsuser(). */
 
 static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a)
 {
-	const kernel_cap_t __cap_fs_set = CAP_FS_SET;
-	return cap_drop(a, __cap_fs_set);
+	return cap_drop(a, CAP_FS_SET);
 }
 
 static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a,
 					    const kernel_cap_t permitted)
 {
-	const kernel_cap_t __cap_fs_set = CAP_FS_SET;
-	return cap_combine(a,
-			   cap_intersect(permitted, __cap_fs_set));
+	return cap_combine(a, cap_intersect(permitted, CAP_FS_SET));
 }
 
 static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a)
 {
-	const kernel_cap_t __cap_fs_set = CAP_NFSD_SET;
-	return cap_drop(a, __cap_fs_set);
+	return cap_drop(a, CAP_NFSD_SET);
 }
 
 static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a,
 					      const kernel_cap_t permitted)
 {
-	const kernel_cap_t __cap_nfsd_set = CAP_NFSD_SET;
-	return cap_combine(a,
-			   cap_intersect(permitted, __cap_nfsd_set));
+	return cap_combine(a, cap_intersect(permitted, CAP_NFSD_SET));
 }
 
 #ifdef CONFIG_MULTIUSER
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 882bd56b01ed..76c279b13aee 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -22,7 +22,6 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
 	struct user_namespace *uns = seq_user_ns(m);
 	struct group_info *gi;
 	kernel_cap_t cap;
-	unsigned __capi;
 	int g;
 
 	seq_printf(m, "%5d\n", id);
@@ -42,8 +41,7 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
 	}
 	seq_puts(m, "\n\tCapEff:\t");
 	cap = cred->cap_effective;
-	CAP_FOR_EACH_U32(__capi)
-		seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
+	seq_put_hex_ll(m, NULL, cap.val, 16);
 	seq_putc(m, '\n');
 	return 0;
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 93d0b87f3283..addeed3df15d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1295,15 +1295,11 @@ out:
 static void audit_log_cap(struct audit_buffer *ab, char *prefix,
 			  kernel_cap_t *cap)
 {
-	int i;
-
 	if (cap_isclear(*cap)) {
 		audit_log_format(ab, " %s=0", prefix);
 		return;
 	}
-	audit_log_format(ab, " %s=", prefix);
-	CAP_FOR_EACH_U32(i)
-		audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
+	audit_log_format(ab, " %s=%016llx", prefix, cap->val);
 }
 
 static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
diff --git a/kernel/capability.c b/kernel/capability.c
index 339a44dfe2f4..3e058f41df32 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -20,13 +20,6 @@
 #include <linux/user_namespace.h>
 #include <linux/uaccess.h>
 
-/*
- * Leveraged for setting/resetting capabilities
- */
-
-const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-EXPORT_SYMBOL(__cap_empty_set);
-
 int file_caps_enabled = 1;
 
 static int __init file_caps_disable(char *str)
@@ -151,6 +144,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
 	pid_t pid;
 	unsigned tocopy;
 	kernel_cap_t pE, pI, pP;
+	struct __user_cap_data_struct kdata[2];
 
 	ret = cap_validate_magic(header, &tocopy);
 	if ((dataptr == NULL) || (ret != 0))
@@ -163,42 +157,46 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
 		return -EINVAL;
 
 	ret = cap_get_target_pid(pid, &pE, &pI, &pP);
-	if (!ret) {
-		struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
-		unsigned i;
-
-		for (i = 0; i < tocopy; i++) {
-			kdata[i].effective = pE.cap[i];
-			kdata[i].permitted = pP.cap[i];
-			kdata[i].inheritable = pI.cap[i];
-		}
-
-		/*
-		 * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
-		 * we silently drop the upper capabilities here. This
-		 * has the effect of making older libcap
-		 * implementations implicitly drop upper capability
-		 * bits when they perform a: capget/modify/capset
-		 * sequence.
-		 *
-		 * This behavior is considered fail-safe
-		 * behavior. Upgrading the application to a newer
-		 * version of libcap will enable access to the newer
-		 * capabilities.
-		 *
-		 * An alternative would be to return an error here
-		 * (-ERANGE), but that causes legacy applications to
-		 * unexpectedly fail; the capget/modify/capset aborts
-		 * before modification is attempted and the application
-		 * fails.
-		 */
-		if (copy_to_user(dataptr, kdata, tocopy
-				 * sizeof(struct __user_cap_data_struct))) {
-			return -EFAULT;
-		}
-	}
+	if (ret)
+		return ret;
 
-	return ret;
+	/*
+	 * Annoying legacy format with 64-bit capabilities exposed
+	 * as two sets of 32-bit fields, so we need to split the
+	 * capability values up.
+	 */
+	kdata[0].effective   = pE.val; kdata[1].effective   = pE.val >> 32;
+	kdata[0].permitted   = pP.val; kdata[1].permitted   = pP.val >> 32;
+	kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32;
+
+	/*
+	 * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
+	 * we silently drop the upper capabilities here. This
+	 * has the effect of making older libcap
+	 * implementations implicitly drop upper capability
+	 * bits when they perform a: capget/modify/capset
+	 * sequence.
+	 *
+	 * This behavior is considered fail-safe
+	 * behavior. Upgrading the application to a newer
+	 * version of libcap will enable access to the newer
+	 * capabilities.
+	 *
+	 * An alternative would be to return an error here
+	 * (-ERANGE), but that causes legacy applications to
+	 * unexpectedly fail; the capget/modify/capset aborts
+	 * before modification is attempted and the application
+	 * fails.
+	 */
+	if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0])))
+		return -EFAULT;
+
+	return 0;
+}
+
+static kernel_cap_t mk_kernel_cap(u32 low, u32 high)
+{
+	return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK };
 }
 
 /**
@@ -221,8 +219,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
  */
 SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
 {
-	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
-	unsigned i, tocopy, copybytes;
+	struct __user_cap_data_struct kdata[2] = { { 0, }, };
+	unsigned tocopy, copybytes;
 	kernel_cap_t inheritable, permitted, effective;
 	struct cred *new;
 	int ret;
@@ -246,21 +244,9 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
 	if (copy_from_user(&kdata, data, copybytes))
 		return -EFAULT;
 
-	for (i = 0; i < tocopy; i++) {
-		effective.cap[i] = kdata[i].effective;
-		permitted.cap[i] = kdata[i].permitted;
-		inheritable.cap[i] = kdata[i].inheritable;
-	}
-	while (i < _KERNEL_CAPABILITY_U32S) {
-		effective.cap[i] = 0;
-		permitted.cap[i] = 0;
-		inheritable.cap[i] = 0;
-		i++;
-	}
-
-	effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
-	permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
-	inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+	effective   = mk_kernel_cap(kdata[0].effective,   kdata[1].effective);
+	permitted   = mk_kernel_cap(kdata[0].permitted,   kdata[1].permitted);
+	inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable);
 
 	new = prepare_creds();
 	if (!new)
diff --git a/kernel/umh.c b/kernel/umh.c
index fbf872c624cb..2a4708277335 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -501,9 +501,9 @@ static int proc_cap_handler(struct ctl_table *table, int write,
 			 void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table t;
-	unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
-	kernel_cap_t new_cap;
-	int err, i;
+	unsigned long cap_array[2];
+	kernel_cap_t new_cap, *cap;
+	int err;
 
 	if (write && (!capable(CAP_SETPCAP) ||
 		      !capable(CAP_SYS_MODULE)))
@@ -514,14 +514,16 @@ static int proc_cap_handler(struct ctl_table *table, int write,
 	 * userspace if this is a read.
 	 */
 	spin_lock(&umh_sysctl_lock);
-	for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)  {
-		if (table->data == CAP_BSET)
-			cap_array[i] = usermodehelper_bset.cap[i];
-		else if (table->data == CAP_PI)
-			cap_array[i] = usermodehelper_inheritable.cap[i];
-		else
-			BUG();
-	}
+	if (table->data == CAP_BSET)
+		cap = &usermodehelper_bset;
+	else if (table->data == CAP_PI)
+		cap = &usermodehelper_inheritable;
+	else
+		BUG();
+
+	/* Legacy format: capabilities are exposed as two 32-bit values */
+	cap_array[0] = (u32) cap->val;
+	cap_array[1] = cap->val >> 32;
 	spin_unlock(&umh_sysctl_lock);
 
 	t = *table;
@@ -535,22 +537,15 @@ static int proc_cap_handler(struct ctl_table *table, int write,
 	if (err < 0)
 		return err;
 
-	/*
-	 * convert from the sysctl array of ulongs to the kernel_cap_t
-	 * internal representation
-	 */
-	for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
-		new_cap.cap[i] = cap_array[i];
+	new_cap.val = (u32)cap_array[0];
+	new_cap.val += (u64)cap_array[1] << 32;
 
 	/*
 	 * Drop everything not in the new_cap (but don't add things)
 	 */
 	if (write) {
 		spin_lock(&umh_sysctl_lock);
-		if (table->data == CAP_BSET)
-			usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
-		if (table->data == CAP_PI)
-			usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+		*cap = cap_intersect(*cap, new_cap);
 		spin_unlock(&umh_sysctl_lock);
 	}
 
@@ -561,14 +556,14 @@ struct ctl_table usermodehelper_table[] = {
 	{
 		.procname	= "bset",
 		.data		= CAP_BSET,
-		.maxlen		= _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+		.maxlen		= 2 * sizeof(unsigned long),
 		.mode		= 0600,
 		.proc_handler	= proc_cap_handler,
 	},
 	{
 		.procname	= "inheritable",
 		.data		= CAP_PI,
-		.maxlen		= _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+		.maxlen		= 2 * sizeof(unsigned long),
 		.mode		= 0600,
 		.proc_handler	= proc_cap_handler,
 	},
diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c
index 5e9949832af6..cf2ceec40b28 100644
--- a/security/apparmor/policy_unpack.c
+++ b/security/apparmor/policy_unpack.c
@@ -304,6 +304,26 @@ fail:
 }
 EXPORT_SYMBOL_IF_KUNIT(aa_unpack_u64);
 
+static bool aa_unpack_cap_low(struct aa_ext *e, kernel_cap_t *data, const char *name)
+{
+	u32 val;
+
+	if (!aa_unpack_u32(e, &val, name))
+		return false;
+	data->val = val;
+	return true;
+}
+
+static bool aa_unpack_cap_high(struct aa_ext *e, kernel_cap_t *data, const char *name)
+{
+	u32 val;
+
+	if (!aa_unpack_u32(e, &val, name))
+		return false;
+	data->val = (u32)data->val | ((u64)val << 32);
+	return true;
+}
+
 VISIBLE_IF_KUNIT bool aa_unpack_array(struct aa_ext *e, const char *name, u16 *size)
 {
 	void *pos = e->pos;
@@ -897,25 +917,25 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name)
 		profile->path_flags = PATH_MEDIATE_DELETED;
 
 	info = "failed to unpack profile capabilities";
-	if (!aa_unpack_u32(e, &(rules->caps.allow.cap[0]), NULL))
+	if (!aa_unpack_cap_low(e, &rules->caps.allow, NULL))
 		goto fail;
-	if (!aa_unpack_u32(e, &(rules->caps.audit.cap[0]), NULL))
+	if (!aa_unpack_cap_low(e, &rules->caps.audit, NULL))
 		goto fail;
-	if (!aa_unpack_u32(e, &(rules->caps.quiet.cap[0]), NULL))
+	if (!aa_unpack_cap_low(e, &rules->caps.quiet, NULL))
 		goto fail;
-	if (!aa_unpack_u32(e, &tmpcap.cap[0], NULL))
+	if (!aa_unpack_cap_low(e, &tmpcap, NULL))
 		goto fail;
 
 	info = "failed to unpack upper profile capabilities";
 	if (aa_unpack_nameX(e, AA_STRUCT, "caps64")) {
 		/* optional upper half of 64 bit caps */
-		if (!aa_unpack_u32(e, &(rules->caps.allow.cap[1]), NULL))
+		if (!aa_unpack_cap_high(e, &rules->caps.allow, NULL))
 			goto fail;
-		if (!aa_unpack_u32(e, &(rules->caps.audit.cap[1]), NULL))
+		if (!aa_unpack_cap_high(e, &rules->caps.audit, NULL))
 			goto fail;
-		if (!aa_unpack_u32(e, &(rules->caps.quiet.cap[1]), NULL))
+		if (!aa_unpack_cap_high(e, &rules->caps.quiet, NULL))
 			goto fail;
-		if (!aa_unpack_u32(e, &(tmpcap.cap[1]), NULL))
+		if (!aa_unpack_cap_high(e, &tmpcap, NULL))
 			goto fail;
 		if (!aa_unpack_nameX(e, AA_STRUCTEND, NULL))
 			goto fail;
@@ -924,9 +944,9 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name)
 	info = "failed to unpack extended profile capabilities";
 	if (aa_unpack_nameX(e, AA_STRUCT, "capsx")) {
 		/* optional extended caps mediation mask */
-		if (!aa_unpack_u32(e, &(rules->caps.extended.cap[0]), NULL))
+		if (!aa_unpack_cap_low(e, &rules->caps.extended, NULL))
 			goto fail;
-		if (!aa_unpack_u32(e, &(rules->caps.extended.cap[1]), NULL))
+		if (!aa_unpack_cap_high(e, &rules->caps.extended, NULL))
 			goto fail;
 		if (!aa_unpack_nameX(e, AA_STRUCTEND, NULL))
 			goto fail;
diff --git a/security/commoncap.c b/security/commoncap.c
index aec62db55271..5bb7d1e96277 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -589,7 +589,6 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 					  bool *has_fcap)
 {
 	struct cred *new = bprm->cred;
-	unsigned i;
 	int ret = 0;
 
 	if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
@@ -598,22 +597,17 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 	if (caps->magic_etc & VFS_CAP_REVISION_MASK)
 		*has_fcap = true;
 
-	CAP_FOR_EACH_U32(i) {
-		__u32 permitted = caps->permitted.cap[i];
-		__u32 inheritable = caps->inheritable.cap[i];
-
-		/*
-		 * pP' = (X & fP) | (pI & fI)
-		 * The addition of pA' is handled later.
-		 */
-		new->cap_permitted.cap[i] =
-			(new->cap_bset.cap[i] & permitted) |
-			(new->cap_inheritable.cap[i] & inheritable);
+	/*
+	 * pP' = (X & fP) | (pI & fI)
+	 * The addition of pA' is handled later.
+	 */
+	new->cap_permitted.val =
+		(new->cap_bset.val & caps->permitted.val) |
+		(new->cap_inheritable.val & caps->inheritable.val);
 
-		if (permitted & ~new->cap_permitted.cap[i])
-			/* insufficient to execute correctly */
-			ret = -EPERM;
-	}
+	if (caps->permitted.val & ~new->cap_permitted.val)
+		/* insufficient to execute correctly */
+		ret = -EPERM;
 
 	/*
 	 * For legacy apps, with no internal support for recognizing they
@@ -644,7 +638,6 @@ int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
 {
 	struct inode *inode = d_backing_inode(dentry);
 	__u32 magic_etc;
-	unsigned tocopy, i;
 	int size;
 	struct vfs_ns_cap_data data, *nscaps = &data;
 	struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
@@ -677,17 +670,14 @@ int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
 	case VFS_CAP_REVISION_1:
 		if (size != XATTR_CAPS_SZ_1)
 			return -EINVAL;
-		tocopy = VFS_CAP_U32_1;
 		break;
 	case VFS_CAP_REVISION_2:
 		if (size != XATTR_CAPS_SZ_2)
 			return -EINVAL;
-		tocopy = VFS_CAP_U32_2;
 		break;
 	case VFS_CAP_REVISION_3:
 		if (size != XATTR_CAPS_SZ_3)
 			return -EINVAL;
-		tocopy = VFS_CAP_U32_3;
 		rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
 		break;
 
@@ -705,15 +695,20 @@ int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
 	if (!rootid_owns_currentns(rootvfsuid))
 		return -ENODATA;
 
-	CAP_FOR_EACH_U32(i) {
-		if (i >= tocopy)
-			break;
-		cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
-		cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
+	cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
+	cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);
+
+	/*
+	 * Rev1 had just a single 32-bit word, later expanded
+	 * to a second one for the high bits
+	 */
+	if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
+		cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
+		cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
 	}
 
-	cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
-	cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+	cpu_caps->permitted.val &= CAP_VALID_MASK;
+	cpu_caps->inheritable.val &= CAP_VALID_MASK;
 
 	cpu_caps->rootid = vfsuid_into_kuid(rootvfsuid);
 
diff --git a/tools/testing/selftests/bpf/progs/test_deny_namespace.c b/tools/testing/selftests/bpf/progs/test_deny_namespace.c
index 09ad5a4ebd1f..591104e79812 100644
--- a/tools/testing/selftests/bpf/progs/test_deny_namespace.c
+++ b/tools/testing/selftests/bpf/progs/test_deny_namespace.c
@@ -6,7 +6,7 @@
 #include <linux/capability.h>
 
 struct kernel_cap_struct {
-	__u32 cap[_LINUX_CAPABILITY_U32S_3];
+	__u64 val;
 } __attribute__((preserve_access_index));
 
 struct cred {
@@ -19,14 +19,13 @@ SEC("lsm.s/userns_create")
 int BPF_PROG(test_userns_create, const struct cred *cred, int ret)
 {
 	struct kernel_cap_struct caps = cred->cap_effective;
-	int cap_index = CAP_TO_INDEX(CAP_SYS_ADMIN);
-	__u32 cap_mask = CAP_TO_MASK(CAP_SYS_ADMIN);
+	__u64 cap_mask = BIT_LL(CAP_SYS_ADMIN);
 
 	if (ret)
 		return 0;
 
 	ret = -EPERM;
-	if (caps.cap[cap_index] & cap_mask)
+	if (caps.val & cap_mask)
 		return 0;
 
 	return -EPERM;
-- 
cgit 


From a98c0710b47d7dd44a351d08542360ff23f78330 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 23 Feb 2023 10:54:41 -0300
Subject: tools headers svm: Sync svm headers with the kernel sources

To pick the changes in:

  8c29f01654053258 ("x86/sev: Add SEV-SNP guest feature negotiation support")

That triggers:

  CC      /tmp/build/perf-tools/arch/x86/util/kvm-stat.o
  CC      /tmp/build/perf-tools/util/header.o
  LD      /tmp/build/perf-tools/arch/x86/util/perf-in.o
  LD      /tmp/build/perf-tools/arch/x86/perf-in.o
  LD      /tmp/build/perf-tools/arch/perf-in.o
  LD      /tmp/build/perf-tools/util/perf-in.o
  LD      /tmp/build/perf-tools/perf-in.o
  LINK    /tmp/build/perf-tools/perf

But this time causes no changes in tooling results, as the introduced
SVM_VMGEXIT_TERM_REQUEST exit reason wasn't added to SVM_EXIT_REASONS,
that is used in kvm-stat.c.

And addresses this perf build warning:

  Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/svm.h' differs from latest version at 'arch/x86/include/uapi/asm/svm.h'
  diff -u tools/arch/x86/include/uapi/asm/svm.h arch/x86/include/uapi/asm/svm.h

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nikunj A Dadhania <nikunj@amd.com>
Link: http://lore.kernel.org/lkml/
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/x86/include/uapi/asm/svm.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/arch/x86/include/uapi/asm/svm.h b/tools/arch/x86/include/uapi/asm/svm.h
index f69c168391aa..80e1df482337 100644
--- a/tools/arch/x86/include/uapi/asm/svm.h
+++ b/tools/arch/x86/include/uapi/asm/svm.h
@@ -116,6 +116,12 @@
 #define SVM_VMGEXIT_AP_CREATE			1
 #define SVM_VMGEXIT_AP_DESTROY			2
 #define SVM_VMGEXIT_HV_FEATURES			0x8000fffd
+#define SVM_VMGEXIT_TERM_REQUEST		0x8000fffe
+#define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code)	\
+	/* SW_EXITINFO1[3:0] */					\
+	(((((u64)reason_set) & 0xf)) |				\
+	/* SW_EXITINFO1[11:4] */				\
+	((((u64)reason_code) & 0xff) << 4))
 #define SVM_VMGEXIT_UNSUPPORTED_EVENT		0x8000ffff
 
 /* Exit code reserved for hypervisor/software use */
-- 
cgit 


From 25f69c69bc3ca8c781a94473f28d443d745768e3 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@huawei.com>
Date: Thu, 2 Mar 2023 11:11:44 +0800
Subject: perf stat: Fix counting when initial delay configured

When creating counters with initial delay configured, the enable_on_exec
field is not set. So we need to enable the counters later. The problem
is, when a workload is specified the target__none() is true. So we also
need to check stat_config.initial_delay.

In this change, we add a new field 'initial_delay' for struct target
which could be shared by other subcommands. And define
target__enable_on_exec() which returns whether enable_on_exec should be
set on normal cases.

Before this fix the event is not counted:

  $ ./perf stat -e instructions -D 100 sleep 2
  Events disabled
  Events enabled

   Performance counter stats for 'sleep 2':

       <not counted>      instructions

         1.901661124 seconds time elapsed

         0.001602000 seconds user
         0.000000000 seconds sys

After fix it works:

  $ ./perf stat -e instructions -D 100 sleep 2
  Events disabled
  Events enabled

   Performance counter stats for 'sleep 2':

             404,214      instructions

         1.901743475 seconds time elapsed

         0.001617000 seconds user
         0.000000000 seconds sys

Fixes: c587e77e100fa40e ("perf stat: Do not delay the workload with --delay")
Signed-off-by: Changbin Du <changbin.du@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Hui Wang <hw.huiwang@huawei.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230302031146.2801588-2-changbin.du@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 15 +++++----------
 tools/perf/util/stat.c    |  6 +-----
 tools/perf/util/stat.h    |  1 -
 tools/perf/util/target.h  | 12 ++++++++++++
 4 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 5d18a5a6f662..fa7c40956d0f 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -539,12 +539,7 @@ static int enable_counters(void)
 			return err;
 	}
 
-	/*
-	 * We need to enable counters only if:
-	 * - we don't have tracee (attaching to task or cpu)
-	 * - we have initial delay configured
-	 */
-	if (!target__none(&target)) {
+	if (!target__enable_on_exec(&target)) {
 		if (!all_counters_use_bpf)
 			evlist__enable(evsel_list);
 	}
@@ -914,7 +909,7 @@ try_again_reset:
 			return err;
 	}
 
-	if (stat_config.initial_delay) {
+	if (target.initial_delay) {
 		pr_info(EVLIST_DISABLED_MSG);
 	} else {
 		err = enable_counters();
@@ -926,8 +921,8 @@ try_again_reset:
 	if (forks)
 		evlist__start_workload(evsel_list);
 
-	if (stat_config.initial_delay > 0) {
-		usleep(stat_config.initial_delay * USEC_PER_MSEC);
+	if (target.initial_delay > 0) {
+		usleep(target.initial_delay * USEC_PER_MSEC);
 		err = enable_counters();
 		if (err)
 			return -1;
@@ -1248,7 +1243,7 @@ static struct option stat_options[] = {
 		     "aggregate counts per thread", AGGR_THREAD),
 	OPT_SET_UINT(0, "per-node", &stat_config.aggr_mode,
 		     "aggregate counts per numa node", AGGR_NODE),
-	OPT_INTEGER('D', "delay", &stat_config.initial_delay,
+	OPT_INTEGER('D', "delay", &target.initial_delay,
 		    "ms to wait before starting measurement after program start (-1: start with events disabled)"),
 	OPT_CALLBACK_NOOPT(0, "metric-only", &stat_config.metric_only, NULL,
 			"Only print computed metrics. No raw values", enable_metric_only),
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 534d36d26fc3..a07473703c6d 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -842,11 +842,7 @@ int create_perf_stat_counter(struct evsel *evsel,
 	if (evsel__is_group_leader(evsel)) {
 		attr->disabled = 1;
 
-		/*
-		 * In case of initial_delay we enable tracee
-		 * events manually.
-		 */
-		if (target__none(target) && !config->initial_delay)
+		if (target__enable_on_exec(target))
 			attr->enable_on_exec = 1;
 	}
 
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index b1c29156c560..bf1794ebc916 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -166,7 +166,6 @@ struct perf_stat_config {
 	FILE			*output;
 	unsigned int		 interval;
 	unsigned int		 timeout;
-	int			 initial_delay;
 	unsigned int		 unit_width;
 	unsigned int		 metric_only_len;
 	int			 times;
diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h
index daec6cba500d..880f1af7f6ad 100644
--- a/tools/perf/util/target.h
+++ b/tools/perf/util/target.h
@@ -18,6 +18,7 @@ struct target {
 	bool	     per_thread;
 	bool	     use_bpf;
 	bool	     hybrid;
+	int	     initial_delay;
 	const char   *attr_map;
 };
 
@@ -72,6 +73,17 @@ static inline bool target__none(struct target *target)
 	return !target__has_task(target) && !target__has_cpu(target);
 }
 
+static inline bool target__enable_on_exec(struct target *target)
+{
+	/*
+	 * Normally enable_on_exec should be set if:
+	 *  1) The tracee process is forked (not attaching to existed task or cpu).
+	 *  2) And initial_delay is not configured.
+	 * Otherwise, we enable tracee events manually.
+	 */
+	return target__none(target) && !target->initial_delay;
+}
+
 static inline bool target__has_per_thread(struct target *target)
 {
 	return target->system_wide && target->per_thread;
-- 
cgit 


From 7cf93538e087a792cb476008a757ab7c1c23b68c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 1 Mar 2023 10:36:40 -0800
Subject: tools: ynl: fully inherit attrs in subsets

To avoid having to repeat the entire definition of an attribute
(including the value) use the Attr object from the original set.
In fact this is already the documented expectation.

Fixes: be5bea1cc0bf ("net: add basic C code generators for Netlink")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/userspace-api/netlink/specs.rst |  3 ++-
 tools/net/ynl/lib/nlspec.py                   | 23 +++++++++++++++--------
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/Documentation/userspace-api/netlink/specs.rst b/Documentation/userspace-api/netlink/specs.rst
index 6ffe8137cd90..1424ab1b9b33 100644
--- a/Documentation/userspace-api/netlink/specs.rst
+++ b/Documentation/userspace-api/netlink/specs.rst
@@ -199,7 +199,8 @@ The ``value`` property can be skipped, in which case the attribute ID
 will be the value of the previous attribute plus one (recursively)
 and ``0`` for the first attribute in the attribute set.
 
-Note that the ``value`` of an attribute is defined only in its main set.
+Note that the ``value`` of an attribute is defined only in its main set
+(not in subsets).
 
 enum
 ~~~~
diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py
index 71da568e2c28..dff31dad36c5 100644
--- a/tools/net/ynl/lib/nlspec.py
+++ b/tools/net/ynl/lib/nlspec.py
@@ -95,15 +95,22 @@ class SpecAttrSet(SpecElement):
         self.attrs = collections.OrderedDict()
         self.attrs_by_val = collections.OrderedDict()
 
-        val = 0
-        for elem in self.yaml['attributes']:
-            if 'value' in elem:
-                val = elem['value']
+        if self.subset_of is None:
+            val = 0
+            for elem in self.yaml['attributes']:
+                if 'value' in elem:
+                    val = elem['value']
 
-            attr = self.new_attr(elem, val)
-            self.attrs[attr.name] = attr
-            self.attrs_by_val[attr.value] = attr
-            val += 1
+                attr = self.new_attr(elem, val)
+                self.attrs[attr.name] = attr
+                self.attrs_by_val[attr.value] = attr
+                val += 1
+        else:
+            real_set = family.attr_sets[self.subset_of]
+            for elem in self.yaml['attributes']:
+                attr = real_set[elem['name']]
+                self.attrs[attr.name] = attr
+                self.attrs_by_val[attr.value] = attr
 
     def new_attr(self, elem, value):
         return SpecAttr(self.family, self, elem, value)
-- 
cgit 


From ad4fafcde5bc1badb8fc2c7f260a5d6b83a038e4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 1 Mar 2023 10:36:41 -0800
Subject: tools: ynl: use 1 as the default for first entry in attrs/ops

Pretty much all families use value: 1 or reserve as unspec
the first entry in attribute set and the first operation.
Make this the default. Update documentation (the doc for
values of operations just refers back to doc for attrs
so updating only attrs).

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/userspace-api/netlink/specs.rst | 7 ++++++-
 tools/net/ynl/lib/nlspec.py                   | 6 +++---
 tools/net/ynl/ynl-gen-c.py                    | 7 +++++--
 3 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/Documentation/userspace-api/netlink/specs.rst b/Documentation/userspace-api/netlink/specs.rst
index 1424ab1b9b33..32e53328d113 100644
--- a/Documentation/userspace-api/netlink/specs.rst
+++ b/Documentation/userspace-api/netlink/specs.rst
@@ -197,7 +197,12 @@ value
 Numerical attribute ID, used in serialized Netlink messages.
 The ``value`` property can be skipped, in which case the attribute ID
 will be the value of the previous attribute plus one (recursively)
-and ``0`` for the first attribute in the attribute set.
+and ``1`` for the first attribute in the attribute set.
+
+Attributes (and operations) use ``1`` as the default value for the first
+entry (unlike enums in definitions which start from ``0``) because
+entry ``0`` is almost always reserved as undefined. Spec can explicitly
+set value to ``0`` if needed.
 
 Note that the ``value`` of an attribute is defined only in its main set
 (not in subsets).
diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py
index dff31dad36c5..9d394e50de23 100644
--- a/tools/net/ynl/lib/nlspec.py
+++ b/tools/net/ynl/lib/nlspec.py
@@ -96,7 +96,7 @@ class SpecAttrSet(SpecElement):
         self.attrs_by_val = collections.OrderedDict()
 
         if self.subset_of is None:
-            val = 0
+            val = 1
             for elem in self.yaml['attributes']:
                 if 'value' in elem:
                     val = elem['value']
@@ -252,7 +252,7 @@ class SpecFamily(SpecElement):
         self._resolution_list.append(elem)
 
     def _dictify_ops_unified(self):
-        val = 0
+        val = 1
         for elem in self.yaml['operations']['list']:
             if 'value' in elem:
                 val = elem['value']
@@ -263,7 +263,7 @@ class SpecFamily(SpecElement):
             self.msgs[op.name] = op
 
     def _dictify_ops_directional(self):
-        req_val = rsp_val = 0
+        req_val = rsp_val = 1
         for elem in self.yaml['operations']['list']:
             if 'notify' in elem:
                 if 'value' in elem:
diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 274e9c566f61..62f8f2c3c56c 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -2044,14 +2044,17 @@ def render_uapi(family, cw):
     max_value = f"({cnt_name} - 1)"
 
     uapi_enum_start(family, cw, family['operations'], 'enum-name')
+    val = 0
     for op in family.msgs.values():
         if separate_ntf and ('notify' in op or 'event' in op):
             continue
 
         suffix = ','
-        if 'value' in op:
-            suffix = f" = {op['value']},"
+        if op.value != val:
+            suffix = f" = {op.value},"
+            val = op.value
         cw.p(op.enum_name + suffix)
+        val += 1
     cw.nl()
     cw.p(cnt_name + ('' if max_by_define else ','))
     if not max_by_define:
-- 
cgit 


From 31d2e6b5d44e436969c15e4613e6b16c4c032d4d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 3 Mar 2023 11:05:19 -0300
Subject: tools headers: Update the copy of x86's mem{cpy,set}_64.S used in
 'perf bench'

We also continue with SYM_TYPED_FUNC_START() in util/include/linux/linkage.h
and with an exception in tools/perf/check_headers.sh's diff check to ignore
the include cfi_types.h line when checking if the kernel original files drifted
from the copies we carry.

This is to get the changes from:

  69d4c0d3218692ff ("entry, kasan, x86: Disallow overriding mem*() functions")

That addresses these perf tools build warning:

  Warning: Kernel ABI header at 'tools/arch/x86/lib/memcpy_64.S' differs from latest version at 'arch/x86/lib/memcpy_64.S'
  diff -u tools/arch/x86/lib/memcpy_64.S arch/x86/lib/memcpy_64.S
  Warning: Kernel ABI header at 'tools/arch/x86/lib/memset_64.S' differs from latest version at 'arch/x86/lib/memset_64.S'
  diff -u tools/arch/x86/lib/memset_64.S arch/x86/lib/memset_64.S

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/lkml/ZAH%2FjsioJXGIOrkf@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/x86/lib/memcpy_64.S | 5 ++---
 tools/arch/x86/lib/memset_64.S | 4 +++-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S
index 5418e2f99834..a91ac666f758 100644
--- a/tools/arch/x86/lib/memcpy_64.S
+++ b/tools/arch/x86/lib/memcpy_64.S
@@ -7,7 +7,7 @@
 #include <asm/alternative.h>
 #include <asm/export.h>
 
-.pushsection .noinstr.text, "ax"
+.section .noinstr.text, "ax"
 
 /*
  * We build a jump to memcpy_orig by default which gets NOPped out on
@@ -42,7 +42,7 @@ SYM_TYPED_FUNC_START(__memcpy)
 SYM_FUNC_END(__memcpy)
 EXPORT_SYMBOL(__memcpy)
 
-SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
+SYM_FUNC_ALIAS(memcpy, __memcpy)
 EXPORT_SYMBOL(memcpy)
 
 /*
@@ -183,4 +183,3 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
 	RET
 SYM_FUNC_END(memcpy_orig)
 
-.popsection
diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S
index fc9ffd3ff3b2..6143b1a6fa2c 100644
--- a/tools/arch/x86/lib/memset_64.S
+++ b/tools/arch/x86/lib/memset_64.S
@@ -6,6 +6,8 @@
 #include <asm/alternative.h>
 #include <asm/export.h>
 
+.section .noinstr.text, "ax"
+
 /*
  * ISO C memset - set a memory block to a byte value. This function uses fast
  * string to get better performance than the original function. The code is
@@ -43,7 +45,7 @@ SYM_FUNC_START(__memset)
 SYM_FUNC_END(__memset)
 EXPORT_SYMBOL(__memset)
 
-SYM_FUNC_ALIAS_WEAK(memset, __memset)
+SYM_FUNC_ALIAS(memset, __memset)
 EXPORT_SYMBOL(memset)
 
 /*
-- 
cgit 


From df4b933e0e51e43bd27a495dc747db0851e6dd34 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 3 Mar 2023 11:48:09 -0300
Subject: tools headers UAPI: Sync linux/prctl.h with the kernel sources

To pick new prctl options introduced in:

  b507808ebce23561 ("mm: implement memory-deny-write-execute as a prctl")

That results in:

  $ diff -u tools/include/uapi/linux/prctl.h include/uapi/linux/prctl.h
  --- tools/include/uapi/linux/prctl.h	2022-06-20 17:54:43.884515663 -0300
  +++ include/uapi/linux/prctl.h	2023-03-03 11:18:51.090923569 -0300
  @@ -281,6 +281,12 @@
   # define PR_SME_VL_LEN_MASK		0xffff
   # define PR_SME_VL_INHERIT		(1 << 17) /* inherit across exec */

  +/* Memory deny write / execute */
  +#define PR_SET_MDWE			65
  +# define PR_MDWE_REFUSE_EXEC_GAIN	1
  +
  +#define PR_GET_MDWE			66
  +
   #define PR_SET_VMA		0x53564d41
   # define PR_SET_VMA_ANON_NAME		0

  $ tools/perf/trace/beauty/prctl_option.sh > before
  $ cp include/uapi/linux/prctl.h tools/include/uapi/linux/prctl.h
  $ tools/perf/trace/beauty/prctl_option.sh > after
  $ diff -u before after
  --- before	2023-03-03 11:47:43.320013146 -0300
  +++ after	2023-03-03 11:47:50.937216229 -0300
  @@ -59,6 +59,8 @@
   	[62] = "SCHED_CORE",
   	[63] = "SME_SET_VL",
   	[64] = "SME_GET_VL",
  +	[65] = "SET_MDWE",
  +	[66] = "GET_MDWE",
   };
   static const char *prctl_set_mm_options[] = {
   	[1] = "START_CODE",
  $

Now users can do:

  # perf trace -e syscalls:sys_enter_prctl --filter "option==SET_MDWE||option==GET_MDWE"
^C#
  # trace -v -e syscalls:sys_enter_prctl --filter "option==SET_MDWE||option==GET_MDWE"
  New filter for syscalls:sys_enter_prctl: (option==65||option==66) && (common_pid != 5519 && common_pid != 3404)
^C#

And when these prctl options appears in a session, they will be
translated to the corresponding string.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/ZAI%2FAoPXb%2Fsxz1%2Fm@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/uapi/linux/prctl.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index a5e06dcbba13..1312a137f7fb 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -281,6 +281,12 @@ struct prctl_mm_map {
 # define PR_SME_VL_LEN_MASK		0xffff
 # define PR_SME_VL_INHERIT		(1 << 17) /* inherit across exec */
 
+/* Memory deny write / execute */
+#define PR_SET_MDWE			65
+# define PR_MDWE_REFUSE_EXEC_GAIN	1
+
+#define PR_GET_MDWE			66
+
 #define PR_SET_VMA		0x53564d41
 # define PR_SET_VMA_ANON_NAME		0
 
-- 
cgit 


From 811f35ff59b6f99ae272d6f5b96bc9e974f88196 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 3 Mar 2023 16:49:20 -0300
Subject: tools headers: Synchronize {linux,vdso}/bits.h with the kernel
 sources

To pick up the changes in this cset:

  cbdb1f163af2bb90 ("vdso/bits.h: Add BIT_ULL() for the sake of consistency")

That just causes perf to rebuild, the macro included doesn't clash with
anything in tools/{perf,objtool,bpf}.

This addresses this perf build warning:

  Warning: Kernel ABI header at 'tools/include/linux/bits.h' differs from latest version at 'include/linux/bits.h'
  diff -u tools/include/linux/bits.h include/linux/bits.h
  Warning: Kernel ABI header at 'tools/include/vdso/bits.h' differs from latest version at 'include/vdso/bits.h'
  diff -u tools/include/vdso/bits.h include/vdso/bits.h

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/linux/bits.h | 1 -
 tools/include/vdso/bits.h  | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h
index 87d112650dfb..7c0cf5031abe 100644
--- a/tools/include/linux/bits.h
+++ b/tools/include/linux/bits.h
@@ -6,7 +6,6 @@
 #include <vdso/bits.h>
 #include <asm/bitsperlong.h>
 
-#define BIT_ULL(nr)		(ULL(1) << (nr))
 #define BIT_MASK(nr)		(UL(1) << ((nr) % BITS_PER_LONG))
 #define BIT_WORD(nr)		((nr) / BITS_PER_LONG)
 #define BIT_ULL_MASK(nr)	(ULL(1) << ((nr) % BITS_PER_LONG_LONG))
diff --git a/tools/include/vdso/bits.h b/tools/include/vdso/bits.h
index 6d005a1f5d94..388b212088ea 100644
--- a/tools/include/vdso/bits.h
+++ b/tools/include/vdso/bits.h
@@ -5,5 +5,6 @@
 #include <vdso/const.h>
 
 #define BIT(nr)			(UL(1) << (nr))
+#define BIT_ULL(nr)		(ULL(1) << (nr))
 
 #endif	/* __VDSO_BITS_H */
-- 
cgit 


From 5f800380af3881f1a0528d332838d4619a64593b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 3 Mar 2023 16:57:50 -0300
Subject: tools include UAPI: Synchronize linux/fcntl.h with the kernel sources

To pick up the changes in:

  6fd7353829cafc40 ("mm/memfd: add F_SEAL_EXEC")

That doesn't add or change any perf tools functionality, only addresses
these build warnings:

  Warning: Kernel ABI header at 'tools/include/uapi/linux/fcntl.h' differs from latest version at 'include/uapi/linux/fcntl.h'
  diff -u tools/include/uapi/linux/fcntl.h include/uapi/linux/fcntl.h

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/uapi/linux/fcntl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h
index 2f86b2ad6d7e..e8c07da58c9f 100644
--- a/tools/include/uapi/linux/fcntl.h
+++ b/tools/include/uapi/linux/fcntl.h
@@ -43,6 +43,7 @@
 #define F_SEAL_GROW	0x0004	/* prevent file from growing */
 #define F_SEAL_WRITE	0x0008	/* prevent writes */
 #define F_SEAL_FUTURE_WRITE	0x0010  /* prevent future writes while mapped */
+#define F_SEAL_EXEC	0x0020  /* prevent chmod modifying exec bits */
 /* (1U << 31) is reserved for signed error codes */
 
 /*
-- 
cgit 


From 33c53f9b5ac60c7d5c6acc91a6b6d933e0a417e3 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 3 Mar 2023 17:26:24 -0300
Subject: tools headers kvm: Sync uapi/{asm/linux} kvm.h headers with the
 kernel sources

To pick up the changes in:

  89b0e7de3451a17f ("KVM: arm64: nv: Introduce nested virtualization VCPU feature")
  14329b825ffb7f27 ("KVM: x86/pmu: Introduce masked events to the pmu event filter")
  6213b701a9df0472 ("KVM: x86: Replace 0-length arrays with flexible arrays")
  3fd49805d19d1c56 ("KVM: s390: Extend MEM_OP ioctl by storage key checked cmpxchg")
  14329b825ffb7f27 ("KVM: x86/pmu: Introduce masked events to the pmu event filter")

That don't change functionality in tools/perf, as no new ioctl is added
for the 'perf trace' scripts to harvest.

This addresses these perf build warnings:

  Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h'
  diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h
  Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/kvm.h' differs from latest version at 'arch/x86/include/uapi/asm/kvm.h'
  diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h
  Warning: Kernel ABI header at 'tools/arch/arm64/include/uapi/asm/kvm.h' differs from latest version at 'arch/arm64/include/uapi/asm/kvm.h'
  diff -u tools/arch/arm64/include/uapi/asm/kvm.h arch/arm64/include/uapi/asm/kvm.h

Cc: Aaron Lewis <aaronlewis@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Christoffer Dall <christoffer.dall@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kees Kook <keescook@chromium.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/lkml/ZAJlg7%2FfWDVGX0F3@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/arm64/include/uapi/asm/kvm.h |  1 +
 tools/arch/x86/include/uapi/asm/kvm.h   | 34 +++++++++++++++++++++++++++++++--
 tools/include/uapi/linux/kvm.h          |  9 +++++++++
 3 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h
index a7a857f1784d..f8129c624b07 100644
--- a/tools/arch/arm64/include/uapi/asm/kvm.h
+++ b/tools/arch/arm64/include/uapi/asm/kvm.h
@@ -109,6 +109,7 @@ struct kvm_regs {
 #define KVM_ARM_VCPU_SVE		4 /* enable SVE for this CPU */
 #define KVM_ARM_VCPU_PTRAUTH_ADDRESS	5 /* VCPU uses address authentication */
 #define KVM_ARM_VCPU_PTRAUTH_GENERIC	6 /* VCPU uses generic authentication */
+#define KVM_ARM_VCPU_HAS_EL2		7 /* Support nested virtualization */
 
 struct kvm_vcpu_init {
 	__u32 target;
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index e48deab8901d..7f467fe05d42 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -9,6 +9,7 @@
 
 #include <linux/types.h>
 #include <linux/ioctl.h>
+#include <linux/stddef.h>
 
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
@@ -507,8 +508,8 @@ struct kvm_nested_state {
 	 * KVM_{GET,PUT}_NESTED_STATE ioctl values.
 	 */
 	union {
-		struct kvm_vmx_nested_state_data vmx[0];
-		struct kvm_svm_nested_state_data svm[0];
+		__DECLARE_FLEX_ARRAY(struct kvm_vmx_nested_state_data, vmx);
+		__DECLARE_FLEX_ARRAY(struct kvm_svm_nested_state_data, svm);
 	} data;
 };
 
@@ -525,6 +526,35 @@ struct kvm_pmu_event_filter {
 #define KVM_PMU_EVENT_ALLOW 0
 #define KVM_PMU_EVENT_DENY 1
 
+#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0)
+#define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
+
+/*
+ * Masked event layout.
+ * Bits   Description
+ * ----   -----------
+ * 7:0    event select (low bits)
+ * 15:8   umask match
+ * 31:16  unused
+ * 35:32  event select (high bits)
+ * 36:54  unused
+ * 55     exclude bit
+ * 63:56  umask mask
+ */
+
+#define KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, exclude) \
+	(((event_select) & 0xFFULL) | (((event_select) & 0XF00ULL) << 24) | \
+	(((mask) & 0xFFULL) << 56) | \
+	(((match) & 0xFFULL) << 8) | \
+	((__u64)(!!(exclude)) << 55))
+
+#define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \
+	(GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MASK		(GENMASK_ULL(63, 56))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH	(GENMASK_ULL(15, 8))
+#define KVM_PMU_MASKED_ENTRY_EXCLUDE		(BIT_ULL(55))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT	(56)
+
 /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
 #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
 #define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 55155e262646..d77aef872a0a 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -583,6 +583,8 @@ struct kvm_s390_mem_op {
 		struct {
 			__u8 ar;	/* the access register number */
 			__u8 key;	/* access key, ignored if flag unset */
+			__u8 pad1[6];	/* ignored */
+			__u64 old_addr;	/* ignored if cmpxchg flag unset */
 		};
 		__u32 sida_offset; /* offset into the sida */
 		__u8 reserved[32]; /* ignored */
@@ -595,11 +597,17 @@ struct kvm_s390_mem_op {
 #define KVM_S390_MEMOP_SIDA_WRITE	3
 #define KVM_S390_MEMOP_ABSOLUTE_READ	4
 #define KVM_S390_MEMOP_ABSOLUTE_WRITE	5
+#define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG	6
+
 /* flags for kvm_s390_mem_op->flags */
 #define KVM_S390_MEMOP_F_CHECK_ONLY		(1ULL << 0)
 #define KVM_S390_MEMOP_F_INJECT_EXCEPTION	(1ULL << 1)
 #define KVM_S390_MEMOP_F_SKEY_PROTECTION	(1ULL << 2)
 
+/* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */
+#define KVM_S390_MEMOP_EXTENSION_CAP_BASE	(1 << 0)
+#define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG	(1 << 1)
+
 /* for KVM_INTERRUPT */
 struct kvm_interrupt {
 	/* in */
@@ -1175,6 +1183,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
 #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224
 #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
+#define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit 


From 3ee7cb4fdf37e3763b3b10da2a7715b7a7a2a783 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 3 Mar 2023 22:36:04 -0300
Subject: tools arch x86: Sync the msr-index.h copy with the kernel sources

To pick up the changes in:

  e7862eda309ecfcc ("x86/cpu: Support AMD Automatic IBRS")
  0125acda7d76b943 ("x86/bugs: Reset speculation control settings on init")
  38aaf921e92dc5cf ("perf/x86: Add Meteor Lake support")
  5b6fac3fa44bafee ("x86/resctrl: Detect and configure Slow Memory Bandwidth Allocation")
  dc2a3e857981f859 ("x86/resctrl: Add interface to read mbm_total_bytes_config")

Addressing these tools/perf build warnings:

    diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h
    Warning: Kernel ABI header at 'tools/arch/x86/include/asm/msr-index.h' differs from latest version at 'arch/x86/include/asm/msr-index.h'

That makes the beautification scripts to pick some new entries:

  $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > before
  $ cp arch/x86/include/asm/msr-index.h tools/arch/x86/include/asm/msr-index.h
  $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > after
  $ diff -u before after
  --- before    2023-03-03 18:26:51.766923522 -0300
  +++ after     2023-03-03 18:27:09.987415481 -0300
  @@ -267,9 +267,11 @@
        [0xc000010e - x86_64_specific_MSRs_offset] = "AMD64_LBR_SELECT",
        [0xc000010f - x86_64_specific_MSRs_offset] = "AMD_DBG_EXTN_CFG",
        [0xc0000200 - x86_64_specific_MSRs_offset] = "IA32_MBA_BW_BASE",
  +     [0xc0000280 - x86_64_specific_MSRs_offset] = "IA32_SMBA_BW_BASE",
        [0xc0000300 - x86_64_specific_MSRs_offset] = "AMD64_PERF_CNTR_GLOBAL_STATUS",
        [0xc0000301 - x86_64_specific_MSRs_offset] = "AMD64_PERF_CNTR_GLOBAL_CTL",
        [0xc0000302 - x86_64_specific_MSRs_offset] = "AMD64_PERF_CNTR_GLOBAL_STATUS_CLR",
  +     [0xc0000400 - x86_64_specific_MSRs_offset] = "IA32_EVT_CFG_BASE",
   };

   #define x86_AMD_V_KVM_MSRs_offset 0xc0010000
  $

Now one can trace systemwide asking to see backtraces to where that MSR
is being read/written, see this example with a previous update:

  # perf trace -e msr:*_msr/max-stack=32/ --filter="msr>=IA32_U_CET && msr<=IA32_INT_SSP_TAB"
  ^C#

If we use -v (verbose mode) we can see what it does behind the scenes:

  # perf trace -v -e msr:*_msr/max-stack=32/ --filter="msr>=IA32_U_CET && msr<=IA32_INT_SSP_TAB"
  Using CPUID AuthenticAMD-25-21-0
  0x6a0
  0x6a8
  New filter for msr:read_msr: (msr>=0x6a0 && msr<=0x6a8) && (common_pid != 597499 && common_pid != 3313)
  0x6a0
  0x6a8
  New filter for msr:write_msr: (msr>=0x6a0 && msr<=0x6a8) && (common_pid != 597499 && common_pid != 3313)
  mmap size 528384B
  ^C#

Example with a frequent msr:

  # perf trace -v -e msr:*_msr/max-stack=32/ --filter="msr==IA32_SPEC_CTRL" --max-events 2
  Using CPUID AuthenticAMD-25-21-0
  0x48
  New filter for msr:read_msr: (msr==0x48) && (common_pid != 2612129 && common_pid != 3841)
  0x48
  New filter for msr:write_msr: (msr==0x48) && (common_pid != 2612129 && common_pid != 3841)
  mmap size 528384B
  Looking at the vmlinux_path (8 entries long)
  symsrc__init: build id mismatch for vmlinux.
  Using /proc/kcore for kernel data
  Using /proc/kallsyms for symbols
     0.000 Timer/2525383 msr:write_msr(msr: IA32_SPEC_CTRL, val: 6)
                                       do_trace_write_msr ([kernel.kallsyms])
                                       do_trace_write_msr ([kernel.kallsyms])
                                       __switch_to_xtra ([kernel.kallsyms])
                                       __switch_to ([kernel.kallsyms])
                                       __schedule ([kernel.kallsyms])
                                       schedule ([kernel.kallsyms])
                                       futex_wait_queue_me ([kernel.kallsyms])
                                       futex_wait ([kernel.kallsyms])
                                       do_futex ([kernel.kallsyms])
                                       __x64_sys_futex ([kernel.kallsyms])
                                       do_syscall_64 ([kernel.kallsyms])
                                       entry_SYSCALL_64_after_hwframe ([kernel.kallsyms])
                                       __futex_abstimed_wait_common64 (/usr/lib64/libpthread-2.33.so)
     0.030 :0/0 msr:write_msr(msr: IA32_SPEC_CTRL, val: 2)
                                       do_trace_write_msr ([kernel.kallsyms])
                                       do_trace_write_msr ([kernel.kallsyms])
                                       __switch_to_xtra ([kernel.kallsyms])
                                       __switch_to ([kernel.kallsyms])
                                       __schedule ([kernel.kallsyms])
                                       schedule_idle ([kernel.kallsyms])
                                       do_idle ([kernel.kallsyms])
                                       cpu_startup_entry ([kernel.kallsyms])
                                       secondary_startup_64_no_verify ([kernel.kallsyms])
    #

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Babu Moger <babu.moger@amd.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Breno Leitao <leitao@debian.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Kim Phillips <kim.phillips@amd.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nikunj A Dadhania <nikunj@amd.com>
Link: https://lore.kernel.org/lkml/ZAJoaZ41+rU5H0vL@kernel.org
[ I had published the perf-tools branch before with the sync with ]
[ 8c29f01654053258 ("x86/sev: Add SEV-SNP guest feature negotiation support") ]
[ I removed it from this new sync ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/x86/include/asm/msr-index.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'tools')

diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h
index d3fe82c5d6b6..ad35355ee43e 100644
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -25,6 +25,7 @@
 #define _EFER_SVME		12 /* Enable virtualization */
 #define _EFER_LMSLE		13 /* Long Mode Segment Limit Enable */
 #define _EFER_FFXSR		14 /* Enable Fast FXSAVE/FXRSTOR */
+#define _EFER_AUTOIBRS		21 /* Enable Automatic IBRS */
 
 #define EFER_SCE		(1<<_EFER_SCE)
 #define EFER_LME		(1<<_EFER_LME)
@@ -33,6 +34,7 @@
 #define EFER_SVME		(1<<_EFER_SVME)
 #define EFER_LMSLE		(1<<_EFER_LMSLE)
 #define EFER_FFXSR		(1<<_EFER_FFXSR)
+#define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)
 
 /* Intel MSRs. Some also available on other CPUs */
 
@@ -49,6 +51,10 @@
 #define SPEC_CTRL_RRSBA_DIS_S_SHIFT	6	   /* Disable RRSBA behavior */
 #define SPEC_CTRL_RRSBA_DIS_S		BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT)
 
+/* A mask for bits which the kernel toggles when controlling mitigations */
+#define SPEC_CTRL_MITIGATIONS_MASK	(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \
+							| SPEC_CTRL_RRSBA_DIS_S)
+
 #define MSR_IA32_PRED_CMD		0x00000049 /* Prediction Command */
 #define PRED_CMD_IBPB			BIT(0)	   /* Indirect Branch Prediction Barrier */
 
@@ -189,6 +195,9 @@
 #define MSR_TURBO_RATIO_LIMIT1		0x000001ae
 #define MSR_TURBO_RATIO_LIMIT2		0x000001af
 
+#define MSR_SNOOP_RSP_0			0x00001328
+#define MSR_SNOOP_RSP_1			0x00001329
+
 #define MSR_LBR_SELECT			0x000001c8
 #define MSR_LBR_TOS			0x000001c9
 
@@ -1081,6 +1090,8 @@
 
 /* - AMD: */
 #define MSR_IA32_MBA_BW_BASE		0xc0000200
+#define MSR_IA32_SMBA_BW_BASE		0xc0000280
+#define MSR_IA32_EVT_CFG_BASE		0xc0000400
 
 /* MSR_IA32_VMX_MISC bits */
 #define MSR_IA32_VMX_MISC_INTEL_PT                 (1ULL << 14)
-- 
cgit 


From 14e998ed42208f60d5f848f5e025fa2c2e9667b0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 6 Mar 2023 09:26:35 -0300
Subject: tools include UAPI: Sync linux/vhost.h with the kernel sources

To get the changes in:

  3b688d7a086d0438 ("vhost-vdpa: uAPI to resume the device")

To pick up these changes and support them:

  $ tools/perf/trace/beauty/vhost_virtio_ioctl.sh > before
  $ cp ../linux/include/uapi/linux/vhost.h tools/include/uapi/linux/vhost.h
  $ tools/perf/trace/beauty/vhost_virtio_ioctl.sh > after
  $ diff -u before after
  --- before	2023-03-06 09:26:14.889251817 -0300
  +++ after	2023-03-06 09:26:20.594406270 -0300
  @@ -30,6 +30,7 @@
   	[0x77] = "VDPA_SET_CONFIG_CALL",
   	[0x7C] = "VDPA_SET_GROUP_ASID",
   	[0x7D] = "VDPA_SUSPEND",
  +	[0x7E] = "VDPA_RESUME",
   };
   static const char *vhost_virtio_ioctl_read_cmds[] = {
   	[0x00] = "GET_FEATURES",
  $

For instance, see how those 'cmd' ioctl arguments get translated, now
VDPA_RESUME will be as well:

  # perf trace -a -e ioctl --max-events=10
       0.000 ( 0.011 ms): pipewire/2261 ioctl(fd: 60, cmd: SNDRV_PCM_HWSYNC, arg: 0x1)                        = 0
      21.353 ( 0.014 ms): pipewire/2261 ioctl(fd: 60, cmd: SNDRV_PCM_HWSYNC, arg: 0x1)                        = 0
      25.766 ( 0.014 ms): gnome-shell/2196 ioctl(fd: 14, cmd: DRM_I915_IRQ_WAIT, arg: 0x7ffe4a22c740)            = 0
      25.845 ( 0.034 ms): gnome-shel:cs0/2212 ioctl(fd: 14, cmd: DRM_I915_IRQ_EMIT, arg: 0x7fd43915dc70)            = 0
      25.916 ( 0.011 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_ADDFB2, arg: 0x7ffe4a22c8a0)               = 0
      25.941 ( 0.025 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_ATOMIC, arg: 0x7ffe4a22c840)               = 0
      32.915 ( 0.009 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_RMFB, arg: 0x7ffe4a22cf9c)                 = 0
      42.522 ( 0.013 ms): gnome-shell/2196 ioctl(fd: 14, cmd: DRM_I915_IRQ_WAIT, arg: 0x7ffe4a22c740)            = 0
      42.579 ( 0.031 ms): gnome-shel:cs0/2212 ioctl(fd: 14, cmd: DRM_I915_IRQ_EMIT, arg: 0x7fd43915dc70)            = 0
      42.644 ( 0.010 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_ADDFB2, arg: 0x7ffe4a22c8a0)               = 0
  #

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sebastien Boeuf <sebastien.boeuf@intel.com>
Link: https://lore.kernel.org/lkml/ZAXdCTecxSNwAoeK@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/uapi/linux/vhost.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/include/uapi/linux/vhost.h b/tools/include/uapi/linux/vhost.h
index f9f115a7c75b..92e1b700b51c 100644
--- a/tools/include/uapi/linux/vhost.h
+++ b/tools/include/uapi/linux/vhost.h
@@ -180,4 +180,12 @@
  */
 #define VHOST_VDPA_SUSPEND		_IO(VHOST_VIRTIO, 0x7D)
 
+/* Resume a device so it can resume processing virtqueue requests
+ *
+ * After the return of this ioctl the device will have restored all the
+ * necessary states and it is fully operational to continue processing the
+ * virtqueue descriptors.
+ */
+#define VHOST_VDPA_RESUME		_IO(VHOST_VIRTIO, 0x7E)
+
 #endif
-- 
cgit 


From 7d0930647c020f75592adde8712d2cb002b29387 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 6 Mar 2023 14:25:21 -0300
Subject: tools headers x86 cpufeatures: Sync with the kernel sources

To pick the changes from:

  8415a74852d7c247 ("x86/cpu, kvm: Add support for CPUID_80000021_EAX")

This only causes these perf files to be rebuilt:

  CC       /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o
  CC       /tmp/build/perf/bench/mem-memset-x86-64-asm.o

And addresses these perf build warnings:

  Warning: Kernel ABI header at 'tools/arch/x86/include/asm/disabled-features.h' differs from latest version at 'arch/x86/include/asm/disabled-features.h'
  diff -u tools/arch/x86/include/asm/disabled-features.h arch/x86/include/asm/disabled-features.h
  Warning: Kernel ABI header at 'tools/arch/x86/include/asm/required-features.h' differs from latest version at 'arch/x86/include/asm/required-features.h'
  diff -u tools/arch/x86/include/asm/required-features.h arch/x86/include/asm/required-features.h

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kim Phillips <kim.phillips@amd.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/ZAYlS2XTJ5hRtss7@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/x86/include/asm/cpufeatures.h       | 2 +-
 tools/arch/x86/include/asm/disabled-features.h | 3 ++-
 tools/arch/x86/include/asm/required-features.h | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 61012476d66e..d53e13048d2e 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -13,7 +13,7 @@
 /*
  * Defines x86 CPU feature bits
  */
-#define NCAPINTS			20	   /* N 32-bit words worth of info */
+#define NCAPINTS			21	   /* N 32-bit words worth of info */
 #define NBUGINTS			1	   /* N 32-bit bug flags */
 
 /*
diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h
index c44b56f7ffba..5dfa4fb76f4b 100644
--- a/tools/arch/x86/include/asm/disabled-features.h
+++ b/tools/arch/x86/include/asm/disabled-features.h
@@ -124,6 +124,7 @@
 #define DISABLED_MASK17	0
 #define DISABLED_MASK18	0
 #define DISABLED_MASK19	0
-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
+#define DISABLED_MASK20	0
+#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
 
 #endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h
index aff774775c67..7ba1726b71c7 100644
--- a/tools/arch/x86/include/asm/required-features.h
+++ b/tools/arch/x86/include/asm/required-features.h
@@ -98,6 +98,7 @@
 #define REQUIRED_MASK17	0
 #define REQUIRED_MASK18	0
 #define REQUIRED_MASK19	0
-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
+#define REQUIRED_MASK20	0
+#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
 
 #endif /* _ASM_X86_REQUIRED_FEATURES_H */
-- 
cgit 


From 294635a8165a31408a8b3a24f9c74849ca3d8701 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 24 Feb 2023 17:36:07 +0100
Subject: bpf, test_run: fix &xdp_frame misplacement for LIVE_FRAMES
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

&xdp_buff and &xdp_frame are bound in a way that

xdp_buff->data_hard_start == xdp_frame

It's always the case and e.g. xdp_convert_buff_to_frame() relies on
this.
IOW, the following:

	for (u32 i = 0; i < 0xdead; i++) {
		xdpf = xdp_convert_buff_to_frame(&xdp);
		xdp_convert_frame_to_buff(xdpf, &xdp);
	}

shouldn't ever modify @xdpf's contents or the pointer itself.
However, "live packet" code wrongly treats &xdp_frame as part of its
context placed *before* the data_hard_start. With such flow,
data_hard_start is sizeof(*xdpf) off to the right and no longer points
to the XDP frame.

Instead of replacing `sizeof(ctx)` with `offsetof(ctx, xdpf)` in several
places and praying that there are no more miscalcs left somewhere in the
code, unionize ::frm with ::data in a flex array, so that both starts
pointing to the actual data_hard_start and the XDP frame actually starts
being a part of it, i.e. a part of the headroom, not the context.
A nice side effect is that the maximum frame size for this mode gets
increased by 40 bytes, as xdp_buff::frame_sz includes everything from
data_hard_start (-> includes xdpf already) to the end of XDP/skb shared
info.
Also update %MAX_PKT_SIZE accordingly in the selftests code. Leave it
hardcoded for 64 bit && 4k pages, it can be made more flexible later on.

Minor: align `&head->data` with how `head->frm` is assigned for
consistency.
Minor #2: rename 'frm' to 'frame' in &xdp_page_head while at it for
clarity.

(was found while testing XDP traffic generator on ice, which calls
 xdp_convert_frame_to_buff() for each XDP frame)

Fixes: b530e9e1063e ("bpf: Add "live packet" mode for XDP in BPF_PROG_RUN")
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20230224163607.2994755-1-aleksander.lobakin@intel.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 net/bpf/test_run.c                                    | 19 +++++++++++++------
 .../selftests/bpf/prog_tests/xdp_do_redirect.c        |  7 ++++---
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 6f3d654b3339..f81b24320a36 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -97,8 +97,11 @@ reset:
 struct xdp_page_head {
 	struct xdp_buff orig_ctx;
 	struct xdp_buff ctx;
-	struct xdp_frame frm;
-	u8 data[];
+	union {
+		/* ::data_hard_start starts here */
+		DECLARE_FLEX_ARRAY(struct xdp_frame, frame);
+		DECLARE_FLEX_ARRAY(u8, data);
+	};
 };
 
 struct xdp_test_data {
@@ -113,6 +116,10 @@ struct xdp_test_data {
 	u32 frame_cnt;
 };
 
+/* tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c:%MAX_PKT_SIZE
+ * must be updated accordingly this gets changed, otherwise BPF selftests
+ * will fail.
+ */
 #define TEST_XDP_FRAME_SIZE (PAGE_SIZE - sizeof(struct xdp_page_head))
 #define TEST_XDP_MAX_BATCH 256
 
@@ -132,8 +139,8 @@ static void xdp_test_run_init_page(struct page *page, void *arg)
 	headroom -= meta_len;
 
 	new_ctx = &head->ctx;
-	frm = &head->frm;
-	data = &head->data;
+	frm = head->frame;
+	data = head->data;
 	memcpy(data + headroom, orig_ctx->data_meta, frm_len);
 
 	xdp_init_buff(new_ctx, TEST_XDP_FRAME_SIZE, &xdp->rxq);
@@ -223,7 +230,7 @@ static void reset_ctx(struct xdp_page_head *head)
 	head->ctx.data = head->orig_ctx.data;
 	head->ctx.data_meta = head->orig_ctx.data_meta;
 	head->ctx.data_end = head->orig_ctx.data_end;
-	xdp_update_frame_from_buff(&head->ctx, &head->frm);
+	xdp_update_frame_from_buff(&head->ctx, head->frame);
 }
 
 static int xdp_recv_frames(struct xdp_frame **frames, int nframes,
@@ -285,7 +292,7 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
 		head = phys_to_virt(page_to_phys(page));
 		reset_ctx(head);
 		ctx = &head->ctx;
-		frm = &head->frm;
+		frm = head->frame;
 		xdp->frame_cnt++;
 
 		act = bpf_prog_run_xdp(prog, ctx);
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
index 2666c84dbd01..7271a18ab3e2 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
@@ -65,12 +65,13 @@ static int attach_tc_prog(struct bpf_tc_hook *hook, int fd)
 }
 
 /* The maximum permissible size is: PAGE_SIZE - sizeof(struct xdp_page_head) -
- * sizeof(struct skb_shared_info) - XDP_PACKET_HEADROOM = 3368 bytes
+ * SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) - XDP_PACKET_HEADROOM =
+ * 3408 bytes for 64-byte cacheline and 3216 for 256-byte one.
  */
 #if defined(__s390x__)
-#define MAX_PKT_SIZE 3176
+#define MAX_PKT_SIZE 3216
 #else
-#define MAX_PKT_SIZE 3368
+#define MAX_PKT_SIZE 3408
 #endif
 static void test_max_pkt_size(int fd)
 {
-- 
cgit 


From dfdd608c3b365f0fd49d7e13911ebcde06b9865b Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lorenz.bauer@isovalent.com>
Date: Mon, 6 Mar 2023 11:21:38 +0000
Subject: selftests/bpf: check that modifier resolves after pointer

Add a regression test that ensures that a VAR pointing at a
modifier which follows a PTR (or STRUCT or ARRAY) is resolved
correctly by the datasec validator.

Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/r/20230306112138.155352-3-lmb@isovalent.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/btf.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index cbb600be943d..210d643fda6c 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -879,6 +879,34 @@ static struct btf_raw_test raw_tests[] = {
 	.btf_load_err = true,
 	.err_str = "Invalid elem",
 },
+{
+	.descr = "var after datasec, ptr followed by modifier",
+	.raw_types = {
+		/* .bss section */				/* [1] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 2),
+			sizeof(void*)+4),
+		BTF_VAR_SECINFO_ENC(4, 0, sizeof(void*)),
+		BTF_VAR_SECINFO_ENC(6, sizeof(void*), 4),
+		/* int */					/* [2] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* int* */					/* [3] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),
+		BTF_VAR_ENC(NAME_TBD, 3, 0),			/* [4] */
+		/* const int */					/* [5] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 2),
+		BTF_VAR_ENC(NAME_TBD, 5, 0),			/* [6] */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0a\0b\0c\0",
+	.str_sec_size = sizeof("\0a\0b\0c\0"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = ".bss",
+	.key_size = sizeof(int),
+	.value_size = sizeof(void*)+4,
+	.key_type_id = 0,
+	.value_type_id = 1,
+	.max_entries = 1,
+},
 /* Test member exceeds the size of struct.
  *
  * struct A {
-- 
cgit 


From 06a1574b94ee5272789eaea05a5bdddedea7cc0d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 6 Mar 2023 14:44:11 -0300
Subject: tools headers UAPI: Sync linux/perf_event.h with the kernel sources

To pick up the changes in:

  09519ec3b19e4144 ("perf: Add perf_event_attr::config3")

The patches for the tooling side will come later.

This addresses this perf build warning:

  Warning: Kernel ABI header at 'tools/include/uapi/linux/perf_event.h' differs from latest version at 'include/uapi/linux/perf_event.h'
  diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/lkml/ZAZLYmDjWjSItWOq@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/uapi/linux/perf_event.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index ccb7f5dad59b..37675437b768 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -374,6 +374,7 @@ enum perf_event_read_format {
 #define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
 #define PERF_ATTR_SIZE_VER6	120	/* add: aux_sample_size */
 #define PERF_ATTR_SIZE_VER7	128	/* add: sig_data */
+#define PERF_ATTR_SIZE_VER8	136	/* add: config3 */
 
 /*
  * Hardware event_id to monitor via a performance monitoring event:
@@ -515,6 +516,8 @@ struct perf_event_attr {
 	 * truncated accordingly on 32 bit architectures.
 	 */
 	__u64	sig_data;
+
+	__u64	config3; /* extension of config2 */
 };
 
 /*
-- 
cgit 


From 37d9df224d1eec1b434fe9ffa40104c756478c29 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 6 Mar 2023 12:04:57 -0800
Subject: ynl: re-license uniformly under GPL-2.0 OR BSD-3-Clause

I was intending to make all the Netlink Spec code BSD-3-Clause
to ease the adoption but it appears that:
 - I fumbled the uAPI and used "GPL WITH uAPI note" there
 - it gives people pause as they expect GPL in the kernel
As suggested by Chuck re-license under dual. This gives us benefit
of full BSD freedom while fulfilling the broad "kernel is under GPL"
expectations.

Link: https://lore.kernel.org/all/20230304120108.05dd44c5@kernel.org/
Link: https://lore.kernel.org/r/20230306200457.3903854-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/genetlink-c.yaml        | 2 +-
 Documentation/netlink/genetlink-legacy.yaml   | 2 +-
 Documentation/netlink/genetlink.yaml          | 2 +-
 Documentation/netlink/specs/ethtool.yaml      | 2 ++
 Documentation/netlink/specs/fou.yaml          | 2 ++
 Documentation/netlink/specs/netdev.yaml       | 2 ++
 Documentation/userspace-api/netlink/specs.rst | 3 +++
 include/uapi/linux/fou.h                      | 2 +-
 include/uapi/linux/netdev.h                   | 2 +-
 net/core/netdev-genl-gen.c                    | 2 +-
 net/core/netdev-genl-gen.h                    | 2 +-
 net/ipv4/fou_nl.c                             | 2 +-
 net/ipv4/fou_nl.h                             | 2 +-
 tools/include/uapi/linux/netdev.h             | 2 +-
 tools/net/ynl/cli.py                          | 2 +-
 tools/net/ynl/lib/__init__.py                 | 2 +-
 tools/net/ynl/lib/nlspec.py                   | 2 +-
 tools/net/ynl/lib/ynl.py                      | 2 +-
 tools/net/ynl/ynl-gen-c.py                    | 7 ++++---
 tools/net/ynl/ynl-regen.sh                    | 2 +-
 20 files changed, 28 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/Documentation/netlink/genetlink-c.yaml b/Documentation/netlink/genetlink-c.yaml
index bbcfa2472b04..f082a5ad7cf1 100644
--- a/Documentation/netlink/genetlink-c.yaml
+++ b/Documentation/netlink/genetlink-c.yaml
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: GPL-2.0
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 %YAML 1.2
 ---
 $id: http://kernel.org/schemas/netlink/genetlink-c.yaml#
diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml
index 5642925c4ceb..c6b8c77f7d12 100644
--- a/Documentation/netlink/genetlink-legacy.yaml
+++ b/Documentation/netlink/genetlink-legacy.yaml
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: GPL-2.0
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 %YAML 1.2
 ---
 $id: http://kernel.org/schemas/netlink/genetlink-legacy.yaml#
diff --git a/Documentation/netlink/genetlink.yaml b/Documentation/netlink/genetlink.yaml
index 62a922755ce2..b2d56ab9e615 100644
--- a/Documentation/netlink/genetlink.yaml
+++ b/Documentation/netlink/genetlink.yaml
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: GPL-2.0
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 %YAML 1.2
 ---
 $id: http://kernel.org/schemas/netlink/genetlink-legacy.yaml#
diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index 35c462bce56f..18ecb7d90cbe 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
 name: ethtool
 
 protocol: genetlink-legacy
diff --git a/Documentation/netlink/specs/fou.yaml b/Documentation/netlink/specs/fou.yaml
index cca4cf98f03a..cff104288723 100644
--- a/Documentation/netlink/specs/fou.yaml
+++ b/Documentation/netlink/specs/fou.yaml
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
 name: fou
 
 protocol: genetlink-legacy
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index ba9ee13cf729..24de747b5344 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
 name: netdev
 
 doc:
diff --git a/Documentation/userspace-api/netlink/specs.rst b/Documentation/userspace-api/netlink/specs.rst
index 32e53328d113..2122e0c4a399 100644
--- a/Documentation/userspace-api/netlink/specs.rst
+++ b/Documentation/userspace-api/netlink/specs.rst
@@ -24,6 +24,9 @@ YAML specifications can be found under ``Documentation/netlink/specs/``
 This document describes details of the schema.
 See :doc:`intro-specs` for a practical starting guide.
 
+All specs must be licensed under ``GPL-2.0-only OR BSD-3-Clause``
+to allow for easy adoption in user space code.
+
 Compatibility levels
 ====================
 
diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h
index 19ebbef41a63..5041c3598493 100644
--- a/include/uapi/linux/fou.h
+++ b/include/uapi/linux/fou.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause */
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/fou.yaml */
 /* YNL-GEN uapi header */
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 588391447bfb..8c4e3e536c04 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause */
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/netdev.yaml */
 /* YNL-GEN uapi header */
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index 48812ec843f5..9e10802587fc 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: BSD-3-Clause
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/netdev.yaml */
 /* YNL-GEN kernel source */
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index b16dc7e026bb..2c5fc7d1e8a7 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: BSD-3-Clause */
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/netdev.yaml */
 /* YNL-GEN kernel header */
diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c
index 6c3820f41dd5..5c14fe030eda 100644
--- a/net/ipv4/fou_nl.c
+++ b/net/ipv4/fou_nl.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: BSD-3-Clause
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/fou.yaml */
 /* YNL-GEN kernel source */
diff --git a/net/ipv4/fou_nl.h b/net/ipv4/fou_nl.h
index b7a68121ce6f..58b1e1ed4b3b 100644
--- a/net/ipv4/fou_nl.h
+++ b/net/ipv4/fou_nl.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: BSD-3-Clause */
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/fou.yaml */
 /* YNL-GEN kernel header */
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 588391447bfb..8c4e3e536c04 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause */
 /* Do not edit directly, auto-generated from: */
 /*	Documentation/netlink/specs/netdev.yaml */
 /* YNL-GEN uapi header */
diff --git a/tools/net/ynl/cli.py b/tools/net/ynl/cli.py
index db410b74d539..ffaa8038aa8c 100755
--- a/tools/net/ynl/cli.py
+++ b/tools/net/ynl/cli.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# SPDX-License-Identifier: BSD-3-Clause
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 
 import argparse
 import json
diff --git a/tools/net/ynl/lib/__init__.py b/tools/net/ynl/lib/__init__.py
index 3c73f59eabab..a2cb8b16d6f1 100644
--- a/tools/net/ynl/lib/__init__.py
+++ b/tools/net/ynl/lib/__init__.py
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: BSD-3-Clause
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 
 from .nlspec import SpecAttr, SpecAttrSet, SpecFamily, SpecOperation
 from .ynl import YnlFamily
diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py
index 9d394e50de23..0a2cfb5862aa 100644
--- a/tools/net/ynl/lib/nlspec.py
+++ b/tools/net/ynl/lib/nlspec.py
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: BSD-3-Clause
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 
 import collections
 import importlib
diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py
index 1c7411ee04dc..a842adc8e87e 100644
--- a/tools/net/ynl/lib/ynl.py
+++ b/tools/net/ynl/lib/ynl.py
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: BSD-3-Clause
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 
 import functools
 import os
diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 62f8f2c3c56c..c940ca834d3f 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 
 import argparse
 import collections
@@ -2127,12 +2128,12 @@ def main():
 
     _, spec_kernel = find_kernel_root(args.spec)
     if args.mode == 'uapi':
-        cw.p('/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */')
+        cw.p('/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause */')
     else:
         if args.header:
-            cw.p('/* SPDX-License-Identifier: BSD-3-Clause */')
+            cw.p('/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */')
         else:
-            cw.p('// SPDX-License-Identifier: BSD-3-Clause')
+            cw.p('// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause')
     cw.p("/* Do not edit directly, auto-generated from: */")
     cw.p(f"/*\t{spec_kernel} */")
     cw.p(f"/* YNL-GEN {args.mode} {'header' if args.header else 'source'} */")
diff --git a/tools/net/ynl/ynl-regen.sh b/tools/net/ynl/ynl-regen.sh
index 43989ae48ed0..74f5de1c2399 100755
--- a/tools/net/ynl/ynl-regen.sh
+++ b/tools/net/ynl/ynl-regen.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# SPDX-License-Identifier: BSD-3-Clause
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 
 TOOL=$(dirname $(realpath $0))/ynl-gen-c.py
 
-- 
cgit 


From 6517a60b0307abdcca80133c237551cc7cc8e6ae Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 7 Mar 2023 16:39:22 -0800
Subject: tools: ynl: move the enum classes to shared code

Move bulk of the EnumSet and EnumEntry code to shared
code for reuse by cli.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/__init__.py |   7 +--
 tools/net/ynl/lib/nlspec.py   |  96 +++++++++++++++++++++++++++++++++++++
 tools/net/ynl/ynl-gen-c.py    | 107 +++++++++---------------------------------
 3 files changed, 121 insertions(+), 89 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/lib/__init__.py b/tools/net/ynl/lib/__init__.py
index a2cb8b16d6f1..4b3797fe784b 100644
--- a/tools/net/ynl/lib/__init__.py
+++ b/tools/net/ynl/lib/__init__.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 
-from .nlspec import SpecAttr, SpecAttrSet, SpecFamily, SpecOperation
+from .nlspec import SpecAttr, SpecAttrSet, SpecEnumEntry, SpecEnumSet, \
+    SpecFamily, SpecOperation
 from .ynl import YnlFamily
 
-__all__ = ["SpecAttr", "SpecAttrSet", "SpecFamily", "SpecOperation",
-           "YnlFamily"]
+__all__ = ["SpecAttr", "SpecAttrSet", "SpecEnumEntry", "SpecEnumSet",
+           "SpecFamily", "SpecOperation", "YnlFamily"]
diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py
index 0a2cfb5862aa..7c1cf6c1499e 100644
--- a/tools/net/ynl/lib/nlspec.py
+++ b/tools/net/ynl/lib/nlspec.py
@@ -57,6 +57,91 @@ class SpecElement:
         pass
 
 
+class SpecEnumEntry(SpecElement):
+    """ Entry within an enum declared in the Netlink spec.
+
+    Attributes:
+        doc         documentation string
+        enum_set    back reference to the enum
+        value       numerical value of this enum (use accessors in most situations!)
+
+    Methods:
+        raw_value   raw value, i.e. the id in the enum, unlike user value which is a mask for flags
+        user_value   user value, same as raw value for enums, for flags it's the mask
+    """
+    def __init__(self, enum_set, yaml, prev, value_start):
+        if isinstance(yaml, str):
+            yaml = {'name': yaml}
+        super().__init__(enum_set.family, yaml)
+
+        self.doc = yaml.get('doc', '')
+        self.enum_set = enum_set
+
+        if 'value' in yaml:
+            self.value = yaml['value']
+        elif prev:
+            self.value = prev.value + 1
+        else:
+            self.value = value_start
+
+    def has_doc(self):
+        return bool(self.doc)
+
+    def raw_value(self):
+        return self.value
+
+    def user_value(self):
+        if self.enum_set['type'] == 'flags':
+            return 1 << self.value
+        else:
+            return self.value
+
+
+class SpecEnumSet(SpecElement):
+    """ Enum type
+
+    Represents an enumeration (list of numerical constants)
+    as declared in the "definitions" section of the spec.
+
+    Attributes:
+        type          enum or flags
+        entries       entries by name
+    Methods:
+        get_mask      for flags compute the mask of all defined values
+    """
+    def __init__(self, family, yaml):
+        super().__init__(family, yaml)
+
+        self.type = yaml['type']
+
+        prev_entry = None
+        value_start = self.yaml.get('value-start', 0)
+        self.entries = dict()
+        for entry in self.yaml['entries']:
+            e = self.new_entry(entry, prev_entry, value_start)
+            self.entries[e.name] = e
+            prev_entry = e
+
+    def new_entry(self, entry, prev_entry, value_start):
+        return SpecEnumEntry(self, entry, prev_entry, value_start)
+
+    def has_doc(self):
+        if 'doc' in self.yaml:
+            return True
+        for entry in self.entries.values():
+            if entry.has_doc():
+                return True
+        return False
+
+    def get_mask(self):
+        mask = 0
+        idx = self.yaml.get('value-start', 0)
+        for _ in self.entries.values():
+            mask |= 1 << idx
+            idx += 1
+        return mask
+
+
 class SpecAttr(SpecElement):
     """ Single Netlink atttribute type
 
@@ -193,6 +278,7 @@ class SpecFamily(SpecElement):
         msgs       dict of all messages (index by name)
         msgs_by_value  dict of all messages (indexed by name)
         ops        dict of all valid requests / responses
+        consts     dict of all constants/enums
     """
     def __init__(self, spec_path, schema_path=None):
         with open(spec_path, "r") as stream:
@@ -222,6 +308,7 @@ class SpecFamily(SpecElement):
         self.req_by_value = collections.OrderedDict()
         self.rsp_by_value = collections.OrderedDict()
         self.ops = collections.OrderedDict()
+        self.consts = collections.OrderedDict()
 
         last_exception = None
         while len(self._resolution_list) > 0:
@@ -242,6 +329,9 @@ class SpecFamily(SpecElement):
             if len(resolved) == 0:
                 raise last_exception
 
+    def new_enum(self, elem):
+        return SpecEnumSet(self, elem)
+
     def new_attr_set(self, elem):
         return SpecAttrSet(self, elem)
 
@@ -296,6 +386,12 @@ class SpecFamily(SpecElement):
     def resolve(self):
         self.resolve_up(super())
 
+        for elem in self.yaml['definitions']:
+            if elem['type'] == 'enum' or elem['type'] == 'flags':
+                self.consts[elem['name']] = self.new_enum(elem)
+            else:
+                self.consts[elem['name']] = elem
+
         for elem in self.yaml['attribute-sets']:
             attr_set = self.new_attr_set(elem)
             self.attr_sets[elem['name']] = attr_set
diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index c940ca834d3f..1bcc5354d800 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -6,7 +6,7 @@ import collections
 import os
 import yaml
 
-from lib import SpecFamily, SpecAttrSet, SpecAttr, SpecOperation
+from lib import SpecFamily, SpecAttrSet, SpecAttr, SpecOperation, SpecEnumSet, SpecEnumEntry
 
 
 def c_upper(name):
@@ -567,97 +567,37 @@ class Struct:
         self.inherited = [c_lower(x) for x in sorted(self._inherited)]
 
 
-class EnumEntry:
+class EnumEntry(SpecEnumEntry):
     def __init__(self, enum_set, yaml, prev, value_start):
-        if isinstance(yaml, str):
-            self.name = yaml
-            yaml = {}
-            self.doc = ''
-        else:
-            self.name = yaml['name']
-            self.doc = yaml.get('doc', '')
-
-        self.yaml = yaml
-        self.enum_set = enum_set
-        self.c_name = c_upper(enum_set.value_pfx + self.name)
-
-        if 'value' in yaml:
-            self.value = yaml['value']
-            if prev:
-                self.value_change = (self.value != prev.value + 1)
-        elif prev:
-            self.value_change = False
-            self.value = prev.value + 1
+        super().__init__(enum_set, yaml, prev, value_start)
+
+        if prev:
+            self.value_change = (self.value != prev.value + 1)
         else:
-            self.value = value_start
             self.value_change = (self.value != 0)
-
         self.value_change = self.value_change or self.enum_set['type'] == 'flags'
 
-    def __getitem__(self, key):
-        return self.yaml[key]
-
-    def __contains__(self, key):
-        return key in self.yaml
-
-    def has_doc(self):
-        return bool(self.doc)
+        # Added by resolve:
+        self.c_name = None
+        delattr(self, "c_name")
 
-    # raw value, i.e. the id in the enum, unlike user value which is a mask for flags
-    def raw_value(self):
-        return self.value
+    def resolve(self):
+        self.resolve_up(super())
 
-    # user value, same as raw value for enums, for flags it's the mask
-    def user_value(self):
-        if self.enum_set['type'] == 'flags':
-            return 1 << self.value
-        else:
-            return self.value
+        self.c_name = c_upper(self.enum_set.value_pfx + self.name)
 
 
-class EnumSet:
+class EnumSet(SpecEnumSet):
     def __init__(self, family, yaml):
-        self.yaml = yaml
-        self.family = family
-
         self.render_name = c_lower(family.name + '-' + yaml['name'])
         self.enum_name = 'enum ' + self.render_name
 
         self.value_pfx = yaml.get('name-prefix', f"{family.name}-{yaml['name']}-")
 
-        self.type = yaml['type']
-
-        prev_entry = None
-        value_start = self.yaml.get('value-start', 0)
-        self.entries = {}
-        self.entry_list = []
-        for entry in self.yaml['entries']:
-            e = EnumEntry(self, entry, prev_entry, value_start)
-            self.entries[e.name] = e
-            self.entry_list.append(e)
-            prev_entry = e
-
-    def __getitem__(self, key):
-        return self.yaml[key]
-
-    def __contains__(self, key):
-        return key in self.yaml
-
-    def has_doc(self):
-        if 'doc' in self.yaml:
-            return True
-        for entry in self.entry_list:
-            if entry.has_doc():
-                return True
-        return False
+        super().__init__(family, yaml)
 
-    def get_mask(self):
-        mask = 0
-        idx = self.yaml.get('value-start', 0)
-        for _ in self.entry_list:
-            mask |= 1 << idx
-            idx += 1
-        return mask
+    def new_entry(self, entry, prev_entry, value_start):
+        return EnumEntry(self, entry, prev_entry, value_start)
 
 
 class AttrSet(SpecAttrSet):
@@ -792,8 +732,6 @@ class Family(SpecFamily):
 
         self.mcgrps = self.yaml.get('mcast-groups', {'list': []})
 
-        self.consts = dict()
-
         self.hooks = dict()
         for when in ['pre', 'post']:
             self.hooks[when] = dict()
@@ -820,6 +758,9 @@ class Family(SpecFamily):
         if self.kernel_policy == 'global':
             self._load_global_policy()
 
+    def new_enum(self, elem):
+        return EnumSet(self, elem)
+
     def new_attr_set(self, elem):
         return AttrSet(self, elem)
 
@@ -837,12 +778,6 @@ class Family(SpecFamily):
                 }
 
     def _dictify(self):
-        for elem in self.yaml['definitions']:
-            if elem['type'] == 'enum' or elem['type'] == 'flags':
-                self.consts[elem['name']] = EnumSet(self, elem)
-            else:
-                self.consts[elem['name']] = elem
-
         ntf = []
         for msg in self.msgs.values():
             if 'notify' in msg:
@@ -1980,7 +1915,7 @@ def render_uapi(family, cw):
                 if 'doc' in enum:
                     doc = ' - ' + enum['doc']
                 cw.write_doc_line(enum.enum_name + doc)
-                for entry in enum.entry_list:
+                for entry in enum.entries.values():
                     if entry.has_doc():
                         doc = '@' + entry.c_name + ': ' + entry['doc']
                         cw.write_doc_line(doc)
@@ -1988,7 +1923,7 @@ def render_uapi(family, cw):
 
             uapi_enum_start(family, cw, const, 'name')
             name_pfx = const.get('name-prefix', f"{family.name}-{const['name']}-")
-            for entry in enum.entry_list:
+            for entry in enum.entries.values():
                 suffix = ','
                 if entry.value_change:
                     suffix = f" = {entry.user_value()}" + suffix
-- 
cgit 


From c311aaa74ca18bd0781d1d3bebd08484799f840c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 7 Mar 2023 16:39:23 -0800
Subject: tools: ynl: fix enum-as-flags in the generic CLI

Lorenzo points out that the generic CLI is broken for the netdev
family. When I added the support for documentation of enums
(and sparse enums) the client script was not updated.
It expects the values in enum to be a list of names,
now it can also be a dict (YAML object).

Reported-by: Lorenzo Bianconi <lorenzo@kernel.org>
Fixes: e4b48ed460d3 ("tools: ynl: add a completely generic client")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/lib/nlspec.py | 7 +++++--
 tools/net/ynl/lib/ynl.py    | 9 ++-------
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py
index 7c1cf6c1499e..a34d088f6743 100644
--- a/tools/net/ynl/lib/nlspec.py
+++ b/tools/net/ynl/lib/nlspec.py
@@ -104,8 +104,9 @@ class SpecEnumSet(SpecElement):
     as declared in the "definitions" section of the spec.
 
     Attributes:
-        type          enum or flags
-        entries       entries by name
+        type            enum or flags
+        entries         entries by name
+        entries_by_val  entries by value
     Methods:
         get_mask      for flags compute the mask of all defined values
     """
@@ -117,9 +118,11 @@ class SpecEnumSet(SpecElement):
         prev_entry = None
         value_start = self.yaml.get('value-start', 0)
         self.entries = dict()
+        self.entries_by_val = dict()
         for entry in self.yaml['entries']:
             e = self.new_entry(entry, prev_entry, value_start)
             self.entries[e.name] = e
+            self.entries_by_val[e.raw_value()] = e
             prev_entry = e
 
     def new_entry(self, entry, prev_entry, value_start):
diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py
index a842adc8e87e..90764a83c646 100644
--- a/tools/net/ynl/lib/ynl.py
+++ b/tools/net/ynl/lib/ynl.py
@@ -303,11 +303,6 @@ class YnlFamily(SpecFamily):
         self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1)
         self.sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_EXT_ACK, 1)
 
-        self._types = dict()
-
-        for elem in self.yaml.get('definitions', []):
-            self._types[elem['name']] = elem
-
         self.async_msg_ids = set()
         self.async_msg_queue = []
 
@@ -353,13 +348,13 @@ class YnlFamily(SpecFamily):
 
     def _decode_enum(self, rsp, attr_spec):
         raw = rsp[attr_spec['name']]
-        enum = self._types[attr_spec['enum']]
+        enum = self.consts[attr_spec['enum']]
         i = attr_spec.get('value-start', 0)
         if 'enum-as-flags' in attr_spec and attr_spec['enum-as-flags']:
             value = set()
             while raw:
                 if raw & 1:
-                    value.add(enum['entries'][i])
+                    value.add(enum.entries_by_val[i].name)
                 raw >>= 1
                 i += 1
         else:
-- 
cgit