From 19d8f1ad12fd746e60707a58d954980013c7a35a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Sep 2018 21:53:52 +0200 Subject: if_link: add IFLA_TARGET_NETNSID alias This adds IFLA_TARGET_NETNSID as an alias for IFLA_IF_NETNSID for RTM_*LINK requests. The new name is clearer and also aligns with the newly introduced IFA_TARGET_NETNSID propert for RTM_*ADDR requests. Signed-off-by: Christian Brauner Suggested-by: Nicolas Dichtel Cc: Jiri Benc Signed-off-by: David S. Miller --- tools/include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index cf01b6824244..1c73d63068b1 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -161,6 +161,7 @@ enum { IFLA_EVENT, IFLA_NEW_NETNSID, IFLA_IF_NETNSID, + IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ IFLA_CARRIER_UP_COUNT, IFLA_CARRIER_DOWN_COUNT, IFLA_NEW_IFINDEX, -- cgit From 52b7b7843d9523ebc3c60c51c7afc4a45cc10aad Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 5 Sep 2018 16:58:03 -0700 Subject: tools/bpf: sync kernel uapi header if_link.h to tools Among others, this header will be used later for bpftool net support. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/if_link.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index cf01b6824244..43391e2d1153 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -164,6 +164,8 @@ enum { IFLA_CARRIER_UP_COUNT, IFLA_CARRIER_DOWN_COUNT, IFLA_NEW_IFINDEX, + IFLA_MIN_MTU, + IFLA_MAX_MTU, __IFLA_MAX }; @@ -334,6 +336,7 @@ enum { IFLA_BRPORT_GROUP_FWD_MASK, IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, + IFLA_BRPORT_BACKUP_PORT, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) @@ -459,6 +462,16 @@ enum { #define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) +/* XFRM section */ +enum { + IFLA_XFRM_UNSPEC, + IFLA_XFRM_LINK, + IFLA_XFRM_IF_ID, + __IFLA_XFRM_MAX +}; + +#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1) + enum macsec_validation_type { MACSEC_VALIDATE_DISABLED = 0, MACSEC_VALIDATE_CHECK = 1, @@ -920,6 +933,7 @@ enum { XDP_ATTACHED_DRV, XDP_ATTACHED_SKB, XDP_ATTACHED_HW, + XDP_ATTACHED_MULTI, }; enum { @@ -928,6 +942,9 @@ enum { IFLA_XDP_ATTACHED, IFLA_XDP_FLAGS, IFLA_XDP_PROG_ID, + IFLA_XDP_DRV_PROG_ID, + IFLA_XDP_SKB_PROG_ID, + IFLA_XDP_HW_PROG_ID, __IFLA_XDP_MAX, }; -- cgit From 1064ea494bb00519c6e34f791dcf17436f70592d Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 13 Aug 2018 19:05:38 +0000 Subject: tools/lib/lockdep: Add empty nmi.h Required since: 88f1c87de11a8 ("locking/lockdep: Avoid triggering hardlockup from debug_show_all_locks()") Signed-off-by: Sasha Levin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180813190527.16853-3-alexander.levin@microsoft.com Signed-off-by: Ingo Molnar --- tools/include/linux/nmi.h | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tools/include/linux/nmi.h (limited to 'tools/include') diff --git a/tools/include/linux/nmi.h b/tools/include/linux/nmi.h new file mode 100644 index 000000000000..e69de29bb2d1 -- cgit From 16214312df6d5aaa5324864d032ce565e97f8890 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 13 Aug 2018 19:05:39 +0000 Subject: tools/lib/lockdep: Add dummy task_struct state member Commit: 8cc05c71ba5f ("locking/lockdep: Move sanity check to inside lockdep_print_held_locks()") added accesses to the task_struct's state member. Add dummy userspace declaration. Signed-off-by: Sasha Levin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180813190527.16853-4-alexander.levin@microsoft.com Signed-off-by: Ingo Molnar --- tools/include/linux/lockdep.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools/include') diff --git a/tools/include/linux/lockdep.h b/tools/include/linux/lockdep.h index 6b0c36a58fcb..e56997288f2b 100644 --- a/tools/include/linux/lockdep.h +++ b/tools/include/linux/lockdep.h @@ -30,9 +30,12 @@ struct task_struct { struct held_lock held_locks[MAX_LOCK_DEPTH]; gfp_t lockdep_reclaim_gfp; int pid; + int state; char comm[17]; }; +#define TASK_RUNNING 0 + extern struct task_struct *__curr(void); #define current (__curr()) -- cgit From 0ee03d936cbb300309ed6154ac1cc12b63e9785f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Sep 2018 10:57:13 -0300 Subject: tools headers uapi: Update tools's copy of linux/perf_event.h To get the changes in: 09121255c784 ("perf/UAPI: Clearly mark __PERF_SAMPLE_CALLCHAIN_EARLY as internal use") This cures the following warning during perf's build: Warning: Kernel ABI header at 'tools/include/uapi/linux/perf_event.h' differs from latest version at 'include/uapi/linux/perf_event.h' diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Cc: Peter Zijlstra Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: https://lkml.kernel.org/n/tip-2vvwh2o19orn56di0ksrtgzr@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index eeb787b1c53c..f35eb72739c0 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -144,7 +144,7 @@ enum perf_event_sample_format { PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */ - __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, + __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; /* -- cgit From f9e6e4351e0bb0811a8b3696679cc6050e4f5947 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Sep 2018 11:00:54 -0300 Subject: tools headers uapi: Update tools's copy of asm-generic/unistd.h To get the changes in: db7a2d1809a5 ("asm-generic: unistd.h: Wire up sys_rseq") That wires up the new 'rsec' system call, which will automagically support that syscall in the syscall table used by 'perf trace' on arm/arm64. This cures the following warning during perf's build: Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/unistd.h' differs from latest version at 'include/uapi/asm-generic/unistd.h' diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnd Bergmann Cc: David Ahern Cc: Hendrik Brueckner Cc: Jiri Olsa Cc: Kim Phillips Cc: Mathieu Desnoyers Cc: Michael Ellerman Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Thomas Richter Cc: Wang Nan Cc: Will Deacon Link: https://lkml.kernel.org/n/tip-vt7k2itnitp1t9p3dp7qeb08@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/unistd.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 42990676a55e..df4bedb9b01c 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -734,9 +734,11 @@ __SYSCALL(__NR_pkey_free, sys_pkey_free) __SYSCALL(__NR_statx, sys_statx) #define __NR_io_pgetevents 292 __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents) +#define __NR_rseq 293 +__SYSCALL(__NR_rseq, sys_rseq) #undef __NR_syscalls -#define __NR_syscalls 293 +#define __NR_syscalls 294 /* * 32 bit systems traditionally used different -- cgit From 434ea1bfbfc707f5fed9292df6a9b91dfb8e41f2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Sep 2018 11:07:56 -0300 Subject: tools headers uapi: Update tools's copy of drm/drm.h To get the changes in: d67b6a206507 ("drm: writeback: Add client capability for exposing writeback connectors") This is for an argument to a DRM ioctl, which is not being prettyfied in the 'perf trace' DRM ioctl beautifier, but will now that syscalls are starting to have pointer arguments augmented via BPF. This time around this just cures the following warning during perf's build: Warning: Kernel ABI header at 'tools/include/uapi/drm/drm.h' differs from latest version at 'include/uapi/drm/drm.h' diff -u tools/include/uapi/drm/drm.h include/uapi/drm/drm.h Cc: Adrian Hunter Cc: Brian Starkey Cc: David Ahern Cc: Eric Anholt Cc: Jiri Olsa Cc: Liviu Dudau Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sean Paul Cc: Wang Nan Link: https://lkml.kernel.org/n/tip-n7qib1bac6mc6w9oke7r4qdc@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/drm/drm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h index 9c660e1688ab..300f336633f2 100644 --- a/tools/include/uapi/drm/drm.h +++ b/tools/include/uapi/drm/drm.h @@ -687,6 +687,15 @@ struct drm_get_cap { */ #define DRM_CLIENT_CAP_ASPECT_RATIO 4 +/** + * DRM_CLIENT_CAP_WRITEBACK_CONNECTORS + * + * If set to 1, the DRM core will expose special connectors to be used for + * writing back to memory the scene setup in the commit. Depends on client + * also supporting DRM_CLIENT_CAP_ATOMIC + */ +#define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS 5 + /** DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */ struct drm_set_client_cap { __u64 capability; -- cgit From 0210c156d7fd330bce1c2c842bee9d27f1c5dfeb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Sep 2018 11:18:58 -0300 Subject: tools headers uapi: Update tools's copies of kvm headers To get the changes in: a449938297e5 ("KVM: s390: Add huge page enablement control") 8fcc4b5923af ("kvm: nVMX: Introduce KVM_CAP_NESTED_STATE") be26b3a73413 ("arm64: KVM: export the capability to set guest SError syndrome") b7b27facc7b5 ("arm/arm64: KVM: Add KVM_GET/SET_VCPU_EVENTS") b0960b9569db ("KVM: arm: Add 32bit get/set events support") a3da7b4a3be5 ("KVM: s390: add etoken support for guests") This makes 'perf trace' automagically get aware of these new ioctls: $ cp include/uapi/linux/kvm.h tools/include/uapi/linux/kvm.h $ tools/perf/trace/beauty/kvm_ioctl.sh > /tmp/after $ diff -u /tmp/before /tmp/after --- /tmp/before 2018-09-11 11:18:29.173207586 -0300 +++ /tmp/after 2018-09-11 11:18:38.488200446 -0300 @@ -84,6 +84,8 @@ [0xbb] = "MEMORY_ENCRYPT_REG_REGION", [0xbc] = "MEMORY_ENCRYPT_UNREG_REGION", [0xbd] = "HYPERV_EVENTFD", + [0xbe] = "GET_NESTED_STATE", + [0xbf] = "SET_NESTED_STATE", [0xe0] = "CREATE_DEVICE", [0xe1] = "SET_DEVICE_ATTR", [0xe2] = "G And cures the following warning during perf's build: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: Adrian Hunter Cc: Christian Borntraeger Cc: Cornelia Huck Cc: David Ahern Cc: David Hildenbrand Cc: Dongjiu Geng Cc: Eduardo Habkost Cc: James Morse Cc: Janosch Frank Cc: Jim Mattson Cc: Jiri Olsa Cc: Marc Zyngier Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Wang Nan Link: https://lkml.kernel.org/n/tip-2vvwh2o19orn56di0ksrtgzr@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm/include/uapi/asm/kvm.h | 13 ++++++++++++ tools/arch/arm64/include/uapi/asm/kvm.h | 13 ++++++++++++ tools/arch/s390/include/uapi/asm/kvm.h | 5 ++++- tools/arch/x86/include/uapi/asm/kvm.h | 37 +++++++++++++++++++++++++++++++++ tools/include/uapi/linux/kvm.h | 6 ++++++ 5 files changed, 73 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/arch/arm/include/uapi/asm/kvm.h b/tools/arch/arm/include/uapi/asm/kvm.h index 16e006f708ca..4602464ebdfb 100644 --- a/tools/arch/arm/include/uapi/asm/kvm.h +++ b/tools/arch/arm/include/uapi/asm/kvm.h @@ -27,6 +27,7 @@ #define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_READONLY_MEM +#define __KVM_HAVE_VCPU_EVENTS #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -125,6 +126,18 @@ struct kvm_sync_regs { struct kvm_arch_memory_slot { }; +/* for KVM_GET/SET_VCPU_EVENTS */ +struct kvm_vcpu_events { + struct { + __u8 serror_pending; + __u8 serror_has_esr; + /* Align it to 8 bytes */ + __u8 pad[6]; + __u64 serror_esr; + } exception; + __u32 reserved[12]; +}; + /* If you need to interpret the index values, here is the key: */ #define KVM_REG_ARM_COPROC_MASK 0x000000000FFF0000 #define KVM_REG_ARM_COPROC_SHIFT 16 diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index 4e76630dd655..97c3478ee6e7 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -39,6 +39,7 @@ #define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_READONLY_MEM +#define __KVM_HAVE_VCPU_EVENTS #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -154,6 +155,18 @@ struct kvm_sync_regs { struct kvm_arch_memory_slot { }; +/* for KVM_GET/SET_VCPU_EVENTS */ +struct kvm_vcpu_events { + struct { + __u8 serror_pending; + __u8 serror_has_esr; + /* Align it to 8 bytes */ + __u8 pad[6]; + __u64 serror_esr; + } exception; + __u32 reserved[12]; +}; + /* If you need to interpret the index values, here is the key: */ #define KVM_REG_ARM_COPROC_MASK 0x000000000FFF0000 #define KVM_REG_ARM_COPROC_SHIFT 16 diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index 4cdaa55fabfe..9a50f02b9894 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -4,7 +4,7 @@ /* * KVM s390 specific structures and definitions * - * Copyright IBM Corp. 2008 + * Copyright IBM Corp. 2008, 2018 * * Author(s): Carsten Otte * Christian Borntraeger @@ -225,6 +225,7 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_FPRS (1UL << 8) #define KVM_SYNC_GSCB (1UL << 9) #define KVM_SYNC_BPBC (1UL << 10) +#define KVM_SYNC_ETOKEN (1UL << 11) /* length and alignment of the sdnx as a power of two */ #define SDNXC 8 #define SDNXL (1UL << SDNXC) @@ -258,6 +259,8 @@ struct kvm_sync_regs { struct { __u64 reserved1[2]; __u64 gscb[4]; + __u64 etoken; + __u64 etoken_extension; }; }; }; diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index c535c2fdea13..86299efa804a 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -378,4 +378,41 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_STATE_NESTED_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_RUN_PENDING 0x00000002 + +#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_SMM_VMXON 0x00000002 + +struct kvm_vmx_nested_state { + __u64 vmxon_pa; + __u64 vmcs_pa; + + struct { + __u16 flags; + } smm; +}; + +/* for KVM_CAP_NESTED_STATE */ +struct kvm_nested_state { + /* KVM_STATE_* flags */ + __u16 flags; + + /* 0 for VMX, 1 for SVM. */ + __u16 format; + + /* 128 for SVM, 128 + VMCS size for VMX. */ + __u32 size; + + union { + /* VMXON, VMCS */ + struct kvm_vmx_nested_state vmx; + + /* Pad the header to 128 bytes. */ + __u8 pad[120]; + }; + + __u8 data[0]; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index b6270a3b38e9..07548de5c988 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -949,6 +949,9 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_GET_MSR_FEATURES 153 #define KVM_CAP_HYPERV_EVENTFD 154 #define KVM_CAP_HYPERV_TLBFLUSH 155 +#define KVM_CAP_S390_HPAGE_1M 156 +#define KVM_CAP_NESTED_STATE 157 +#define KVM_CAP_ARM_INJECT_SERROR_ESR 158 #ifdef KVM_CAP_IRQ_ROUTING @@ -1391,6 +1394,9 @@ struct kvm_enc_region { /* Available with KVM_CAP_HYPERV_EVENTFD */ #define KVM_HYPERV_EVENTFD _IOW(KVMIO, 0xbd, struct kvm_hyperv_eventfd) +/* Available with KVM_CAP_NESTED_STATE */ +#define KVM_GET_NESTED_STATE _IOWR(KVMIO, 0xbe, struct kvm_nested_state) +#define KVM_SET_NESTED_STATE _IOW(KVMIO, 0xbf, struct kvm_nested_state) /* Secure Encrypted Virtualization command */ enum sev_cmd_id { -- cgit From 7f28785c41f4d5635e69c183b3de8ea19093ccef Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Sep 2018 13:12:40 -0300 Subject: tools headers uapi: Update tools's copy of linux/vhost.h To get the changes in: c48300c92ad9 ("vhost: fix VHOST_GET_BACKEND_FEATURES ioctl request definition") This makes 'perf trace' and other tools in the future using its beautifiers in a libbeauty.so library be able to translate these new ioctl to strings: $ tools/perf/trace/beauty/vhost_virtio_ioctl.sh > /tmp/after $ diff -u /tmp/before /tmp/after --- /tmp/before 2018-09-11 13:10:57.923038244 -0300 +++ /tmp/after 2018-09-11 13:11:20.329012685 -0300 @@ -15,6 +15,7 @@ [0x22] = "SET_VRING_ERR", [0x23] = "SET_VRING_BUSYLOOP_TIMEOUT", [0x24] = "GET_VRING_BUSYLOOP_TIMEOUT", + [0x25] = "SET_BACKEND_FEATURES", [0x30] = "NET_SET_BACKEND", [0x40] = "SCSI_SET_ENDPOINT", [0x41] = "SCSI_CLEAR_ENDPOINT", @@ -27,4 +28,5 @@ static const char *vhost_virtio_ioctl_read_cmds[] = { [0x00] = "GET_FEATURES", [0x12] = "GET_VRING_BASE", + [0x26] = "GET_BACKEND_FEATURES", }; $ We'll also use this to be able to express syscall filters using symbolic these symbolic names, something like: # perf trace --all-cpus -e ioctl(cmd=*GET_FEATURES) This silences the following warning during perf's build: Warning: Kernel ABI header at 'tools/include/uapi/linux/vhost.h' differs from latest version at 'include/uapi/linux/vhost.h' diff -u tools/include/uapi/linux/vhost.h include/uapi/linux/vhost.h Cc: Adrian Hunter Cc: David Ahern Cc: David S. Miller Cc: Gleb Fotengauer-Malinovskiy Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: https://lkml.kernel.org/n/tip-35x71oei2hdui9u0tarpimbq@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/vhost.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/vhost.h b/tools/include/uapi/linux/vhost.h index c51f8e5cc608..84c3de89696a 100644 --- a/tools/include/uapi/linux/vhost.h +++ b/tools/include/uapi/linux/vhost.h @@ -65,6 +65,7 @@ struct vhost_iotlb_msg { }; #define VHOST_IOTLB_MSG 0x1 +#define VHOST_IOTLB_MSG_V2 0x2 struct vhost_msg { int type; @@ -74,6 +75,15 @@ struct vhost_msg { }; }; +struct vhost_msg_v2 { + __u32 type; + __u32 reserved; + union { + struct vhost_iotlb_msg iotlb; + __u8 padding[64]; + }; +}; + struct vhost_memory_region { __u64 guest_phys_addr; __u64 memory_size; /* bytes */ @@ -160,6 +170,14 @@ struct vhost_memory { #define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24, \ struct vhost_vring_state) +/* Set or get vhost backend capability */ + +/* Use message type V2 */ +#define VHOST_BACKEND_F_IOTLB_MSG_V2 0x1 + +#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) +#define VHOST_GET_BACKEND_FEATURES _IOR(VHOST_VIRTIO, 0x26, __u64) + /* VHOST_NET specific defines */ /* Attach virtio net ring to a raw socket, or tap device. -- cgit From 5db48a8d01319620d390bf6d9da5410be14f98e3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Sep 2018 14:10:52 -0300 Subject: tools headers uapi: Update tools's copy of linux/if_link.h To get the changes in: 3e7a50ceb11e ("net: report min and max mtu network device settings") 2756f68c3149 ("net: bridge: add support for backup port") a25717d2b604 ("xdp: support simultaneous driver and hw XDP attachment") 4f91da26c811 ("xdp: add per mode attributes for attached programs") f203b76d7809 ("xfrm: Add virtual xfrm interfaces") Silencing this libbpf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/if_link.h' differs from latest version at 'include/uapi/linux/if_link.h' Cc: Adrian Hunter Cc: Daniel Borkmann Cc: David Ahern Cc: David S. Miller Cc: Jakub Kicinski Cc: Jiri Olsa Cc: Namhyung Kim Cc: Nikolay Aleksandrov Cc: Steffen Klassert Cc: Stephen Hemminger Cc: Wang Nan Link: https://lkml.kernel.org/n/tip-xd9ztioa894zemv8ag8kg64u@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/if_link.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index cf01b6824244..43391e2d1153 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -164,6 +164,8 @@ enum { IFLA_CARRIER_UP_COUNT, IFLA_CARRIER_DOWN_COUNT, IFLA_NEW_IFINDEX, + IFLA_MIN_MTU, + IFLA_MAX_MTU, __IFLA_MAX }; @@ -334,6 +336,7 @@ enum { IFLA_BRPORT_GROUP_FWD_MASK, IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, + IFLA_BRPORT_BACKUP_PORT, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) @@ -459,6 +462,16 @@ enum { #define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) +/* XFRM section */ +enum { + IFLA_XFRM_UNSPEC, + IFLA_XFRM_LINK, + IFLA_XFRM_IF_ID, + __IFLA_XFRM_MAX +}; + +#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1) + enum macsec_validation_type { MACSEC_VALIDATE_DISABLED = 0, MACSEC_VALIDATE_CHECK = 1, @@ -920,6 +933,7 @@ enum { XDP_ATTACHED_DRV, XDP_ATTACHED_SKB, XDP_ATTACHED_HW, + XDP_ATTACHED_MULTI, }; enum { @@ -928,6 +942,9 @@ enum { IFLA_XDP_ATTACHED, IFLA_XDP_FLAGS, IFLA_XDP_PROG_ID, + IFLA_XDP_DRV_PROG_ID, + IFLA_XDP_SKB_PROG_ID, + IFLA_XDP_HW_PROG_ID, __IFLA_XDP_MAX, }; -- cgit From 52d0d404d39dd9eac71a181615d6ca15e23d8e38 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 12 Sep 2018 10:04:21 +0800 Subject: geneve: add ttl inherit support Similar with commit 72f6d71e491e6 ("vxlan: add ttl inherit support"), currently ttl == 0 means "use whatever default value" on geneve instead of inherit inner ttl. To respect compatibility with old behavior, let's add a new IFLA_GENEVE_TTL_INHERIT for geneve ttl inherit support. Reported-by: Jianlin Shi Suggested-by: Jiri Benc Signed-off-by: Hangbin Liu Reviewed-by: Jiri Benc Signed-off-by: David S. Miller --- drivers/net/geneve.c | 41 ++++++++++++++++++++++++++++++-------- include/uapi/linux/if_link.h | 1 + tools/include/uapi/linux/if_link.h | 1 + 3 files changed, 35 insertions(+), 8 deletions(-) (limited to 'tools/include') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 6acb6b5718b9..6625fabe2c88 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -69,6 +69,7 @@ struct geneve_dev { struct gro_cells gro_cells; bool collect_md; bool use_udp6_rx_checksums; + bool ttl_inherit; }; struct geneve_sock { @@ -843,7 +844,11 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, ttl = key->ttl; } else { tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, ip_hdr(skb), skb); - ttl = key->ttl ? : ip4_dst_hoplimit(&rt->dst); + if (geneve->ttl_inherit) + ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb); + else + ttl = key->ttl; + ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); } df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; @@ -889,7 +894,11 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, } else { prio = ip_tunnel_ecn_encap(ip6_tclass(fl6.flowlabel), ip_hdr(skb), skb); - ttl = key->ttl ? : ip6_dst_hoplimit(dst); + if (geneve->ttl_inherit) + ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb); + else + ttl = key->ttl; + ttl = ttl ? : ip6_dst_hoplimit(dst); } err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr)); if (unlikely(err)) @@ -1091,6 +1100,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { [IFLA_GENEVE_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, [IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, + [IFLA_GENEVE_TTL_INHERIT] = { .type = NLA_U8 }, }; static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], @@ -1170,7 +1180,8 @@ static bool geneve_dst_addr_equal(struct ip_tunnel_info *a, static int geneve_configure(struct net *net, struct net_device *dev, struct netlink_ext_ack *extack, const struct ip_tunnel_info *info, - bool metadata, bool ipv6_rx_csum) + bool metadata, bool ipv6_rx_csum, + bool ttl_inherit) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *t, *geneve = netdev_priv(dev); @@ -1219,6 +1230,7 @@ static int geneve_configure(struct net *net, struct net_device *dev, geneve->info = *info; geneve->collect_md = metadata; geneve->use_udp6_rx_checksums = ipv6_rx_csum; + geneve->ttl_inherit = ttl_inherit; err = register_netdevice(dev); if (err) @@ -1237,7 +1249,8 @@ static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port) static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack, struct ip_tunnel_info *info, bool *metadata, - bool *use_udp6_rx_checksums, bool changelink) + bool *use_udp6_rx_checksums, bool *ttl_inherit, + bool changelink) { int attrtype; @@ -1315,6 +1328,9 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], if (data[IFLA_GENEVE_TTL]) info->key.ttl = nla_get_u8(data[IFLA_GENEVE_TTL]); + if (data[IFLA_GENEVE_TTL_INHERIT]) + *ttl_inherit = true; + if (data[IFLA_GENEVE_TOS]) info->key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]); @@ -1438,17 +1454,18 @@ static int geneve_newlink(struct net *net, struct net_device *dev, { bool use_udp6_rx_checksums = false; struct ip_tunnel_info info; + bool ttl_inherit = false; bool metadata = false; int err; init_tnl_info(&info, GENEVE_UDP_PORT); err = geneve_nl2info(tb, data, extack, &info, &metadata, - &use_udp6_rx_checksums, false); + &use_udp6_rx_checksums, &ttl_inherit, false); if (err) return err; err = geneve_configure(net, dev, extack, &info, metadata, - use_udp6_rx_checksums); + use_udp6_rx_checksums, ttl_inherit); if (err) return err; @@ -1511,6 +1528,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_info info; bool metadata; bool use_udp6_rx_checksums; + bool ttl_inherit; int err; /* If the geneve device is configured for metadata (or externally @@ -1523,8 +1541,9 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[], memcpy(&info, &geneve->info, sizeof(info)); metadata = geneve->collect_md; use_udp6_rx_checksums = geneve->use_udp6_rx_checksums; + ttl_inherit = geneve->ttl_inherit; err = geneve_nl2info(tb, data, extack, &info, &metadata, - &use_udp6_rx_checksums, true); + &use_udp6_rx_checksums, &ttl_inherit, true); if (err) return err; @@ -1537,6 +1556,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[], geneve->info = info; geneve->collect_md = metadata; geneve->use_udp6_rx_checksums = use_udp6_rx_checksums; + geneve->ttl_inherit = ttl_inherit; geneve_unquiesce(geneve, gs4, gs6); return 0; @@ -1562,6 +1582,7 @@ static size_t geneve_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */ + nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */ 0; } @@ -1569,6 +1590,7 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); struct ip_tunnel_info *info = &geneve->info; + bool ttl_inherit = geneve->ttl_inherit; bool metadata = geneve->collect_md; __u8 tmp_vni[3]; __u32 vni; @@ -1614,6 +1636,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; #endif + if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -1650,7 +1675,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name, return dev; init_tnl_info(&info, dst_port); - err = geneve_configure(net, dev, NULL, &info, true, true); + err = geneve_configure(net, dev, NULL, &info, true, true, false); if (err) { free_netdev(dev); return ERR_PTR(err); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 29d49b989acd..58faab897201 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -555,6 +555,7 @@ enum { IFLA_GENEVE_UDP_ZERO_CSUM6_TX, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, IFLA_GENEVE_LABEL, + IFLA_GENEVE_TTL_INHERIT, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 1c73d63068b1..141cbfdc5865 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -542,6 +542,7 @@ enum { IFLA_GENEVE_UDP_ZERO_CSUM6_TX, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, IFLA_GENEVE_LABEL, + IFLA_GENEVE_TTL_INHERIT, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) -- cgit From 2f965e3fcd4b4159a5ea832d4b5e9ff2550fa571 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 14 Sep 2018 07:46:19 -0700 Subject: bpf: sync bpf.h uapi with tools/ This patch syncs tools/include/uapi/linux/bpf.h with the flow dissector definitions from include/uapi/linux/bpf.h Signed-off-by: Petar Penkov Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 66917a4eba27..aa5ccd2385ed 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -152,6 +152,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, }; enum bpf_attach_type { @@ -172,6 +173,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP6_SENDMSG, BPF_LIRC_MODE2, + BPF_FLOW_DISSECTOR, __MAX_BPF_ATTACH_TYPE }; @@ -2333,6 +2335,7 @@ struct __sk_buff { /* ... here. */ __u32 data_meta; + struct bpf_flow_keys *flow_keys; }; struct bpf_tunnel_key { @@ -2778,4 +2781,27 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; +struct bpf_flow_keys { + __u16 nhoff; + __u16 thoff; + __u16 addr_proto; /* ETH_P_* of valid addrs */ + __u8 is_frag; + __u8 is_first_frag; + __u8 is_encap; + __u8 ip_proto; + __be16 n_proto; + __be16 sport; + __be16 dport; + union { + struct { + __be32 ipv4_src; + __be32 ipv4_dst; + }; + struct { + __u32 ipv6_src[4]; /* in6_addr; network order */ + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + }; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit From 01ab2e91103b8c23dfedfeb799bc8b810d585bd0 Mon Sep 17 00:00:00 2001 From: Ding Xiang Date: Fri, 7 Sep 2018 09:34:41 +0800 Subject: tools include: Adopt PTR_ERR_OR_ZERO from the kernel err.h header Add PTR_ERR_OR_ZERO, so that tools can use it, just like the kernel. Signed-off-by: Ding Xiang Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1536284082-23466-1-git-send-email-dingxiang@cmss.chinamobile.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/err.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools/include') diff --git a/tools/include/linux/err.h b/tools/include/linux/err.h index 7a8b61ad44cb..094649667bae 100644 --- a/tools/include/linux/err.h +++ b/tools/include/linux/err.h @@ -52,4 +52,11 @@ static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr); } +static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr) +{ + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + else + return 0; +} #endif /* _LINUX_ERR_H */ -- cgit From 080220b687147fd9376878534aba7194f17f6ef5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 18 Sep 2018 10:13:59 -0700 Subject: tools: bpf: fix license for a compat header file libc_compat.h is used by libbpf so make sure it's licensed under LGPL or BSD license. The license change should be OK, I'm the only author of the file. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/include/tools/libc_compat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/tools/libc_compat.h b/tools/include/tools/libc_compat.h index 664ced8cb1b0..e907ba6f15e5 100644 --- a/tools/include/tools/libc_compat.h +++ b/tools/include/tools/libc_compat.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: (LGPL-2.0+ OR BSD-2-Clause) /* Copyright (C) 2018 Netronome Systems, Inc. */ #ifndef __TOOLS_LIBC_COMPAT_H -- cgit From 25025e0aab2f91a93a1557b299a79814559b4dc2 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:45:48 +0000 Subject: bpf: sync include/uapi/linux/bpf.h to tools/include/uapi/linux/bpf.h The sync is required due to the appearance of a new map type: BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, which implements per-cpu cgroup local storage. Signed-off-by: Roman Gushchin Acked-by: Song Liu Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index aa5ccd2385ed..e2070d819e04 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -127,6 +127,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, }; enum bpf_prog_type { -- cgit From 6acc9b432e6714d72d7d77ec7c27f6f8358d0c71 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:36 -0700 Subject: bpf: Add helper to retrieve socket in BPF This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and bpf_sk_lookup_udp() which allows BPF programs to find out if there is a socket listening on this host, and returns a socket pointer which the BPF program can then access to determine, for instance, whether to forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the socket, so when a BPF program makes use of this function, it must subsequently pass the returned pointer into the newly added sk_release() to return the reference. By way of example, the following pseudocode would filter inbound connections at XDP if there is no corresponding service listening for the traffic: struct bpf_sock_tuple tuple; struct bpf_sock_ops *sk; populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0); if (!sk) { // Couldn't find a socket listening for this traffic. Drop. return TC_ACT_SHOT; } bpf_sk_release(sk, 0); return TC_ACT_OK; Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 93 +++++++++++++++++- kernel/bpf/verifier.c | 8 +- net/core/filter.c | 151 ++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 93 +++++++++++++++++- tools/testing/selftests/bpf/bpf_helpers.h | 12 +++ 5 files changed, 354 insertions(+), 3 deletions(-) (limited to 'tools/include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e2070d819e04..f9187b41dff6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2144,6 +2144,77 @@ union bpf_attr { * request in the skb. * Return * 0 on success, or a negative error in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * int bpf_sk_release(struct bpf_sock *sk) + * Description + * Release the reference held by *sock*. *sock* must be a non-NULL + * pointer that was returned from bpf_sk_lookup_xxx\ (). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2229,7 +2300,10 @@ union bpf_attr { FN(get_current_cgroup_id), \ FN(get_local_storage), \ FN(sk_select_reuseport), \ - FN(skb_ancestor_cgroup_id), + FN(skb_ancestor_cgroup_id), \ + FN(sk_lookup_tcp), \ + FN(sk_lookup_udp), \ + FN(sk_release), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2399,6 +2473,23 @@ struct bpf_sock { */ }; +struct bpf_sock_tuple { + union { + struct { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + } ipv4; + struct { + __be32 saddr[4]; + __be32 daddr[4]; + __be16 sport; + __be16 dport; + } ipv6; + }; +}; + #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cd0d8bc00bd1..73c81bef6ae8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type * passes through a NULL-check conditional. For the branch wherein the state is * changed to CONST_IMM, the verifier releases the reference. + * + * For each helper function that allocates a reference, such as + * bpf_sk_lookup_tcp(), there is a corresponding release function, such as + * bpf_sk_release(). When a reference type passes into the release function, + * the verifier also releases the reference. If any unchecked or unreleased + * reference remains at the end of the program, the verifier rejects it. */ /* verifier_state + insn_idx are pushed to stack when branch is encountered */ @@ -300,7 +306,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return false; + return func_id == BPF_FUNC_sk_release; } /* string representation of 'enum bpf_reg_type' */ diff --git a/net/core/filter.c b/net/core/filter.c index b2cb186252e4..591c698bc517 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -58,13 +58,17 @@ #include #include #include +#include #include #include #include +#include +#include #include #include #include #include +#include #include #include #include @@ -4813,6 +4817,141 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { }; #endif /* CONFIG_IPV6_SEG6_BPF */ +struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, + struct sk_buff *skb, u8 family, u8 proto) +{ + int dif = skb->dev->ifindex; + bool refcounted = false; + struct sock *sk = NULL; + + if (family == AF_INET) { + __be32 src4 = tuple->ipv4.saddr; + __be32 dst4 = tuple->ipv4.daddr; + int sdif = inet_sdif(skb); + + if (proto == IPPROTO_TCP) + sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, + src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &refcounted); + else + sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &udp_table, skb); +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; + struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; + int sdif = inet6_sdif(skb); + + if (proto == IPPROTO_TCP) + sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, + src6, tuple->ipv6.sport, + dst6, tuple->ipv6.dport, + dif, sdif, &refcounted); + else + sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport, + dst6, tuple->ipv6.dport, + dif, sdif, &udp_table, skb); +#endif + } + + if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + sk = NULL; + } + return sk; +} + +/* bpf_sk_lookup performs the core lookup for different types of sockets, + * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. + * Returns the socket as an 'unsigned long' to simplify the casting in the + * callers to satisfy BPF_CALL declarations. + */ +static unsigned long +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + struct sock *sk = NULL; + u8 family = AF_UNSPEC; + struct net *net; + + family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; + if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) + goto out; + + if (skb->dev) + caller_net = dev_net(skb->dev); + else + caller_net = sock_net(skb->sk); + if (netns_id) { + net = get_net_ns_by_id(caller_net, netns_id); + if (unlikely(!net)) + goto out; + sk = sk_lookup(net, tuple, skb, family, proto); + put_net(net); + } else { + net = caller_net; + sk = sk_lookup(net, tuple, skb, family, proto); + } + + if (sk) + sk = sk_to_full_sk(sk); +out: + return (unsigned long) sk; +} + +BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { + .func = bpf_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { + .func = bpf_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_sk_release, struct sock *, sk) +{ + if (!sock_flag(sk, SOCK_RCU_FREE)) + sock_gen_put(sk); + return 0; +} + +static const struct bpf_func_proto bpf_sk_release_proto = { + .func = bpf_sk_release, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, +}; + bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || @@ -5019,6 +5158,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; #endif + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; default: return bpf_base_func_proto(func_id); } @@ -5119,6 +5264,12 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_hash_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e2070d819e04..f9187b41dff6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2144,6 +2144,77 @@ union bpf_attr { * request in the skb. * Return * 0 on success, or a negative error in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * int bpf_sk_release(struct bpf_sock *sk) + * Description + * Release the reference held by *sock*. *sock* must be a non-NULL + * pointer that was returned from bpf_sk_lookup_xxx\ (). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2229,7 +2300,10 @@ union bpf_attr { FN(get_current_cgroup_id), \ FN(get_local_storage), \ FN(sk_select_reuseport), \ - FN(skb_ancestor_cgroup_id), + FN(skb_ancestor_cgroup_id), \ + FN(sk_lookup_tcp), \ + FN(sk_lookup_udp), \ + FN(sk_release), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2399,6 +2473,23 @@ struct bpf_sock { */ }; +struct bpf_sock_tuple { + union { + struct { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + } ipv4; + struct { + __be32 saddr[4]; + __be32 daddr[4]; + __be16 sport; + __be16 dport; + } ipv6; + }; +}; + #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index e4be7730222d..1d407b3494f9 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -143,6 +143,18 @@ static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) = (void *) BPF_FUNC_skb_cgroup_id; static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) = (void *) BPF_FUNC_skb_ancestor_cgroup_id; +static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, + struct bpf_sock_tuple *tuple, + int size, unsigned int netns_id, + unsigned long long flags) = + (void *) BPF_FUNC_sk_lookup_tcp; +static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, + struct bpf_sock_tuple *tuple, + int size, unsigned int netns_id, + unsigned long long flags) = + (void *) BPF_FUNC_sk_lookup_udp; +static int (*bpf_sk_release)(struct bpf_sock *sk) = + (void *) BPF_FUNC_sk_release; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions -- cgit From 25fe15e54fe5e15b4963fe101f7cd8bad4f11393 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 8 Oct 2018 12:09:14 -0300 Subject: tools headers uapi: Sync kvm.h copy To pick up the changes introduced in: 6fbbde9a1969 ("KVM: x86: Control guest reads of MSR_PLATFORM_INFO") That is not yet used in tools such as 'perf trace'. The type of the change in this file, a simple integer parameter to the KVM_CHECK_EXTENSION ioctl should be easier to implement tho, adding to the libbeauty TODO list. This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: Adrian Hunter Cc: David Ahern Cc: Drew Schmitt Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Wang Nan Link: https://lkml.kernel.org/n/tip-67h1bio5bihi1q6dy7hgwwx8@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 07548de5c988..251be353f950 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -952,6 +952,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_HPAGE_1M 156 #define KVM_CAP_NESTED_STATE 157 #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 +#define KVM_CAP_MSR_PLATFORM_INFO 159 #ifdef KVM_CAP_IRQ_ROUTING -- cgit From ba4aa02b417f08a0bee5e7b8ed70cac788a7c854 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 25 Sep 2018 10:55:59 -0300 Subject: tools include: Adopt linux/bits.h So that we reduce the difference of tools/include/linux/bitops.h to the original kernel file, include/linux/bitops.h, trying to remove the need to define BITS_PER_LONG, to avoid clashes with asm/bitsperlong.h. And the things removed from tools/include/linux/bitops.h are really in linux/bits.h, so that we can have a copy and then tools/perf/check_headers.sh will tell us when new stuff gets added to linux/bits.h so that we can check if it is useful and if any adjustment needs to be done to the tools/{include,arch}/ copies. Cc: Adrian Hunter Cc: Alexander Sverdlin Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: https://lkml.kernel.org/n/tip-y1sqyydvfzo0bjjoj4zsl562@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/bitops.h | 7 ++----- tools/include/linux/bits.h | 26 ++++++++++++++++++++++++++ tools/perf/check-headers.sh | 1 + 3 files changed, 29 insertions(+), 5 deletions(-) create mode 100644 tools/include/linux/bits.h (limited to 'tools/include') diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h index acc704bd3998..0b0ef3abc966 100644 --- a/tools/include/linux/bitops.h +++ b/tools/include/linux/bitops.h @@ -3,8 +3,6 @@ #define _TOOLS_LINUX_BITOPS_H_ #include -#include - #ifndef __WORDSIZE #define __WORDSIZE (__SIZEOF_LONG__ * 8) #endif @@ -12,10 +10,9 @@ #ifndef BITS_PER_LONG # define BITS_PER_LONG __WORDSIZE #endif +#include +#include -#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) -#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) -#define BITS_PER_BYTE 8 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) #define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64)) #define BITS_TO_U32(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u32)) diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h new file mode 100644 index 000000000000..2b7b532c1d51 --- /dev/null +++ b/tools/include/linux/bits.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_BITS_H +#define __LINUX_BITS_H +#include + +#define BIT(nr) (1UL << (nr)) +#define BIT_ULL(nr) (1ULL << (nr)) +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) +#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG)) +#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) +#define BITS_PER_BYTE 8 + +/* + * Create a contiguous bitmask starting at bit position @l and ending at + * position @h. For example + * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. + */ +#define GENMASK(h, l) \ + (((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h)))) + +#define GENMASK_ULL(h, l) \ + (((~0ULL) - (1ULL << (l)) + 1) & \ + (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h)))) + +#endif /* __LINUX_BITS_H */ diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 466540ee8ea7..c72cc73a6b09 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -14,6 +14,7 @@ include/uapi/linux/sched.h include/uapi/linux/stat.h include/uapi/linux/vhost.h include/uapi/sound/asound.h +include/linux/bits.h include/linux/hash.h include/uapi/linux/hw_breakpoint.h arch/x86/include/asm/disabled-features.h -- cgit From 421f4292f46e871cdcf8bdc3e6fdfcefe2e81a9d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 16 Oct 2018 15:59:36 +0200 Subject: bpf, tls: add tls header to tools infrastructure Andrey reported a build error for the BPF kselftest suite when compiled on a machine which does not have tls related header bits installed natively: test_sockmap.c:120:23: fatal error: linux/tls.h: No such file or directory #include ^ compilation terminated. Fix it by adding the header to the tools include infrastructure and add definitions such as SOL_TLS that could potentially be missing. Fixes: e9dd904708c4 ("bpf: add tls support for testing in test_sockmap") Reported-by: Andrey Ignatov Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/tls.h | 78 ++++++++++++++++++++++++++++++ tools/testing/selftests/bpf/test_sockmap.c | 13 +++-- 2 files changed, 86 insertions(+), 5 deletions(-) create mode 100644 tools/include/uapi/linux/tls.h (limited to 'tools/include') diff --git a/tools/include/uapi/linux/tls.h b/tools/include/uapi/linux/tls.h new file mode 100644 index 000000000000..ff02287495ac --- /dev/null +++ b/tools/include/uapi/linux/tls.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* + * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _UAPI_LINUX_TLS_H +#define _UAPI_LINUX_TLS_H + +#include + +/* TLS socket options */ +#define TLS_TX 1 /* Set transmit parameters */ +#define TLS_RX 2 /* Set receive parameters */ + +/* Supported versions */ +#define TLS_VERSION_MINOR(ver) ((ver) & 0xFF) +#define TLS_VERSION_MAJOR(ver) (((ver) >> 8) & 0xFF) + +#define TLS_VERSION_NUMBER(id) ((((id##_VERSION_MAJOR) & 0xFF) << 8) | \ + ((id##_VERSION_MINOR) & 0xFF)) + +#define TLS_1_2_VERSION_MAJOR 0x3 +#define TLS_1_2_VERSION_MINOR 0x3 +#define TLS_1_2_VERSION TLS_VERSION_NUMBER(TLS_1_2) + +/* Supported ciphers */ +#define TLS_CIPHER_AES_GCM_128 51 +#define TLS_CIPHER_AES_GCM_128_IV_SIZE 8 +#define TLS_CIPHER_AES_GCM_128_KEY_SIZE 16 +#define TLS_CIPHER_AES_GCM_128_SALT_SIZE 4 +#define TLS_CIPHER_AES_GCM_128_TAG_SIZE 16 +#define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE 8 + +#define TLS_SET_RECORD_TYPE 1 +#define TLS_GET_RECORD_TYPE 2 + +struct tls_crypto_info { + __u16 version; + __u16 cipher_type; +}; + +struct tls12_crypto_info_aes_gcm_128 { + struct tls_crypto_info info; + unsigned char iv[TLS_CIPHER_AES_GCM_128_IV_SIZE]; + unsigned char key[TLS_CIPHER_AES_GCM_128_KEY_SIZE]; + unsigned char salt[TLS_CIPHER_AES_GCM_128_SALT_SIZE]; + unsigned char rec_seq[TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE]; +}; + +#endif /* _UAPI_LINUX_TLS_H */ diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 10a5fa83c75a..7cb69ce6dfa2 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,13 @@ int running; static void running_handler(int a); +#ifndef TCP_ULP +# define TCP_ULP 31 +#endif +#ifndef SOL_TLS +# define SOL_TLS 282 +#endif + /* randomly selected ports for testing on lo */ #define S1_PORT 10000 #define S2_PORT 10001 @@ -114,11 +122,6 @@ static void usage(char *argv[]) printf("\n"); } -#define TCP_ULP 31 -#define TLS_TX 1 -#define TLS_RX 2 -#include - char *sock_to_string(int s) { if (s == c1) -- cgit From c939989d74e27f44229a51d8fe96b5c297965bfa Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 16 Oct 2018 18:50:10 +0200 Subject: tools/headers: update kvm.h Pick up the latest kvm.h definitions. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- tools/include/uapi/linux/kvm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 07548de5c988..2875ce85b322 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -719,6 +719,7 @@ struct kvm_ppc_one_seg_page_size { #define KVM_PPC_PAGE_SIZES_REAL 0x00000001 #define KVM_PPC_1T_SEGMENTS 0x00000002 +#define KVM_PPC_NO_HASH 0x00000004 struct kvm_ppc_smmu_info { __u64 flags; @@ -952,6 +953,11 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_HPAGE_1M 156 #define KVM_CAP_NESTED_STATE 157 #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 +#define KVM_CAP_MSR_PLATFORM_INFO 159 +#define KVM_CAP_PPC_NESTED_HV 160 +#define KVM_CAP_HYPERV_SEND_IPI 161 +#define KVM_CAP_COALESCED_PIO 162 +#define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163 #ifdef KVM_CAP_IRQ_ROUTING -- cgit From b55cbc8d9b44aaee94f19e995a5f241d453763ee Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 17 Oct 2018 16:24:48 +0200 Subject: bpf: fix doc of bpf_skb_adjust_room() in uapi len_diff is signed. Fixes: fa15601ab31e ("bpf: add documentation for eBPF helpers (33-41)") CC: Quentin Monnet Signed-off-by: Nicolas Dichtel Reviewed-by: Quentin Monnet Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f9187b41dff6..5e46f6732781 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1433,7 +1433,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) + * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f9187b41dff6..5e46f6732781 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1433,7 +1433,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) + * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. -- cgit From da4e1b15f67613eca7ae998abd996cfee7cff1ba Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:36 +0200 Subject: Sync uapi/bpf.h to tools/include Sync both files. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5e46f6732781..a2fb333290dc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -103,6 +103,7 @@ enum bpf_cmd { BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, + BPF_MAP_LOOKUP_AND_DELETE_ELEM, }; enum bpf_map_type { @@ -128,6 +129,8 @@ enum bpf_map_type { BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_QUEUE, + BPF_MAP_TYPE_STACK, }; enum bpf_prog_type { @@ -462,6 +465,28 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is removed to + * make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -2303,7 +2328,10 @@ union bpf_attr { FN(skb_ancestor_cgroup_id), \ FN(sk_lookup_tcp), \ FN(sk_lookup_udp), \ - FN(sk_release), + FN(sk_release), \ + FN(map_push_elem), \ + FN(map_pop_elem), \ + FN(map_peek_elem), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit From 09d62154f61316f7e97eae3f31ef8770c7e4b386 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 19 Oct 2018 15:51:02 +0200 Subject: tools, perf: add and use optimized ring_buffer_{read_head, write_tail} helpers Currently, on x86-64, perf uses LFENCE and MFENCE (rmb() and mb(), respectively) when processing events from the perf ring buffer which is unnecessarily expensive as we can do more lightweight in particular given this is critical fast-path in perf. According to Peter rmb()/mb() were added back then via a94d342b9cb0 ("tools/perf: Add required memory barriers") at a time where kernel still supported chips that needed it, but nowadays support for these has been ditched completely, therefore we can fix them up as well. While for x86-64, replacing rmb() and mb() with smp_*() variants would result in just a compiler barrier for the former and LOCK + ADD for the latter (__sync_synchronize() uses slower MFENCE by the way), Peter suggested we can use smp_{load_acquire,store_release}() instead for architectures where its implementation doesn't resolve in slower smp_mb(). Thus, e.g. in x86-64 we would be able to avoid CPU barrier entirely due to TSO. For architectures where the latter needs to use smp_mb() e.g. on arm, we stick to cheaper smp_rmb() variant for fetching the head. This work adds helpers ring_buffer_read_head() and ring_buffer_write_tail() for tools infrastructure that either switches to smp_load_acquire() for architectures where it is cheaper or uses READ_ONCE() + smp_rmb() barrier for those where it's not in order to fetch the data_head from the perf control page, and it uses smp_store_release() to write the data_tail. Latter is smp_mb() + WRITE_ONCE() combination or a cheaper variant if architecture allows for it. Those that rely on smp_rmb() and smp_mb() can further improve performance in a follow up step by implementing the two under tools/arch/*/include/asm/barrier.h such that they don't have to fallback to rmb() and mb() in tools/include/asm/barrier.h. Switch perf to use ring_buffer_read_head() and ring_buffer_write_tail() so it can make use of the optimizations. Later, we convert libbpf as well to use the same helpers. Side note [0]: the topic has been raised of whether one could simply use the C11 gcc builtins [1] for the smp_load_acquire() and smp_store_release() instead: __atomic_load_n(ptr, __ATOMIC_ACQUIRE); __atomic_store_n(ptr, val, __ATOMIC_RELEASE); Kernel and (presumably) tooling shipped along with the kernel has a minimum requirement of being able to build with gcc-4.6 and the latter does not have C11 builtins. While generally the C11 memory models don't align with the kernel's, the C11 load-acquire and store-release alone /could/ suffice, however. Issue is that this is implementation dependent on how the load-acquire and store-release is done by the compiler and the mapping of supported compilers must align to be compatible with the kernel's implementation, and thus needs to be verified/tracked on a case by case basis whether they match (unless an architecture uses them also from kernel side). The implementations for smp_load_acquire() and smp_store_release() in this patch have been adapted from the kernel side ones to have a concrete and compatible mapping in place. [0] http://patchwork.ozlabs.org/patch/985422/ [1] https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html Signed-off-by: Daniel Borkmann Acked-by: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Will Deacon Cc: Arnaldo Carvalho de Melo Signed-off-by: Alexei Starovoitov --- tools/arch/arm64/include/asm/barrier.h | 70 +++++++++++++++++++++++++++++ tools/arch/ia64/include/asm/barrier.h | 13 ++++++ tools/arch/powerpc/include/asm/barrier.h | 16 +++++++ tools/arch/s390/include/asm/barrier.h | 13 ++++++ tools/arch/sparc/include/asm/barrier_64.h | 13 ++++++ tools/arch/x86/include/asm/barrier.h | 14 ++++++ tools/include/asm/barrier.h | 35 +++++++++++++++ tools/include/linux/ring_buffer.h | 73 +++++++++++++++++++++++++++++++ tools/perf/util/mmap.h | 15 ++----- 9 files changed, 250 insertions(+), 12 deletions(-) create mode 100644 tools/include/linux/ring_buffer.h (limited to 'tools/include') diff --git a/tools/arch/arm64/include/asm/barrier.h b/tools/arch/arm64/include/asm/barrier.h index 40bde6b23501..12835ea0e417 100644 --- a/tools/arch/arm64/include/asm/barrier.h +++ b/tools/arch/arm64/include/asm/barrier.h @@ -14,4 +14,74 @@ #define wmb() asm volatile("dmb ishst" ::: "memory") #define rmb() asm volatile("dmb ishld" ::: "memory") +#define smp_store_release(p, v) \ +do { \ + union { typeof(*p) __val; char __c[1]; } __u = \ + { .__val = (__force typeof(*p)) (v) }; \ + \ + switch (sizeof(*p)) { \ + case 1: \ + asm volatile ("stlrb %w1, %0" \ + : "=Q" (*p) \ + : "r" (*(__u8 *)__u.__c) \ + : "memory"); \ + break; \ + case 2: \ + asm volatile ("stlrh %w1, %0" \ + : "=Q" (*p) \ + : "r" (*(__u16 *)__u.__c) \ + : "memory"); \ + break; \ + case 4: \ + asm volatile ("stlr %w1, %0" \ + : "=Q" (*p) \ + : "r" (*(__u32 *)__u.__c) \ + : "memory"); \ + break; \ + case 8: \ + asm volatile ("stlr %1, %0" \ + : "=Q" (*p) \ + : "r" (*(__u64 *)__u.__c) \ + : "memory"); \ + break; \ + default: \ + /* Only to shut up gcc ... */ \ + mb(); \ + break; \ + } \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + union { typeof(*p) __val; char __c[1]; } __u; \ + \ + switch (sizeof(*p)) { \ + case 1: \ + asm volatile ("ldarb %w0, %1" \ + : "=r" (*(__u8 *)__u.__c) \ + : "Q" (*p) : "memory"); \ + break; \ + case 2: \ + asm volatile ("ldarh %w0, %1" \ + : "=r" (*(__u16 *)__u.__c) \ + : "Q" (*p) : "memory"); \ + break; \ + case 4: \ + asm volatile ("ldar %w0, %1" \ + : "=r" (*(__u32 *)__u.__c) \ + : "Q" (*p) : "memory"); \ + break; \ + case 8: \ + asm volatile ("ldar %0, %1" \ + : "=r" (*(__u64 *)__u.__c) \ + : "Q" (*p) : "memory"); \ + break; \ + default: \ + /* Only to shut up gcc ... */ \ + mb(); \ + break; \ + } \ + __u.__val; \ +}) + #endif /* _TOOLS_LINUX_ASM_AARCH64_BARRIER_H */ diff --git a/tools/arch/ia64/include/asm/barrier.h b/tools/arch/ia64/include/asm/barrier.h index d808ee0e77b5..4d471d9511a5 100644 --- a/tools/arch/ia64/include/asm/barrier.h +++ b/tools/arch/ia64/include/asm/barrier.h @@ -46,4 +46,17 @@ #define rmb() mb() #define wmb() mb() +#define smp_store_release(p, v) \ +do { \ + barrier(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* _TOOLS_LINUX_ASM_IA64_BARRIER_H */ diff --git a/tools/arch/powerpc/include/asm/barrier.h b/tools/arch/powerpc/include/asm/barrier.h index a634da05bc97..905a2c66d96d 100644 --- a/tools/arch/powerpc/include/asm/barrier.h +++ b/tools/arch/powerpc/include/asm/barrier.h @@ -27,4 +27,20 @@ #define rmb() __asm__ __volatile__ ("sync" : : : "memory") #define wmb() __asm__ __volatile__ ("sync" : : : "memory") +#if defined(__powerpc64__) +#define smp_lwsync() __asm__ __volatile__ ("lwsync" : : : "memory") + +#define smp_store_release(p, v) \ +do { \ + smp_lwsync(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + smp_lwsync(); \ + ___p1; \ +}) +#endif /* defined(__powerpc64__) */ #endif /* _TOOLS_LINUX_ASM_POWERPC_BARRIER_H */ diff --git a/tools/arch/s390/include/asm/barrier.h b/tools/arch/s390/include/asm/barrier.h index 5030c99f47d2..de362fa664d4 100644 --- a/tools/arch/s390/include/asm/barrier.h +++ b/tools/arch/s390/include/asm/barrier.h @@ -28,4 +28,17 @@ #define rmb() mb() #define wmb() mb() +#define smp_store_release(p, v) \ +do { \ + barrier(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* __TOOLS_LIB_ASM_BARRIER_H */ diff --git a/tools/arch/sparc/include/asm/barrier_64.h b/tools/arch/sparc/include/asm/barrier_64.h index ba61344287d5..cfb0fdc8ccf0 100644 --- a/tools/arch/sparc/include/asm/barrier_64.h +++ b/tools/arch/sparc/include/asm/barrier_64.h @@ -40,4 +40,17 @@ do { __asm__ __volatile__("ba,pt %%xcc, 1f\n\t" \ #define rmb() __asm__ __volatile__("":::"memory") #define wmb() __asm__ __volatile__("":::"memory") +#define smp_store_release(p, v) \ +do { \ + barrier(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* !(__TOOLS_LINUX_SPARC64_BARRIER_H) */ diff --git a/tools/arch/x86/include/asm/barrier.h b/tools/arch/x86/include/asm/barrier.h index 8774dee27471..58919868473c 100644 --- a/tools/arch/x86/include/asm/barrier.h +++ b/tools/arch/x86/include/asm/barrier.h @@ -26,4 +26,18 @@ #define wmb() asm volatile("sfence" ::: "memory") #endif +#if defined(__x86_64__) +#define smp_store_release(p, v) \ +do { \ + barrier(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + barrier(); \ + ___p1; \ +}) +#endif /* defined(__x86_64__) */ #endif /* _TOOLS_LINUX_ASM_X86_BARRIER_H */ diff --git a/tools/include/asm/barrier.h b/tools/include/asm/barrier.h index 391d942536e5..8d378c57cb01 100644 --- a/tools/include/asm/barrier.h +++ b/tools/include/asm/barrier.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include #if defined(__i386__) || defined(__x86_64__) #include "../../arch/x86/include/asm/barrier.h" #elif defined(__arm__) @@ -26,3 +27,37 @@ #else #include #endif + +/* + * Generic fallback smp_*() definitions for archs that haven't + * been updated yet. + */ + +#ifndef smp_rmb +# define smp_rmb() rmb() +#endif + +#ifndef smp_wmb +# define smp_wmb() wmb() +#endif + +#ifndef smp_mb +# define smp_mb() mb() +#endif + +#ifndef smp_store_release +# define smp_store_release(p, v) \ +do { \ + smp_mb(); \ + WRITE_ONCE(*p, v); \ +} while (0) +#endif + +#ifndef smp_load_acquire +# define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + smp_mb(); \ + ___p1; \ +}) +#endif diff --git a/tools/include/linux/ring_buffer.h b/tools/include/linux/ring_buffer.h new file mode 100644 index 000000000000..9a083ae60473 --- /dev/null +++ b/tools/include/linux/ring_buffer.h @@ -0,0 +1,73 @@ +#ifndef _TOOLS_LINUX_RING_BUFFER_H_ +#define _TOOLS_LINUX_RING_BUFFER_H_ + +#include + +/* + * Contract with kernel for walking the perf ring buffer from + * user space requires the following barrier pairing (quote + * from kernel/events/ring_buffer.c): + * + * Since the mmap() consumer (userspace) can run on a + * different CPU: + * + * kernel user + * + * if (LOAD ->data_tail) { LOAD ->data_head + * (A) smp_rmb() (C) + * STORE $data LOAD $data + * smp_wmb() (B) smp_mb() (D) + * STORE ->data_head STORE ->data_tail + * } + * + * Where A pairs with D, and B pairs with C. + * + * In our case A is a control dependency that separates the + * load of the ->data_tail and the stores of $data. In case + * ->data_tail indicates there is no room in the buffer to + * store $data we do not. + * + * D needs to be a full barrier since it separates the data + * READ from the tail WRITE. + * + * For B a WMB is sufficient since it separates two WRITEs, + * and for C an RMB is sufficient since it separates two READs. + * + * Note, instead of B, C, D we could also use smp_store_release() + * in B and D as well as smp_load_acquire() in C. + * + * However, this optimization does not make sense for all kernel + * supported architectures since for a fair number it would + * resolve into READ_ONCE() + smp_mb() pair for smp_load_acquire(), + * and smp_mb() + WRITE_ONCE() pair for smp_store_release(). + * + * Thus for those smp_wmb() in B and smp_rmb() in C would still + * be less expensive. For the case of D this has either the same + * cost or is less expensive, for example, due to TSO x86 can + * avoid the CPU barrier entirely. + */ + +static inline u64 ring_buffer_read_head(struct perf_event_mmap_page *base) +{ +/* + * Architectures where smp_load_acquire() does not fallback to + * READ_ONCE() + smp_mb() pair. + */ +#if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__) || \ + defined(__ia64__) || defined(__sparc__) && defined(__arch64__) + return smp_load_acquire(&base->data_head); +#else + u64 head = READ_ONCE(base->data_head); + + smp_rmb(); + return head; +#endif +} + +static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base, + u64 tail) +{ + smp_store_release(&base->data_tail, tail); +} + +#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */ diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 05a6d47c7956..8f6531fd4dad 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include "auxtrace.h" #include "event.h" @@ -71,21 +71,12 @@ void perf_mmap__consume(struct perf_mmap *map); static inline u64 perf_mmap__read_head(struct perf_mmap *mm) { - struct perf_event_mmap_page *pc = mm->base; - u64 head = READ_ONCE(pc->data_head); - rmb(); - return head; + return ring_buffer_read_head(mm->base); } static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail) { - struct perf_event_mmap_page *pc = md->base; - - /* - * ensure all reads are done before we write the tail out. - */ - mb(); - pc->data_tail = tail; + ring_buffer_write_tail(md->base, tail); } union perf_event *perf_mmap__read_forward(struct perf_mmap *map); -- cgit From f908d26b2c41d9a924371099c4979e4b5d385165 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 19 Oct 2018 19:56:50 -0700 Subject: bpf: libbpf support for msg_push_data Add support for new bpf_msg_push_data in libbpf. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/bpf.h | 20 +++++++++++++++++++- tools/testing/selftests/bpf/bpf_helpers.h | 2 ++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a2fb333290dc..852dc17ab47a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2240,6 +2240,23 @@ union bpf_attr { * pointer that was returned from bpf_sk_lookup_xxx\ (). * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) + * Description + * For socket policies, insert *len* bytes into msg at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the msg. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2331,7 +2348,8 @@ union bpf_attr { FN(sk_release), \ FN(map_push_elem), \ FN(map_pop_elem), \ - FN(map_peek_elem), + FN(map_peek_elem), \ + FN(msg_push_data), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 6407a3df0f3b..686e57ce40f4 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -111,6 +111,8 @@ static int (*bpf_msg_cork_bytes)(void *ctx, int len) = (void *) BPF_FUNC_msg_cork_bytes; static int (*bpf_msg_pull_data)(void *ctx, int start, int end, int flags) = (void *) BPF_FUNC_msg_pull_data; +static int (*bpf_msg_push_data)(void *ctx, int start, int end, int flags) = + (void *) BPF_FUNC_msg_push_data; static int (*bpf_bind)(void *ctx, void *addr, int addr_len) = (void *) BPF_FUNC_bind; static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) = -- cgit From ad3d6c7263e368abdc151e1cc13dc78aa39cc7a7 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 7 Nov 2017 14:57:46 -0500 Subject: xarray: Add XArray load operation The xa_load function brings with it a lot of infrastructure; xa_empty(), xa_is_err(), and large chunks of the XArray advanced API that are used to implement xa_load. As the test-suite demonstrates, it is possible to use the XArray functions on a radix tree. The radix tree functions depend on the GFP flags being stored in the root of the tree, so it's not possible to use the radix tree functions on an XArray. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 336 ++++++++++++++++++++++++++++++ lib/Kconfig.debug | 3 + lib/Makefile | 1 + lib/radix-tree.c | 43 ---- lib/test_xarray.c | 87 ++++++++ lib/xarray.c | 195 +++++++++++++++++ tools/include/linux/kernel.h | 1 + tools/testing/radix-tree/.gitignore | 1 + tools/testing/radix-tree/Makefile | 6 +- tools/testing/radix-tree/linux/kernel.h | 1 + tools/testing/radix-tree/linux/rcupdate.h | 2 + tools/testing/radix-tree/main.c | 1 + tools/testing/radix-tree/test.h | 1 + tools/testing/radix-tree/xarray.c | 28 +++ 14 files changed, 661 insertions(+), 45 deletions(-) create mode 100644 lib/test_xarray.c (limited to 'tools/include') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 52141dfc5a90..a0df8217068c 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -30,6 +32,10 @@ * * 0-62: Sibling entries * 256: Retry entry + * + * Errors are also represented as internal entries, but use the negative + * space (-4094 to -2). They're never stored in the slots array; only + * returned by the normal API. */ #define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) @@ -155,6 +161,42 @@ static inline bool xa_is_internal(const void *entry) return ((unsigned long)entry & 3) == 2; } +/** + * xa_is_err() - Report whether an XArray operation returned an error + * @entry: Result from calling an XArray function + * + * If an XArray operation cannot complete an operation, it will return + * a special value indicating an error. This function tells you + * whether an error occurred; xa_err() tells you which error occurred. + * + * Context: Any context. + * Return: %true if the entry indicates an error. + */ +static inline bool xa_is_err(const void *entry) +{ + return unlikely(xa_is_internal(entry)); +} + +/** + * xa_err() - Turn an XArray result into an errno. + * @entry: Result from calling an XArray function. + * + * If an XArray operation cannot complete an operation, it will return + * a special pointer value which encodes an errno. This function extracts + * the errno from the pointer value, or returns 0 if the pointer does not + * represent an errno. + * + * Context: Any context. + * Return: A negative errno or 0. + */ +static inline int xa_err(void *entry) +{ + /* xa_to_internal() would not do sign extension. */ + if (xa_is_err(entry)) + return (long)entry >> 2; + return 0; +} + /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. @@ -209,6 +251,7 @@ struct xarray { #define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) void xa_init_flags(struct xarray *, gfp_t flags); +void *xa_load(struct xarray *, unsigned long index); /** * xa_init() - Initialise an empty XArray. @@ -223,6 +266,18 @@ static inline void xa_init(struct xarray *xa) xa_init_flags(xa, 0); } +/** + * xa_empty() - Determine if an array has any present entries. + * @xa: XArray. + * + * Context: Any context. + * Return: %true if the array contains only NULL pointers. + */ +static inline bool xa_empty(const struct xarray *xa) +{ + return xa->xa_head == NULL; +} + #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) @@ -280,6 +335,65 @@ struct xa_node { }; }; +void xa_dump(const struct xarray *); +void xa_dump_node(const struct xa_node *); + +#ifdef XA_DEBUG +#define XA_BUG_ON(xa, x) do { \ + if (x) { \ + xa_dump(xa); \ + BUG(); \ + } \ + } while (0) +#define XA_NODE_BUG_ON(node, x) do { \ + if (x) { \ + if (node) xa_dump_node(node); \ + BUG(); \ + } \ + } while (0) +#else +#define XA_BUG_ON(xa, x) do { } while (0) +#define XA_NODE_BUG_ON(node, x) do { } while (0) +#endif + +/* Private */ +static inline void *xa_head(const struct xarray *xa) +{ + return rcu_dereference_check(xa->xa_head, + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline void *xa_head_locked(const struct xarray *xa) +{ + return rcu_dereference_protected(xa->xa_head, + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline void *xa_entry(const struct xarray *xa, + const struct xa_node *node, unsigned int offset) +{ + XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); + return rcu_dereference_check(node->slots[offset], + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline void *xa_entry_locked(const struct xarray *xa, + const struct xa_node *node, unsigned int offset) +{ + XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); + return rcu_dereference_protected(node->slots[offset], + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline struct xa_node *xa_to_node(const void *entry) +{ + return (struct xa_node *)((unsigned long)entry - 2); +} + /* Private */ static inline bool xa_is_node(const void *entry) { @@ -312,4 +426,226 @@ static inline bool xa_is_sibling(const void *entry) #define XA_RETRY_ENTRY xa_mk_internal(256) +/** + * xa_is_retry() - Is the entry a retry entry? + * @entry: Entry retrieved from the XArray + * + * Return: %true if the entry is a retry entry. + */ +static inline bool xa_is_retry(const void *entry) +{ + return unlikely(entry == XA_RETRY_ENTRY); +} + +/** + * typedef xa_update_node_t - A callback function from the XArray. + * @node: The node which is being processed + * + * This function is called every time the XArray updates the count of + * present and value entries in a node. It allows advanced users to + * maintain the private_list in the node. + * + * Context: The xa_lock is held and interrupts may be disabled. + * Implementations should not drop the xa_lock, nor re-enable + * interrupts. + */ +typedef void (*xa_update_node_t)(struct xa_node *node); + +/* + * The xa_state is opaque to its users. It contains various different pieces + * of state involved in the current operation on the XArray. It should be + * declared on the stack and passed between the various internal routines. + * The various elements in it should not be accessed directly, but only + * through the provided accessor functions. The below documentation is for + * the benefit of those working on the code, not for users of the XArray. + * + * @xa_node usually points to the xa_node containing the slot we're operating + * on (and @xa_offset is the offset in the slots array). If there is a + * single entry in the array at index 0, there are no allocated xa_nodes to + * point to, and so we store %NULL in @xa_node. @xa_node is set to + * the value %XAS_RESTART if the xa_state is not walked to the correct + * position in the tree of nodes for this operation. If an error occurs + * during an operation, it is set to an %XAS_ERROR value. If we run off the + * end of the allocated nodes, it is set to %XAS_BOUNDS. + */ +struct xa_state { + struct xarray *xa; + unsigned long xa_index; + unsigned char xa_shift; + unsigned char xa_sibs; + unsigned char xa_offset; + unsigned char xa_pad; /* Helps gcc generate better code */ + struct xa_node *xa_node; + struct xa_node *xa_alloc; + xa_update_node_t xa_update; +}; + +/* + * We encode errnos in the xas->xa_node. If an error has happened, we need to + * drop the lock to fix it, and once we've done so the xa_state is invalid. + */ +#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL)) +#define XAS_BOUNDS ((struct xa_node *)1UL) +#define XAS_RESTART ((struct xa_node *)3UL) + +#define __XA_STATE(array, index, shift, sibs) { \ + .xa = array, \ + .xa_index = index, \ + .xa_shift = shift, \ + .xa_sibs = sibs, \ + .xa_offset = 0, \ + .xa_pad = 0, \ + .xa_node = XAS_RESTART, \ + .xa_alloc = NULL, \ + .xa_update = NULL \ +} + +/** + * XA_STATE() - Declare an XArray operation state. + * @name: Name of this operation state (usually xas). + * @array: Array to operate on. + * @index: Initial index of interest. + * + * Declare and initialise an xa_state on the stack. + */ +#define XA_STATE(name, array, index) \ + struct xa_state name = __XA_STATE(array, index, 0, 0) + +/** + * XA_STATE_ORDER() - Declare an XArray operation state. + * @name: Name of this operation state (usually xas). + * @array: Array to operate on. + * @index: Initial index of interest. + * @order: Order of entry. + * + * Declare and initialise an xa_state on the stack. This variant of + * XA_STATE() allows you to specify the 'order' of the element you + * want to operate on.` + */ +#define XA_STATE_ORDER(name, array, index, order) \ + struct xa_state name = __XA_STATE(array, \ + (index >> order) << order, \ + order - (order % XA_CHUNK_SHIFT), \ + (1U << (order % XA_CHUNK_SHIFT)) - 1) + +#define xas_marked(xas, mark) xa_marked((xas)->xa, (mark)) +#define xas_trylock(xas) xa_trylock((xas)->xa) +#define xas_lock(xas) xa_lock((xas)->xa) +#define xas_unlock(xas) xa_unlock((xas)->xa) +#define xas_lock_bh(xas) xa_lock_bh((xas)->xa) +#define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa) +#define xas_lock_irq(xas) xa_lock_irq((xas)->xa) +#define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa) +#define xas_lock_irqsave(xas, flags) \ + xa_lock_irqsave((xas)->xa, flags) +#define xas_unlock_irqrestore(xas, flags) \ + xa_unlock_irqrestore((xas)->xa, flags) + +/** + * xas_error() - Return an errno stored in the xa_state. + * @xas: XArray operation state. + * + * Return: 0 if no error has been noted. A negative errno if one has. + */ +static inline int xas_error(const struct xa_state *xas) +{ + return xa_err(xas->xa_node); +} + +/** + * xas_set_err() - Note an error in the xa_state. + * @xas: XArray operation state. + * @err: Negative error number. + * + * Only call this function with a negative @err; zero or positive errors + * will probably not behave the way you think they should. If you want + * to clear the error from an xa_state, use xas_reset(). + */ +static inline void xas_set_err(struct xa_state *xas, long err) +{ + xas->xa_node = XA_ERROR(err); +} + +/** + * xas_invalid() - Is the xas in a retry or error state? + * @xas: XArray operation state. + * + * Return: %true if the xas cannot be used for operations. + */ +static inline bool xas_invalid(const struct xa_state *xas) +{ + return (unsigned long)xas->xa_node & 3; +} + +/** + * xas_valid() - Is the xas a valid cursor into the array? + * @xas: XArray operation state. + * + * Return: %true if the xas can be used for operations. + */ +static inline bool xas_valid(const struct xa_state *xas) +{ + return !xas_invalid(xas); +} + +/** + * xas_reset() - Reset an XArray operation state. + * @xas: XArray operation state. + * + * Resets the error or walk state of the @xas so future walks of the + * array will start from the root. Use this if you have dropped the + * xarray lock and want to reuse the xa_state. + * + * Context: Any context. + */ +static inline void xas_reset(struct xa_state *xas) +{ + xas->xa_node = XAS_RESTART; +} + +/** + * xas_retry() - Retry the operation if appropriate. + * @xas: XArray operation state. + * @entry: Entry from xarray. + * + * The advanced functions may sometimes return an internal entry, such as + * a retry entry or a zero entry. This function sets up the @xas to restart + * the walk from the head of the array if needed. + * + * Context: Any context. + * Return: true if the operation needs to be retried. + */ +static inline bool xas_retry(struct xa_state *xas, const void *entry) +{ + if (!xa_is_retry(entry)) + return false; + xas_reset(xas); + return true; +} + +void *xas_load(struct xa_state *); + +/** + * xas_reload() - Refetch an entry from the xarray. + * @xas: XArray operation state. + * + * Use this function to check that a previously loaded entry still has + * the same value. This is useful for the lockless pagecache lookup where + * we walk the array with only the RCU lock to protect us, lock the page, + * then check that the page hasn't moved since we looked it up. + * + * The caller guarantees that @xas is still valid. If it may be in an + * error or restart state, call xas_load() instead. + * + * Return: The entry at this location in the xarray. + */ +static inline void *xas_reload(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + if (node) + return xa_entry(xas->xa, node, xas->xa_offset); + return xa_head(xas->xa); +} + #endif /* _LINUX_XARRAY_H */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4966c4fbe7f7..091155e12422 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1813,6 +1813,9 @@ config TEST_BITFIELD config TEST_UUID tristate "Test functions located in the uuid module at runtime" +config TEST_XARRAY + tristate "Test the XArray code at runtime" + config TEST_OVERFLOW tristate "Test check_*_overflow() functions at runtime" diff --git a/lib/Makefile b/lib/Makefile index 057385f1f145..e6f809556af5 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -68,6 +68,7 @@ obj-$(CONFIG_TEST_PRINTF) += test_printf.o obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o obj-$(CONFIG_TEST_BITFIELD) += test_bitfield.o obj-$(CONFIG_TEST_UUID) += test_uuid.o +obj-$(CONFIG_TEST_XARRAY) += test_xarray.o obj-$(CONFIG_TEST_PARMAN) += test_parman.o obj-$(CONFIG_TEST_KMOD) += test_kmod.o obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8a568cea1237..b8e961428484 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -256,49 +256,6 @@ static unsigned long next_index(unsigned long index, } #ifndef __KERNEL__ -static void dump_node(struct radix_tree_node *node, unsigned long index) -{ - unsigned long i; - - pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d nr_values %d\n", - node, node->offset, index, index | node_maxindex(node), - node->parent, - node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->shift, node->count, node->nr_values); - - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { - unsigned long first = index | (i << node->shift); - unsigned long last = first | ((1UL << node->shift) - 1); - void *entry = node->slots[i]; - if (!entry) - continue; - if (entry == RADIX_TREE_RETRY) { - pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n", - i, first, last, node); - } else if (!radix_tree_is_internal_node(entry)) { - pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n", - entry, i, first, last, node); - } else if (xa_is_sibling(entry)) { - pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n", - entry, i, first, last, node, - node->slots[xa_to_sibling(entry)]); - } else { - dump_node(entry_to_node(entry), first); - } - } -} - -/* For debug */ -static void radix_tree_dump(struct radix_tree_root *root) -{ - pr_debug("radix root: %p xa_head %p tags %x\n", - root, root->xa_head, - root->xa_flags >> ROOT_TAG_SHIFT); - if (!radix_tree_is_internal_node(root->xa_head)) - return; - dump_node(entry_to_node(root->xa_head), 0); -} - static void dump_ida_node(void *entry, unsigned long index) { unsigned long i; diff --git a/lib/test_xarray.c b/lib/test_xarray.c new file mode 100644 index 000000000000..a7248b87617f --- /dev/null +++ b/lib/test_xarray.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * test_xarray.c: Test the XArray API + * Copyright (c) 2017-2018 Microsoft Corporation + * Author: Matthew Wilcox + */ + +#include +#include + +static unsigned int tests_run; +static unsigned int tests_passed; + +#ifndef XA_DEBUG +# ifdef __KERNEL__ +void xa_dump(const struct xarray *xa) { } +# endif +#undef XA_BUG_ON +#define XA_BUG_ON(xa, x) do { \ + tests_run++; \ + if (x) { \ + printk("BUG at %s:%d\n", __func__, __LINE__); \ + xa_dump(xa); \ + dump_stack(); \ + } else { \ + tests_passed++; \ + } \ +} while (0) +#endif + +static void *xa_store_index(struct xarray *xa, unsigned long index, gfp_t gfp) +{ + radix_tree_insert(xa, index, xa_mk_value(index)); + return NULL; +} + +static void xa_erase_index(struct xarray *xa, unsigned long index) +{ + radix_tree_delete(xa, index); +} + +static noinline void check_xa_load(struct xarray *xa) +{ + unsigned long i, j; + + for (i = 0; i < 1024; i++) { + for (j = 0; j < 1024; j++) { + void *entry = xa_load(xa, j); + if (j < i) + XA_BUG_ON(xa, xa_to_value(entry) != j); + else + XA_BUG_ON(xa, entry); + } + XA_BUG_ON(xa, xa_store_index(xa, i, GFP_KERNEL) != NULL); + } + + for (i = 0; i < 1024; i++) { + for (j = 0; j < 1024; j++) { + void *entry = xa_load(xa, j); + if (j >= i) + XA_BUG_ON(xa, xa_to_value(entry) != j); + else + XA_BUG_ON(xa, entry); + } + xa_erase_index(xa, i); + } + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static RADIX_TREE(array, GFP_KERNEL); + +static int xarray_checks(void) +{ + check_xa_load(&array); + + printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); + return (tests_run == tests_passed) ? 0 : -EINVAL; +} + +static void xarray_exit(void) +{ +} + +module_init(xarray_checks); +module_exit(xarray_exit); +MODULE_AUTHOR("Matthew Wilcox "); +MODULE_LICENSE("GPL"); diff --git a/lib/xarray.c b/lib/xarray.c index 862f4c64c754..19cfcbc69a68 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -24,6 +24,100 @@ * @entry refers to something stored in a slot in the xarray */ +/* extracts the offset within this node from the index */ +static unsigned int get_offset(unsigned long index, struct xa_node *node) +{ + return (index >> node->shift) & XA_CHUNK_MASK; +} + +/* move the index either forwards (find) or backwards (sibling slot) */ +static void xas_move_index(struct xa_state *xas, unsigned long offset) +{ + unsigned int shift = xas->xa_node->shift; + xas->xa_index &= ~XA_CHUNK_MASK << shift; + xas->xa_index += offset << shift; +} + +static void *set_bounds(struct xa_state *xas) +{ + xas->xa_node = XAS_BOUNDS; + return NULL; +} + +/* + * Starts a walk. If the @xas is already valid, we assume that it's on + * the right path and just return where we've got to. If we're in an + * error state, return NULL. If the index is outside the current scope + * of the xarray, return NULL without changing @xas->xa_node. Otherwise + * set @xas->xa_node to NULL and return the current head of the array. + */ +static void *xas_start(struct xa_state *xas) +{ + void *entry; + + if (xas_valid(xas)) + return xas_reload(xas); + if (xas_error(xas)) + return NULL; + + entry = xa_head(xas->xa); + if (!xa_is_node(entry)) { + if (xas->xa_index) + return set_bounds(xas); + } else { + if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK) + return set_bounds(xas); + } + + xas->xa_node = NULL; + return entry; +} + +static void *xas_descend(struct xa_state *xas, struct xa_node *node) +{ + unsigned int offset = get_offset(xas->xa_index, node); + void *entry = xa_entry(xas->xa, node, offset); + + xas->xa_node = node; + if (xa_is_sibling(entry)) { + offset = xa_to_sibling(entry); + entry = xa_entry(xas->xa, node, offset); + } + + xas->xa_offset = offset; + return entry; +} + +/** + * xas_load() - Load an entry from the XArray (advanced). + * @xas: XArray operation state. + * + * Usually walks the @xas to the appropriate state to load the entry + * stored at xa_index. However, it will do nothing and return %NULL if + * @xas is in an error state. xas_load() will never expand the tree. + * + * If the xa_state is set up to operate on a multi-index entry, xas_load() + * may return %NULL or an internal entry, even if there are entries + * present within the range specified by @xas. + * + * Context: Any context. The caller should hold the xa_lock or the RCU lock. + * Return: Usually an entry in the XArray, but see description for exceptions. + */ +void *xas_load(struct xa_state *xas) +{ + void *entry = xas_start(xas); + + while (xa_is_node(entry)) { + struct xa_node *node = xa_to_node(entry); + + if (xas->xa_shift > node->shift) + break; + entry = xas_descend(xas, node); + } + return entry; +} +EXPORT_SYMBOL_GPL(xas_load); + /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. @@ -42,3 +136,104 @@ void xa_init_flags(struct xarray *xa, gfp_t flags) xa->xa_head = NULL; } EXPORT_SYMBOL(xa_init_flags); + +/** + * xa_load() - Load an entry from an XArray. + * @xa: XArray. + * @index: index into array. + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The entry at @index in @xa. + */ +void *xa_load(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + void *entry; + + rcu_read_lock(); + do { + entry = xas_load(&xas); + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + return entry; +} +EXPORT_SYMBOL(xa_load); + +#ifdef XA_DEBUG +void xa_dump_node(const struct xa_node *node) +{ + unsigned i, j; + + if (!node) + return; + if ((unsigned long)node & 3) { + pr_cont("node %px\n", node); + return; + } + + pr_cont("node %px %s %d parent %px shift %d count %d values %d " + "array %px list %px %px marks", + node, node->parent ? "offset" : "max", node->offset, + node->parent, node->shift, node->count, node->nr_values, + node->array, node->private_list.prev, node->private_list.next); + for (i = 0; i < XA_MAX_MARKS; i++) + for (j = 0; j < XA_MARK_LONGS; j++) + pr_cont(" %lx", node->marks[i][j]); + pr_cont("\n"); +} + +void xa_dump_index(unsigned long index, unsigned int shift) +{ + if (!shift) + pr_info("%lu: ", index); + else if (shift >= BITS_PER_LONG) + pr_info("0-%lu: ", ~0UL); + else + pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1)); +} + +void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift) +{ + if (!entry) + return; + + xa_dump_index(index, shift); + + if (xa_is_node(entry)) { + if (shift == 0) { + pr_cont("%px\n", entry); + } else { + unsigned long i; + struct xa_node *node = xa_to_node(entry); + xa_dump_node(node); + for (i = 0; i < XA_CHUNK_SIZE; i++) + xa_dump_entry(node->slots[i], + index + (i << node->shift), node->shift); + } + } else if (xa_is_value(entry)) + pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry), + xa_to_value(entry), entry); + else if (!xa_is_internal(entry)) + pr_cont("%px\n", entry); + else if (xa_is_retry(entry)) + pr_cont("retry (%ld)\n", xa_to_internal(entry)); + else if (xa_is_sibling(entry)) + pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry)); + else + pr_cont("UNKNOWN ENTRY (%px)\n", entry); +} + +void xa_dump(const struct xarray *xa) +{ + void *entry = xa->xa_head; + unsigned int shift = 0; + + pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry, + xa->xa_flags, radix_tree_tagged(xa, 0), + radix_tree_tagged(xa, 1), radix_tree_tagged(xa, 2)); + if (xa_is_node(entry)) + shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT; + xa_dump_entry(entry, 0, shift); +} +#endif diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h index 0ad884452c5c..6935ef94e77a 100644 --- a/tools/include/linux/kernel.h +++ b/tools/include/linux/kernel.h @@ -70,6 +70,7 @@ #define BUG_ON(cond) assert(!(cond)) #endif #endif +#define BUG() BUG_ON(1) #if __BYTE_ORDER == __BIG_ENDIAN #define cpu_to_le16 bswap_16 diff --git a/tools/testing/radix-tree/.gitignore b/tools/testing/radix-tree/.gitignore index d4706c0ffceb..3834899b6693 100644 --- a/tools/testing/radix-tree/.gitignore +++ b/tools/testing/radix-tree/.gitignore @@ -4,3 +4,4 @@ idr-test main multiorder radix-tree.c +xarray diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index c0cf1c79efd5..1379f1d78d0b 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -4,7 +4,7 @@ CFLAGS += -I. -I../../include -g -Og -Wall -D_LGPL_SOURCE -fsanitize=address \ -fsanitize=undefined LDFLAGS += -fsanitize=address -fsanitize=undefined LDLIBS+= -lpthread -lurcu -TARGETS = main idr-test multiorder +TARGETS = main idr-test multiorder xarray CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \ tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o @@ -25,6 +25,8 @@ main: $(OFILES) idr-test.o: ../../../lib/test_ida.c idr-test: idr-test.o $(CORE_OFILES) +xarray: $(CORE_OFILES) + multiorder: multiorder.o $(CORE_OFILES) clean: @@ -45,7 +47,7 @@ radix-tree.c: ../../../lib/radix-tree.c idr.c: ../../../lib/idr.c sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ -xarray.o: ../../../lib/xarray.c +xarray.o: ../../../lib/xarray.c ../../../lib/test_xarray.c generated/map-shift.h: @if ! grep -qws $(SHIFT) generated/map-shift.h; then \ diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index 426f32f28547..5d06ac75a14d 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h @@ -14,6 +14,7 @@ #include "../../../include/linux/kconfig.h" #define printk printf +#define pr_info printk #define pr_debug printk #define pr_cont printk diff --git a/tools/testing/radix-tree/linux/rcupdate.h b/tools/testing/radix-tree/linux/rcupdate.h index 73ed33658203..fd280b070fdb 100644 --- a/tools/testing/radix-tree/linux/rcupdate.h +++ b/tools/testing/radix-tree/linux/rcupdate.h @@ -6,5 +6,7 @@ #define rcu_dereference_raw(p) rcu_dereference(p) #define rcu_dereference_protected(p, cond) rcu_dereference(p) +#define rcu_dereference_check(p, cond) rcu_dereference(p) +#define RCU_INIT_POINTER(p, v) (p) = (v) #endif diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c index b741686e53d6..09deaf4f0959 100644 --- a/tools/testing/radix-tree/main.c +++ b/tools/testing/radix-tree/main.c @@ -365,6 +365,7 @@ int main(int argc, char **argv) rcu_register_thread(); radix_tree_init(); + xarray_tests(); regression1_test(); regression2_test(); regression3_test(); diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index 92d901eacf49..e3dc7a16f09b 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -34,6 +34,7 @@ int tag_tagged_items(struct radix_tree_root *, pthread_mutex_t *, unsigned iftag, unsigned thentag); unsigned long find_item(struct radix_tree_root *, void *item); +void xarray_tests(void); void tag_check(void); void multiorder_checks(void); void iteration_test(unsigned order, unsigned duration); diff --git a/tools/testing/radix-tree/xarray.c b/tools/testing/radix-tree/xarray.c index 9bbd667172a7..e61e43efe463 100644 --- a/tools/testing/radix-tree/xarray.c +++ b/tools/testing/radix-tree/xarray.c @@ -4,4 +4,32 @@ * Copyright (c) 2018 Matthew Wilcox */ +#define XA_DEBUG +#include "test.h" + +#define module_init(x) +#define module_exit(x) +#define MODULE_AUTHOR(x) +#define MODULE_LICENSE(x) +#define dump_stack() assert(0) + #include "../../../lib/xarray.c" +#undef XA_DEBUG +#include "../../../lib/test_xarray.c" + +void xarray_tests(void) +{ + xarray_checks(); + xarray_exit(); +} + +int __weak main(void) +{ + radix_tree_init(); + xarray_tests(); + radix_tree_cpu_dead(1); + rcu_barrier(); + if (nr_allocated) + printf("nr_allocated = %d\n", nr_allocated); + return 0; +} -- cgit From 9b89a0355144685a787b0dc5bcf7bdd6f2d02968 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 10 Nov 2017 09:34:31 -0500 Subject: xarray: Add XArray marks XArray marks are like the radix tree tags, only slightly more strongly typed. They are renamed in order to distinguish them from tagged pointers. This commit adds the basic get/set/clear operations. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 63 +++++++ lib/test_xarray.c | 34 ++++ lib/xarray.c | 232 +++++++++++++++++++++++++- tools/include/asm-generic/bitops.h | 1 + tools/include/asm-generic/bitops/atomic.h | 9 - tools/include/asm-generic/bitops/non-atomic.h | 109 ++++++++++++ tools/include/linux/spinlock.h | 10 +- 7 files changed, 445 insertions(+), 13 deletions(-) create mode 100644 tools/include/asm-generic/bitops/non-atomic.h (limited to 'tools/include') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index a0df8217068c..2de504ae9ba4 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -197,6 +198,20 @@ static inline int xa_err(void *entry) return 0; } +typedef unsigned __bitwise xa_mark_t; +#define XA_MARK_0 ((__force xa_mark_t)0U) +#define XA_MARK_1 ((__force xa_mark_t)1U) +#define XA_MARK_2 ((__force xa_mark_t)2U) +#define XA_PRESENT ((__force xa_mark_t)8U) +#define XA_MARK_MAX XA_MARK_2 + +/* + * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, + * and we remain compatible with that. + */ +#define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ + (__force unsigned)(mark))) + /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. @@ -252,6 +267,9 @@ struct xarray { void xa_init_flags(struct xarray *, gfp_t flags); void *xa_load(struct xarray *, unsigned long index); +bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); +void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); +void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); /** * xa_init() - Initialise an empty XArray. @@ -278,6 +296,19 @@ static inline bool xa_empty(const struct xarray *xa) return xa->xa_head == NULL; } +/** + * xa_marked() - Inquire whether any entry in this array has a mark set + * @xa: Array + * @mark: Mark value + * + * Context: Any context. + * Return: %true if any entry has this mark set. + */ +static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) +{ + return xa->xa_flags & XA_FLAGS_MARK(mark); +} + #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) @@ -290,6 +321,12 @@ static inline bool xa_empty(const struct xarray *xa) #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) +/* + * Versions of the normal API which require the caller to hold the xa_lock. + */ +void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); +void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); + /* Everything below here is the Advanced API. Proceed with caution. */ /* @@ -388,6 +425,22 @@ static inline void *xa_entry_locked(const struct xarray *xa, lockdep_is_held(&xa->xa_lock)); } +/* Private */ +static inline struct xa_node *xa_parent(const struct xarray *xa, + const struct xa_node *node) +{ + return rcu_dereference_check(node->parent, + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline struct xa_node *xa_parent_locked(const struct xarray *xa, + const struct xa_node *node) +{ + return rcu_dereference_protected(node->parent, + lockdep_is_held(&xa->xa_lock)); +} + /* Private */ static inline struct xa_node *xa_to_node(const void *entry) { @@ -588,6 +641,12 @@ static inline bool xas_valid(const struct xa_state *xas) return !xas_invalid(xas); } +/* True if the pointer is something other than a node */ +static inline bool xas_not_node(struct xa_node *node) +{ + return ((unsigned long)node & 3) || !node; +} + /** * xas_reset() - Reset an XArray operation state. * @xas: XArray operation state. @@ -625,6 +684,10 @@ static inline bool xas_retry(struct xa_state *xas, const void *entry) void *xas_load(struct xa_state *); +bool xas_get_mark(const struct xa_state *, xa_mark_t); +void xas_set_mark(const struct xa_state *, xa_mark_t); +void xas_clear_mark(const struct xa_state *, xa_mark_t); + /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. diff --git a/lib/test_xarray.c b/lib/test_xarray.c index a7248b87617f..f8b0e9ba80a4 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -67,11 +67,45 @@ static noinline void check_xa_load(struct xarray *xa) XA_BUG_ON(xa, !xa_empty(xa)); } +static noinline void check_xa_mark_1(struct xarray *xa, unsigned long index) +{ + /* NULL elements have no marks set */ + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + xa_set_mark(xa, index, XA_MARK_0); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + + /* Storing a pointer will not make a mark appear */ + XA_BUG_ON(xa, xa_store_index(xa, index, GFP_KERNEL) != NULL); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + xa_set_mark(xa, index, XA_MARK_0); + XA_BUG_ON(xa, !xa_get_mark(xa, index, XA_MARK_0)); + + /* Setting one mark will not set another mark */ + XA_BUG_ON(xa, xa_get_mark(xa, index + 1, XA_MARK_0)); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_1)); + + /* Storing NULL clears marks, and they can't be set again */ + xa_erase_index(xa, index); + XA_BUG_ON(xa, !xa_empty(xa)); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + xa_set_mark(xa, index, XA_MARK_0); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); +} + +static noinline void check_xa_mark(struct xarray *xa) +{ + unsigned long index; + + for (index = 0; index < 16384; index += 4) + check_xa_mark_1(xa, index); +} + static RADIX_TREE(array, GFP_KERNEL); static int xarray_checks(void) { check_xa_load(&array); + check_xa_mark(&array); printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); return (tests_run == tests_passed) ? 0 : -EINVAL; diff --git a/lib/xarray.c b/lib/xarray.c index 19cfcbc69a68..aa86c47e532f 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -5,6 +5,7 @@ * Author: Matthew Wilcox */ +#include #include #include @@ -24,6 +25,48 @@ * @entry refers to something stored in a slot in the xarray */ +static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark) +{ + if (!(xa->xa_flags & XA_FLAGS_MARK(mark))) + xa->xa_flags |= XA_FLAGS_MARK(mark); +} + +static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark) +{ + if (xa->xa_flags & XA_FLAGS_MARK(mark)) + xa->xa_flags &= ~(XA_FLAGS_MARK(mark)); +} + +static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark) +{ + return node->marks[(__force unsigned)mark]; +} + +static inline bool node_get_mark(struct xa_node *node, + unsigned int offset, xa_mark_t mark) +{ + return test_bit(offset, node_marks(node, mark)); +} + +/* returns true if the bit was set */ +static inline bool node_set_mark(struct xa_node *node, unsigned int offset, + xa_mark_t mark) +{ + return __test_and_set_bit(offset, node_marks(node, mark)); +} + +/* returns true if the bit was set */ +static inline bool node_clear_mark(struct xa_node *node, unsigned int offset, + xa_mark_t mark) +{ + return __test_and_clear_bit(offset, node_marks(node, mark)); +} + +static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark) +{ + return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE); +} + /* extracts the offset within this node from the index */ static unsigned int get_offset(unsigned long index, struct xa_node *node) { @@ -118,6 +161,85 @@ void *xas_load(struct xa_state *xas) } EXPORT_SYMBOL_GPL(xas_load); +/** + * xas_get_mark() - Returns the state of this mark. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Return: true if the mark is set, false if the mark is clear or @xas + * is in an error state. + */ +bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark) +{ + if (xas_invalid(xas)) + return false; + if (!xas->xa_node) + return xa_marked(xas->xa, mark); + return node_get_mark(xas->xa_node, xas->xa_offset, mark); +} +EXPORT_SYMBOL_GPL(xas_get_mark); + +/** + * xas_set_mark() - Sets the mark on this entry and its parents. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Sets the specified mark on this entry, and walks up the tree setting it + * on all the ancestor entries. Does nothing if @xas has not been walked to + * an entry, or is in an error state. + */ +void xas_set_mark(const struct xa_state *xas, xa_mark_t mark) +{ + struct xa_node *node = xas->xa_node; + unsigned int offset = xas->xa_offset; + + if (xas_invalid(xas)) + return; + + while (node) { + if (node_set_mark(node, offset, mark)) + return; + offset = node->offset; + node = xa_parent_locked(xas->xa, node); + } + + if (!xa_marked(xas->xa, mark)) + xa_mark_set(xas->xa, mark); +} +EXPORT_SYMBOL_GPL(xas_set_mark); + +/** + * xas_clear_mark() - Clears the mark on this entry and its parents. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Clears the specified mark on this entry, and walks back to the head + * attempting to clear it on all the ancestor entries. Does nothing if + * @xas has not been walked to an entry, or is in an error state. + */ +void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark) +{ + struct xa_node *node = xas->xa_node; + unsigned int offset = xas->xa_offset; + + if (xas_invalid(xas)) + return; + + while (node) { + if (!node_clear_mark(node, offset, mark)) + return; + if (node_any_mark(node, mark)) + return; + + offset = node->offset; + node = xa_parent_locked(xas->xa, node); + } + + if (xa_marked(xas->xa, mark)) + xa_mark_clear(xas->xa, mark); +} +EXPORT_SYMBOL_GPL(xas_clear_mark); + /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. @@ -160,6 +282,112 @@ void *xa_load(struct xarray *xa, unsigned long index) } EXPORT_SYMBOL(xa_load); +/** + * __xa_set_mark() - Set this mark on this entry while locked. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Attempting to set a mark on a NULL entry does not succeed. + * + * Context: Any context. Expects xa_lock to be held on entry. + */ +void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry = xas_load(&xas); + + if (entry) + xas_set_mark(&xas, mark); +} +EXPORT_SYMBOL_GPL(__xa_set_mark); + +/** + * __xa_clear_mark() - Clear this mark on this entry while locked. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Context: Any context. Expects xa_lock to be held on entry. + */ +void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry = xas_load(&xas); + + if (entry) + xas_clear_mark(&xas, mark); +} +EXPORT_SYMBOL_GPL(__xa_clear_mark); + +/** + * xa_get_mark() - Inquire whether this mark is set on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * This function uses the RCU read lock, so the result may be out of date + * by the time it returns. If you need the result to be stable, use a lock. + * + * Context: Any context. Takes and releases the RCU lock. + * Return: True if the entry at @index has this mark set, false if it doesn't. + */ +bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry; + + rcu_read_lock(); + entry = xas_start(&xas); + while (xas_get_mark(&xas, mark)) { + if (!xa_is_node(entry)) + goto found; + entry = xas_descend(&xas, xa_to_node(entry)); + } + rcu_read_unlock(); + return false; + found: + rcu_read_unlock(); + return true; +} +EXPORT_SYMBOL(xa_get_mark); + +/** + * xa_set_mark() - Set this mark on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Attempting to set a mark on a NULL entry does not succeed. + * + * Context: Process context. Takes and releases the xa_lock. + */ +void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + xa_lock(xa); + __xa_set_mark(xa, index, mark); + xa_unlock(xa); +} +EXPORT_SYMBOL(xa_set_mark); + +/** + * xa_clear_mark() - Clear this mark on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Clearing a mark always succeeds. + * + * Context: Process context. Takes and releases the xa_lock. + */ +void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + xa_lock(xa); + __xa_clear_mark(xa, index, mark); + xa_unlock(xa); +} +EXPORT_SYMBOL(xa_clear_mark); + #ifdef XA_DEBUG void xa_dump_node(const struct xa_node *node) { @@ -230,8 +458,8 @@ void xa_dump(const struct xarray *xa) unsigned int shift = 0; pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry, - xa->xa_flags, radix_tree_tagged(xa, 0), - radix_tree_tagged(xa, 1), radix_tree_tagged(xa, 2)); + xa->xa_flags, xa_marked(xa, XA_MARK_0), + xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2)); if (xa_is_node(entry)) shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT; xa_dump_entry(entry, 0, shift); diff --git a/tools/include/asm-generic/bitops.h b/tools/include/asm-generic/bitops.h index 9bce3b56b5e7..5d2ab38965cc 100644 --- a/tools/include/asm-generic/bitops.h +++ b/tools/include/asm-generic/bitops.h @@ -27,5 +27,6 @@ #include #include +#include #endif /* __TOOLS_ASM_GENERIC_BITOPS_H */ diff --git a/tools/include/asm-generic/bitops/atomic.h b/tools/include/asm-generic/bitops/atomic.h index 21c41ccd1266..2f6ea28764a7 100644 --- a/tools/include/asm-generic/bitops/atomic.h +++ b/tools/include/asm-generic/bitops/atomic.h @@ -15,13 +15,4 @@ static inline void clear_bit(int nr, unsigned long *addr) addr[nr / __BITS_PER_LONG] &= ~(1UL << (nr % __BITS_PER_LONG)); } -static __always_inline int test_bit(unsigned int nr, const unsigned long *addr) -{ - return ((1UL << (nr % __BITS_PER_LONG)) & - (((unsigned long *)addr)[nr / __BITS_PER_LONG])) != 0; -} - -#define __set_bit(nr, addr) set_bit(nr, addr) -#define __clear_bit(nr, addr) clear_bit(nr, addr) - #endif /* _TOOLS_LINUX_ASM_GENERIC_BITOPS_ATOMIC_H_ */ diff --git a/tools/include/asm-generic/bitops/non-atomic.h b/tools/include/asm-generic/bitops/non-atomic.h new file mode 100644 index 000000000000..7e10c4b50c5d --- /dev/null +++ b/tools/include/asm-generic/bitops/non-atomic.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ +#define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ + +#include + +/** + * __set_bit - Set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * Unlike set_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __set_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + + *p |= mask; +} + +static inline void __clear_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + + *p &= ~mask; +} + +/** + * __change_bit - Toggle a bit in memory + * @nr: the bit to change + * @addr: the address to start counting from + * + * Unlike change_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __change_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + + *p ^= mask; +} + +/** + * __test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + unsigned long old = *p; + + *p = old | mask; + return (old & mask) != 0; +} + +/** + * __test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + unsigned long old = *p; + + *p = old & ~mask; + return (old & mask) != 0; +} + +/* WARNING: non atomic and it can be reordered! */ +static inline int __test_and_change_bit(int nr, + volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + unsigned long old = *p; + + *p = old ^ mask; + return (old & mask) != 0; +} + +/** + * test_bit - Determine whether a bit is set + * @nr: bit number to test + * @addr: Address to start counting from + */ +static inline int test_bit(int nr, const volatile unsigned long *addr) +{ + return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); +} + +#endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */ diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 1738c0391da4..622266b197d0 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -8,8 +8,14 @@ #define spinlock_t pthread_mutex_t #define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER #define __SPIN_LOCK_UNLOCKED(x) (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER -#define spin_lock_init(x) pthread_mutex_init(x, NULL) - +#define spin_lock_init(x) pthread_mutex_init(x, NULL) + +#define spin_lock(x) pthread_mutex_lock(x) +#define spin_unlock(x) pthread_mutex_unlock(x) +#define spin_lock_bh(x) pthread_mutex_lock(x) +#define spin_unlock_bh(x) pthread_mutex_unlock(x) +#define spin_lock_irq(x) pthread_mutex_lock(x) +#define spin_unlock_irq(x) pthread_mutex_unlock(x) #define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x) #define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x) -- cgit From 58d6ea3085f2e53714810a513c61629f6d2be0a6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 10 Nov 2017 15:15:08 -0500 Subject: xarray: Add XArray unconditional store operations xa_store() differs from radix_tree_insert() in that it will overwrite an existing element in the array rather than returning an error. This is the behaviour which most users want, and those that want more complex behaviour generally want to use the xas family of routines anyway. For memory allocation, xa_store() will first attempt to request memory from the slab allocator; if memory is not immediately available, it will drop the xa_lock and allocate memory, keeping a pointer in the xa_state. It does not use the per-CPU cache, although those will continue to exist until all radix tree users are converted to the xarray. This patch also includes xa_erase() and __xa_erase() for a streamlined way to store NULL. Since there is no need to allocate memory in order to store a NULL in the XArray, we do not need to trouble the user with deciding what memory allocation flags to use. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 147 ++++++- lib/radix-tree.c | 4 +- lib/test_xarray.c | 177 +++++++- lib/xarray.c | 693 +++++++++++++++++++++++++++++++ tools/include/linux/bitmap.h | 1 + tools/include/linux/spinlock.h | 2 + tools/testing/radix-tree/Makefile | 2 +- tools/testing/radix-tree/bitmap.c | 23 + tools/testing/radix-tree/linux/kernel.h | 4 + tools/testing/radix-tree/linux/lockdep.h | 11 + 10 files changed, 1055 insertions(+), 9 deletions(-) create mode 100644 tools/testing/radix-tree/bitmap.c create mode 100644 tools/testing/radix-tree/linux/lockdep.h (limited to 'tools/include') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 2de504ae9ba4..15eab63286ed 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -205,10 +205,17 @@ typedef unsigned __bitwise xa_mark_t; #define XA_PRESENT ((__force xa_mark_t)8U) #define XA_MARK_MAX XA_MARK_2 +enum xa_lock_type { + XA_LOCK_IRQ = 1, + XA_LOCK_BH = 2, +}; + /* * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, * and we remain compatible with that. */ +#define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) +#define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) #define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ (__force unsigned)(mark))) @@ -267,6 +274,7 @@ struct xarray { void xa_init_flags(struct xarray *, gfp_t flags); void *xa_load(struct xarray *, unsigned long index); +void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); @@ -309,6 +317,23 @@ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) return xa->xa_flags & XA_FLAGS_MARK(mark); } +/** + * xa_erase() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index of entry. + * + * This function is the equivalent of calling xa_store() with %NULL as + * the third argument. The XArray does not need to allocate memory, so + * the user does not need to provide GFP flags. + * + * Context: Process context. Takes and releases the xa_lock. + * Return: The entry which used to be at this index. + */ +static inline void *xa_erase(struct xarray *xa, unsigned long index) +{ + return xa_store(xa, index, NULL, 0); +} + #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) @@ -322,11 +347,65 @@ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) spin_unlock_irqrestore(&(xa)->xa_lock, flags) /* - * Versions of the normal API which require the caller to hold the xa_lock. - */ + * Versions of the normal API which require the caller to hold the + * xa_lock. If the GFP flags allow it, they will drop the lock to + * allocate memory, then reacquire it afterwards. These functions + * may also re-enable interrupts if the XArray flags indicate the + * locking should be interrupt safe. + */ +void *__xa_erase(struct xarray *, unsigned long index); +void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); +/** + * xa_erase_bh() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index of entry. + * + * This function is the equivalent of calling xa_store() with %NULL as + * the third argument. The XArray does not need to allocate memory, so + * the user does not need to provide GFP flags. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling softirqs. + * Return: The entry which used to be at this index. + */ +static inline void *xa_erase_bh(struct xarray *xa, unsigned long index) +{ + void *entry; + + xa_lock_bh(xa); + entry = __xa_erase(xa, index); + xa_unlock_bh(xa); + + return entry; +} + +/** + * xa_erase_irq() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index of entry. + * + * This function is the equivalent of calling xa_store() with %NULL as + * the third argument. The XArray does not need to allocate memory, so + * the user does not need to provide GFP flags. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. + * Return: The entry which used to be at this index. + */ +static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) +{ + void *entry; + + xa_lock_irq(xa); + entry = __xa_erase(xa, index); + xa_unlock_irq(xa); + + return entry; +} + /* Everything below here is the Advanced API. Proceed with caution. */ /* @@ -441,6 +520,12 @@ static inline struct xa_node *xa_parent_locked(const struct xarray *xa, lockdep_is_held(&xa->xa_lock)); } +/* Private */ +static inline void *xa_mk_node(const struct xa_node *node) +{ + return (void *)((unsigned long)node | 2); +} + /* Private */ static inline struct xa_node *xa_to_node(const void *entry) { @@ -647,6 +732,12 @@ static inline bool xas_not_node(struct xa_node *node) return ((unsigned long)node & 3) || !node; } +/* True if the node represents head-of-tree, RESTART or BOUNDS */ +static inline bool xas_top(struct xa_node *node) +{ + return node <= XAS_RESTART; +} + /** * xas_reset() - Reset an XArray operation state. * @xas: XArray operation state. @@ -683,10 +774,14 @@ static inline bool xas_retry(struct xa_state *xas, const void *entry) } void *xas_load(struct xa_state *); +void *xas_store(struct xa_state *, void *entry); bool xas_get_mark(const struct xa_state *, xa_mark_t); void xas_set_mark(const struct xa_state *, xa_mark_t); void xas_clear_mark(const struct xa_state *, xa_mark_t); +void xas_init_marks(const struct xa_state *); + +bool xas_nomem(struct xa_state *, gfp_t); /** * xas_reload() - Refetch an entry from the xarray. @@ -711,4 +806,52 @@ static inline void *xas_reload(struct xa_state *xas) return xa_head(xas->xa); } +/** + * xas_set() - Set up XArray operation state for a different index. + * @xas: XArray operation state. + * @index: New index into the XArray. + * + * Move the operation state to refer to a different index. This will + * have the effect of starting a walk from the top; see xas_next() + * to move to an adjacent index. + */ +static inline void xas_set(struct xa_state *xas, unsigned long index) +{ + xas->xa_index = index; + xas->xa_node = XAS_RESTART; +} + +/** + * xas_set_order() - Set up XArray operation state for a multislot entry. + * @xas: XArray operation state. + * @index: Target of the operation. + * @order: Entry occupies 2^@order indices. + */ +static inline void xas_set_order(struct xa_state *xas, unsigned long index, + unsigned int order) +{ +#ifdef CONFIG_XARRAY_MULTI + xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0; + xas->xa_shift = order - (order % XA_CHUNK_SHIFT); + xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; + xas->xa_node = XAS_RESTART; +#else + BUG_ON(order > 0); + xas_set(xas, index); +#endif +} + +/** + * xas_set_update() - Set up XArray operation state for a callback. + * @xas: XArray operation state. + * @update: Function to call when updating a node. + * + * The XArray can notify a caller after it has updated an xa_node. + * This is advanced functionality and is only needed by the page cache. + */ +static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) +{ + xas->xa_update = update; +} + #endif /* _LINUX_XARRAY_H */ diff --git a/lib/radix-tree.c b/lib/radix-tree.c index b8e961428484..3479d93c32b9 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -47,7 +47,7 @@ static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly; /* * Radix tree node cache. */ -static struct kmem_cache *radix_tree_node_cachep; +struct kmem_cache *radix_tree_node_cachep; /* * The radix tree is variable-height, so an insert operation not only has @@ -365,7 +365,7 @@ out: return ret; } -static void radix_tree_node_rcu_free(struct rcu_head *head) +void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); diff --git a/lib/test_xarray.c b/lib/test_xarray.c index f8b0e9ba80a4..b711030fbc32 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -30,13 +30,49 @@ void xa_dump(const struct xarray *xa) { } static void *xa_store_index(struct xarray *xa, unsigned long index, gfp_t gfp) { - radix_tree_insert(xa, index, xa_mk_value(index)); - return NULL; + return xa_store(xa, index, xa_mk_value(index & LONG_MAX), gfp); } static void xa_erase_index(struct xarray *xa, unsigned long index) { - radix_tree_delete(xa, index); + XA_BUG_ON(xa, xa_erase(xa, index) != xa_mk_value(index & LONG_MAX)); + XA_BUG_ON(xa, xa_load(xa, index) != NULL); +} + +/* + * If anyone needs this, please move it to xarray.c. We have no current + * users outside the test suite because all current multislot users want + * to use the advanced API. + */ +static void *xa_store_order(struct xarray *xa, unsigned long index, + unsigned order, void *entry, gfp_t gfp) +{ + XA_STATE_ORDER(xas, xa, index, order); + void *curr; + + do { + xas_lock(&xas); + curr = xas_store(&xas, entry); + xas_unlock(&xas); + } while (xas_nomem(&xas, gfp)); + + return curr; +} + +static noinline void check_xa_err(struct xarray *xa) +{ + XA_BUG_ON(xa, xa_err(xa_store_index(xa, 0, GFP_NOWAIT)) != 0); + XA_BUG_ON(xa, xa_err(xa_erase(xa, 0)) != 0); +#ifndef __KERNEL__ + /* The kernel does not fail GFP_NOWAIT allocations */ + XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_NOWAIT)) != -ENOMEM); + XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_NOWAIT)) != -ENOMEM); +#endif + XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_KERNEL)) != 0); + XA_BUG_ON(xa, xa_err(xa_store(xa, 1, xa_mk_value(0), GFP_KERNEL)) != 0); + XA_BUG_ON(xa, xa_err(xa_erase(xa, 1)) != 0); +// kills the test-suite :-( +// XA_BUG_ON(xa, xa_err(xa_store(xa, 0, xa_mk_internal(0), 0)) != -EINVAL); } static noinline void check_xa_load(struct xarray *xa) @@ -69,6 +105,9 @@ static noinline void check_xa_load(struct xarray *xa) static noinline void check_xa_mark_1(struct xarray *xa, unsigned long index) { + unsigned int order; + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 8 : 1; + /* NULL elements have no marks set */ XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); xa_set_mark(xa, index, XA_MARK_0); @@ -90,6 +129,37 @@ static noinline void check_xa_mark_1(struct xarray *xa, unsigned long index) XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); xa_set_mark(xa, index, XA_MARK_0); XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + + /* + * Storing a multi-index entry over entries with marks gives the + * entire entry the union of the marks + */ + BUG_ON((index % 4) != 0); + for (order = 2; order < max_order; order++) { + unsigned long base = round_down(index, 1UL << order); + unsigned long next = base + (1UL << order); + unsigned long i; + + XA_BUG_ON(xa, xa_store_index(xa, index + 1, GFP_KERNEL)); + xa_set_mark(xa, index + 1, XA_MARK_0); + XA_BUG_ON(xa, xa_store_index(xa, index + 2, GFP_KERNEL)); + xa_set_mark(xa, index + 2, XA_MARK_1); + XA_BUG_ON(xa, xa_store_index(xa, next, GFP_KERNEL)); + xa_store_order(xa, index, order, xa_mk_value(index), + GFP_KERNEL); + for (i = base; i < next; i++) { + XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_0)); + XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_1)); + XA_BUG_ON(xa, xa_get_mark(xa, i, XA_MARK_2)); + } + XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_0)); + XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_1)); + XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_2)); + xa_erase_index(xa, index); + xa_erase_index(xa, next); + XA_BUG_ON(xa, !xa_empty(xa)); + } + XA_BUG_ON(xa, !xa_empty(xa)); } static noinline void check_xa_mark(struct xarray *xa) @@ -100,12 +170,111 @@ static noinline void check_xa_mark(struct xarray *xa) check_xa_mark_1(xa, index); } -static RADIX_TREE(array, GFP_KERNEL); +static noinline void check_xa_shrink(struct xarray *xa) +{ + XA_STATE(xas, xa, 1); + struct xa_node *node; + + XA_BUG_ON(xa, !xa_empty(xa)); + XA_BUG_ON(xa, xa_store_index(xa, 0, GFP_KERNEL) != NULL); + XA_BUG_ON(xa, xa_store_index(xa, 1, GFP_KERNEL) != NULL); + + /* + * Check that erasing the entry at 1 shrinks the tree and properly + * marks the node as being deleted. + */ + xas_lock(&xas); + XA_BUG_ON(xa, xas_load(&xas) != xa_mk_value(1)); + node = xas.xa_node; + XA_BUG_ON(xa, xa_entry_locked(xa, node, 0) != xa_mk_value(0)); + XA_BUG_ON(xa, xas_store(&xas, NULL) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 1) != NULL); + XA_BUG_ON(xa, xas.xa_node != XAS_BOUNDS); + XA_BUG_ON(xa, xa_entry_locked(xa, node, 0) != XA_RETRY_ENTRY); + XA_BUG_ON(xa, xas_load(&xas) != NULL); + xas_unlock(&xas); + XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0)); + xa_erase_index(xa, 0); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_multi_store(struct xarray *xa) +{ +#ifdef CONFIG_XARRAY_MULTI + unsigned long i, j, k; + unsigned int max_order = (sizeof(long) == 4) ? 30 : 60; + + /* Loading from any position returns the same value */ + xa_store_order(xa, 0, 1, xa_mk_value(0), GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0)); + XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(0)); + XA_BUG_ON(xa, xa_load(xa, 2) != NULL); + rcu_read_lock(); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 2); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 2); + rcu_read_unlock(); + + /* Storing adjacent to the value does not alter the value */ + xa_store(xa, 3, xa, GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0)); + XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(0)); + XA_BUG_ON(xa, xa_load(xa, 2) != NULL); + rcu_read_lock(); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 3); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 2); + rcu_read_unlock(); + + /* Overwriting multiple indexes works */ + xa_store_order(xa, 0, 2, xa_mk_value(1), GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 2) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 3) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 4) != NULL); + rcu_read_lock(); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 4); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 4); + rcu_read_unlock(); + + /* We can erase multiple values with a single store */ + xa_store_order(xa, 0, 63, NULL, GFP_KERNEL); + XA_BUG_ON(xa, !xa_empty(xa)); + + /* Even when the first slot is empty but the others aren't */ + xa_store_index(xa, 1, GFP_KERNEL); + xa_store_index(xa, 2, GFP_KERNEL); + xa_store_order(xa, 0, 2, NULL, GFP_KERNEL); + XA_BUG_ON(xa, !xa_empty(xa)); + + for (i = 0; i < max_order; i++) { + for (j = 0; j < max_order; j++) { + xa_store_order(xa, 0, i, xa_mk_value(i), GFP_KERNEL); + xa_store_order(xa, 0, j, xa_mk_value(j), GFP_KERNEL); + + for (k = 0; k < max_order; k++) { + void *entry = xa_load(xa, (1UL << k) - 1); + if ((i < k) && (j < k)) + XA_BUG_ON(xa, entry != NULL); + else + XA_BUG_ON(xa, entry != xa_mk_value(j)); + } + + xa_erase(xa, 0); + XA_BUG_ON(xa, !xa_empty(xa)); + } + } +#endif +} + +static DEFINE_XARRAY(array); static int xarray_checks(void) { + check_xa_err(&array); check_xa_load(&array); check_xa_mark(&array); + check_xa_shrink(&array); + check_multi_store(&array); printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); return (tests_run == tests_passed) ? 0 : -EINVAL; diff --git a/lib/xarray.c b/lib/xarray.c index aa86c47e532f..4596a95ed9cd 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -7,6 +7,8 @@ #include #include +#include +#include #include /* @@ -25,6 +27,31 @@ * @entry refers to something stored in a slot in the xarray */ +static inline unsigned int xa_lock_type(const struct xarray *xa) +{ + return (__force unsigned int)xa->xa_flags & 3; +} + +static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type) +{ + if (lock_type == XA_LOCK_IRQ) + xas_lock_irq(xas); + else if (lock_type == XA_LOCK_BH) + xas_lock_bh(xas); + else + xas_lock(xas); +} + +static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type) +{ + if (lock_type == XA_LOCK_IRQ) + xas_unlock_irq(xas); + else if (lock_type == XA_LOCK_BH) + xas_unlock_bh(xas); + else + xas_unlock(xas); +} + static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark) { if (!(xa->xa_flags & XA_FLAGS_MARK(mark))) @@ -67,6 +94,34 @@ static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark) return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE); } +#define mark_inc(mark) do { \ + mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \ +} while (0) + +/* + * xas_squash_marks() - Merge all marks to the first entry + * @xas: Array operation state. + * + * Set a mark on the first entry if any entry has it set. Clear marks on + * all sibling entries. + */ +static void xas_squash_marks(const struct xa_state *xas) +{ + unsigned int mark = 0; + unsigned int limit = xas->xa_offset + xas->xa_sibs + 1; + + if (!xas->xa_sibs) + return; + + do { + unsigned long *marks = xas->xa_node->marks[mark]; + if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit) + continue; + __set_bit(xas->xa_offset, marks); + bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); + } while (mark++ != (__force unsigned)XA_MARK_MAX); +} + /* extracts the offset within this node from the index */ static unsigned int get_offset(unsigned long index, struct xa_node *node) { @@ -161,6 +216,516 @@ void *xas_load(struct xa_state *xas) } EXPORT_SYMBOL_GPL(xas_load); +/* Move the radix tree node cache here */ +extern struct kmem_cache *radix_tree_node_cachep; +extern void radix_tree_node_rcu_free(struct rcu_head *head); + +#define XA_RCU_FREE ((struct xarray *)1) + +static void xa_node_free(struct xa_node *node) +{ + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + node->array = XA_RCU_FREE; + call_rcu(&node->rcu_head, radix_tree_node_rcu_free); +} + +/* + * xas_destroy() - Free any resources allocated during the XArray operation. + * @xas: XArray operation state. + * + * This function is now internal-only. + */ +static void xas_destroy(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_alloc; + + if (!node) + return; + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + kmem_cache_free(radix_tree_node_cachep, node); + xas->xa_alloc = NULL; +} + +/** + * xas_nomem() - Allocate memory if needed. + * @xas: XArray operation state. + * @gfp: Memory allocation flags. + * + * If we need to add new nodes to the XArray, we try to allocate memory + * with GFP_NOWAIT while holding the lock, which will usually succeed. + * If it fails, @xas is flagged as needing memory to continue. The caller + * should drop the lock and call xas_nomem(). If xas_nomem() succeeds, + * the caller should retry the operation. + * + * Forward progress is guaranteed as one node is allocated here and + * stored in the xa_state where it will be found by xas_alloc(). More + * nodes will likely be found in the slab allocator, but we do not tie + * them up here. + * + * Return: true if memory was needed, and was successfully allocated. + */ +bool xas_nomem(struct xa_state *xas, gfp_t gfp) +{ + if (xas->xa_node != XA_ERROR(-ENOMEM)) { + xas_destroy(xas); + return false; + } + xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + if (!xas->xa_alloc) + return false; + XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); + xas->xa_node = XAS_RESTART; + return true; +} +EXPORT_SYMBOL_GPL(xas_nomem); + +/* + * __xas_nomem() - Drop locks and allocate memory if needed. + * @xas: XArray operation state. + * @gfp: Memory allocation flags. + * + * Internal variant of xas_nomem(). + * + * Return: true if memory was needed, and was successfully allocated. + */ +static bool __xas_nomem(struct xa_state *xas, gfp_t gfp) + __must_hold(xas->xa->xa_lock) +{ + unsigned int lock_type = xa_lock_type(xas->xa); + + if (xas->xa_node != XA_ERROR(-ENOMEM)) { + xas_destroy(xas); + return false; + } + if (gfpflags_allow_blocking(gfp)) { + xas_unlock_type(xas, lock_type); + xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + xas_lock_type(xas, lock_type); + } else { + xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + } + if (!xas->xa_alloc) + return false; + XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); + xas->xa_node = XAS_RESTART; + return true; +} + +static void xas_update(struct xa_state *xas, struct xa_node *node) +{ + if (xas->xa_update) + xas->xa_update(node); + else + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); +} + +static void *xas_alloc(struct xa_state *xas, unsigned int shift) +{ + struct xa_node *parent = xas->xa_node; + struct xa_node *node = xas->xa_alloc; + + if (xas_invalid(xas)) + return NULL; + + if (node) { + xas->xa_alloc = NULL; + } else { + node = kmem_cache_alloc(radix_tree_node_cachep, + GFP_NOWAIT | __GFP_NOWARN); + if (!node) { + xas_set_err(xas, -ENOMEM); + return NULL; + } + } + + if (parent) { + node->offset = xas->xa_offset; + parent->count++; + XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE); + xas_update(xas, parent); + } + XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + node->shift = shift; + node->count = 0; + node->nr_values = 0; + RCU_INIT_POINTER(node->parent, xas->xa_node); + node->array = xas->xa; + + return node; +} + +/* + * Use this to calculate the maximum index that will need to be created + * in order to add the entry described by @xas. Because we cannot store a + * multiple-index entry at index 0, the calculation is a little more complex + * than you might expect. + */ +static unsigned long xas_max(struct xa_state *xas) +{ + unsigned long max = xas->xa_index; + +#ifdef CONFIG_XARRAY_MULTI + if (xas->xa_shift || xas->xa_sibs) { + unsigned long mask; + mask = (((xas->xa_sibs + 1UL) << xas->xa_shift) - 1); + max |= mask; + if (mask == max) + max++; + } +#endif + + return max; +} + +/* The maximum index that can be contained in the array without expanding it */ +static unsigned long max_index(void *entry) +{ + if (!xa_is_node(entry)) + return 0; + return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1; +} + +static void xas_shrink(struct xa_state *xas) +{ + struct xarray *xa = xas->xa; + struct xa_node *node = xas->xa_node; + + for (;;) { + void *entry; + + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + if (node->count != 1) + break; + entry = xa_entry_locked(xa, node, 0); + if (!entry) + break; + if (!xa_is_node(entry) && node->shift) + break; + xas->xa_node = XAS_BOUNDS; + + RCU_INIT_POINTER(xa->xa_head, entry); + + node->count = 0; + node->nr_values = 0; + if (!xa_is_node(entry)) + RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY); + xas_update(xas, node); + xa_node_free(node); + if (!xa_is_node(entry)) + break; + node = xa_to_node(entry); + node->parent = NULL; + } +} + +/* + * xas_delete_node() - Attempt to delete an xa_node + * @xas: Array operation state. + * + * Attempts to delete the @xas->xa_node. This will fail if xa->node has + * a non-zero reference count. + */ +static void xas_delete_node(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + for (;;) { + struct xa_node *parent; + + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + if (node->count) + break; + + parent = xa_parent_locked(xas->xa, node); + xas->xa_node = parent; + xas->xa_offset = node->offset; + xa_node_free(node); + + if (!parent) { + xas->xa->xa_head = NULL; + xas->xa_node = XAS_BOUNDS; + return; + } + + parent->slots[xas->xa_offset] = NULL; + parent->count--; + XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE); + node = parent; + xas_update(xas, node); + } + + if (!node->parent) + xas_shrink(xas); +} + +/** + * xas_free_nodes() - Free this node and all nodes that it references + * @xas: Array operation state. + * @top: Node to free + * + * This node has been removed from the tree. We must now free it and all + * of its subnodes. There may be RCU walkers with references into the tree, + * so we must replace all entries with retry markers. + */ +static void xas_free_nodes(struct xa_state *xas, struct xa_node *top) +{ + unsigned int offset = 0; + struct xa_node *node = top; + + for (;;) { + void *entry = xa_entry_locked(xas->xa, node, offset); + + if (xa_is_node(entry)) { + node = xa_to_node(entry); + offset = 0; + continue; + } + if (entry) + RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY); + offset++; + while (offset == XA_CHUNK_SIZE) { + struct xa_node *parent; + + parent = xa_parent_locked(xas->xa, node); + offset = node->offset + 1; + node->count = 0; + node->nr_values = 0; + xas_update(xas, node); + xa_node_free(node); + if (node == top) + return; + node = parent; + } + } +} + +/* + * xas_expand adds nodes to the head of the tree until it has reached + * sufficient height to be able to contain @xas->xa_index + */ +static int xas_expand(struct xa_state *xas, void *head) +{ + struct xarray *xa = xas->xa; + struct xa_node *node = NULL; + unsigned int shift = 0; + unsigned long max = xas_max(xas); + + if (!head) { + if (max == 0) + return 0; + while ((max >> shift) >= XA_CHUNK_SIZE) + shift += XA_CHUNK_SHIFT; + return shift + XA_CHUNK_SHIFT; + } else if (xa_is_node(head)) { + node = xa_to_node(head); + shift = node->shift + XA_CHUNK_SHIFT; + } + xas->xa_node = NULL; + + while (max > max_index(head)) { + xa_mark_t mark = 0; + + XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); + node = xas_alloc(xas, shift); + if (!node) + return -ENOMEM; + + node->count = 1; + if (xa_is_value(head)) + node->nr_values = 1; + RCU_INIT_POINTER(node->slots[0], head); + + /* Propagate the aggregated mark info to the new child */ + for (;;) { + if (xa_marked(xa, mark)) + node_set_mark(node, 0, mark); + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } + + /* + * Now that the new node is fully initialised, we can add + * it to the tree + */ + if (xa_is_node(head)) { + xa_to_node(head)->offset = 0; + rcu_assign_pointer(xa_to_node(head)->parent, node); + } + head = xa_mk_node(node); + rcu_assign_pointer(xa->xa_head, head); + xas_update(xas, node); + + shift += XA_CHUNK_SHIFT; + } + + xas->xa_node = node; + return shift; +} + +/* + * xas_create() - Create a slot to store an entry in. + * @xas: XArray operation state. + * + * Most users will not need to call this function directly, as it is called + * by xas_store(). It is useful for doing conditional store operations + * (see the xa_cmpxchg() implementation for an example). + * + * Return: If the slot already existed, returns the contents of this slot. + * If the slot was newly created, returns NULL. If it failed to create the + * slot, returns NULL and indicates the error in @xas. + */ +static void *xas_create(struct xa_state *xas) +{ + struct xarray *xa = xas->xa; + void *entry; + void __rcu **slot; + struct xa_node *node = xas->xa_node; + int shift; + unsigned int order = xas->xa_shift; + + if (xas_top(node)) { + entry = xa_head_locked(xa); + xas->xa_node = NULL; + shift = xas_expand(xas, entry); + if (shift < 0) + return NULL; + entry = xa_head_locked(xa); + slot = &xa->xa_head; + } else if (xas_error(xas)) { + return NULL; + } else if (node) { + unsigned int offset = xas->xa_offset; + + shift = node->shift; + entry = xa_entry_locked(xa, node, offset); + slot = &node->slots[offset]; + } else { + shift = 0; + entry = xa_head_locked(xa); + slot = &xa->xa_head; + } + + while (shift > order) { + shift -= XA_CHUNK_SHIFT; + if (!entry) { + node = xas_alloc(xas, shift); + if (!node) + break; + rcu_assign_pointer(*slot, xa_mk_node(node)); + } else if (xa_is_node(entry)) { + node = xa_to_node(entry); + } else { + break; + } + entry = xas_descend(xas, node); + slot = &node->slots[xas->xa_offset]; + } + + return entry; +} + +static void update_node(struct xa_state *xas, struct xa_node *node, + int count, int values) +{ + if (!node || (!count && !values)) + return; + + node->count += count; + node->nr_values += values; + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE); + xas_update(xas, node); + if (count < 0) + xas_delete_node(xas); +} + +/** + * xas_store() - Store this entry in the XArray. + * @xas: XArray operation state. + * @entry: New entry. + * + * If @xas is operating on a multi-index entry, the entry returned by this + * function is essentially meaningless (it may be an internal entry or it + * may be %NULL, even if there are non-NULL entries at some of the indices + * covered by the range). This is not a problem for any current users, + * and can be changed if needed. + * + * Return: The old entry at this index. + */ +void *xas_store(struct xa_state *xas, void *entry) +{ + struct xa_node *node; + void __rcu **slot = &xas->xa->xa_head; + unsigned int offset, max; + int count = 0; + int values = 0; + void *first, *next; + bool value = xa_is_value(entry); + + if (entry) + first = xas_create(xas); + else + first = xas_load(xas); + + if (xas_invalid(xas)) + return first; + node = xas->xa_node; + if (node && (xas->xa_shift < node->shift)) + xas->xa_sibs = 0; + if ((first == entry) && !xas->xa_sibs) + return first; + + next = first; + offset = xas->xa_offset; + max = xas->xa_offset + xas->xa_sibs; + if (node) { + slot = &node->slots[offset]; + if (xas->xa_sibs) + xas_squash_marks(xas); + } + if (!entry) + xas_init_marks(xas); + + for (;;) { + /* + * Must clear the marks before setting the entry to NULL, + * otherwise xas_for_each_marked may find a NULL entry and + * stop early. rcu_assign_pointer contains a release barrier + * so the mark clearing will appear to happen before the + * entry is set to NULL. + */ + rcu_assign_pointer(*slot, entry); + if (xa_is_node(next)) + xas_free_nodes(xas, xa_to_node(next)); + if (!node) + break; + count += !next - !entry; + values += !xa_is_value(first) - !value; + if (entry) { + if (offset == max) + break; + if (!xa_is_sibling(entry)) + entry = xa_mk_sibling(xas->xa_offset); + } else { + if (offset == XA_CHUNK_MASK) + break; + } + next = xa_entry_locked(xas->xa, node, ++offset); + if (!xa_is_sibling(next)) { + if (!entry && (offset > max)) + break; + first = next; + } + slot++; + } + + update_node(xas, node, count, values); + return first; +} +EXPORT_SYMBOL_GPL(xas_store); + /** * xas_get_mark() - Returns the state of this mark. * @xas: XArray operation state. @@ -240,6 +805,30 @@ void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark) } EXPORT_SYMBOL_GPL(xas_clear_mark); +/** + * xas_init_marks() - Initialise all marks for the entry + * @xas: Array operations state. + * + * Initialise all marks for the entry specified by @xas. If we're tracking + * free entries with a mark, we need to set it on all entries. All other + * marks are cleared. + * + * This implementation is not as efficient as it could be; we may walk + * up the tree multiple times. + */ +void xas_init_marks(const struct xa_state *xas) +{ + xa_mark_t mark = 0; + + for (;;) { + xas_clear_mark(xas, mark); + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } +} +EXPORT_SYMBOL_GPL(xas_init_marks); + /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. @@ -253,9 +842,19 @@ EXPORT_SYMBOL_GPL(xas_clear_mark); */ void xa_init_flags(struct xarray *xa, gfp_t flags) { + unsigned int lock_type; + static struct lock_class_key xa_lock_irq; + static struct lock_class_key xa_lock_bh; + spin_lock_init(&xa->xa_lock); xa->xa_flags = flags; xa->xa_head = NULL; + + lock_type = xa_lock_type(xa); + if (lock_type == XA_LOCK_IRQ) + lockdep_set_class(&xa->xa_lock, &xa_lock_irq); + else if (lock_type == XA_LOCK_BH) + lockdep_set_class(&xa->xa_lock, &xa_lock_bh); } EXPORT_SYMBOL(xa_init_flags); @@ -282,6 +881,100 @@ void *xa_load(struct xarray *xa, unsigned long index) } EXPORT_SYMBOL(xa_load); +static void *xas_result(struct xa_state *xas, void *curr) +{ + XA_NODE_BUG_ON(xas->xa_node, xa_is_internal(curr)); + if (xas_error(xas)) + curr = xas->xa_node; + return curr; +} + +/** + * __xa_erase() - Erase this entry from the XArray while locked. + * @xa: XArray. + * @index: Index into array. + * + * If the entry at this index is a multi-index entry then all indices will + * be erased, and the entry will no longer be a multi-index entry. + * This function expects the xa_lock to be held on entry. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: The old entry at this index. + */ +void *__xa_erase(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + return xas_result(&xas, xas_store(&xas, NULL)); +} +EXPORT_SYMBOL_GPL(__xa_erase); + +/** + * xa_store() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * After this function returns, loads from this index will return @entry. + * Storing into an existing multislot entry updates the entry of every index. + * The marks associated with @index are unaffected unless @entry is %NULL. + * + * Context: Process context. Takes and releases the xa_lock. May sleep + * if the @gfp flags permit. + * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry + * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation + * failed. + */ +void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return XA_ERROR(-EINVAL); + + do { + xas_lock(&xas); + curr = xas_store(&xas, entry); + xas_unlock(&xas); + } while (xas_nomem(&xas, gfp)); + + return xas_result(&xas, curr); +} +EXPORT_SYMBOL(xa_store); + +/** + * __xa_store() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * You must already be holding the xa_lock when calling this function. + * It will drop the lock if needed to allocate memory, and then reacquire + * it afterwards. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: The old entry at this index or xa_err() if an error happened. + */ +void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return XA_ERROR(-EINVAL); + + do { + curr = xas_store(&xas, entry); + } while (__xas_nomem(&xas, gfp)); + + return xas_result(&xas, curr); +} +EXPORT_SYMBOL(__xa_store); + /** * __xa_set_mark() - Set this mark on this entry while locked. * @xa: XArray. diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index e63662db131b..05dca5c203f3 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -15,6 +15,7 @@ void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits); +void bitmap_clear(unsigned long *map, unsigned int start, int len); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 622266b197d0..c934572d935c 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -37,4 +37,6 @@ static inline bool arch_spin_is_locked(arch_spinlock_t *mutex) return true; } +#include + #endif diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index 1379f1d78d0b..acf1afa01c5b 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -5,7 +5,7 @@ CFLAGS += -I. -I../../include -g -Og -Wall -D_LGPL_SOURCE -fsanitize=address \ LDFLAGS += -fsanitize=address -fsanitize=undefined LDLIBS+= -lpthread -lurcu TARGETS = main idr-test multiorder xarray -CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o +CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o bitmap.o OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \ tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o diff --git a/tools/testing/radix-tree/bitmap.c b/tools/testing/radix-tree/bitmap.c new file mode 100644 index 000000000000..66ec4a24a203 --- /dev/null +++ b/tools/testing/radix-tree/bitmap.c @@ -0,0 +1,23 @@ +/* lib/bitmap.c pulls in at least two other files. */ + +#include + +void bitmap_clear(unsigned long *map, unsigned int start, int len) +{ + unsigned long *p = map + BIT_WORD(start); + const unsigned int size = start + len; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + + while (len - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = ~0UL; + p++; + } + if (len) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + *p &= ~mask_to_clear; + } +} diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index 5d06ac75a14d..4568248222ae 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h @@ -18,4 +18,8 @@ #define pr_debug printk #define pr_cont printk +#define __acquires(x) +#define __releases(x) +#define __must_hold(x) + #endif /* _KERNEL_H */ diff --git a/tools/testing/radix-tree/linux/lockdep.h b/tools/testing/radix-tree/linux/lockdep.h new file mode 100644 index 000000000000..565fccdfe6e9 --- /dev/null +++ b/tools/testing/radix-tree/linux/lockdep.h @@ -0,0 +1,11 @@ +#ifndef _LINUX_LOCKDEP_H +#define _LINUX_LOCKDEP_H +struct lock_class_key { + unsigned int a; +}; + +static inline void lockdep_set_class(spinlock_t *lock, + struct lock_class_key *key) +{ +} +#endif /* _LINUX_LOCKDEP_H */ -- cgit From f443f38c5789ece6ebe59ae21c27bf861e61c4e2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 24 Oct 2018 14:31:12 -0300 Subject: tools include uapi: Grab a copy of linux/fs.h We'll use it to create tables for the 'flags' argument to the 'mount' and 'umount' syscalls. Add it to check_headers.sh so that when a new protocol gets added we get a notification during the build process. Cc: Adrian Hunter Cc: Benjamin Peterson Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: https://lkml.kernel.org/n/tip-yacf9jvkwfwg2g95r2us3xb3@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/fs.h | 393 ++++++++++++++++++++++++++++++++++++++++++ tools/perf/check-headers.sh | 1 + 2 files changed, 394 insertions(+) create mode 100644 tools/include/uapi/linux/fs.h (limited to 'tools/include') diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h new file mode 100644 index 000000000000..73e01918f996 --- /dev/null +++ b/tools/include/uapi/linux/fs.h @@ -0,0 +1,393 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_FS_H +#define _UAPI_LINUX_FS_H + +/* + * This file has definitions for some important file table structures + * and constants and structures used by various generic file system + * ioctl's. Please do not make any changes in this file before + * sending patches for review to linux-fsdevel@vger.kernel.org and + * linux-api@vger.kernel.org. + */ + +#include +#include +#include + +/* + * It's silly to have NR_OPEN bigger than NR_FILE, but you can change + * the file limit at runtime and only root can increase the per-process + * nr_file rlimit, so it's safe to set up a ridiculously high absolute + * upper limit on files-per-process. + * + * Some programs (notably those using select()) may have to be + * recompiled to take full advantage of the new limits.. + */ + +/* Fixed constants first: */ +#undef NR_OPEN +#define INR_OPEN_CUR 1024 /* Initial setting for nfile rlimits */ +#define INR_OPEN_MAX 4096 /* Hard limit for nfile rlimits */ + +#define BLOCK_SIZE_BITS 10 +#define BLOCK_SIZE (1<string tables with: $ tools/perf/trace/beauty/mount_flags.sh static const char *mount_flags[] = { [4096 ? (ilog2(4096) + 1) : 0] = "BIND", [30 + 1] = "ACTIVE", [31 + 1] = "NOUSER", }; $ # trace -e mount,umount mount --bind /proc /mnt 1.228 ( 2.581 ms): mount/1068 mount(dev_name: /mnt, dir_name: 0x55f011c354a0, type: 0x55f011c38170, flags: BIND) = 0 # trace -e mount,umount umount /proc /mnt umount: /proc: target is busy. 1.587 ( 0.010 ms): umount/1070 umount2(name: /proc) = -1 EBUSY Device or resource busy 1.799 (12.660 ms): umount/1070 umount2(name: /mnt) = 0 # Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Cc: Jason A. Donenfeld Cc: Herbert Xu Link: https://lkml.kernel.org/n/tip-c00bqzclscgah26z2g5zxm73@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index 73e01918f996..a441ea1bfe6d 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -279,8 +279,8 @@ struct fsxattr { #define FS_ENCRYPTION_MODE_AES_256_CTS 4 #define FS_ENCRYPTION_MODE_AES_128_CBC 5 #define FS_ENCRYPTION_MODE_AES_128_CTS 6 -#define FS_ENCRYPTION_MODE_SPECK128_256_XTS 7 -#define FS_ENCRYPTION_MODE_SPECK128_256_CTS 8 +#define FS_ENCRYPTION_MODE_SPECK128_256_XTS 7 /* Removed, do not use. */ +#define FS_ENCRYPTION_MODE_SPECK128_256_CTS 8 /* Removed, do not use. */ struct fscrypt_policy { __u8 version; -- cgit From 8dd4c0f68c0db4c0f01af60a99a7ed34fd3dee2b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 30 Oct 2018 13:10:50 -0300 Subject: tools arch uapi: Update asm-generic/unistd.h and arm64 unistd.h copies To get the changes in: 82b355d161c9 ("y2038: Remove newstat family from default syscall set") Which will make the syscall table used by 'perf trace' for arm64 to be updated from the changes in that patch. This silences these perf build warnings: Warning: Kernel ABI header at 'tools/arch/arm64/include/uapi/asm/unistd.h' differs from latest version at 'arch/arm64/include/uapi/asm/unistd.h' diff -u tools/arch/arm64/include/uapi/asm/unistd.h arch/arm64/include/uapi/asm/unistd.h Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/unistd.h' differs from latest version at 'include/uapi/asm-generic/unistd.h' diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h Cc: Kim Phillips Cc: Arnd Bergmann Cc: Geert Uytterhoeven Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-3euy7c4yy5mvnp5bm16t9vqg@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm64/include/uapi/asm/unistd.h | 1 + tools/include/uapi/asm-generic/unistd.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'tools/include') diff --git a/tools/arch/arm64/include/uapi/asm/unistd.h b/tools/arch/arm64/include/uapi/asm/unistd.h index 5072cbd15c82..dae1584cf017 100644 --- a/tools/arch/arm64/include/uapi/asm/unistd.h +++ b/tools/arch/arm64/include/uapi/asm/unistd.h @@ -16,5 +16,6 @@ */ #define __ARCH_WANT_RENAMEAT +#define __ARCH_WANT_NEW_STAT #include diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index df4bedb9b01c..538546edbfbd 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -242,10 +242,12 @@ __SYSCALL(__NR_tee, sys_tee) /* fs/stat.c */ #define __NR_readlinkat 78 __SYSCALL(__NR_readlinkat, sys_readlinkat) +#if defined(__ARCH_WANT_NEW_STAT) || defined(__ARCH_WANT_STAT64) #define __NR3264_fstatat 79 __SC_3264(__NR3264_fstatat, sys_fstatat64, sys_newfstatat) #define __NR3264_fstat 80 __SC_3264(__NR3264_fstat, sys_fstat64, sys_newfstat) +#endif /* fs/sync.c */ #define __NR_sync 81 -- cgit From 89eb1f3b7f2a9156ce6f78713d7924c1bb2fab9f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 30 Oct 2018 14:20:07 -0300 Subject: tools include uapi: Update asound.h copy To silence this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/sound/asound.h' differs from latest version at 'include/uapi/sound/asound.h' diff -u tools/include/uapi/sound/asound.h include/uapi/sound/asound.h Due to this cset: a98401518def ("ALSA: timer: fix wrong comment to refer to 'SNDRV_TIMER_PSFLG_*'") Cc: Jiri Olsa Cc: Namhyung Kim Cc: Takashi Sakamoto Cc: Takashi Iwai Link: https://lkml.kernel.org/n/tip-76gsvs0w2g0x723ivqa2xua3@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/sound/asound.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index ed0a120d4f08..404d4b9ffe76 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -752,7 +752,7 @@ struct snd_timer_info { #define SNDRV_TIMER_PSFLG_EARLY_EVENT (1<<2) /* write early event to the poll queue */ struct snd_timer_params { - unsigned int flags; /* flags - SNDRV_MIXER_PSFLG_* */ + unsigned int flags; /* flags - SNDRV_TIMER_PSFLG_* */ unsigned int ticks; /* requested resolution in ticks */ unsigned int queue_size; /* total size of queue (32-1024) */ unsigned int reserved0; /* reserved, was: failure locations */ -- cgit From 685626dc26bd9cead850d06520708acbd16bcfda Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 30 Oct 2018 16:50:08 -0300 Subject: tools include uapi: Update linux/mmap.h copy To pick up the changes from: 20916d4636a9 ("mm/hugetlb: add mmap() encodings for 32MB and 512MB page sizes") That do not entail changes in in tools, this just shows that we have to consider bits [26:31] of flags to beautify that in tools like 'perf trace' This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/mman.h' differs from latest version at 'include/uapi/linux/mman.h' diff -u tools/include/uapi/linux/mman.h include/uapi/linux/mman.h Cc: Anshuman Khandual Cc: Greg Kroah-Hartman Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-3rvc39lon93kgt5pl31d8g4x@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/mman.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h index bfd5938fede6..d0f515d53299 100644 --- a/tools/include/uapi/linux/mman.h +++ b/tools/include/uapi/linux/mman.h @@ -28,7 +28,9 @@ #define MAP_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB #define MAP_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB #define MAP_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MAP_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB #define MAP_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MAP_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB #define MAP_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB #define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB -- cgit From 827758129a0f84fbd0b2dda15e14a77a7604803d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 30 Oct 2018 17:01:46 -0300 Subject: tools headers: Sync the various kvm.h header copies For powerpc, s390, x86 and the main uapi linux/kvm.h header, none of them entail changes in tooling. Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-avn7iy8f4tcm2y40sbsdk31m@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/powerpc/include/uapi/asm/kvm.h | 1 + tools/arch/s390/include/uapi/asm/kvm.h | 2 ++ tools/arch/x86/include/uapi/asm/kvm.h | 6 ++---- tools/include/uapi/linux/kvm.h | 21 +++++++++++++++++++-- 4 files changed, 24 insertions(+), 6 deletions(-) (limited to 'tools/include') diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index 1b32b56a03d3..8c876c166ef2 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -634,6 +634,7 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe) #define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf) +#define KVM_REG_PPC_PTCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0) /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index 9a50f02b9894..16511d97e8dc 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -160,6 +160,8 @@ struct kvm_s390_vm_cpu_subfunc { #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1 #define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2 #define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3 +#define KVM_S390_VM_CRYPTO_ENABLE_APIE 4 +#define KVM_S390_VM_CRYPTO_DISABLE_APIE 5 /* kvm attributes for migration mode */ #define KVM_S390_VM_MIGRATION_STOP 0 diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 8a6eff9c27f3..dabfcf7c3941 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -300,10 +300,7 @@ struct kvm_vcpu_events { __u8 injected; __u8 nr; __u8 has_error_code; - union { - __u8 pad; - __u8 pending; - }; + __u8 pending; __u32 error_code; } exception; struct { @@ -387,6 +384,7 @@ struct kvm_sync_regs { #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 +#define KVM_STATE_NESTED_EVMCS 0x00000004 #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_SMM_VMXON 0x00000002 diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 2875ce85b322..2b7a652c9fa4 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -420,13 +420,19 @@ struct kvm_run { struct kvm_coalesced_mmio_zone { __u64 addr; __u32 size; - __u32 pad; + union { + __u32 pad; + __u32 pio; + }; }; struct kvm_coalesced_mmio { __u64 phys_addr; __u32 len; - __u32 pad; + union { + __u32 pad; + __u32 pio; + }; __u8 data[8]; }; @@ -751,6 +757,15 @@ struct kvm_ppc_resize_hpt { #define KVM_S390_SIE_PAGE_OFFSET 1 +/* + * On arm64, machine type can be used to request the physical + * address size for the VM. Bits[7-0] are reserved for the guest + * PA size shift (i.e, log2(PA_Size)). For backward compatibility, + * value 0 implies the default IPA size, 40bits. + */ +#define KVM_VM_TYPE_ARM_IPA_SIZE_MASK 0xffULL +#define KVM_VM_TYPE_ARM_IPA_SIZE(x) \ + ((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK) /* * ioctls for /dev/kvm fds: */ @@ -958,6 +973,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_HYPERV_SEND_IPI 161 #define KVM_CAP_COALESCED_PIO 162 #define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163 +#define KVM_CAP_EXCEPTION_PAYLOAD 164 +#define KVM_CAP_ARM_VM_IPA_SIZE 165 #ifdef KVM_CAP_IRQ_ROUTING -- cgit From d45a57fff0a657045a77b395ae713ffae0cb4e46 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 30 Oct 2018 17:04:47 -0300 Subject: tools headers uapi: Update linux/netlink.h header copy Picking the changes from: 89d35528d17d ("netlink: Add new socket option to enable strict checking on dumps") To silence this build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/netlink.h' differs from latest version at 'include/uapi/linux/netlink.h' Cc: Alexei Starovoitov Cc: David Ahern Cc: David S. Miller Cc: Eric Leblond Link: https://lkml.kernel.org/n/tip-1xymkfjpmhxfzrs46t8z8mjw@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/netlink.h b/tools/include/uapi/linux/netlink.h index 776bc92e9118..486ed1f0c0bc 100644 --- a/tools/include/uapi/linux/netlink.h +++ b/tools/include/uapi/linux/netlink.h @@ -155,6 +155,7 @@ enum nlmsgerr_attrs { #define NETLINK_LIST_MEMBERSHIPS 9 #define NETLINK_CAP_ACK 10 #define NETLINK_EXT_ACK 11 +#define NETLINK_DUMP_STRICT_CHK 12 struct nl_pktinfo { __u32 group; -- cgit From 76b0b801782b34b3028dcef3de36cb634e3908a8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 30 Oct 2018 17:06:57 -0300 Subject: tools headers uapi: Update linux/if_link.h header copy To pick the changes from: 9163a0fc1f0c ("net: bridge: add support for per-port vlan stats") And silence this build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/if_link.h' differs from latest version at 'include/uapi/linux/if_link.h' Cc: Alexei Starovoitov Cc: David S. Miller Cc: Eric Leblond Cc: Nikolay Aleksandrov Link: https://lkml.kernel.org/n/tip-7p53ghippywz7fqkwo3nkzet@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 58faab897201..1debfa42cba1 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -287,6 +287,7 @@ enum { IFLA_BR_MCAST_STATS_ENABLED, IFLA_BR_MCAST_IGMP_VERSION, IFLA_BR_MCAST_MLD_VERSION, + IFLA_BR_VLAN_STATS_PER_PORT, __IFLA_BR_MAX, }; -- cgit From 4f8f382e635707ddaddf8269a116e4f8cc8835c0 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 30 Oct 2018 22:24:04 -0700 Subject: perf tools: Don't clone maps from parent when synthesizing forks When synthesizing FORK events, we are trying to create thread objects for the already running tasks on the machine. Normally, for a kernel FORK event, we want to clone the parent's maps because that is what the kernel just did. But when synthesizing, this should not be done. If we do, we end up with overlapping maps as we process the sythesized MMAP2 events that get delivered shortly thereafter. Use the FORK event misc flags in an internal way to signal this situation, so we can elide the map clone when appropriate. Signed-off-by: David S. Miller Cc: Don Zickus Cc: Jiri Olsa Cc: Joe Mario Link: http://lkml.kernel.org/r/20181030.222404.2085088822877051075.davem@davemloft.net [ Added comment about flag use in machine__process_fork_event(), use ternary op in thread__clone_map_groups() as suggested by Jiri ] Signed-off-by: Arnaldo Carvalho de Melo --- include/uapi/linux/perf_event.h | 2 ++ tools/include/uapi/linux/perf_event.h | 2 ++ tools/perf/util/event.c | 1 + tools/perf/util/machine.c | 19 ++++++++++++++++++- tools/perf/util/thread.c | 13 +++++-------- tools/perf/util/thread.h | 2 +- 6 files changed, 29 insertions(+), 10 deletions(-) (limited to 'tools/include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index f35eb72739c0..9de8780ac8d9 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -646,10 +646,12 @@ struct perf_event_mmap_page { * * PERF_RECORD_MISC_MMAP_DATA - PERF_RECORD_MMAP* events * PERF_RECORD_MISC_COMM_EXEC - PERF_RECORD_COMM event + * PERF_RECORD_MISC_FORK_EXEC - PERF_RECORD_FORK event (perf internal) * PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events */ #define PERF_RECORD_MISC_MMAP_DATA (1 << 13) #define PERF_RECORD_MISC_COMM_EXEC (1 << 13) +#define PERF_RECORD_MISC_FORK_EXEC (1 << 13) #define PERF_RECORD_MISC_SWITCH_OUT (1 << 13) /* * These PERF_RECORD_MISC_* flags below are safely reused diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index f35eb72739c0..9de8780ac8d9 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -646,10 +646,12 @@ struct perf_event_mmap_page { * * PERF_RECORD_MISC_MMAP_DATA - PERF_RECORD_MMAP* events * PERF_RECORD_MISC_COMM_EXEC - PERF_RECORD_COMM event + * PERF_RECORD_MISC_FORK_EXEC - PERF_RECORD_FORK event (perf internal) * PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events */ #define PERF_RECORD_MISC_MMAP_DATA (1 << 13) #define PERF_RECORD_MISC_COMM_EXEC (1 << 13) +#define PERF_RECORD_MISC_FORK_EXEC (1 << 13) #define PERF_RECORD_MISC_SWITCH_OUT (1 << 13) /* * These PERF_RECORD_MISC_* flags below are safely reused diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index bc646185f8d9..e9c108a6b1c3 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -308,6 +308,7 @@ static int perf_event__synthesize_fork(struct perf_tool *tool, event->fork.pid = tgid; event->fork.tid = pid; event->fork.header.type = PERF_RECORD_FORK; + event->fork.header.misc = PERF_RECORD_MISC_FORK_EXEC; event->fork.header.size = (sizeof(event->fork) + machine->id_hdr_size); diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 8ee8ab39d8ac..8f36ce813bc5 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1708,6 +1708,7 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event struct thread *parent = machine__findnew_thread(machine, event->fork.ppid, event->fork.ptid); + bool do_maps_clone = true; int err = 0; if (dump_trace) @@ -1736,9 +1737,25 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event thread = machine__findnew_thread(machine, event->fork.pid, event->fork.tid); + /* + * When synthesizing FORK events, we are trying to create thread + * objects for the already running tasks on the machine. + * + * Normally, for a kernel FORK event, we want to clone the parent's + * maps because that is what the kernel just did. + * + * But when synthesizing, this should not be done. If we do, we end up + * with overlapping maps as we process the sythesized MMAP2 events that + * get delivered shortly thereafter. + * + * Use the FORK event misc flags in an internal way to signal this + * situation, so we can elide the map clone when appropriate. + */ + if (event->fork.header.misc & PERF_RECORD_MISC_FORK_EXEC) + do_maps_clone = false; if (thread == NULL || parent == NULL || - thread__fork(thread, parent, sample->time) < 0) { + thread__fork(thread, parent, sample->time, do_maps_clone) < 0) { dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n"); err = -1; } diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 2048d393ece6..3d9ed7d0e281 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -330,7 +330,8 @@ static int thread__prepare_access(struct thread *thread) } static int thread__clone_map_groups(struct thread *thread, - struct thread *parent) + struct thread *parent, + bool do_maps_clone) { /* This is new thread, we share map groups for process. */ if (thread->pid_ == parent->pid_) @@ -341,15 +342,11 @@ static int thread__clone_map_groups(struct thread *thread, thread->pid_, thread->tid, parent->pid_, parent->tid); return 0; } - /* But this one is new process, copy maps. */ - if (map_groups__clone(thread, parent->mg) < 0) - return -ENOMEM; - - return 0; + return do_maps_clone ? map_groups__clone(thread, parent->mg) : 0; } -int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp) +int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp, bool do_maps_clone) { if (parent->comm_set) { const char *comm = thread__comm_str(parent); @@ -362,7 +359,7 @@ int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp) } thread->ppid = parent->tid; - return thread__clone_map_groups(thread, parent); + return thread__clone_map_groups(thread, parent, do_maps_clone); } void thread__find_cpumode_addr_location(struct thread *thread, u64 addr, diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 36c09a9904e6..30e2b4c165fe 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -89,7 +89,7 @@ struct comm *thread__comm(const struct thread *thread); struct comm *thread__exec_comm(const struct thread *thread); const char *thread__comm_str(const struct thread *thread); int thread__insert_map(struct thread *thread, struct map *map); -int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp); +int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp, bool do_maps_clone); size_t thread__fprintf(struct thread *thread, FILE *fp); struct thread *thread__main_thread(struct machine *machine, struct thread *thread); -- cgit