diff options
Diffstat (limited to 'include')
48 files changed, 1046 insertions, 158 deletions
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 57e9e109257e..8506690dbb9c 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -199,9 +199,9 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ + if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ - if (sk_fullsock(__sk) && \ + if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \ cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ CGROUP_INET_EGRESS); \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f58895830ada..ceaa8c23287f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -228,6 +228,18 @@ struct btf_record { struct btf_field fields[]; }; +/* Non-opaque version of bpf_rb_node in uapi/linux/bpf.h */ +struct bpf_rb_node_kern { + struct rb_node rb_node; + void *owner; +} __attribute__((aligned(8))); + +/* Non-opaque version of bpf_list_node in uapi/linux/bpf.h */ +struct bpf_list_node_kern { + struct list_head list_head; + void *owner; +} __attribute__((aligned(8))); + struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -275,6 +287,7 @@ struct bpf_map { } owner; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ + s64 __percpu *elem_count; }; static inline const char *btf_field_type_name(enum btf_field_type type) @@ -2040,6 +2053,35 @@ bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, } #endif +static inline int +bpf_map_init_elem_count(struct bpf_map *map) +{ + size_t size = sizeof(*map->elem_count), align = size; + gfp_t flags = GFP_USER | __GFP_NOWARN; + + map->elem_count = bpf_map_alloc_percpu(map, size, align, flags); + if (!map->elem_count) + return -ENOMEM; + + return 0; +} + +static inline void +bpf_map_free_elem_count(struct bpf_map *map) +{ + free_percpu(map->elem_count); +} + +static inline void bpf_map_inc_elem_count(struct bpf_map *map) +{ + this_cpu_inc(*map->elem_count); +} + +static inline void bpf_map_dec_elem_count(struct bpf_map *map) +{ + this_cpu_dec(*map->elem_count); +} + extern int sysctl_unprivileged_bpf_disabled; static inline bool bpf_allow_ptr_leaks(void) diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 3929be5743f4..d644bbb298af 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -27,10 +27,12 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); /* kmalloc/kfree equivalent: */ void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size); void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); +void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr); /* kmem_cache_alloc/free equivalent: */ void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); +void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr); void bpf_mem_cache_raw_free(void *ptr); void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags); diff --git a/include/linux/bpf_mprog.h b/include/linux/bpf_mprog.h new file mode 100644 index 000000000000..2b429488f840 --- /dev/null +++ b/include/linux/bpf_mprog.h @@ -0,0 +1,327 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2023 Isovalent */ +#ifndef __BPF_MPROG_H +#define __BPF_MPROG_H + +#include <linux/bpf.h> + +/* bpf_mprog framework: + * + * bpf_mprog is a generic layer for multi-program attachment. In-kernel users + * of the bpf_mprog don't need to care about the dependency resolution + * internals, they can just consume it with few API calls. Currently available + * dependency directives are BPF_F_{BEFORE,AFTER} which enable insertion of + * a BPF program or BPF link relative to an existing BPF program or BPF link + * inside the multi-program array as well as prepend and append behavior if + * no relative object was specified, see corresponding selftests for concrete + * examples (e.g. tc_links and tc_opts test cases of test_progs). + * + * Usage of bpf_mprog_{attach,detach,query}() core APIs with pseudo code: + * + * Attach case: + * + * struct bpf_mprog_entry *entry, *entry_new; + * int ret; + * + * // bpf_mprog user-side lock + * // fetch active @entry from attach location + * [...] + * ret = bpf_mprog_attach(entry, &entry_new, [...]); + * if (!ret) { + * if (entry != entry_new) { + * // swap @entry to @entry_new at attach location + * // ensure there are no inflight users of @entry: + * synchronize_rcu(); + * } + * bpf_mprog_commit(entry); + * } else { + * // error path, bail out, propagate @ret + * } + * // bpf_mprog user-side unlock + * + * Detach case: + * + * struct bpf_mprog_entry *entry, *entry_new; + * int ret; + * + * // bpf_mprog user-side lock + * // fetch active @entry from attach location + * [...] + * ret = bpf_mprog_detach(entry, &entry_new, [...]); + * if (!ret) { + * // all (*) marked is optional and depends on the use-case + * // whether bpf_mprog_bundle should be freed or not + * if (!bpf_mprog_total(entry_new)) (*) + * entry_new = NULL (*) + * // swap @entry to @entry_new at attach location + * // ensure there are no inflight users of @entry: + * synchronize_rcu(); + * bpf_mprog_commit(entry); + * if (!entry_new) (*) + * // free bpf_mprog_bundle (*) + * } else { + * // error path, bail out, propagate @ret + * } + * // bpf_mprog user-side unlock + * + * Query case: + * + * struct bpf_mprog_entry *entry; + * int ret; + * + * // bpf_mprog user-side lock + * // fetch active @entry from attach location + * [...] + * ret = bpf_mprog_query(attr, uattr, entry); + * // bpf_mprog user-side unlock + * + * Data/fast path: + * + * struct bpf_mprog_entry *entry; + * struct bpf_mprog_fp *fp; + * struct bpf_prog *prog; + * int ret = [...]; + * + * rcu_read_lock(); + * // fetch active @entry from attach location + * [...] + * bpf_mprog_foreach_prog(entry, fp, prog) { + * ret = bpf_prog_run(prog, [...]); + * // process @ret from program + * } + * [...] + * rcu_read_unlock(); + * + * bpf_mprog locking considerations: + * + * bpf_mprog_{attach,detach,query}() must be protected by an external lock + * (like RTNL in case of tcx). + * + * bpf_mprog_entry pointer can be an __rcu annotated pointer (in case of tcx + * the netdevice has tcx_ingress and tcx_egress __rcu pointer) which gets + * updated via rcu_assign_pointer() pointing to the active bpf_mprog_entry of + * the bpf_mprog_bundle. + * + * Fast path accesses the active bpf_mprog_entry within RCU critical section + * (in case of tcx it runs in NAPI which provides RCU protection there, + * other users might need explicit rcu_read_lock()). The bpf_mprog_commit() + * assumes that for the old bpf_mprog_entry there are no inflight users + * anymore. + * + * The READ_ONCE()/WRITE_ONCE() pairing for bpf_mprog_fp's prog access is for + * the replacement case where we don't swap the bpf_mprog_entry. + */ + +#define bpf_mprog_foreach_tuple(entry, fp, cp, t) \ + for (fp = &entry->fp_items[0], cp = &entry->parent->cp_items[0];\ + ({ \ + t.prog = READ_ONCE(fp->prog); \ + t.link = cp->link; \ + t.prog; \ + }); \ + fp++, cp++) + +#define bpf_mprog_foreach_prog(entry, fp, p) \ + for (fp = &entry->fp_items[0]; \ + (p = READ_ONCE(fp->prog)); \ + fp++) + +#define BPF_MPROG_MAX 64 + +struct bpf_mprog_fp { + struct bpf_prog *prog; +}; + +struct bpf_mprog_cp { + struct bpf_link *link; +}; + +struct bpf_mprog_entry { + struct bpf_mprog_fp fp_items[BPF_MPROG_MAX]; + struct bpf_mprog_bundle *parent; +}; + +struct bpf_mprog_bundle { + struct bpf_mprog_entry a; + struct bpf_mprog_entry b; + struct bpf_mprog_cp cp_items[BPF_MPROG_MAX]; + struct bpf_prog *ref; + atomic64_t revision; + u32 count; +}; + +struct bpf_tuple { + struct bpf_prog *prog; + struct bpf_link *link; +}; + +static inline struct bpf_mprog_entry * +bpf_mprog_peer(const struct bpf_mprog_entry *entry) +{ + if (entry == &entry->parent->a) + return &entry->parent->b; + else + return &entry->parent->a; +} + +static inline void bpf_mprog_bundle_init(struct bpf_mprog_bundle *bundle) +{ + BUILD_BUG_ON(sizeof(bundle->a.fp_items[0]) > sizeof(u64)); + BUILD_BUG_ON(ARRAY_SIZE(bundle->a.fp_items) != + ARRAY_SIZE(bundle->cp_items)); + + memset(bundle, 0, sizeof(*bundle)); + atomic64_set(&bundle->revision, 1); + bundle->a.parent = bundle; + bundle->b.parent = bundle; +} + +static inline void bpf_mprog_inc(struct bpf_mprog_entry *entry) +{ + entry->parent->count++; +} + +static inline void bpf_mprog_dec(struct bpf_mprog_entry *entry) +{ + entry->parent->count--; +} + +static inline int bpf_mprog_max(void) +{ + return ARRAY_SIZE(((struct bpf_mprog_entry *)NULL)->fp_items) - 1; +} + +static inline int bpf_mprog_total(struct bpf_mprog_entry *entry) +{ + int total = entry->parent->count; + + WARN_ON_ONCE(total > bpf_mprog_max()); + return total; +} + +static inline bool bpf_mprog_exists(struct bpf_mprog_entry *entry, + struct bpf_prog *prog) +{ + const struct bpf_mprog_fp *fp; + const struct bpf_prog *tmp; + + bpf_mprog_foreach_prog(entry, fp, tmp) { + if (tmp == prog) + return true; + } + return false; +} + +static inline void bpf_mprog_mark_for_release(struct bpf_mprog_entry *entry, + struct bpf_tuple *tuple) +{ + WARN_ON_ONCE(entry->parent->ref); + if (!tuple->link) + entry->parent->ref = tuple->prog; +} + +static inline void bpf_mprog_complete_release(struct bpf_mprog_entry *entry) +{ + /* In the non-link case prog deletions can only drop the reference + * to the prog after the bpf_mprog_entry got swapped and the + * bpf_mprog ensured that there are no inflight users anymore. + * + * Paired with bpf_mprog_mark_for_release(). + */ + if (entry->parent->ref) { + bpf_prog_put(entry->parent->ref); + entry->parent->ref = NULL; + } +} + +static inline void bpf_mprog_revision_new(struct bpf_mprog_entry *entry) +{ + atomic64_inc(&entry->parent->revision); +} + +static inline void bpf_mprog_commit(struct bpf_mprog_entry *entry) +{ + bpf_mprog_complete_release(entry); + bpf_mprog_revision_new(entry); +} + +static inline u64 bpf_mprog_revision(struct bpf_mprog_entry *entry) +{ + return atomic64_read(&entry->parent->revision); +} + +static inline void bpf_mprog_entry_copy(struct bpf_mprog_entry *dst, + struct bpf_mprog_entry *src) +{ + memcpy(dst->fp_items, src->fp_items, sizeof(src->fp_items)); +} + +static inline void bpf_mprog_entry_grow(struct bpf_mprog_entry *entry, int idx) +{ + int total = bpf_mprog_total(entry); + + memmove(entry->fp_items + idx + 1, + entry->fp_items + idx, + (total - idx) * sizeof(struct bpf_mprog_fp)); + + memmove(entry->parent->cp_items + idx + 1, + entry->parent->cp_items + idx, + (total - idx) * sizeof(struct bpf_mprog_cp)); +} + +static inline void bpf_mprog_entry_shrink(struct bpf_mprog_entry *entry, int idx) +{ + /* Total array size is needed in this case to enure the NULL + * entry is copied at the end. + */ + int total = ARRAY_SIZE(entry->fp_items); + + memmove(entry->fp_items + idx, + entry->fp_items + idx + 1, + (total - idx - 1) * sizeof(struct bpf_mprog_fp)); + + memmove(entry->parent->cp_items + idx, + entry->parent->cp_items + idx + 1, + (total - idx - 1) * sizeof(struct bpf_mprog_cp)); +} + +static inline void bpf_mprog_read(struct bpf_mprog_entry *entry, u32 idx, + struct bpf_mprog_fp **fp, + struct bpf_mprog_cp **cp) +{ + *fp = &entry->fp_items[idx]; + *cp = &entry->parent->cp_items[idx]; +} + +static inline void bpf_mprog_write(struct bpf_mprog_fp *fp, + struct bpf_mprog_cp *cp, + struct bpf_tuple *tuple) +{ + WRITE_ONCE(fp->prog, tuple->prog); + cp->link = tuple->link; +} + +int bpf_mprog_attach(struct bpf_mprog_entry *entry, + struct bpf_mprog_entry **entry_new, + struct bpf_prog *prog_new, struct bpf_link *link, + struct bpf_prog *prog_old, + u32 flags, u32 id_or_fd, u64 revision); + +int bpf_mprog_detach(struct bpf_mprog_entry *entry, + struct bpf_mprog_entry **entry_new, + struct bpf_prog *prog, struct bpf_link *link, + u32 flags, u32 id_or_fd, u64 revision); + +int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, + struct bpf_mprog_entry *entry); + +static inline bool bpf_mprog_supported(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_SCHED_CLS: + return true; + default: + return false; + } +} +#endif /* __BPF_MPROG_H */ diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 5d732f48f787..c55810a43541 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -44,6 +44,7 @@ #define PHY_ID_BCM7366 0x600d8490 #define PHY_ID_BCM7346 0x600d8650 #define PHY_ID_BCM7362 0x600d84b0 +#define PHY_ID_BCM74165 0x359052c0 #define PHY_ID_BCM7425 0x600d86b0 #define PHY_ID_BCM7429 0x600d8730 #define PHY_ID_BCM7435 0x600d8750 diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 00950cc03bff..a3462a9b8e18 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -267,5 +267,6 @@ MAX_BTF_TRACING_TYPE, extern u32 btf_tracing_ids[]; extern u32 bpf_cgroup_btf_id[]; extern u32 bpf_local_storage_map_btf_id[]; +extern u32 btf_bpf_map_id[]; #endif diff --git a/include/linux/connector.h b/include/linux/connector.h index 487350bb19c3..cec2d99ae902 100644 --- a/include/linux/connector.h +++ b/include/linux/connector.h @@ -90,13 +90,19 @@ void cn_del_callback(const struct cb_id *id); * If @group is not zero, then message will be delivered * to the specified group. * @gfp_mask: GFP mask. + * @filter: Filter function to be used at netlink layer. + * @filter_data:Filter data to be supplied to the filter function * * It can be safely called from softirq context, but may silently * fail under strong memory pressure. * * If there are no listeners for given group %-ESRCH can be returned. */ -int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 group, gfp_t gfp_mask); +int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, + u32 group, gfp_t gfp_mask, + int (*filter)(struct sock *dsk, struct sk_buff *skb, + void *data), + void *filter_data); /** * cn_netlink_send - Sends message to the specified groups. diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index db0f4fcfdaf4..e3b3b0fa2a8f 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -85,12 +85,10 @@ extern void icmpv6_param_prob_reason(struct sk_buff *skb, struct flowi6; struct in6_addr; -extern void icmpv6_flow_init(struct sock *sk, - struct flowi6 *fl6, - u8 type, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - int oif); + +void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type, + const struct in6_addr *saddr, + const struct in6_addr *daddr, int oif); static inline void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) { diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 839247a4f48e..0295b47c10a3 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -33,6 +33,7 @@ struct ipv6_devconf { __s32 accept_ra_defrtr; __u32 ra_defrtr_metric; __s32 accept_ra_min_hop_limit; + __s32 accept_ra_min_rtr_lft; __s32 accept_ra_pinfo; __s32 ignore_routes_with_linkdown; #ifdef CONFIG_IPV6_ROUTER_PREF @@ -199,14 +200,7 @@ struct inet6_cork { u8 tclass; }; -/** - * struct ipv6_pinfo - ipv6 private area - * - * In the struct sock hierarchy (tcp6_sock, upd6_sock, etc) - * this _must_ be the last member, so that inet6_sk_generic - * is able to calculate its offset from the base struct sock - * by using the struct proto->slab_obj_size member. -acme - */ +/* struct ipv6_pinfo - ipv6 private area */ struct ipv6_pinfo { struct in6_addr saddr; struct in6_pktinfo sticky_pktinfo; @@ -306,19 +300,19 @@ struct raw6_sock { __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; - /* ipv6_pinfo has to be the last member of raw6_sock, see inet6_sk_generic */ + struct ipv6_pinfo inet6; }; struct udp6_sock { struct udp_sock udp; - /* ipv6_pinfo has to be the last member of udp6_sock, see inet6_sk_generic */ + struct ipv6_pinfo inet6; }; struct tcp6_sock { struct tcp_sock tcp; - /* ipv6_pinfo has to be the last member of tcp6_sock, see inet6_sk_generic */ + struct ipv6_pinfo inet6; }; diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 7308a1a7599b..4f2621e87634 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -316,7 +316,7 @@ LSM_HOOK(int, 0, sk_alloc_security, struct sock *sk, int family, gfp_t priority) LSM_HOOK(void, LSM_RET_VOID, sk_free_security, struct sock *sk) LSM_HOOK(void, LSM_RET_VOID, sk_clone_security, const struct sock *sk, struct sock *newsk) -LSM_HOOK(void, LSM_RET_VOID, sk_getsecid, struct sock *sk, u32 *secid) +LSM_HOOK(void, LSM_RET_VOID, sk_getsecid, const struct sock *sk, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, sock_graft, struct sock *sk, struct socket *parent) LSM_HOOK(int, 0, inet_conn_request, const struct sock *sk, struct sk_buff *skb, struct request_sock *req) diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h index 0f06c2287b52..9b54c4f0677f 100644 --- a/include/linux/marvell_phy.h +++ b/include/linux/marvell_phy.h @@ -25,6 +25,7 @@ #define MARVELL_PHY_ID_88X3310 0x002b09a0 #define MARVELL_PHY_ID_88E2110 0x002b09b0 #define MARVELL_PHY_ID_88X2222 0x01410f10 +#define MARVELL_PHY_ID_88Q2110 0x002b0980 /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */ #define MARVELL_PHY_ID_88E1111_FINISAR 0x01ff0cc0 diff --git a/include/linux/mdio.h b/include/linux/mdio.h index c1b7008826e5..8fa23bdcedbf 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -537,6 +537,8 @@ static inline void mii_c73_mod_linkmode(unsigned long *adv, u16 *lpa) int __mdiobus_read(struct mii_bus *bus, int addr, u32 regnum); int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val); +int __mdiobus_modify(struct mii_bus *bus, int addr, u32 regnum, u16 mask, + u16 set); int __mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum, u16 mask, u16 set); @@ -564,6 +566,30 @@ int mdiobus_c45_modify(struct mii_bus *bus, int addr, int devad, u32 regnum, int mdiobus_c45_modify_changed(struct mii_bus *bus, int addr, int devad, u32 regnum, u16 mask, u16 set); +static inline int __mdiodev_read(struct mdio_device *mdiodev, u32 regnum) +{ + return __mdiobus_read(mdiodev->bus, mdiodev->addr, regnum); +} + +static inline int __mdiodev_write(struct mdio_device *mdiodev, u32 regnum, + u16 val) +{ + return __mdiobus_write(mdiodev->bus, mdiodev->addr, regnum, val); +} + +static inline int __mdiodev_modify(struct mdio_device *mdiodev, u32 regnum, + u16 mask, u16 set) +{ + return __mdiobus_modify(mdiodev->bus, mdiodev->addr, regnum, mask, set); +} + +static inline int __mdiodev_modify_changed(struct mdio_device *mdiodev, + u32 regnum, u16 mask, u16 set) +{ + return __mdiobus_modify_changed(mdiodev->bus, mdiodev->addr, regnum, + mask, set); +} + static inline int mdiodev_read(struct mdio_device *mdiodev, u32 regnum) { return mdiobus_read(mdiodev->bus, mdiodev->addr, regnum); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 33344a71c3e3..b3ad6b9852ec 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -464,10 +464,10 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reformat_add_esp_trasport[0x1]; u8 reformat_l2_to_l3_esp_tunnel[0x1]; - u8 reserved_at_42[0x1]; + u8 reformat_add_esp_transport_over_udp[0x1]; u8 reformat_del_esp_trasport[0x1]; u8 reformat_l3_esp_tunnel_to_l2[0x1]; - u8 reserved_at_45[0x1]; + u8 reformat_del_esp_transport_over_udp[0x1]; u8 execute_aso[0x1]; u8 reserved_at_47[0x19]; @@ -6665,9 +6665,12 @@ enum mlx5_reformat_ctx_type { MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4, MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV4 = 0x5, MLX5_REFORMAT_TYPE_L2_TO_L3_ESP_TUNNEL = 0x6, + MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_UDPV4 = 0x7, MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT = 0x8, MLX5_REFORMAT_TYPE_L3_ESP_TUNNEL_TO_L2 = 0x9, + MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT_OVER_UDP = 0xa, MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV6 = 0xb, + MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_UDPV6 = 0xc, MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf, MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10, MLX5_REFORMAT_TYPE_ADD_MACSEC = 0x11, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b828c7a75be2..11652e464f5d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1930,8 +1930,7 @@ enum netdev_ml_priv_type { * * @rx_handler: handler for received packets * @rx_handler_data: XXX: need comments on this one - * @miniq_ingress: ingress/clsact qdisc specific data for - * ingress processing + * @tcx_ingress: BPF & clsact qdisc specific data for ingress processing * @ingress_queue: XXX: need comments on this one * @nf_hooks_ingress: netfilter hooks executed for ingress packets * @broadcast: hw bcast address @@ -1952,8 +1951,7 @@ enum netdev_ml_priv_type { * @xps_maps: all CPUs/RXQs maps for XPS device * * @xps_maps: XXX: need comments on this one - * @miniq_egress: clsact qdisc specific data for - * egress processing + * @tcx_egress: BPF & clsact qdisc specific data for egress processing * @nf_hooks_egress: netfilter hooks executed for egress packets * @qdisc_hash: qdisc hash table * @watchdog_timeo: Represents the timeout that is used by @@ -2045,6 +2043,8 @@ enum netdev_ml_priv_type { * receive offload (GRO) * @gro_ipv4_max_size: Maximum size of aggregated packet in generic * receive offload (GRO), for IPv4. + * @xdp_zc_max_segs: Maximum number of segments supported by AF_XDP + * zero copy driver * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. @@ -2250,11 +2250,11 @@ struct net_device { #define GRO_MAX_SIZE (8 * 65535u) unsigned int gro_max_size; unsigned int gro_ipv4_max_size; + unsigned int xdp_zc_max_segs; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; - -#ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc __rcu *miniq_ingress; +#ifdef CONFIG_NET_XGRESS + struct bpf_mprog_entry __rcu *tcx_ingress; #endif struct netdev_queue __rcu *ingress_queue; #ifdef CONFIG_NETFILTER_INGRESS @@ -2282,8 +2282,8 @@ struct net_device { #ifdef CONFIG_XPS struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX]; #endif -#ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc __rcu *miniq_egress; +#ifdef CONFIG_NET_XGRESS + struct bpf_mprog_entry __rcu *tcx_egress; #endif #ifdef CONFIG_NETFILTER_EGRESS struct nf_hook_entries __rcu *nf_hooks_egress; diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 9eec3f4f5351..75d7de34c908 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -50,6 +50,7 @@ struct netlink_kernel_cfg { struct mutex *cb_mutex; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); + void (*release) (struct sock *sk, unsigned long *groups); }; struct sock *__netlink_kernel_create(struct net *net, int unit, @@ -227,6 +228,11 @@ bool netlink_strict_get_check(struct sk_buff *skb); int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); +int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, + __u32 portid, __u32 group, gfp_t allocation, + int (*filter)(struct sock *dsk, + struct sk_buff *skb, void *data), + void *filter_data); int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code); int netlink_register_notifier(struct notifier_block *nb); int netlink_unregister_notifier(struct notifier_block *nb); diff --git a/include/linux/phy.h b/include/linux/phy.h index 11c1e91563d4..b254848a9c99 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1826,6 +1826,7 @@ int genphy_c45_an_config_aneg(struct phy_device *phydev); int genphy_c45_an_disable_aneg(struct phy_device *phydev); int genphy_c45_read_mdix(struct phy_device *phydev); int genphy_c45_pma_read_abilities(struct phy_device *phydev); +int genphy_c45_pma_baset1_read_abilities(struct phy_device *phydev); int genphy_c45_read_eee_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_master_slave(struct phy_device *phydev); int genphy_c45_read_status(struct phy_device *phydev); diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 1817940a3418..789c516c6b4a 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -9,6 +9,7 @@ struct device_node; struct ethtool_cmd; struct fwnode_handle; struct net_device; +struct phylink; enum { MLO_PAUSE_NONE, @@ -200,8 +201,6 @@ enum phylink_op_type { * struct phylink_config - PHYLINK configuration structure * @dev: a pointer to a struct device associated with the MAC * @type: operation type of PHYLINK instance - * @legacy_pre_march2020: driver has not been updated for March 2020 updates - * (See commit 7cceb599d15d ("net: phylink: avoid mac_config calls") * @poll_fixed_state: if true, starts link_poll, * if MAC link is at %MLO_AN_FIXED mode. * @mac_managed_pm: if true, indicate the MAC driver is responsible for PHY PM. @@ -215,7 +214,6 @@ enum phylink_op_type { struct phylink_config { struct device *dev; enum phylink_op_type type; - bool legacy_pre_march2020; bool poll_fixed_state; bool mac_managed_pm; bool ovr_an_inband; @@ -229,11 +227,9 @@ struct phylink_config { * struct phylink_mac_ops - MAC operations structure. * @validate: Validate and update the link configuration. * @mac_select_pcs: Select a PCS for the interface mode. - * @mac_pcs_get_state: Read the current link state from the hardware. * @mac_prepare: prepare for a major reconfiguration of the interface. * @mac_config: configure the MAC for the selected mode and state. * @mac_finish: finish a major reconfiguration of the interface. - * @mac_an_restart: restart 802.3z BaseX autonegotiation. * @mac_link_down: take the link down. * @mac_link_up: allow the link to come up. * @@ -245,15 +241,12 @@ struct phylink_mac_ops { struct phylink_link_state *state); struct phylink_pcs *(*mac_select_pcs)(struct phylink_config *config, phy_interface_t interface); - void (*mac_pcs_get_state)(struct phylink_config *config, - struct phylink_link_state *state); int (*mac_prepare)(struct phylink_config *config, unsigned int mode, phy_interface_t iface); void (*mac_config)(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state); int (*mac_finish)(struct phylink_config *config, unsigned int mode, phy_interface_t iface); - void (*mac_an_restart)(struct phylink_config *config); void (*mac_link_down)(struct phylink_config *config, unsigned int mode, phy_interface_t interface); void (*mac_link_up)(struct phylink_config *config, @@ -314,25 +307,6 @@ struct phylink_pcs *mac_select_pcs(struct phylink_config *config, phy_interface_t interface); /** - * mac_pcs_get_state() - Read the current inband link state from the hardware - * @config: a pointer to a &struct phylink_config. - * @state: a pointer to a &struct phylink_link_state. - * - * Read the current inband link state from the MAC PCS, reporting the - * current speed in @state->speed, duplex mode in @state->duplex, pause - * mode in @state->pause using the %MLO_PAUSE_RX and %MLO_PAUSE_TX bits, - * negotiation completion state in @state->an_complete, and link up state - * in @state->link. If possible, @state->lp_advertising should also be - * populated. - * - * Note: This is a legacy method. This function will not be called unless - * legacy_pre_march2020 is set in &struct phylink_config and there is no - * PCS attached. - */ -void mac_pcs_get_state(struct phylink_config *config, - struct phylink_link_state *state); - -/** * mac_prepare() - prepare to change the PHY interface mode * @config: a pointer to a &struct phylink_config. * @mode: one of %MLO_AN_FIXED, %MLO_AN_PHY, %MLO_AN_INBAND. @@ -368,17 +342,9 @@ int mac_prepare(struct phylink_config *config, unsigned int mode, * guaranteed to be correct, and so any mac_config() implementation must * never reference these fields. * - * Note: For legacy March 2020 drivers (drivers with legacy_pre_march2020 set - * in their &phylnk_config and which don't have a PCS), this function will be - * called on each link up event, and to also change the in-band advert. For - * non-legacy drivers, it will only be called to reconfigure the MAC for a - * "major" change in e.g. interface mode. It will not be called for changes - * in speed, duplex or pause modes or to change the in-band advertisement. - * In any case, it is strongly preferred that speed, duplex and pause settings - * are handled in the mac_link_up() method and not in this method. - * - * (this requires a rewrite - please refer to mac_link_up() for situations - * where the PCS and MAC are not tightly integrated.) + * This will only be called to reconfigure the MAC for a "major" change in + * e.g. interface mode. It will not be called for changes in speed, duplex + * or pause modes or to change the in-band advertisement. * * In all negotiation modes, as defined by @mode, @state->pause indicates the * pause settings which should be applied as follows. If %MLO_PAUSE_AN is not @@ -410,7 +376,7 @@ int mac_prepare(struct phylink_config *config, unsigned int mode, * 1000base-X or Cisco SGMII mode depending on the @state->interface * mode). In both cases, link state management (whether the link * is up or not) is performed by the MAC, and reported via the - * mac_pcs_get_state() callback. Changes in link state must be made + * pcs_get_state() callback. Changes in link state must be made * by calling phylink_mac_change(). * * Interface mode specific details are mentioned below. @@ -459,16 +425,6 @@ int mac_finish(struct phylink_config *config, unsigned int mode, phy_interface_t iface); /** - * mac_an_restart() - restart 802.3z BaseX autonegotiation - * @config: a pointer to a &struct phylink_config. - * - * Note: This is a legacy method. This function will not be called unless - * legacy_pre_march2020 is set in &struct phylink_config and there is no - * PCS attached. - */ -void mac_an_restart(struct phylink_config *config); - -/** * mac_link_down() - take the link down * @config: a pointer to a &struct phylink_config. * @mode: link autonegotiation mode @@ -520,14 +476,19 @@ struct phylink_pcs_ops; /** * struct phylink_pcs - PHYLINK PCS instance * @ops: a pointer to the &struct phylink_pcs_ops structure + * @phylink: pointer to &struct phylink_config * @neg_mode: provide PCS neg mode via "mode" argument * @poll: poll the PCS for link changes * * This structure is designed to be embedded within the PCS private data, * and will be passed between phylink and the PCS. + * + * The @phylink member is private to phylink and must not be touched by + * the PCS driver. */ struct phylink_pcs { const struct phylink_pcs_ops *ops; + struct phylink *phylink; bool neg_mode; bool poll; }; @@ -535,6 +496,10 @@ struct phylink_pcs { /** * struct phylink_pcs_ops - MAC PCS operations structure. * @pcs_validate: validate the link configuration. + * @pcs_enable: enable the PCS. + * @pcs_disable: disable the PCS. + * @pcs_pre_config: pre-mac_config method (for errata) + * @pcs_post_config: post-mac_config method (for arrata) * @pcs_get_state: read the current MAC PCS link state from the hardware. * @pcs_config: configure the MAC PCS for the selected mode and state. * @pcs_an_restart: restart 802.3z BaseX autonegotiation. @@ -544,6 +509,12 @@ struct phylink_pcs { struct phylink_pcs_ops { int (*pcs_validate)(struct phylink_pcs *pcs, unsigned long *supported, const struct phylink_link_state *state); + int (*pcs_enable)(struct phylink_pcs *pcs); + void (*pcs_disable)(struct phylink_pcs *pcs); + void (*pcs_pre_config)(struct phylink_pcs *pcs, + phy_interface_t interface); + int (*pcs_post_config)(struct phylink_pcs *pcs, + phy_interface_t interface); void (*pcs_get_state)(struct phylink_pcs *pcs, struct phylink_link_state *state); int (*pcs_config)(struct phylink_pcs *pcs, unsigned int neg_mode, @@ -574,6 +545,18 @@ int pcs_validate(struct phylink_pcs *pcs, unsigned long *supported, const struct phylink_link_state *state); /** + * pcs_enable() - enable the PCS. + * @pcs: a pointer to a &struct phylink_pcs. + */ +int pcs_enable(struct phylink_pcs *pcs); + +/** + * pcs_disable() - disable the PCS. + * @pcs: a pointer to a &struct phylink_pcs. + */ +void pcs_disable(struct phylink_pcs *pcs); + +/** * pcs_get_state() - Read the current inband link state from the hardware * @pcs: a pointer to a &struct phylink_pcs. * @state: a pointer to a &struct phylink_link_state. @@ -585,8 +568,8 @@ int pcs_validate(struct phylink_pcs *pcs, unsigned long *supported, * in @state->link. If possible, @state->lp_advertising should also be * populated. * - * When present, this overrides mac_pcs_get_state() in &struct - * phylink_mac_ops. + * When present, this overrides pcs_get_state() in &struct + * phylink_pcs_ops. */ void pcs_get_state(struct phylink_pcs *pcs, struct phylink_link_state *state); @@ -677,6 +660,7 @@ int phylink_fwnode_phy_connect(struct phylink *pl, void phylink_disconnect_phy(struct phylink *); void phylink_mac_change(struct phylink *, bool up); +void phylink_pcs_change(struct phylink_pcs *, bool up); void phylink_start(struct phylink *); void phylink_stop(struct phylink *); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 7f17acf29dda..7b949292908a 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -138,6 +138,8 @@ static inline int rcu_needs_cpu(void) return 0; } +static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } + /* * Take advantage of the fact that there is only one CPU, which * allows us to ignore virtualization-based context switches. diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 56bccb5a8fde..126f6b418f6a 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -21,6 +21,7 @@ void rcu_softirq_qs(void); void rcu_note_context_switch(bool preempt); int rcu_needs_cpu(void); void rcu_cpu_stall_reset(void); +void rcu_request_urgent_qs_task(struct task_struct *t); /* * Note a virtualization-based context switch. This is simply a diff --git a/include/linux/security.h b/include/linux/security.h index 32828502f09e..994cf099d9ac 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1439,7 +1439,8 @@ int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u int security_sk_alloc(struct sock *sk, int family, gfp_t priority); void security_sk_free(struct sock *sk); void security_sk_clone(const struct sock *sk, struct sock *newsk); -void security_sk_classify_flow(struct sock *sk, struct flowi_common *flic); +void security_sk_classify_flow(const struct sock *sk, + struct flowi_common *flic); void security_req_classify_flow(const struct request_sock *req, struct flowi_common *flic); void security_sock_graft(struct sock*sk, struct socket *parent); @@ -1597,7 +1598,7 @@ static inline void security_sk_clone(const struct sock *sk, struct sock *newsk) { } -static inline void security_sk_classify_flow(struct sock *sk, +static inline void security_sk_classify_flow(const struct sock *sk, struct flowi_common *flic) { } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 91ed66952580..16a49ba534e4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -441,8 +441,6 @@ static inline bool skb_frag_must_loop(struct page *p) copied += p_len, p++, p_off = 0, \ p_len = min_t(u32, f_len - copied, PAGE_SIZE)) \ -#define HAVE_HW_TIME_STAMP - /** * struct skb_shared_hwtstamps - hardware time stamps * @hwtstamp: hardware time stamp transformed into duration @@ -944,7 +942,7 @@ struct sk_buff { __u8 __mono_tc_offset[0]; /* public: */ __u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */ -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ __u8 tc_skip_classify:1; #endif @@ -993,7 +991,7 @@ struct sk_buff { __u8 csum_not_inet:1; #endif -#ifdef CONFIG_NET_SCHED +#if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS) __u16 tc_index; /* traffic control index */ #endif @@ -4023,7 +4021,7 @@ __skb_header_pointer(const struct sk_buff *skb, int offset, int len, if (likely(hlen - offset >= len)) return (void *)data + offset; - if (!skb || !buffer || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0)) + if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0)) return NULL; return buffer; @@ -4036,6 +4034,14 @@ skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) skb_headlen(skb), buffer); } +static inline void * __must_check +skb_pointer_if_linear(const struct sk_buff *skb, int offset, int len) +{ + if (likely(skb_headlen(skb) - offset >= len)) + return skb->data + offset; + return NULL; +} + /** * skb_needs_linearize - check if we need to linearize a given skb * depending on the given device features. diff --git a/include/linux/smscphy.h b/include/linux/smscphy.h index e1c88627755a..1a6a851d2cf8 100644 --- a/include/linux/smscphy.h +++ b/include/linux/smscphy.h @@ -38,4 +38,38 @@ int smsc_phy_set_tunable(struct phy_device *phydev, struct ethtool_tunable *tuna, const void *data); int smsc_phy_probe(struct phy_device *phydev); +#define MII_LAN874X_PHY_MMD_WOL_WUCSR 0x8010 +#define MII_LAN874X_PHY_MMD_WOL_WUF_CFGA 0x8011 +#define MII_LAN874X_PHY_MMD_WOL_WUF_CFGB 0x8012 +#define MII_LAN874X_PHY_MMD_WOL_WUF_MASK0 0x8021 +#define MII_LAN874X_PHY_MMD_WOL_WUF_MASK1 0x8022 +#define MII_LAN874X_PHY_MMD_WOL_WUF_MASK2 0x8023 +#define MII_LAN874X_PHY_MMD_WOL_WUF_MASK3 0x8024 +#define MII_LAN874X_PHY_MMD_WOL_WUF_MASK4 0x8025 +#define MII_LAN874X_PHY_MMD_WOL_WUF_MASK5 0x8026 +#define MII_LAN874X_PHY_MMD_WOL_WUF_MASK6 0x8027 +#define MII_LAN874X_PHY_MMD_WOL_WUF_MASK7 0x8028 +#define MII_LAN874X_PHY_MMD_WOL_RX_ADDRA 0x8061 +#define MII_LAN874X_PHY_MMD_WOL_RX_ADDRB 0x8062 +#define MII_LAN874X_PHY_MMD_WOL_RX_ADDRC 0x8063 +#define MII_LAN874X_PHY_MMD_MCFGR 0x8064 + +#define MII_LAN874X_PHY_PME1_SET (2 << 13) +#define MII_LAN874X_PHY_PME2_SET (2 << 11) +#define MII_LAN874X_PHY_PME_SELF_CLEAR BIT(9) +#define MII_LAN874X_PHY_WOL_PFDA_FR BIT(7) +#define MII_LAN874X_PHY_WOL_WUFR BIT(6) +#define MII_LAN874X_PHY_WOL_MPR BIT(5) +#define MII_LAN874X_PHY_WOL_BCAST_FR BIT(4) +#define MII_LAN874X_PHY_WOL_PFDAEN BIT(3) +#define MII_LAN874X_PHY_WOL_WUEN BIT(2) +#define MII_LAN874X_PHY_WOL_MPEN BIT(1) +#define MII_LAN874X_PHY_WOL_BCSTEN BIT(0) + +#define MII_LAN874X_PHY_WOL_FILTER_EN BIT(15) +#define MII_LAN874X_PHY_WOL_FILTER_MCASTTEN BIT(9) +#define MII_LAN874X_PHY_WOL_FILTER_BCSTEN BIT(8) + +#define MII_LAN874X_PHY_PME_SELF_CLEAR_DELAY 0x1000 /* 81 milliseconds */ + #endif /* __LINUX_SMSCPHY_H__ */ diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 06090538fe2d..ef67dba775d0 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -204,6 +204,19 @@ struct dwmac4_addrs { u32 mtl_low_cred_offset; }; +#define STMMAC_FLAG_HAS_INTEGRATED_PCS BIT(0) +#define STMMAC_FLAG_SPH_DISABLE BIT(1) +#define STMMAC_FLAG_USE_PHY_WOL BIT(2) +#define STMMAC_FLAG_HAS_SUN8I BIT(3) +#define STMMAC_FLAG_TSO_EN BIT(4) +#define STMMAC_FLAG_SERDES_UP_AFTER_PHY_LINKUP BIT(5) +#define STMMAC_FLAG_VLAN_FAIL_Q_EN BIT(6) +#define STMMAC_FLAG_MULTI_MSI_EN BIT(7) +#define STMMAC_FLAG_EXT_SNAPSHOT_EN BIT(8) +#define STMMAC_FLAG_INT_SNAPSHOT_EN BIT(9) +#define STMMAC_FLAG_RX_CLK_RUNS_IN_LPI BIT(10) +#define STMMAC_FLAG_EN_TX_LPI_CLOCKGATING BIT(11) + struct plat_stmmacenet_data { int bus_id; int phy_addr; @@ -266,22 +279,14 @@ struct plat_stmmacenet_data { struct reset_control *stmmac_ahb_rst; struct stmmac_axi *axi; int has_gmac4; - bool has_sun8i; - bool tso_en; int rss_en; int mac_port_sel_speed; - bool en_tx_lpi_clockgating; - bool rx_clk_runs_in_lpi; int has_xgmac; - bool vlan_fail_q_en; u8 vlan_fail_q; unsigned int eee_usecs_rate; struct pci_dev *pdev; int int_snapshot_num; int ext_snapshot_num; - bool int_snapshot_en; - bool ext_snapshot_en; - bool multi_msi_en; int msi_mac_vec; int msi_wol_vec; int msi_lpi_vec; @@ -289,10 +294,7 @@ struct plat_stmmacenet_data { int msi_sfty_ue_vec; int msi_rx_base_vec; int msi_tx_base_vec; - bool use_phy_wol; - bool sph_disable; - bool serdes_up_after_phy_linkup; const struct dwmac4_addrs *dwmac4_addrs; - bool has_integrated_pcs; + unsigned int flags; }; #endif diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 91a37c99ba66..d16abdb3541a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -172,6 +172,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) return (struct tcp_request_sock *)req; } +#define TCP_RMEM_TO_WIN_SCALE 8 + struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; @@ -238,7 +240,7 @@ struct tcp_sock { u32 window_clamp; /* Maximal window to advertise */ u32 rcv_ssthresh; /* Current window clamp */ - + u8 scaling_ratio; /* see tcp_win_from_space() */ /* Information of the most recently (s)acked skb */ struct tcp_rack { u64 mstamp; /* (Re)sent time of the skb */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 3930e676436c..e66d04dbe56a 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -867,7 +867,8 @@ extern int perf_uprobe_init(struct perf_event *event, extern void perf_uprobe_destroy(struct perf_event *event); extern int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, const char **filename, - u64 *probe_offset, bool perf_type_tracepoint); + u64 *probe_offset, u64 *probe_addr, + bool perf_type_tracepoint); #endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index a2b953b57689..f291a3b0f9e5 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -30,6 +30,7 @@ FN(TCP_OVERWINDOW) \ FN(TCP_OFOMERGE) \ FN(TCP_RFC7323_PAWS) \ + FN(TCP_OLD_SEQUENCE) \ FN(TCP_INVALID_SEQUENCE) \ FN(TCP_RESET) \ FN(TCP_INVALID_SYN) \ @@ -188,6 +189,8 @@ enum skb_drop_reason { * LINUX_MIB_PAWSESTABREJECTED */ SKB_DROP_REASON_TCP_RFC7323_PAWS, + /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ + SKB_DROP_REASON_TCP_OLD_SEQUENCE, /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */ SKB_DROP_REASON_TCP_INVALID_SEQUENCE, /** @SKB_DROP_REASON_TCP_RESET: Invalid RST packet */ diff --git a/include/net/dsa.h b/include/net/dsa.h index d309ee7ed04b..0b9c6aa27047 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -873,8 +873,6 @@ struct dsa_switch_ops { struct phylink_pcs *(*phylink_mac_select_pcs)(struct dsa_switch *ds, int port, phy_interface_t iface); - int (*phylink_mac_link_state)(struct dsa_switch *ds, int port, - struct phylink_link_state *state); int (*phylink_mac_prepare)(struct dsa_switch *ds, int port, unsigned int mode, phy_interface_t interface); @@ -884,7 +882,6 @@ struct dsa_switch_ops { int (*phylink_mac_finish)(struct dsa_switch *ds, int port, unsigned int mode, phy_interface_t interface); - void (*phylink_mac_an_restart)(struct dsa_switch *ds, int port); void (*phylink_mac_link_down)(struct dsa_switch *ds, int port, unsigned int mode, phy_interface_t interface); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index ed4b6ad3fcac..e8750b4ef7e1 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -52,6 +52,7 @@ struct ip_tunnel_key { u8 tos; /* TOS for IPv4, TC for IPv6 */ u8 ttl; /* TTL for IPv4, HL for IPv6 */ __be32 label; /* Flow Label for IPv6 */ + u32 nhid; __be16 tp_src; __be16 tp_dst; __u8 flow_flags; diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index cf0d81be5a96..165e7a03b8e9 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -100,7 +100,7 @@ nf_ct_expect_find_get(struct net *net, struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple); + const struct nf_conntrack_tuple *tuple, bool unlink); void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index f00374718159..7a41c4791536 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -152,7 +152,7 @@ struct netns_ipv4 { u8 sysctl_tcp_abort_on_overflow; u8 sysctl_tcp_fack; /* obsolete */ int sysctl_tcp_max_reordering; - int sysctl_tcp_adv_win_scale; + int sysctl_tcp_adv_win_scale; /* obsolete */ u8 sysctl_tcp_dsack; u8 sysctl_tcp_app_win; u8 sysctl_tcp_frto; diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 126f9e294389..f1d5cc1fa13b 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -18,9 +18,8 @@ * * API keeps track of in-flight pages, in-order to let API user know * when it is safe to dealloactor page_pool object. Thus, API users - * must make sure to call page_pool_release_page() when a page is - * "leaving" the page_pool. Or call page_pool_put_page() where - * appropiate. For maintaining correct accounting. + * must call page_pool_put_page() where appropriate and only attach + * the page to a page_pool-aware objects, like skbs marked for recycling. * * API user must only call page_pool_put_page() once on a page, as it * will either recycle the page, or in case of elevated refcnt, it @@ -251,7 +250,6 @@ void page_pool_unlink_napi(struct page_pool *pool); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), struct xdp_mem_info *mem); -void page_pool_release_page(struct page_pool *pool, struct page *page); void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count); #else @@ -268,10 +266,6 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool, struct xdp_mem_info *mem) { } -static inline void page_pool_release_page(struct page_pool *pool, - struct page *page) -{ -} static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count) diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a2ea45c7b53e..139cd09828af 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -866,6 +866,7 @@ struct tc_htb_qopt_offload { u32 parent_classid; u16 classid; u16 qid; + u32 quantum; u64 rate; u64 ceil; u8 prio; diff --git a/include/net/route.h b/include/net/route.h index 5a5c726472bd..d8d150155195 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -163,7 +163,7 @@ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, } static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi4 *fl4, - struct sock *sk, + const struct sock *sk, __be32 daddr, __be32 saddr, __be16 dport, __be16 sport, __u8 proto, __u8 tos, int oif) @@ -309,7 +309,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst, __be32 src, int oif, u8 protocol, __be16 sport, __be16 dport, - struct sock *sk) + const struct sock *sk) { struct net *net = sock_net(sk); struct rtable *rt; @@ -330,7 +330,7 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst, static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable *rt, __be16 orig_sport, __be16 orig_dport, __be16 sport, __be16 dport, - struct sock *sk) + const struct sock *sk) { if (sport != orig_sport || dport != orig_dport) { fl4->fl4_dport = dport; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e92f73bb3198..15be2d96b06d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -703,7 +703,7 @@ int skb_do_redirect(struct sk_buff *); static inline bool skb_at_tc_ingress(const struct sk_buff *skb) { -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS return skb->tc_at_ingress; #else return false; diff --git a/include/net/sock.h b/include/net/sock.h index 2eb916d1ff64..7ae44bf866af 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1339,6 +1339,7 @@ struct proto { struct kmem_cache *slab; unsigned int obj_size; + unsigned int ipv6_pinfo_offset; slab_flags_t slab_flags; unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ diff --git a/include/net/switchdev.h b/include/net/switchdev.h index ca0312b78294..4d324e2a2eef 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -231,6 +231,7 @@ enum switchdev_notifier_type { SWITCHDEV_BRPORT_OFFLOADED, SWITCHDEV_BRPORT_UNOFFLOADED, + SWITCHDEV_BRPORT_REPLAY, }; struct switchdev_notifier_info { @@ -299,6 +300,11 @@ void switchdev_bridge_port_unoffload(struct net_device *brport_dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb); +int switchdev_bridge_port_replay(struct net_device *brport_dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack); void switchdev_deferred_process(void); int switchdev_port_attr_set(struct net_device *dev, diff --git a/include/net/tcp.h b/include/net/tcp.h index 0ca972ebd3dd..6ebf54992ffe 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -350,7 +350,6 @@ ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule); -void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks); static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts) { @@ -606,7 +605,6 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, unsigned int mss_now, gfp_t gfp); void tcp_send_probe0(struct sock *); -void tcp_send_partial(struct sock *); int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); void tcp_send_active_reset(struct sock *sk, gfp_t priority); @@ -1432,13 +1430,39 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); +static inline int __tcp_win_from_space(u8 scaling_ratio, int space) +{ + s64 scaled_space = (s64)space * scaling_ratio; + + return scaled_space >> TCP_RMEM_TO_WIN_SCALE; +} + static inline int tcp_win_from_space(const struct sock *sk, int space) { - int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale); + return __tcp_win_from_space(tcp_sk(sk)->scaling_ratio, space); +} + +/* inverse of __tcp_win_from_space() */ +static inline int __tcp_space_from_win(u8 scaling_ratio, int win) +{ + u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE; - return tcp_adv_win_scale <= 0 ? - (space>>(-tcp_adv_win_scale)) : - space - (space>>tcp_adv_win_scale); + do_div(val, scaling_ratio); + return val; +} + +static inline int tcp_space_from_win(const struct sock *sk, int win) +{ + return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win); +} + +static inline void tcp_scaling_ratio_init(struct sock *sk) +{ + /* Assume a conservative default of 1200 bytes of payload per 4K page. + * This may be adjusted later in tcp_measure_rcv_mss(). + */ + tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) / + SKB_TRUESIZE(4096); } /* Note: caller must be prepared to deal with negative returns */ diff --git a/include/net/tcx.h b/include/net/tcx.h new file mode 100644 index 000000000000..264f147953ba --- /dev/null +++ b/include/net/tcx.h @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2023 Isovalent */ +#ifndef __NET_TCX_H +#define __NET_TCX_H + +#include <linux/bpf.h> +#include <linux/bpf_mprog.h> + +#include <net/sch_generic.h> + +struct mini_Qdisc; + +struct tcx_entry { + struct mini_Qdisc __rcu *miniq; + struct bpf_mprog_bundle bundle; + bool miniq_active; + struct rcu_head rcu; +}; + +struct tcx_link { + struct bpf_link link; + struct net_device *dev; + u32 location; +}; + +static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress) +{ +#ifdef CONFIG_NET_XGRESS + skb->tc_at_ingress = ingress; +#endif +} + +#ifdef CONFIG_NET_XGRESS +static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry) +{ + struct bpf_mprog_bundle *bundle = entry->parent; + + return container_of(bundle, struct tcx_entry, bundle); +} + +static inline struct tcx_link *tcx_link(struct bpf_link *link) +{ + return container_of(link, struct tcx_link, link); +} + +static inline const struct tcx_link *tcx_link_const(const struct bpf_link *link) +{ + return tcx_link((struct bpf_link *)link); +} + +void tcx_inc(void); +void tcx_dec(void); + +static inline void tcx_entry_sync(void) +{ + /* bpf_mprog_entry got a/b swapped, therefore ensure that + * there are no inflight users on the old one anymore. + */ + synchronize_rcu(); +} + +static inline void +tcx_entry_update(struct net_device *dev, struct bpf_mprog_entry *entry, + bool ingress) +{ + ASSERT_RTNL(); + if (ingress) + rcu_assign_pointer(dev->tcx_ingress, entry); + else + rcu_assign_pointer(dev->tcx_egress, entry); +} + +static inline struct bpf_mprog_entry * +tcx_entry_fetch(struct net_device *dev, bool ingress) +{ + ASSERT_RTNL(); + if (ingress) + return rcu_dereference_rtnl(dev->tcx_ingress); + else + return rcu_dereference_rtnl(dev->tcx_egress); +} + +static inline struct bpf_mprog_entry *tcx_entry_create(void) +{ + struct tcx_entry *tcx = kzalloc(sizeof(*tcx), GFP_KERNEL); + + if (tcx) { + bpf_mprog_bundle_init(&tcx->bundle); + return &tcx->bundle.a; + } + return NULL; +} + +static inline void tcx_entry_free(struct bpf_mprog_entry *entry) +{ + kfree_rcu(tcx_entry(entry), rcu); +} + +static inline struct bpf_mprog_entry * +tcx_entry_fetch_or_create(struct net_device *dev, bool ingress, bool *created) +{ + struct bpf_mprog_entry *entry = tcx_entry_fetch(dev, ingress); + + *created = false; + if (!entry) { + entry = tcx_entry_create(); + if (!entry) + return NULL; + *created = true; + } + return entry; +} + +static inline void tcx_skeys_inc(bool ingress) +{ + tcx_inc(); + if (ingress) + net_inc_ingress_queue(); + else + net_inc_egress_queue(); +} + +static inline void tcx_skeys_dec(bool ingress) +{ + if (ingress) + net_dec_ingress_queue(); + else + net_dec_egress_queue(); + tcx_dec(); +} + +static inline void tcx_miniq_set_active(struct bpf_mprog_entry *entry, + const bool active) +{ + ASSERT_RTNL(); + tcx_entry(entry)->miniq_active = active; +} + +static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry) +{ + ASSERT_RTNL(); + return bpf_mprog_total(entry) || tcx_entry(entry)->miniq_active; +} + +static inline enum tcx_action_base tcx_action_code(struct sk_buff *skb, + int code) +{ + switch (code) { + case TCX_PASS: + skb->tc_index = qdisc_skb_cb(skb)->tc_classid; + fallthrough; + case TCX_DROP: + case TCX_REDIRECT: + return code; + case TCX_NEXT: + default: + return TCX_NEXT; + } +} +#endif /* CONFIG_NET_XGRESS */ + +#if defined(CONFIG_NET_XGRESS) && defined(CONFIG_BPF_SYSCALL) +int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); +void tcx_uninstall(struct net_device *dev, bool ingress); + +int tcx_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); + +static inline void dev_tcx_uninstall(struct net_device *dev) +{ + ASSERT_RTNL(); + tcx_uninstall(dev, true); + tcx_uninstall(dev, false); +} +#else +static inline int tcx_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int tcx_link_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int tcx_prog_detach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int tcx_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} + +static inline void dev_tcx_uninstall(struct net_device *dev) +{ +} +#endif /* CONFIG_NET_XGRESS && CONFIG_BPF_SYSCALL */ +#endif /* __NET_TCX_H */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index e96a1151ec75..1617af380162 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -52,6 +52,7 @@ struct xdp_sock { struct xsk_buff_pool *pool; u16 queue_id; bool zc; + bool sg; enum { XSK_READY = 0, XSK_BOUND, @@ -67,6 +68,12 @@ struct xdp_sock { u64 rx_dropped; u64 rx_queue_full; + /* When __xsk_generic_xmit() must return before it sees the EOP descriptor for the current + * packet, the partially built skb is saved here so that packet building can resume in next + * call of __xsk_generic_xmit(). + */ + struct sk_buff *skb; + struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index c243f906ebed..1f6fc8c7a84c 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -89,6 +89,11 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return xp_alloc(pool); } +static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +{ + return !xp_mb_desc(desc); +} + /* Returns as many entries as possible up to max. 0 <= N <= max. */ static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { @@ -103,10 +108,45 @@ static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) static inline void xsk_buff_free(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + struct list_head *xskb_list = &xskb->pool->xskb_list; + struct xdp_buff_xsk *pos, *tmp; + + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { + list_del(&pos->xskb_list_node); + xp_free(pos); + } + + xdp_get_shared_info_from_buff(xdp)->nr_frags = 0; +out: xp_free(xskb); } +static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); + + list_add_tail(&frag->xskb_list_node, &frag->pool->xskb_list); +} + +static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff *ret = NULL; + struct xdp_buff_xsk *frag; + + frag = list_first_entry_or_null(&xskb->pool->xskb_list, + struct xdp_buff_xsk, xskb_list_node); + if (frag) { + list_del(&frag->xskb_list_node); + ret = &frag->xdp; + } + + return ret; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; @@ -241,6 +281,11 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return NULL; } +static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +{ + return false; +} + static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { return 0; @@ -255,6 +300,15 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) { } +static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +{ +} + +static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +{ + return NULL; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index a8d7b8a3688a..b0bdff26fc88 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -29,6 +29,7 @@ struct xdp_buff_xsk { struct xsk_buff_pool *pool; u64 orig_addr; struct list_head free_list_node; + struct list_head xskb_list_node; }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) @@ -54,6 +55,7 @@ struct xsk_buff_pool { struct xdp_umem *umem; struct work_struct work; struct list_head free_list; + struct list_head xskb_list; u32 heads_cnt; u16 queue_id; @@ -184,6 +186,11 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); } +static inline bool xp_mb_desc(struct xdp_desc *desc) +{ + return desc->options & XDP_PKT_CONTD; +} + static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) { return addr & pool->chunk_mask; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 60a9d59beeab..739c15906a65 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1036,6 +1036,8 @@ enum bpf_attach_type { BPF_LSM_CGROUP, BPF_STRUCT_OPS, BPF_NETFILTER, + BPF_TCX_INGRESS, + BPF_TCX_EGRESS, __MAX_BPF_ATTACH_TYPE }; @@ -1053,10 +1055,20 @@ enum bpf_link_type { BPF_LINK_TYPE_KPROBE_MULTI = 8, BPF_LINK_TYPE_STRUCT_OPS = 9, BPF_LINK_TYPE_NETFILTER = 10, - + BPF_LINK_TYPE_TCX = 11, MAX_BPF_LINK_TYPE, }; +enum bpf_perf_event_type { + BPF_PERF_EVENT_UNSPEC = 0, + BPF_PERF_EVENT_UPROBE = 1, + BPF_PERF_EVENT_URETPROBE = 2, + BPF_PERF_EVENT_KPROBE = 3, + BPF_PERF_EVENT_KRETPROBE = 4, + BPF_PERF_EVENT_TRACEPOINT = 5, + BPF_PERF_EVENT_EVENT = 6, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. @@ -1103,7 +1115,12 @@ enum bpf_link_type { */ #define BPF_F_ALLOW_OVERRIDE (1U << 0) #define BPF_F_ALLOW_MULTI (1U << 1) +/* Generic attachment flags. */ #define BPF_F_REPLACE (1U << 2) +#define BPF_F_BEFORE (1U << 3) +#define BPF_F_AFTER (1U << 4) +#define BPF_F_ID (1U << 5) +#define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the * verifier will perform strict alignment checking as if the kernel @@ -1434,14 +1451,19 @@ union bpf_attr { }; struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ - __u32 target_fd; /* container object to attach to */ - __u32 attach_bpf_fd; /* eBPF program to attach */ + union { + __u32 target_fd; /* target object to attach to or ... */ + __u32 target_ifindex; /* target ifindex */ + }; + __u32 attach_bpf_fd; __u32 attach_type; __u32 attach_flags; - __u32 replace_bpf_fd; /* previously attached eBPF - * program to replace if - * BPF_F_REPLACE is used - */ + __u32 replace_bpf_fd; + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; }; struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ @@ -1487,16 +1509,26 @@ union bpf_attr { } info; struct { /* anonymous struct used by BPF_PROG_QUERY command */ - __u32 target_fd; /* container object to query */ + union { + __u32 target_fd; /* target object to query or ... */ + __u32 target_ifindex; /* target ifindex */ + }; __u32 attach_type; __u32 query_flags; __u32 attach_flags; __aligned_u64 prog_ids; - __u32 prog_cnt; + union { + __u32 prog_cnt; + __u32 count; + }; + __u32 :32; /* output: per-program attach_flags. * not allowed to be set during effective query. */ __aligned_u64 prog_attach_flags; + __aligned_u64 link_ids; + __aligned_u64 link_attach_flags; + __u64 revision; } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ @@ -1539,13 +1571,13 @@ union bpf_attr { __u32 map_fd; /* struct_ops to attach */ }; union { - __u32 target_fd; /* object to attach to */ - __u32 target_ifindex; /* target ifindex */ + __u32 target_fd; /* target object to attach to or ... */ + __u32 target_ifindex; /* target ifindex */ }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ union { - __u32 target_btf_id; /* btf_id of target to attach to */ + __u32 target_btf_id; /* btf_id of target to attach to */ struct { __aligned_u64 iter_info; /* extra bpf_iter_link_info */ __u32 iter_info_len; /* iter_info length */ @@ -1579,6 +1611,13 @@ union bpf_attr { __s32 priority; __u32 flags; } netfilter; + struct { + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + } tcx; }; } link_create; @@ -6187,6 +6226,19 @@ struct bpf_sock_tuple { }; }; +/* (Simplified) user return codes for tcx prog type. + * A valid tcx program must return one of these defined values. All other + * return codes are reserved for future use. Must remain compatible with + * their TC_ACT_* counter-parts. For compatibility in behavior, unknown + * return codes are mapped to TCX_NEXT. + */ +enum tcx_action_base { + TCX_NEXT = -1, + TCX_PASS = 0, + TCX_DROP = 2, + TCX_REDIRECT = 7, +}; + struct bpf_xdp_sock { __u32 queue_id; }; @@ -6439,6 +6491,40 @@ struct bpf_link_info { __s32 priority; __u32 flags; } netfilter; + struct { + __aligned_u64 addrs; + __u32 count; /* in/out: kprobe_multi function count */ + __u32 flags; + } kprobe_multi; + struct { + __u32 type; /* enum bpf_perf_event_type */ + __u32 :32; + union { + struct { + __aligned_u64 file_name; /* in/out */ + __u32 name_len; + __u32 offset; /* offset from file_name */ + } uprobe; /* BPF_PERF_EVENT_UPROBE, BPF_PERF_EVENT_URETPROBE */ + struct { + __aligned_u64 func_name; /* in/out */ + __u32 name_len; + __u32 offset; /* offset from func_name */ + __u64 addr; + } kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */ + struct { + __aligned_u64 tp_name; /* in/out */ + __u32 name_len; + } tracepoint; /* BPF_PERF_EVENT_TRACEPOINT */ + struct { + __u64 config; + __u32 type; + } event; /* BPF_PERF_EVENT_EVENT */ + }; + } perf_event; + struct { + __u32 ifindex; + __u32 attach_type; + } tcx; }; } __attribute__((aligned(8))); @@ -7012,6 +7098,7 @@ struct bpf_list_head { struct bpf_list_node { __u64 :64; __u64 :64; + __u64 :64; } __attribute__((aligned(8))); struct bpf_rb_root { @@ -7023,6 +7110,7 @@ struct bpf_rb_node { __u64 :64; __u64 :64; __u64 :64; + __u64 :64; } __attribute__((aligned(8))); struct bpf_refcount { diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h index db210625cee8..f2afb7cc4926 100644 --- a/include/uapi/linux/cn_proc.h +++ b/include/uapi/linux/cn_proc.h @@ -30,6 +30,49 @@ enum proc_cn_mcast_op { PROC_CN_MCAST_IGNORE = 2 }; +#define PROC_EVENT_ALL (PROC_EVENT_FORK | PROC_EVENT_EXEC | PROC_EVENT_UID | \ + PROC_EVENT_GID | PROC_EVENT_SID | PROC_EVENT_PTRACE | \ + PROC_EVENT_COMM | PROC_EVENT_NONZERO_EXIT | \ + PROC_EVENT_COREDUMP | PROC_EVENT_EXIT) + +/* + * If you add an entry in proc_cn_event, make sure you add it in + * PROC_EVENT_ALL above as well. + */ +enum proc_cn_event { + /* Use successive bits so the enums can be used to record + * sets of events as well + */ + PROC_EVENT_NONE = 0x00000000, + PROC_EVENT_FORK = 0x00000001, + PROC_EVENT_EXEC = 0x00000002, + PROC_EVENT_UID = 0x00000004, + PROC_EVENT_GID = 0x00000040, + PROC_EVENT_SID = 0x00000080, + PROC_EVENT_PTRACE = 0x00000100, + PROC_EVENT_COMM = 0x00000200, + /* "next" should be 0x00000400 */ + /* "last" is the last process event: exit, + * while "next to last" is coredumping event + * before that is report only if process dies + * with non-zero exit status + */ + PROC_EVENT_NONZERO_EXIT = 0x20000000, + PROC_EVENT_COREDUMP = 0x40000000, + PROC_EVENT_EXIT = 0x80000000 +}; + +struct proc_input { + enum proc_cn_mcast_op mcast_op; + enum proc_cn_event event_type; +}; + +static inline enum proc_cn_event valid_event(enum proc_cn_event ev_type) +{ + ev_type &= PROC_EVENT_ALL; + return ev_type; +} + /* * From the user's point of view, the process * ID is the thread group ID and thread ID is the internal @@ -44,24 +87,7 @@ enum proc_cn_mcast_op { */ struct proc_event { - enum what { - /* Use successive bits so the enums can be used to record - * sets of events as well - */ - PROC_EVENT_NONE = 0x00000000, - PROC_EVENT_FORK = 0x00000001, - PROC_EVENT_EXEC = 0x00000002, - PROC_EVENT_UID = 0x00000004, - PROC_EVENT_GID = 0x00000040, - PROC_EVENT_SID = 0x00000080, - PROC_EVENT_PTRACE = 0x00000100, - PROC_EVENT_COMM = 0x00000200, - /* "next" should be 0x00000400 */ - /* "last" is the last process event: exit, - * while "next to last" is coredumping event */ - PROC_EVENT_COREDUMP = 0x40000000, - PROC_EVENT_EXIT = 0x80000000 - } what; + enum proc_cn_event what; __u32 cpu; __u64 __attribute__((aligned(8))) timestamp_ns; /* Number of nano seconds since system boot */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 0f6a0fe09bdb..ce3117df9cec 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -570,6 +570,7 @@ enum { IFLA_BRPORT_MCAST_N_GROUPS, IFLA_BRPORT_MCAST_MAX_GROUPS, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, + IFLA_BRPORT_BACKUP_NHID, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index a78a8096f4ce..8d48863472b9 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -25,6 +25,12 @@ * application. */ #define XDP_USE_NEED_WAKEUP (1 << 3) +/* By setting this option, userspace application indicates that it can + * handle multiple descriptors per packet thus enabling AF_XDP to split + * multi-buffer XDP frames into multiple Rx descriptors. Without this set + * such frames will be dropped. + */ +#define XDP_USE_SG (1 << 4) /* Flags for xsk_umem_config flags */ #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) @@ -108,4 +114,11 @@ struct xdp_desc { /* UMEM descriptor is __u64 */ +/* Flag indicating that the packet continues with the buffer pointed out by the + * next frame in the ring. The end of the packet is signalled by setting this + * bit to zero. For single buffer packets, every descriptor has 'options' set + * to 0 and this maintains backward compatibility. + */ +#define XDP_PKT_CONTD (1 << 0) + #endif /* _LINUX_IF_XDP_H */ diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index ac56605fe9bc..8b6bcbf6ed4a 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -198,6 +198,7 @@ enum { DEVCONF_IOAM6_ID_WIDE, DEVCONF_NDISC_EVICT_NOCARRIER, DEVCONF_ACCEPT_UNTRACKED_NA, + DEVCONF_ACCEPT_RA_MIN_RTR_LFT, DEVCONF_MAX }; diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index b826598d1e94..d03863da180e 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -82,6 +82,8 @@ #define MDIO_AN_10BT1_AN_CTRL 526 /* 10BASE-T1 AN control register */ #define MDIO_AN_10BT1_AN_STAT 527 /* 10BASE-T1 AN status register */ #define MDIO_PMA_PMD_BT1_CTRL 2100 /* BASE-T1 PMA/PMD control register */ +#define MDIO_PCS_1000BT1_CTRL 2304 /* 1000BASE-T1 PCS control register */ +#define MDIO_PCS_1000BT1_STAT 2305 /* 1000BASE-T1 PCS status register */ /* LASI (Link Alarm Status Interrupt) registers, defined by XENPAK MSA. */ #define MDIO_PMA_LASI_RXCTRL 0x9000 /* RX_ALARM control */ @@ -332,6 +334,8 @@ #define MDIO_PCS_10T1L_CTRL_RESET 0x8000 /* PCS reset */ /* BASE-T1 PMA/PMD extended ability register. */ +#define MDIO_PMA_PMD_BT1_B100_ABLE 0x0001 /* 100BASE-T1 Ability */ +#define MDIO_PMA_PMD_BT1_B1000_ABLE 0x0002 /* 1000BASE-T1 Ability */ #define MDIO_PMA_PMD_BT1_B10L_ABLE 0x0004 /* 10BASE-T1L Ability */ /* BASE-T1 auto-negotiation advertisement register [15:0] */ @@ -373,7 +377,19 @@ #define MDIO_AN_10BT1_AN_STAT_LPA_EEE_T1L 0x4000 /* 10BASE-T1L LP EEE ability advertisement */ /* BASE-T1 PMA/PMD control register */ -#define MDIO_PMA_PMD_BT1_CTRL_CFG_MST 0x4000 /* MASTER-SLAVE config value */ +#define MDIO_PMA_PMD_BT1_CTRL_STRAP 0x000F /* Type selection (Strap) */ +#define MDIO_PMA_PMD_BT1_CTRL_STRAP_B1000 0x0001 /* Select 1000BASE-T1 */ +#define MDIO_PMA_PMD_BT1_CTRL_CFG_MST 0x4000 /* MASTER-SLAVE config value */ + +/* 1000BASE-T1 PCS control register */ +#define MDIO_PCS_1000BT1_CTRL_LOW_POWER 0x0800 /* Low power mode */ +#define MDIO_PCS_1000BT1_CTRL_DISABLE_TX 0x4000 /* Global PMA transmit disable */ +#define MDIO_PCS_1000BT1_CTRL_RESET 0x8000 /* Software reset value */ + +/* 1000BASE-T1 PCS status register */ +#define MDIO_PCS_1000BT1_STAT_LINK 0x0004 /* PCS Link is up */ +#define MDIO_PCS_1000BT1_STAT_FAULT 0x0080 /* There is a fault condition */ + /* EEE Supported/Advertisement/LP Advertisement registers. * diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 639524b59930..bf71698a1e82 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -41,6 +41,7 @@ enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, + NETDEV_A_DEV_XDP_ZC_MAX_SEGS, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) |