diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-13 15:47:48 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-13 15:47:48 -0800 |
commit | 7e68dd7d07a28faa2e6574dd6b9dbd90cdeaae91 (patch) | |
tree | ae0427c5a3b905f24b3a44b510a9bcf35d9b67a3 /arch/x86/net/bpf_jit_comp.c | |
parent | 1ca06f1c1acecbe02124f14a37cce347b8c1a90c (diff) | |
parent | 7c4a6309e27f411743817fe74a832ec2d2798a4b (diff) |
Merge tag 'net-next-6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni:
"Core:
- Allow live renaming when an interface is up
- Add retpoline wrappers for tc, improving considerably the
performances of complex queue discipline configurations
- Add inet drop monitor support
- A few GRO performance improvements
- Add infrastructure for atomic dev stats, addressing long standing
data races
- De-duplicate common code between OVS and conntrack offloading
infrastructure
- A bunch of UBSAN_BOUNDS/FORTIFY_SOURCE improvements
- Netfilter: introduce packet parser for tunneled packets
- Replace IPVS timer-based estimators with kthreads to scale up the
workload with the number of available CPUs
- Add the helper support for connection-tracking OVS offload
BPF:
- Support for user defined BPF objects: the use case is to allocate
own objects, build own object hierarchies and use the building
blocks to build own data structures flexibly, for example, linked
lists in BPF
- Make cgroup local storage available to non-cgroup attached BPF
programs
- Avoid unnecessary deadlock detection and failures wrt BPF task
storage helpers
- A relevant bunch of BPF verifier fixes and improvements
- Veristat tool improvements to support custom filtering, sorting,
and replay of results
- Add LLVM disassembler as default library for dumping JITed code
- Lots of new BPF documentation for various BPF maps
- Add bpf_rcu_read_{,un}lock() support for sleepable programs
- Add RCU grace period chaining to BPF to wait for the completion of
access from both sleepable and non-sleepable BPF programs
- Add support storing struct task_struct objects as kptrs in maps
- Improve helper UAPI by explicitly defining BPF_FUNC_xxx integer
values
- Add libbpf *_opts API-variants for bpf_*_get_fd_by_id() functions
Protocols:
- TCP: implement Protective Load Balancing across switch links
- TCP: allow dynamically disabling TCP-MD5 static key, reverting back
to fast[er]-path
- UDP: Introduce optional per-netns hash lookup table
- IPv6: simplify and cleanup sockets disposal
- Netlink: support different type policies for each generic netlink
operation
- MPTCP: add MSG_FASTOPEN and FastOpen listener side support
- MPTCP: add netlink notification support for listener sockets events
- SCTP: add VRF support, allowing sctp sockets binding to VRF devices
- Add bridging MAC Authentication Bypass (MAB) support
- Extensions for Ethernet VPN bridging implementation to better
support multicast scenarios
- More work for Wi-Fi 7 support, comprising conversion of all the
existing drivers to internal TX queue usage
- IPSec: introduce a new offload type (packet offload) allowing
complete header processing and crypto offloading
- IPSec: extended ack support for more descriptive XFRM error
reporting
- RXRPC: increase SACK table size and move processing into a
per-local endpoint kernel thread, reducing considerably the
required locking
- IEEE 802154: synchronous send frame and extended filtering support,
initial support for scanning available 15.4 networks
- Tun: bump the link speed from 10Mbps to 10Gbps
- Tun/VirtioNet: implement UDP segmentation offload support
Driver API:
- PHY/SFP: improve power level switching between standard level 1 and
the higher power levels
- New API for netdev <-> devlink_port linkage
- PTP: convert existing drivers to new frequency adjustment
implementation
- DSA: add support for rx offloading
- Autoload DSA tagging driver when dynamically changing protocol
- Add new PCP and APPTRUST attributes to Data Center Bridging
- Add configuration support for 800Gbps link speed
- Add devlink port function attribute to enable/disable RoCE and
migratable
- Extend devlink-rate to support strict prioriry and weighted fair
queuing
- Add devlink support to directly reading from region memory
- New device tree helper to fetch MAC address from nvmem
- New big TCP helper to simplify temporary header stripping
New hardware / drivers:
- Ethernet:
- Marvel Octeon CNF95N and CN10KB Ethernet Switches
- Marvel Prestera AC5X Ethernet Switch
- WangXun 10 Gigabit NIC
- Motorcomm yt8521 Gigabit Ethernet
- Microchip ksz9563 Gigabit Ethernet Switch
- Microsoft Azure Network Adapter
- Linux Automation 10Base-T1L adapter
- PHY:
- Aquantia AQR112 and AQR412
- Motorcomm YT8531S
- PTP:
- Orolia ART-CARD
- WiFi:
- MediaTek Wi-Fi 7 (802.11be) devices
- RealTek rtw8821cu, rtw8822bu, rtw8822cu and rtw8723du USB
devices
- Bluetooth:
- Broadcom BCM4377/4378/4387 Bluetooth chipsets
- Realtek RTL8852BE and RTL8723DS
- Cypress.CYW4373A0 WiFi + Bluetooth combo device
Drivers:
- CAN:
- gs_usb: bus error reporting support
- kvaser_usb: listen only and bus error reporting support
- Ethernet NICs:
- Intel (100G):
- extend action skbedit to RX queue mapping
- implement devlink-rate support
- support direct read from memory
- nVidia/Mellanox (mlx5):
- SW steering improvements, increasing rules update rate
- Support for enhanced events compression
- extend H/W offload packet manipulation capabilities
- implement IPSec packet offload mode
- nVidia/Mellanox (mlx4):
- better big TCP support
- Netronome Ethernet NICs (nfp):
- IPsec offload support
- add support for multicast filter
- Broadcom:
- RSS and PTP support improvements
- AMD/SolarFlare:
- netlink extened ack improvements
- add basic flower matches to offload, and related stats
- Virtual NICs:
- ibmvnic: introduce affinity hint support
- small / embedded:
- FreeScale fec: add initial XDP support
- Marvel mv643xx_eth: support MII/GMII/RGMII modes for Kirkwood
- TI am65-cpsw: add suspend/resume support
- Mediatek MT7986: add RX wireless wthernet dispatch support
- Realtek 8169: enable GRO software interrupt coalescing per
default
- Ethernet high-speed switches:
- Microchip (sparx5):
- add support for Sparx5 TC/flower H/W offload via VCAP
- Mellanox mlxsw:
- add 802.1X and MAC Authentication Bypass offload support
- add ip6gre support
- Embedded Ethernet switches:
- Mediatek (mtk_eth_soc):
- improve PCS implementation, add DSA untag support
- enable flow offload support
- Renesas:
- add rswitch R-Car Gen4 gPTP support
- Microchip (lan966x):
- add full XDP support
- add TC H/W offload via VCAP
- enable PTP on bridge interfaces
- Microchip (ksz8):
- add MTU support for KSZ8 series
- Qualcomm 802.11ax WiFi (ath11k):
- support configuring channel dwell time during scan
- MediaTek WiFi (mt76):
- enable Wireless Ethernet Dispatch (WED) offload support
- add ack signal support
- enable coredump support
- remain_on_channel support
- Intel WiFi (iwlwifi):
- enable Wi-Fi 7 Extremely High Throughput (EHT) PHY capabilities
- 320 MHz channels support
- RealTek WiFi (rtw89):
- new dynamic header firmware format support
- wake-over-WLAN support"
* tag 'net-next-6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2002 commits)
ipvs: fix type warning in do_div() on 32 bit
net: lan966x: Remove a useless test in lan966x_ptp_add_trap()
net: ipa: add IPA v4.7 support
dt-bindings: net: qcom,ipa: Add SM6350 compatible
bnxt: Use generic HBH removal helper in tx path
IPv6/GRO: generic helper to remove temporary HBH/jumbo header in driver
selftests: forwarding: Add bridge MDB test
selftests: forwarding: Rename bridge_mdb test
bridge: mcast: Support replacement of MDB port group entries
bridge: mcast: Allow user space to specify MDB entry routing protocol
bridge: mcast: Allow user space to add (*, G) with a source list and filter mode
bridge: mcast: Add support for (*, G) with a source list and filter mode
bridge: mcast: Avoid arming group timer when (S, G) corresponds to a source
bridge: mcast: Add a flag for user installed source entries
bridge: mcast: Expose __br_multicast_del_group_src()
bridge: mcast: Expose br_multicast_new_group_src()
bridge: mcast: Add a centralized error path
bridge: mcast: Place netlink policy before validation functions
bridge: mcast: Split (*, G) and (S, G) addition into different functions
bridge: mcast: Do not derive entry type from its filter mode
...
Diffstat (limited to 'arch/x86/net/bpf_jit_comp.c')
-rw-r--r-- | arch/x86/net/bpf_jit_comp.c | 128 |
1 files changed, 97 insertions, 31 deletions
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 99620428ad78..36ffe67ad6e5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -891,6 +891,65 @@ static void emit_nops(u8 **pprog, int len) *pprog = prog; } +/* emit the 3-byte VEX prefix + * + * r: same as rex.r, extra bit for ModRM reg field + * x: same as rex.x, extra bit for SIB index field + * b: same as rex.b, extra bit for ModRM r/m, or SIB base + * m: opcode map select, encoding escape bytes e.g. 0x0f38 + * w: same as rex.w (32 bit or 64 bit) or opcode specific + * src_reg2: additional source reg (encoded as BPF reg) + * l: vector length (128 bit or 256 bit) or reserved + * pp: opcode prefix (none, 0x66, 0xf2 or 0xf3) + */ +static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m, + bool w, u8 src_reg2, bool l, u8 pp) +{ + u8 *prog = *pprog; + const u8 b0 = 0xc4; /* first byte of 3-byte VEX prefix */ + u8 b1, b2; + u8 vvvv = reg2hex[src_reg2]; + + /* reg2hex gives only the lower 3 bit of vvvv */ + if (is_ereg(src_reg2)) + vvvv |= 1 << 3; + + /* + * 2nd byte of 3-byte VEX prefix + * ~ means bit inverted encoding + * + * 7 0 + * +---+---+---+---+---+---+---+---+ + * |~R |~X |~B | m | + * +---+---+---+---+---+---+---+---+ + */ + b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f); + /* + * 3rd byte of 3-byte VEX prefix + * + * 7 0 + * +---+---+---+---+---+---+---+---+ + * | W | ~vvvv | L | pp | + * +---+---+---+---+---+---+---+---+ + */ + b2 = (w << 7) | ((~vvvv & 0xf) << 3) | (l << 2) | (pp & 3); + + EMIT3(b0, b1, b2); + *pprog = prog; +} + +/* emit BMI2 shift instruction */ +static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op) +{ + u8 *prog = *pprog; + bool r = is_ereg(dst_reg); + u8 m = 2; /* escape code 0f38 */ + + emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op); + EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg)); + *pprog = prog; +} + #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, @@ -1137,17 +1196,38 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image case BPF_ALU64 | BPF_LSH | BPF_X: case BPF_ALU64 | BPF_RSH | BPF_X: case BPF_ALU64 | BPF_ARSH | BPF_X: + /* BMI2 shifts aren't better when shift count is already in rcx */ + if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) { + /* shrx/sarx/shlx dst_reg, dst_reg, src_reg */ + bool w = (BPF_CLASS(insn->code) == BPF_ALU64); + u8 op; + + switch (BPF_OP(insn->code)) { + case BPF_LSH: + op = 1; /* prefix 0x66 */ + break; + case BPF_RSH: + op = 3; /* prefix 0xf2 */ + break; + case BPF_ARSH: + op = 2; /* prefix 0xf3 */ + break; + } - /* Check for bad case when dst_reg == rcx */ - if (dst_reg == BPF_REG_4) { - /* mov r11, dst_reg */ - EMIT_mov(AUX_REG, dst_reg); - dst_reg = AUX_REG; + emit_shiftx(&prog, dst_reg, src_reg, w, op); + + break; } if (src_reg != BPF_REG_4) { /* common case */ - EMIT1(0x51); /* push rcx */ - + /* Check for bad case when dst_reg == rcx */ + if (dst_reg == BPF_REG_4) { + /* mov r11, dst_reg */ + EMIT_mov(AUX_REG, dst_reg); + dst_reg = AUX_REG; + } else { + EMIT1(0x51); /* push rcx */ + } /* mov rcx, src_reg */ EMIT_mov(BPF_REG_4, src_reg); } @@ -1159,12 +1239,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image b3 = simple_alu_opcodes[BPF_OP(insn->code)]; EMIT2(0xD3, add_1reg(b3, dst_reg)); - if (src_reg != BPF_REG_4) - EMIT1(0x59); /* pop rcx */ + if (src_reg != BPF_REG_4) { + if (insn->dst_reg == BPF_REG_4) + /* mov dst_reg, r11 */ + EMIT_mov(insn->dst_reg, AUX_REG); + else + EMIT1(0x59); /* pop rcx */ + } - if (insn->dst_reg == BPF_REG_4) - /* mov dst_reg, r11 */ - EMIT_mov(insn->dst_reg, AUX_REG); break; case BPF_ALU | BPF_END | BPF_FROM_BE: @@ -1226,8 +1308,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image /* speculation barrier */ case BPF_ST | BPF_NOSPEC: - if (boot_cpu_has(X86_FEATURE_XMM2)) - EMIT_LFENCE(); + EMIT_LFENCE(); break; /* ST: *(u8*)(dst_reg + off) = imm */ @@ -1813,10 +1894,6 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_link *l, int stack_size, int run_ctx_off, bool save_ret) { - void (*exit)(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_exit; - u64 (*enter)(struct bpf_prog *prog, - struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_enter; u8 *prog = *pprog; u8 *jmp_insn; int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); @@ -1835,23 +1912,12 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off); - if (p->aux->sleepable) { - enter = __bpf_prog_enter_sleepable; - exit = __bpf_prog_exit_sleepable; - } else if (p->type == BPF_PROG_TYPE_STRUCT_OPS) { - enter = __bpf_prog_enter_struct_ops; - exit = __bpf_prog_exit_struct_ops; - } else if (p->expected_attach_type == BPF_LSM_CGROUP) { - enter = __bpf_prog_enter_lsm_cgroup; - exit = __bpf_prog_exit_lsm_cgroup; - } - /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); /* arg2: lea rsi, [rbp - ctx_cookie_off] */ EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); - if (emit_call(&prog, enter, prog)) + if (emit_call(&prog, bpf_trampoline_enter(p), prog)) return -EINVAL; /* remember prog start time returned by __bpf_prog_enter */ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); @@ -1896,7 +1962,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); /* arg3: lea rdx, [rbp - run_ctx_off] */ EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); - if (emit_call(&prog, exit, prog)) + if (emit_call(&prog, bpf_trampoline_exit(p), prog)) return -EINVAL; *pprog = prog; |