diff options
161 files changed, 8200 insertions, 1514 deletions
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 1bdb1aff0619..f1c95779843b 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -167,25 +167,37 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func) { + unsigned int i, ctx_idx = ctx->idx; + + /* Load function address into r12 */ + PPC_LI64(12, func); + + /* For bpf-to-bpf function calls, the callee's address is unknown + * until the last extra pass. As seen above, we use PPC_LI64() to + * load the callee's address, but this may optimize the number of + * instructions required based on the nature of the address. + * + * Since we don't want the number of instructions emitted to change, + * we pad the optimized PPC_LI64() call with NOPs to guarantee that + * we always have a five-instruction sequence, which is the maximum + * that PPC_LI64() can emit. + */ + for (i = ctx->idx - ctx_idx; i < 5; i++) + PPC_NOP(); + #ifdef PPC64_ELF_ABI_v1 - /* func points to the function descriptor */ - PPC_LI64(b2p[TMP_REG_2], func); - /* Load actual entry point from function descriptor */ - PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0); - /* ... and move it to LR */ - PPC_MTLR(b2p[TMP_REG_1]); /* * Load TOC from function descriptor at offset 8. * We can clobber r2 since we get called through a * function pointer (so caller will save/restore r2) * and since we don't use a TOC ourself. */ - PPC_BPF_LL(2, b2p[TMP_REG_2], 8); -#else - /* We can clobber r12 */ - PPC_FUNC_ADDR(12, func); - PPC_MTLR(12); + PPC_BPF_LL(2, 12, 8); + /* Load actual entry point from function descriptor */ + PPC_BPF_LL(12, 12, 0); #endif + + PPC_MTLR(12); PPC_BLRL(); } @@ -256,7 +268,7 @@ static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 /* Assemble the body code between the prologue & epilogue */ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, - u32 *addrs) + u32 *addrs, bool extra_pass) { const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; @@ -712,11 +724,25 @@ emit_clear: break; /* - * Call kernel helper + * Call kernel helper or bpf function */ case BPF_JMP | BPF_CALL: ctx->seen |= SEEN_FUNC; - func = (u8 *) __bpf_call_base + imm; + + /* bpf function call */ + if (insn[i].src_reg == BPF_PSEUDO_CALL) + if (!extra_pass) + func = NULL; + else if (fp->aux->func && off < fp->aux->func_cnt) + /* use the subprog id from the off + * field to lookup the callee address + */ + func = (u8 *) fp->aux->func[off]->bpf_func; + else + return -EINVAL; + /* kernel helper call */ + else + func = (u8 *) __bpf_call_base + imm; bpf_jit_emit_func_call(image, ctx, (u64)func); @@ -864,6 +890,14 @@ cond_branch: return 0; } +struct powerpc64_jit_data { + struct bpf_binary_header *header; + u32 *addrs; + u8 *image; + u32 proglen; + struct codegen_context ctx; +}; + struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) { u32 proglen; @@ -871,6 +905,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) u8 *image = NULL; u32 *code_base; u32 *addrs; + struct powerpc64_jit_data *jit_data; struct codegen_context cgctx; int pass; int flen; @@ -878,6 +913,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) struct bpf_prog *org_fp = fp; struct bpf_prog *tmp_fp; bool bpf_blinded = false; + bool extra_pass = false; if (!fp->jit_requested) return org_fp; @@ -891,11 +927,32 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) fp = tmp_fp; } + jit_data = fp->aux->jit_data; + if (!jit_data) { + jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); + if (!jit_data) { + fp = org_fp; + goto out; + } + fp->aux->jit_data = jit_data; + } + flen = fp->len; + addrs = jit_data->addrs; + if (addrs) { + cgctx = jit_data->ctx; + image = jit_data->image; + bpf_hdr = jit_data->header; + proglen = jit_data->proglen; + alloclen = proglen + FUNCTION_DESCR_SIZE; + extra_pass = true; + goto skip_init_ctx; + } + addrs = kzalloc((flen+1) * sizeof(*addrs), GFP_KERNEL); if (addrs == NULL) { fp = org_fp; - goto out; + goto out_addrs; } memset(&cgctx, 0, sizeof(struct codegen_context)); @@ -904,10 +961,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) cgctx.stack_size = round_up(fp->aux->stack_depth, 16); /* Scouting faux-generate pass 0 */ - if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) { + if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) { /* We hit something illegal or unsupported. */ fp = org_fp; - goto out; + goto out_addrs; } /* @@ -925,9 +982,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) bpf_jit_fill_ill_insns); if (!bpf_hdr) { fp = org_fp; - goto out; + goto out_addrs; } +skip_init_ctx: code_base = (u32 *)(image + FUNCTION_DESCR_SIZE); /* Code generation passes 1-2 */ @@ -935,7 +993,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) /* Now build the prologue, body code & epilogue for real. */ cgctx.idx = 0; bpf_jit_build_prologue(code_base, &cgctx); - bpf_jit_build_body(fp, code_base, &cgctx, addrs); + bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass); bpf_jit_build_epilogue(code_base, &cgctx); if (bpf_jit_enable > 1) @@ -961,10 +1019,20 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) fp->jited_len = alloclen; bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE)); + if (!fp->is_func || extra_pass) { +out_addrs: + kfree(addrs); + kfree(jit_data); + fp->aux->jit_data = NULL; + } else { + jit_data->addrs = addrs; + jit_data->ctx = cgctx; + jit_data->proglen = proglen; + jit_data->image = image; + jit_data->header = bpf_hdr; + } out: - kfree(addrs); - if (bpf_blinded) bpf_jit_prog_release_other(fp, fp == org_fp ? tmp_fp : org_fp); diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index fea17b92b1ae..bd53a71f6b00 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1218,12 +1218,37 @@ static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond) } } +static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond, + enum netdev_lag_tx_type type) +{ + if (type != NETDEV_LAG_TX_TYPE_HASH) + return NETDEV_LAG_HASH_NONE; + + switch (bond->params.xmit_policy) { + case BOND_XMIT_POLICY_LAYER2: + return NETDEV_LAG_HASH_L2; + case BOND_XMIT_POLICY_LAYER34: + return NETDEV_LAG_HASH_L34; + case BOND_XMIT_POLICY_LAYER23: + return NETDEV_LAG_HASH_L23; + case BOND_XMIT_POLICY_ENCAP23: + return NETDEV_LAG_HASH_E23; + case BOND_XMIT_POLICY_ENCAP34: + return NETDEV_LAG_HASH_E34; + default: + return NETDEV_LAG_HASH_UNKNOWN; + } +} + static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave, struct netlink_ext_ack *extack) { struct netdev_lag_upper_info lag_upper_info; + enum netdev_lag_tx_type type; - lag_upper_info.tx_type = bond_lag_tx_type(bond); + type = bond_lag_tx_type(bond); + lag_upper_info.tx_type = type; + lag_upper_info.hash_type = bond_lag_hash_type(bond, type); return netdev_master_upper_dev_link(slave->dev, bond->dev, slave, &lag_upper_info, extack); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index 0f305d97f4ee..0dbe2d9e22d6 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -1738,6 +1738,9 @@ int t4_set_addr_hash(struct adapter *adap, unsigned int mbox, unsigned int viid, bool ucast, u64 vec, bool sleep_ok); int t4_enable_vi_params(struct adapter *adap, unsigned int mbox, unsigned int viid, bool rx_en, bool tx_en, bool dcb_en); +int t4_enable_pi_params(struct adapter *adap, unsigned int mbox, + struct port_info *pi, + bool rx_en, bool tx_en, bool dcb_en); int t4_enable_vi(struct adapter *adap, unsigned int mbox, unsigned int viid, bool rx_en, bool tx_en); int t4_identify_port(struct adapter *adap, unsigned int mbox, unsigned int viid, diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c index 59d04d73c672..f7eef93ffc87 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c @@ -800,24 +800,20 @@ static int set_link_ksettings(struct net_device *dev, if (base->duplex != DUPLEX_FULL) return -EINVAL; - if (!(lc->pcaps & FW_PORT_CAP32_ANEG)) { - /* PHY offers a single speed. See if that's what's - * being requested. - */ - if (base->autoneg == AUTONEG_DISABLE && - (lc->pcaps & speed_to_fw_caps(base->speed))) - return 0; - return -EINVAL; - } - old_lc = *lc; - if (base->autoneg == AUTONEG_DISABLE) { + if (!(lc->pcaps & FW_PORT_CAP32_ANEG) || + base->autoneg == AUTONEG_DISABLE) { fw_caps = speed_to_fw_caps(base->speed); - if (!(lc->pcaps & fw_caps)) + /* Must only specify a single speed which must be supported + * as part of the Physical Port Capabilities. + */ + if ((fw_caps & (fw_caps - 1)) != 0 || + !(lc->pcaps & fw_caps)) return -EINVAL; + lc->speed_caps = fw_caps; - lc->acaps = 0; + lc->acaps = fw_caps; } else { fw_caps = lmm_to_fw_caps(link_ksettings->link_modes.advertising); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 513e1d356384..0efae2030e71 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -465,7 +465,7 @@ static int link_start(struct net_device *dev) &pi->link_cfg); if (ret == 0) { local_bh_disable(); - ret = t4_enable_vi_params(pi->adapter, mb, pi->viid, true, + ret = t4_enable_pi_params(pi->adapter, mb, pi, true, true, CXGB4_DCB_ENABLED); local_bh_enable(); } @@ -2344,7 +2344,8 @@ static int cxgb_close(struct net_device *dev) netif_tx_stop_all_queues(dev); netif_carrier_off(dev); - ret = t4_enable_vi(adapter, adapter->pf, pi->viid, false, false); + ret = t4_enable_pi_params(adapter, adapter->pf, pi, + false, false, false); #ifdef CONFIG_CHELSIO_T4_DCB cxgb4_dcb_reset(dev); dcb_tx_queue_prio_enable(dev, false); @@ -4140,6 +4141,10 @@ static int adap_init0(struct adapter *adap) * card */ card_fw = kvzalloc(sizeof(*card_fw), GFP_KERNEL); + if (!card_fw) { + ret = -ENOMEM; + goto bye; + } /* Get FW from from /lib/firmware/ */ ret = request_firmware(&fw, fw_info->fw_mod_name, @@ -5236,14 +5241,11 @@ static void free_some_resources(struct adapter *adapter) NETIF_F_IPV6_CSUM | NETIF_F_HIGHDMA) #define SEGMENT_SIZE 128 -static int get_chip_type(struct pci_dev *pdev, u32 pl_rev) +static int t4_get_chip_type(struct adapter *adap, int ver) { - u16 device_id; + u32 pl_rev = REV_G(t4_read_reg(adap, PL_REV_A)); - /* Retrieve adapter's device ID */ - pci_read_config_word(pdev, PCI_DEVICE_ID, &device_id); - - switch (device_id >> 12) { + switch (ver) { case CHELSIO_T4: return CHELSIO_CHIP_CODE(CHELSIO_T4, pl_rev); case CHELSIO_T5: @@ -5251,8 +5253,7 @@ static int get_chip_type(struct pci_dev *pdev, u32 pl_rev) case CHELSIO_T6: return CHELSIO_CHIP_CODE(CHELSIO_T6, pl_rev); default: - dev_err(&pdev->dev, "Device %d is not supported\n", - device_id); + break; } return -EINVAL; } @@ -5422,15 +5423,18 @@ static int cxgb4_iov_configure(struct pci_dev *pdev, int num_vfs) static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { - int func, i, err, s_qpp, qpp, num_seg; + struct net_device *netdev; + struct adapter *adapter; + static int adap_idx = 1; + int s_qpp, qpp, num_seg; struct port_info *pi; bool highdma = false; - struct adapter *adapter = NULL; - struct net_device *netdev; - void __iomem *regs; - u32 whoami, pl_rev; enum chip_type chip; - static int adap_idx = 1; + void __iomem *regs; + int func, chip_ver; + u16 device_id; + int i, err; + u32 whoami; printk_once(KERN_INFO "%s - version %s\n", DRV_DESC, DRV_VERSION); @@ -5466,11 +5470,17 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) goto out_free_adapter; /* We control everything through one PF */ - whoami = readl(regs + PL_WHOAMI_A); - pl_rev = REV_G(readl(regs + PL_REV_A)); - chip = get_chip_type(pdev, pl_rev); - func = CHELSIO_CHIP_VERSION(chip) <= CHELSIO_T5 ? - SOURCEPF_G(whoami) : T6_SOURCEPF_G(whoami); + whoami = t4_read_reg(adapter, PL_WHOAMI_A); + pci_read_config_word(pdev, PCI_DEVICE_ID, &device_id); + chip = t4_get_chip_type(adapter, CHELSIO_PCI_ID_VER(device_id)); + if (chip < 0) { + dev_err(&pdev->dev, "Device %d is not supported\n", device_id); + err = chip; + goto out_free_adapter; + } + chip_ver = CHELSIO_CHIP_VERSION(chip); + func = chip_ver <= CHELSIO_T5 ? + SOURCEPF_G(whoami) : T6_SOURCEPF_G(whoami); adapter->pdev = pdev; adapter->pdev_dev = &pdev->dev; @@ -5636,7 +5646,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_TC; - if (CHELSIO_CHIP_VERSION(chip) > CHELSIO_T5) { + if (chip_ver > CHELSIO_T5) { netdev->hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM | @@ -5716,7 +5726,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev_warn(&pdev->dev, "could not allocate MPS Encap entries, continuing\n"); #if IS_ENABLED(CONFIG_IPV6) - if ((CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5) && + if (chip_ver <= CHELSIO_T5 && (!(t4_read_reg(adapter, LE_DB_CONFIG_A) & ASLIPCOMPEN_F))) { /* CLIP functionality is not present in hardware, * hence disable all offload features diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_chip_type.h b/drivers/net/ethernet/chelsio/cxgb4/t4_chip_type.h index 54b718111e3f..721c77577ec5 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_chip_type.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_chip_type.h @@ -34,6 +34,8 @@ #ifndef __T4_CHIP_TYPE_H__ #define __T4_CHIP_TYPE_H__ +#define CHELSIO_PCI_ID_VER(__DeviceID) ((__DeviceID) >> 12) + #define CHELSIO_T4 0x4 #define CHELSIO_T5 0x5 #define CHELSIO_T6 0x6 diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index 704f6960a6ea..39da7e3c804b 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -4066,6 +4066,7 @@ int t4_link_l1cfg_core(struct adapter *adapter, unsigned int mbox, fw_port_cap32_t fw_fc, cc_fec, fw_fec, rcap; struct fw_port_cmd cmd; unsigned int fw_mdi; + int ret; fw_mdi = (FW_PORT_CAP32_MDI_V(FW_PORT_CAP32_MDI_AUTO) & lc->pcaps); /* Convert driver coding of Pause Frame Flow Control settings into the @@ -4100,6 +4101,13 @@ int t4_link_l1cfg_core(struct adapter *adapter, unsigned int mbox, rcap = lc->acaps | fw_fc | fw_fec | fw_mdi; } + if (rcap & ~lc->pcaps) { + dev_err(adapter->pdev_dev, + "Requested Port Capabilities %#x exceed Physical Port Capabilities %#x\n", + rcap, lc->pcaps); + return -EINVAL; + } + /* And send that on to the Firmware ... */ memset(&cmd, 0, sizeof(cmd)); @@ -4110,13 +4118,21 @@ int t4_link_l1cfg_core(struct adapter *adapter, unsigned int mbox, cpu_to_be32(FW_PORT_CMD_ACTION_V(fw_caps == FW_CAPS16 ? FW_PORT_ACTION_L1_CFG : FW_PORT_ACTION_L1_CFG32) | - FW_LEN16(cmd)); + FW_LEN16(cmd)); if (fw_caps == FW_CAPS16) cmd.u.l1cfg.rcap = cpu_to_be32(fwcaps32_to_caps16(rcap)); else cmd.u.l1cfg32.rcap32 = cpu_to_be32(rcap); - return t4_wr_mbox_meat_timeout(adapter, mbox, &cmd, sizeof(cmd), NULL, - sleep_ok, timeout); + + ret = t4_wr_mbox_meat_timeout(adapter, mbox, &cmd, sizeof(cmd), NULL, + sleep_ok, timeout); + if (ret) { + dev_err(adapter->pdev_dev, + "Requested Port Capabilities %#x rejected, error %d\n", + rcap, -ret); + return ret; + } + return ret; } /** @@ -7998,6 +8014,34 @@ int t4_enable_vi(struct adapter *adap, unsigned int mbox, unsigned int viid, } /** + * t4_enable_pi_params - enable/disable a Port's Virtual Interface + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @pi: the Port Information structure + * @rx_en: 1=enable Rx, 0=disable Rx + * @tx_en: 1=enable Tx, 0=disable Tx + * @dcb_en: 1=enable delivery of Data Center Bridging messages. + * + * Enables/disables a Port's Virtual Interface. Note that setting DCB + * Enable only makes sense when enabling a Virtual Interface ... + * If the Virtual Interface enable/disable operation is successful, + * we notify the OS-specific code of a potential Link Status change + * via the OS Contract API t4_os_link_changed(). + */ +int t4_enable_pi_params(struct adapter *adap, unsigned int mbox, + struct port_info *pi, + bool rx_en, bool tx_en, bool dcb_en) +{ + int ret = t4_enable_vi_params(adap, mbox, pi->viid, + rx_en, tx_en, dcb_en); + if (ret) + return ret; + t4_os_link_changed(adap, pi->port_id, + rx_en && tx_en && pi->link_cfg.link_ok); + return 0; +} + +/** * t4_identify_port - identify a VI's port by blinking its LED * @adap: the adapter * @mbox: mailbox to use for the FW command @@ -8395,7 +8439,9 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl) lc->lpacaps = lpacaps; lc->acaps = acaps & ADVERT_MASK; - if (lc->acaps & FW_PORT_CAP32_ANEG) { + if (!(lc->acaps & FW_PORT_CAP32_ANEG)) { + lc->autoneg = AUTONEG_DISABLE; + } else if (lc->acaps & FW_PORT_CAP32_ANEG) { lc->autoneg = AUTONEG_ENABLE; } else { /* When Autoneg is disabled, user needs to set @@ -8600,6 +8646,13 @@ static void init_link_config(struct link_config *lc, fw_port_cap32_t pcaps, lc->requested_fec = FEC_AUTO; lc->fec = fwcap_to_cc_fec(lc->def_acaps); + /* If the Port is capable of Auto-Negtotiation, initialize it as + * "enabled" and copy over all of the Physical Port Capabilities + * to the Advertised Port Capabilities. Otherwise mark it as + * Auto-Negotiate disabled and select the highest supported speed + * for the link. Note parallel structure in t4_link_l1cfg_core() + * and t4_handle_get_port_info(). + */ if (lc->pcaps & FW_PORT_CAP32_ANEG) { lc->acaps = lc->pcaps & ADVERT_MASK; lc->autoneg = AUTONEG_ENABLE; @@ -8607,6 +8660,7 @@ static void init_link_config(struct link_config *lc, fw_port_cap32_t pcaps, } else { lc->acaps = 0; lc->autoneg = AUTONEG_DISABLE; + lc->speed_caps = fwcap_to_fwspeed(acaps); } } diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c index 71f13bd2b5e4..ff84791a0ff8 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c @@ -274,7 +274,7 @@ static int link_start(struct net_device *dev) * is enabled on a port. */ if (ret == 0) - ret = t4vf_enable_vi(pi->adapter, pi->viid, true, true); + ret = t4vf_enable_pi(pi->adapter, pi, true, true); /* The Virtual Interfaces are connected to an internal switch on the * chip which allows VIs attached to the same port to talk to each @@ -822,8 +822,7 @@ static int cxgb4vf_stop(struct net_device *dev) netif_tx_stop_all_queues(dev); netif_carrier_off(dev); - t4vf_enable_vi(adapter, pi->viid, false, false); - pi->link_cfg.link_ok = 0; + t4vf_enable_pi(adapter, pi, false, false); clear_bit(pi->port_id, &adapter->open_device_map); if (adapter->open_device_map == 0) diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h index 712e8f0c71b4..ccca67cf4487 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h +++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h @@ -391,7 +391,10 @@ int t4vf_config_rss_range(struct adapter *, unsigned int, int, int, int t4vf_alloc_vi(struct adapter *, int); int t4vf_free_vi(struct adapter *, int); -int t4vf_enable_vi(struct adapter *, unsigned int, bool, bool); +int t4vf_enable_vi(struct adapter *adapter, unsigned int viid, bool rx_en, + bool tx_en); +int t4vf_enable_pi(struct adapter *adapter, struct port_info *pi, bool rx_en, + bool tx_en); int t4vf_identify_port(struct adapter *, unsigned int, unsigned int); int t4vf_set_rxmode(struct adapter *, unsigned int, int, int, int, int, int, diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c index 3017f7873ff9..5b8c08cf523f 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c @@ -405,6 +405,36 @@ static unsigned int fwcap_to_speed(fw_port_cap32_t caps) return 0; } +/** + * fwcap_to_fwspeed - return highest speed in Port Capabilities + * @acaps: advertised Port Capabilities + * + * Get the highest speed for the port from the advertised Port + * Capabilities. It will be either the highest speed from the list of + * speeds or whatever user has set using ethtool. + */ +static fw_port_cap32_t fwcap_to_fwspeed(fw_port_cap32_t acaps) +{ + #define TEST_SPEED_RETURN(__caps_speed) \ + do { \ + if (acaps & FW_PORT_CAP32_SPEED_##__caps_speed) \ + return FW_PORT_CAP32_SPEED_##__caps_speed; \ + } while (0) + + TEST_SPEED_RETURN(400G); + TEST_SPEED_RETURN(200G); + TEST_SPEED_RETURN(100G); + TEST_SPEED_RETURN(50G); + TEST_SPEED_RETURN(40G); + TEST_SPEED_RETURN(25G); + TEST_SPEED_RETURN(10G); + TEST_SPEED_RETURN(1G); + TEST_SPEED_RETURN(100M); + + #undef TEST_SPEED_RETURN + return 0; +} + /* * init_link_config - initialize a link's SW state * @lc: structure holding the link state @@ -431,6 +461,13 @@ static void init_link_config(struct link_config *lc, lc->requested_fec = FEC_AUTO; lc->fec = lc->auto_fec; + /* If the Port is capable of Auto-Negtotiation, initialize it as + * "enabled" and copy over all of the Physical Port Capabilities + * to the Advertised Port Capabilities. Otherwise mark it as + * Auto-Negotiate disabled and select the highest supported speed + * for the link. Note parallel structure in t4_link_l1cfg_core() + * and t4_handle_get_port_info(). + */ if (lc->pcaps & FW_PORT_CAP32_ANEG) { lc->acaps = acaps & ADVERT_MASK; lc->autoneg = AUTONEG_ENABLE; @@ -438,6 +475,7 @@ static void init_link_config(struct link_config *lc, } else { lc->acaps = 0; lc->autoneg = AUTONEG_DISABLE; + lc->speed_caps = fwcap_to_fwspeed(acaps); } } @@ -1362,6 +1400,30 @@ int t4vf_enable_vi(struct adapter *adapter, unsigned int viid, } /** + * t4vf_enable_pi - enable/disable a Port's virtual interface + * @adapter: the adapter + * @pi: the Port Information structure + * @rx_en: 1=enable Rx, 0=disable Rx + * @tx_en: 1=enable Tx, 0=disable Tx + * + * Enables/disables a Port's virtual interface. If the Virtual + * Interface enable/disable operation is successful, we notify the + * OS-specific code of a potential Link Status change via the OS Contract + * API t4vf_os_link_changed(). + */ +int t4vf_enable_pi(struct adapter *adapter, struct port_info *pi, + bool rx_en, bool tx_en) +{ + int ret = t4vf_enable_vi(adapter, pi->viid, rx_en, tx_en); + + if (ret) + return ret; + t4vf_os_link_changed(adapter, pi->pidx, + rx_en && tx_en && pi->link_cfg.link_ok); + return 0; +} + +/** * t4vf_identify_port - identify a VI's port by blinking its LED * @adapter: the adapter * @viid: the Virtual Interface ID @@ -1955,7 +2017,14 @@ static void t4vf_handle_get_port_info(struct port_info *pi, lc->lpacaps = lpacaps; lc->acaps = acaps & ADVERT_MASK; - if (lc->acaps & FW_PORT_CAP32_ANEG) { + /* If we're not physically capable of Auto-Negotiation, note + * this as Auto-Negotiation disabled. Otherwise, we track + * what Auto-Negotiation settings we have. Note parallel + * structure in init_link_config(). + */ + if (!(lc->pcaps & FW_PORT_CAP32_ANEG)) { + lc->autoneg = AUTONEG_DISABLE; + } else if (lc->acaps & FW_PORT_CAP32_ANEG) { lc->autoneg = AUTONEG_ENABLE; } else { /* When Autoneg is disabled, user needs to set diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c index f81439796ac7..d438ef8a371d 100644 --- a/drivers/net/ethernet/freescale/fec_ptp.c +++ b/drivers/net/ethernet/freescale/fec_ptp.c @@ -466,12 +466,6 @@ static int fec_ptp_enable(struct ptp_clock_info *ptp, return -EOPNOTSUPP; } -/** - * fec_ptp_hwtstamp_ioctl - control hardware time stamping - * @ndev: pointer to net_device - * @ifreq: ioctl data - * @cmd: particular ioctl requested - */ int fec_ptp_set(struct net_device *ndev, struct ifreq *ifr) { struct fec_enet_private *fep = netdev_priv(ndev); diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 4bb4646a5f92..09f8e6baf049 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -109,13 +109,14 @@ static union sub_crq *ibmvnic_next_scrq(struct ibmvnic_adapter *, struct ibmvnic_sub_crq_queue *); static int ibmvnic_poll(struct napi_struct *napi, int data); static void send_map_query(struct ibmvnic_adapter *adapter); -static void send_request_map(struct ibmvnic_adapter *, dma_addr_t, __be32, u8); -static void send_request_unmap(struct ibmvnic_adapter *, u8); +static int send_request_map(struct ibmvnic_adapter *, dma_addr_t, __be32, u8); +static int send_request_unmap(struct ibmvnic_adapter *, u8); static int send_login(struct ibmvnic_adapter *adapter); static void send_cap_queries(struct ibmvnic_adapter *adapter); static int init_sub_crqs(struct ibmvnic_adapter *); static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter); static int ibmvnic_init(struct ibmvnic_adapter *); +static int ibmvnic_reset_init(struct ibmvnic_adapter *); static void release_crq_queue(struct ibmvnic_adapter *); static int __ibmvnic_set_mac(struct net_device *netdev, struct sockaddr *p); static int init_crq_queue(struct ibmvnic_adapter *adapter); @@ -172,6 +173,7 @@ static int alloc_long_term_buff(struct ibmvnic_adapter *adapter, struct ibmvnic_long_term_buff *ltb, int size) { struct device *dev = &adapter->vdev->dev; + int rc; ltb->size = size; ltb->buff = dma_alloc_coherent(dev, ltb->size, <b->addr, @@ -185,8 +187,12 @@ static int alloc_long_term_buff(struct ibmvnic_adapter *adapter, adapter->map_id++; init_completion(&adapter->fw_done); - send_request_map(adapter, ltb->addr, - ltb->size, ltb->map_id); + rc = send_request_map(adapter, ltb->addr, + ltb->size, ltb->map_id); + if (rc) { + dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr); + return rc; + } wait_for_completion(&adapter->fw_done); if (adapter->fw_done_rc) { @@ -215,10 +221,14 @@ static void free_long_term_buff(struct ibmvnic_adapter *adapter, static int reset_long_term_buff(struct ibmvnic_adapter *adapter, struct ibmvnic_long_term_buff *ltb) { + int rc; + memset(ltb->buff, 0, ltb->size); init_completion(&adapter->fw_done); - send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id); + rc = send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id); + if (rc) + return rc; wait_for_completion(&adapter->fw_done); if (adapter->fw_done_rc) { @@ -789,6 +799,7 @@ static void release_napi(struct ibmvnic_adapter *adapter) kfree(adapter->napi); adapter->napi = NULL; adapter->num_active_rx_napi = 0; + adapter->napi_enabled = false; } static int ibmvnic_login(struct net_device *netdev) @@ -919,6 +930,10 @@ static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state) /* Partuial success, delay and re-send */ mdelay(1000); resend = true; + } else if (adapter->init_done_rc) { + netdev_warn(netdev, "Unable to set link state, rc=%d\n", + adapter->init_done_rc); + return adapter->init_done_rc; } } while (resend); @@ -951,6 +966,7 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter) struct device *dev = &adapter->vdev->dev; union ibmvnic_crq crq; int len = 0; + int rc; if (adapter->vpd->buff) len = adapter->vpd->len; @@ -958,7 +974,9 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter) init_completion(&adapter->fw_done); crq.get_vpd_size.first = IBMVNIC_CRQ_CMD; crq.get_vpd_size.cmd = GET_VPD_SIZE; - ibmvnic_send_crq(adapter, &crq); + rc = ibmvnic_send_crq(adapter, &crq); + if (rc) + return rc; wait_for_completion(&adapter->fw_done); if (!adapter->vpd->len) @@ -991,7 +1009,12 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter) crq.get_vpd.cmd = GET_VPD; crq.get_vpd.ioba = cpu_to_be32(adapter->vpd->dma_addr); crq.get_vpd.len = cpu_to_be32((u32)adapter->vpd->len); - ibmvnic_send_crq(adapter, &crq); + rc = ibmvnic_send_crq(adapter, &crq); + if (rc) { + kfree(adapter->vpd->buff); + adapter->vpd->buff = NULL; + return rc; + } wait_for_completion(&adapter->fw_done); return 0; @@ -1690,6 +1713,7 @@ static int __ibmvnic_set_mac(struct net_device *netdev, struct sockaddr *p) struct ibmvnic_adapter *adapter = netdev_priv(netdev); struct sockaddr *addr = p; union ibmvnic_crq crq; + int rc; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; @@ -1700,7 +1724,9 @@ static int __ibmvnic_set_mac(struct net_device *netdev, struct sockaddr *p) ether_addr_copy(&crq.change_mac_addr.mac_addr[0], addr->sa_data); init_completion(&adapter->fw_done); - ibmvnic_send_crq(adapter, &crq); + rc = ibmvnic_send_crq(adapter, &crq); + if (rc) + return rc; wait_for_completion(&adapter->fw_done); /* netdev->dev_addr is changed in handle_change_mac_rsp function */ return adapter->fw_done_rc ? -EIO : 0; @@ -1782,7 +1808,7 @@ static int do_reset(struct ibmvnic_adapter *adapter, return rc; } - rc = ibmvnic_init(adapter); + rc = ibmvnic_reset_init(adapter); if (rc) return IBMVNIC_INIT_FAILED; @@ -1852,6 +1878,85 @@ static int do_reset(struct ibmvnic_adapter *adapter, return 0; } +static int do_hard_reset(struct ibmvnic_adapter *adapter, + struct ibmvnic_rwi *rwi, u32 reset_state) +{ + struct net_device *netdev = adapter->netdev; + int rc; + + netdev_dbg(adapter->netdev, "Hard resetting driver (%d)\n", + rwi->reset_reason); + + netif_carrier_off(netdev); + adapter->reset_reason = rwi->reset_reason; + + ibmvnic_cleanup(netdev); + release_resources(adapter); + release_sub_crqs(adapter, 0); + release_crq_queue(adapter); + + /* remove the closed state so when we call open it appears + * we are coming from the probed state. + */ + adapter->state = VNIC_PROBED; + + rc = init_crq_queue(adapter); + if (rc) { + netdev_err(adapter->netdev, + "Couldn't initialize crq. rc=%d\n", rc); + return rc; + } + + rc = ibmvnic_init(adapter); + if (rc) + return rc; + + /* If the adapter was in PROBE state prior to the reset, + * exit here. + */ + if (reset_state == VNIC_PROBED) + return 0; + + rc = ibmvnic_login(netdev); + if (rc) { + adapter->state = VNIC_PROBED; + return 0; + } + /* netif_set_real_num_xx_queues needs to take rtnl lock here + * unless wait_for_reset is set, in which case the rtnl lock + * has already been taken before initializing the reset + */ + if (!adapter->wait_for_reset) { + rtnl_lock(); + rc = init_resources(adapter); + rtnl_unlock(); + } else { + rc = init_resources(adapter); + } + if (rc) + return rc; + + ibmvnic_disable_irqs(adapter); + adapter->state = VNIC_CLOSED; + + if (reset_state == VNIC_CLOSED) + return 0; + + rc = __ibmvnic_open(netdev); + if (rc) { + if (list_empty(&adapter->rwi_list)) + adapter->state = VNIC_CLOSED; + else + adapter->state = reset_state; + + return 0; + } + + netif_carrier_on(netdev); + + return 0; +} + static struct ibmvnic_rwi *get_next_rwi(struct ibmvnic_adapter *adapter) { struct ibmvnic_rwi *rwi; @@ -1893,14 +1998,19 @@ static void __ibmvnic_reset(struct work_struct *work) netdev = adapter->netdev; mutex_lock(&adapter->reset_lock); - adapter->resetting = true; reset_state = adapter->state; rwi = get_next_rwi(adapter); while (rwi) { - rc = do_reset(adapter, rwi, reset_state); + if (adapter->force_reset_recovery) { + adapter->force_reset_recovery = false; + rc = do_hard_reset(adapter, rwi, reset_state); + } else { + rc = do_reset(adapter, rwi, reset_state); + } kfree(rwi); - if (rc && rc != IBMVNIC_INIT_FAILED) + if (rc && rc != IBMVNIC_INIT_FAILED && + !adapter->force_reset_recovery) break; rwi = get_next_rwi(adapter); @@ -1926,9 +2036,9 @@ static void __ibmvnic_reset(struct work_struct *work) static int ibmvnic_reset(struct ibmvnic_adapter *adapter, enum ibmvnic_reset_reason reason) { + struct list_head *entry, *tmp_entry; struct ibmvnic_rwi *rwi, *tmp; struct net_device *netdev = adapter->netdev; - struct list_head *entry; int ret; if (adapter->state == VNIC_REMOVING || @@ -1964,11 +2074,17 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, ret = ENOMEM; goto err; } - + /* if we just received a transport event, + * flush reset queue and process this reset + */ + if (adapter->force_reset_recovery && !list_empty(&adapter->rwi_list)) { + list_for_each_safe(entry, tmp_entry, &adapter->rwi_list) + list_del(entry); + } rwi->reset_reason = reason; list_add_tail(&rwi->list, &adapter->rwi_list); mutex_unlock(&adapter->rwi_lock); - + adapter->resetting = true; netdev_dbg(adapter->netdev, "Scheduling reset (reason %d)\n", reason); schedule_work(&adapter->ibmvnic_reset); @@ -2364,6 +2480,7 @@ static void ibmvnic_get_ethtool_stats(struct net_device *dev, struct ibmvnic_adapter *adapter = netdev_priv(dev); union ibmvnic_crq crq; int i, j; + int rc; memset(&crq, 0, sizeof(crq)); crq.request_statistics.first = IBMVNIC_CRQ_CMD; @@ -2374,7 +2491,9 @@ static void ibmvnic_get_ethtool_stats(struct net_device *dev, /* Wait for data to be written */ init_completion(&adapter->stats_done); - ibmvnic_send_crq(adapter, &crq); + rc = ibmvnic_send_crq(adapter, &crq); + if (rc) + return; wait_for_completion(&adapter->stats_done); for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++) @@ -3146,6 +3265,12 @@ static int ibmvnic_send_crq(struct ibmvnic_adapter *adapter, (unsigned long int)cpu_to_be64(u64_crq[0]), (unsigned long int)cpu_to_be64(u64_crq[1])); + if (!adapter->crq.active && + crq->generic.first != IBMVNIC_CRQ_INIT_CMD) { + dev_warn(dev, "Invalid request detected while CRQ is inactive, possible device state change during reset\n"); + return -EINVAL; + } + /* Make sure the hypervisor sees the complete request */ mb(); @@ -3370,8 +3495,8 @@ buf_alloc_failed: return -1; } -static void send_request_map(struct ibmvnic_adapter *adapter, dma_addr_t addr, - u32 len, u8 map_id) +static int send_request_map(struct ibmvnic_adapter *adapter, dma_addr_t addr, + u32 len, u8 map_id) { union ibmvnic_crq crq; @@ -3381,10 +3506,10 @@ static void send_request_map(struct ibmvnic_adapter *adapter, dma_addr_t addr, crq.request_map.map_id = map_id; crq.request_map.ioba = cpu_to_be32(addr); crq.request_map.len = cpu_to_be32(len); - ibmvnic_send_crq(adapter, &crq); + return ibmvnic_send_crq(adapter, &crq); } -static void send_request_unmap(struct ibmvnic_adapter *adapter, u8 map_id) +static int send_request_unmap(struct ibmvnic_adapter *adapter, u8 map_id) { union ibmvnic_crq crq; @@ -3392,7 +3517,7 @@ static void send_request_unmap(struct ibmvnic_adapter *adapter, u8 map_id) crq.request_unmap.first = IBMVNIC_CRQ_CMD; crq.request_unmap.cmd = REQUEST_UNMAP; crq.request_unmap.map_id = map_id; - ibmvnic_send_crq(adapter, &crq); + return ibmvnic_send_crq(adapter, &crq); } static void send_map_query(struct ibmvnic_adapter *adapter) @@ -4219,11 +4344,15 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, dev_info(dev, "Partner initialized\n"); adapter->from_passive_init = true; adapter->failover_pending = false; - complete(&adapter->init_done); + if (!completion_done(&adapter->init_done)) { + complete(&adapter->init_done); + adapter->init_done_rc = -EIO; + } ibmvnic_reset(adapter, VNIC_RESET_FAILOVER); break; case IBMVNIC_CRQ_INIT_COMPLETE: dev_info(dev, "Partner initialization complete\n"); + adapter->crq.active = true; send_version_xchg(adapter); break; default: @@ -4232,6 +4361,9 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, return; case IBMVNIC_CRQ_XPORT_EVENT: netif_carrier_off(netdev); + adapter->crq.active = false; + if (adapter->resetting) + adapter->force_reset_recovery = true; if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) { dev_info(dev, "Migrated, re-enabling adapter\n"); ibmvnic_reset(adapter, VNIC_RESET_MOBILITY); @@ -4419,6 +4551,7 @@ static int ibmvnic_reset_crq(struct ibmvnic_adapter *adapter) /* Clean out the queue */ memset(crq->msgs, 0, PAGE_SIZE); crq->cur = 0; + crq->active = false; /* And re-open it again */ rc = plpar_hcall_norets(H_REG_CRQ, vdev->unit_address, @@ -4453,6 +4586,7 @@ static void release_crq_queue(struct ibmvnic_adapter *adapter) DMA_BIDIRECTIONAL); free_page((unsigned long)crq->msgs); crq->msgs = NULL; + crq->active = false; } static int init_crq_queue(struct ibmvnic_adapter *adapter) @@ -4530,7 +4664,7 @@ map_failed: return retrc; } -static int ibmvnic_init(struct ibmvnic_adapter *adapter) +static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter) { struct device *dev = &adapter->vdev->dev; unsigned long timeout = msecs_to_jiffies(30000); @@ -4589,6 +4723,49 @@ static int ibmvnic_init(struct ibmvnic_adapter *adapter) return rc; } +static int ibmvnic_init(struct ibmvnic_adapter *adapter) +{ + struct device *dev = &adapter->vdev->dev; + unsigned long timeout = msecs_to_jiffies(30000); + int rc; + + adapter->from_passive_init = false; + + init_completion(&adapter->init_done); + adapter->init_done_rc = 0; + ibmvnic_send_crq_init(adapter); + if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { + dev_err(dev, "Initialization sequence timed out\n"); + return -1; + } + + if (adapter->init_done_rc) { + release_crq_queue(adapter); + return adapter->init_done_rc; + } + + if (adapter->from_passive_init) { + adapter->state = VNIC_OPEN; + adapter->from_passive_init = false; + return -1; + } + + rc = init_sub_crqs(adapter); + if (rc) { + dev_err(dev, "Initialization of sub crqs failed\n"); + release_crq_queue(adapter); + return rc; + } + + rc = init_sub_crq_irqs(adapter); + if (rc) { + dev_err(dev, "Failed to initialize sub crq irqs\n"); + release_crq_queue(adapter); + } + + return rc; +} + static struct device_attribute dev_attr_failover; static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index 22391e8805f6..f9fb780102ac 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -865,6 +865,7 @@ struct ibmvnic_crq_queue { int size, cur; dma_addr_t msg_token; spinlock_t lock; + bool active; }; union sub_crq { @@ -1108,6 +1109,7 @@ struct ibmvnic_adapter { bool mac_change_pending; bool failover_pending; + bool force_reset_recovery; struct ibmvnic_tunables desired; struct ibmvnic_tunables fallback; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 5efa68de935b..9b698c5acd05 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev) * @dev: netdev * @xdp: XDP buffer * - * Returns Zero if sent, else an error code + * Returns number of frames successfully sent. Frames that fail are + * free'ed via XDP return API. + * + * For error cases, a negative errno code is returned and no-frames + * are transmitted (caller must handle freeing frames). **/ -int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) +int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames) { struct i40e_netdev_priv *np = netdev_priv(dev); unsigned int queue_index = smp_processor_id(); struct i40e_vsi *vsi = np->vsi; - int err; + int drops = 0; + int i; if (test_bit(__I40E_VSI_DOWN, vsi->state)) return -ENETDOWN; @@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs) return -ENXIO; - err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]); - if (err != I40E_XDP_TX) - return -ENOSPC; + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + int err; - return 0; + err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]); + if (err != I40E_XDP_TX) { + xdp_return_frame_rx_napi(xdpf); + drops++; + } + } + + return n - drops; } /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index fdd2c55f03a6..eb8804b3d7b6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -487,7 +487,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw); void i40e_detect_recover_hung(struct i40e_vsi *vsi); int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size); bool __i40e_chk_linearize(struct sk_buff *skb); -int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf); +int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames); void i40e_xdp_flush(struct net_device *dev); /** diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index a52d92e182ee..031d65c4178d 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -10022,11 +10022,13 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) } } -static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) +static int ixgbe_xdp_xmit(struct net_device *dev, int n, + struct xdp_frame **frames) { struct ixgbe_adapter *adapter = netdev_priv(dev); struct ixgbe_ring *ring; - int err; + int drops = 0; + int i; if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state))) return -ENETDOWN; @@ -10038,11 +10040,18 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) if (unlikely(!ring)) return -ENXIO; - err = ixgbe_xmit_xdp_ring(adapter, xdpf); - if (err != IXGBE_XDP_TX) - return -ENOSPC; + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + int err; - return 0; + err = ixgbe_xmit_xdp_ring(adapter, xdpf); + if (err != IXGBE_XDP_TX) { + xdp_return_frame_rx_napi(xdpf); + drops++; + } + } + + return n - drops; } static void ixgbe_xdp_flush(struct net_device *dev) diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile index 6373f56205fd..4afb10375397 100644 --- a/drivers/net/ethernet/netronome/nfp/Makefile +++ b/drivers/net/ethernet/netronome/nfp/Makefile @@ -37,6 +37,7 @@ ifeq ($(CONFIG_NFP_APP_FLOWER),y) nfp-objs += \ flower/action.o \ flower/cmsg.o \ + flower/lag_conf.o \ flower/main.o \ flower/match.o \ flower/metadata.o \ diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index a4d3da215863..8a92088df0d7 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -212,6 +212,60 @@ emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer) } static void +__emit_br_bit(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 addr, u8 defer, + bool set, bool src_lmextn) +{ + u16 addr_lo, addr_hi; + u64 insn; + + addr_lo = addr & (OP_BR_BIT_ADDR_LO >> __bf_shf(OP_BR_BIT_ADDR_LO)); + addr_hi = addr != addr_lo; + + insn = OP_BR_BIT_BASE | + FIELD_PREP(OP_BR_BIT_A_SRC, areg) | + FIELD_PREP(OP_BR_BIT_B_SRC, breg) | + FIELD_PREP(OP_BR_BIT_BV, set) | + FIELD_PREP(OP_BR_BIT_DEFBR, defer) | + FIELD_PREP(OP_BR_BIT_ADDR_LO, addr_lo) | + FIELD_PREP(OP_BR_BIT_ADDR_HI, addr_hi) | + FIELD_PREP(OP_BR_BIT_SRC_LMEXTN, src_lmextn); + + nfp_prog_push(nfp_prog, insn); +} + +static void +emit_br_bit_relo(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, + u8 defer, bool set, enum nfp_relo_type relo) +{ + struct nfp_insn_re_regs reg; + int err; + + /* NOTE: The bit to test is specified as an rotation amount, such that + * the bit to test will be placed on the MSB of the result when + * doing a rotate right. For bit X, we need right rotate X + 1. + */ + bit += 1; + + err = swreg_to_restricted(reg_none(), src, reg_imm(bit), ®, false); + if (err) { + nfp_prog->error = err; + return; + } + + __emit_br_bit(nfp_prog, reg.areg, reg.breg, addr, defer, set, + reg.src_lmextn); + + nfp_prog->prog[nfp_prog->prog_len - 1] |= + FIELD_PREP(OP_RELO_TYPE, relo); +} + +static void +emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer) +{ + emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL); +} + +static void __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi, enum immed_width width, bool invert, enum immed_shift shift, bool wr_both, @@ -310,6 +364,19 @@ emit_shf(struct nfp_prog *nfp_prog, swreg dst, } static void +emit_shf_indir(struct nfp_prog *nfp_prog, swreg dst, + swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc) +{ + if (sc == SHF_SC_R_ROT) { + pr_err("indirect shift is not allowed on rotation\n"); + nfp_prog->error = -EFAULT; + return; + } + + emit_shf(nfp_prog, dst, lreg, op, rreg, sc, 0); +} + +static void __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab, u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both, bool dst_lmextn, bool src_lmextn) @@ -1629,26 +1696,142 @@ static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) return 0; } +/* Pseudo code: + * if shift_amt >= 32 + * dst_high = dst_low << shift_amt[4:0] + * dst_low = 0; + * else + * dst_high = (dst_high, dst_low) >> (32 - shift_amt) + * dst_low = dst_low << shift_amt + * + * The indirect shift will use the same logic at runtime. + */ +static int __shl_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt) +{ + if (shift_amt < 32) { + emit_shf(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), + SHF_OP_NONE, reg_b(dst), SHF_SC_R_DSHF, + 32 - shift_amt); + emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE, + reg_b(dst), SHF_SC_L_SHF, shift_amt); + } else if (shift_amt == 32) { + wrp_reg_mov(nfp_prog, dst + 1, dst); + wrp_immed(nfp_prog, reg_both(dst), 0); + } else if (shift_amt > 32) { + emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE, + reg_b(dst), SHF_SC_L_SHF, shift_amt - 32); + wrp_immed(nfp_prog, reg_both(dst), 0); + } + + return 0; +} + static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { const struct bpf_insn *insn = &meta->insn; u8 dst = insn->dst_reg * 2; - if (insn->imm < 32) { - emit_shf(nfp_prog, reg_both(dst + 1), - reg_a(dst + 1), SHF_OP_NONE, reg_b(dst), - SHF_SC_R_DSHF, 32 - insn->imm); - emit_shf(nfp_prog, reg_both(dst), - reg_none(), SHF_OP_NONE, reg_b(dst), - SHF_SC_L_SHF, insn->imm); - } else if (insn->imm == 32) { - wrp_reg_mov(nfp_prog, dst + 1, dst); - wrp_immed(nfp_prog, reg_both(dst), 0); - } else if (insn->imm > 32) { - emit_shf(nfp_prog, reg_both(dst + 1), - reg_none(), SHF_OP_NONE, reg_b(dst), - SHF_SC_L_SHF, insn->imm - 32); - wrp_immed(nfp_prog, reg_both(dst), 0); + return __shl_imm64(nfp_prog, dst, insn->imm); +} + +static void shl_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src) +{ + emit_alu(nfp_prog, imm_both(nfp_prog), reg_imm(32), ALU_OP_SUB, + reg_b(src)); + emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR, reg_imm(0)); + emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), SHF_OP_NONE, + reg_b(dst), SHF_SC_R_DSHF); +} + +/* NOTE: for indirect left shift, HIGH part should be calculated first. */ +static void shl_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src) +{ + emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0)); + emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE, + reg_b(dst), SHF_SC_L_SHF); +} + +static void shl_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src) +{ + shl_reg64_lt32_high(nfp_prog, dst, src); + shl_reg64_lt32_low(nfp_prog, dst, src); +} + +static void shl_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src) +{ + emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0)); + emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE, + reg_b(dst), SHF_SC_L_SHF); + wrp_immed(nfp_prog, reg_both(dst), 0); +} + +static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + const struct bpf_insn *insn = &meta->insn; + u64 umin, umax; + u8 dst, src; + + dst = insn->dst_reg * 2; + umin = meta->umin; + umax = meta->umax; + if (umin == umax) + return __shl_imm64(nfp_prog, dst, umin); + + src = insn->src_reg * 2; + if (umax < 32) { + shl_reg64_lt32(nfp_prog, dst, src); + } else if (umin >= 32) { + shl_reg64_ge32(nfp_prog, dst, src); + } else { + /* Generate different instruction sequences depending on runtime + * value of shift amount. + */ + u16 label_ge32, label_end; + + label_ge32 = nfp_prog_current_offset(nfp_prog) + 7; + emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0); + + shl_reg64_lt32_high(nfp_prog, dst, src); + label_end = nfp_prog_current_offset(nfp_prog) + 6; + emit_br(nfp_prog, BR_UNC, label_end, 2); + /* shl_reg64_lt32_low packed in delay slot. */ + shl_reg64_lt32_low(nfp_prog, dst, src); + + if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32)) + return -EINVAL; + shl_reg64_ge32(nfp_prog, dst, src); + + if (!nfp_prog_confirm_current_offset(nfp_prog, label_end)) + return -EINVAL; + } + + return 0; +} + +/* Pseudo code: + * if shift_amt >= 32 + * dst_high = 0; + * dst_low = dst_high >> shift_amt[4:0] + * else + * dst_high = dst_high >> shift_amt + * dst_low = (dst_high, dst_low) >> shift_amt + * + * The indirect shift will use the same logic at runtime. + */ +static int __shr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt) +{ + if (shift_amt < 32) { + emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE, + reg_b(dst), SHF_SC_R_DSHF, shift_amt); + emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE, + reg_b(dst + 1), SHF_SC_R_SHF, shift_amt); + } else if (shift_amt == 32) { + wrp_reg_mov(nfp_prog, dst, dst + 1); + wrp_immed(nfp_prog, reg_both(dst + 1), 0); + } else if (shift_amt > 32) { + emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE, + reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32); + wrp_immed(nfp_prog, reg_both(dst + 1), 0); } return 0; @@ -1659,21 +1842,186 @@ static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) const struct bpf_insn *insn = &meta->insn; u8 dst = insn->dst_reg * 2; - if (insn->imm < 32) { - emit_shf(nfp_prog, reg_both(dst), - reg_a(dst + 1), SHF_OP_NONE, reg_b(dst), - SHF_SC_R_DSHF, insn->imm); - emit_shf(nfp_prog, reg_both(dst + 1), - reg_none(), SHF_OP_NONE, reg_b(dst + 1), - SHF_SC_R_SHF, insn->imm); - } else if (insn->imm == 32) { + return __shr_imm64(nfp_prog, dst, insn->imm); +} + +/* NOTE: for indirect right shift, LOW part should be calculated first. */ +static void shr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src) +{ + emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0)); + emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE, + reg_b(dst + 1), SHF_SC_R_SHF); +} + +static void shr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src) +{ + emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0)); + emit_shf_indir(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE, + reg_b(dst), SHF_SC_R_DSHF); +} + +static void shr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src) +{ + shr_reg64_lt32_low(nfp_prog, dst, src); + shr_reg64_lt32_high(nfp_prog, dst, src); +} + +static void shr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src) +{ + emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0)); + emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE, + reg_b(dst + 1), SHF_SC_R_SHF); + wrp_immed(nfp_prog, reg_both(dst + 1), 0); +} + +static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + const struct bpf_insn *insn = &meta->insn; + u64 umin, umax; + u8 dst, src; + + dst = insn->dst_reg * 2; + umin = meta->umin; + umax = meta->umax; + if (umin == umax) + return __shr_imm64(nfp_prog, dst, umin); + + src = insn->src_reg * 2; + if (umax < 32) { + shr_reg64_lt32(nfp_prog, dst, src); + } else if (umin >= 32) { + shr_reg64_ge32(nfp_prog, dst, src); + } else { + /* Generate different instruction sequences depending on runtime + * value of shift amount. + */ + u16 label_ge32, label_end; + + label_ge32 = nfp_prog_current_offset(nfp_prog) + 6; + emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0); + shr_reg64_lt32_low(nfp_prog, dst, src); + label_end = nfp_prog_current_offset(nfp_prog) + 6; + emit_br(nfp_prog, BR_UNC, label_end, 2); + /* shr_reg64_lt32_high packed in delay slot. */ + shr_reg64_lt32_high(nfp_prog, dst, src); + + if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32)) + return -EINVAL; + shr_reg64_ge32(nfp_prog, dst, src); + + if (!nfp_prog_confirm_current_offset(nfp_prog, label_end)) + return -EINVAL; + } + + return 0; +} + +/* Code logic is the same as __shr_imm64 except ashr requires signedness bit + * told through PREV_ALU result. + */ +static int __ashr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt) +{ + if (shift_amt < 32) { + emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE, + reg_b(dst), SHF_SC_R_DSHF, shift_amt); + /* Set signedness bit. */ + emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR, + reg_imm(0)); + emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR, + reg_b(dst + 1), SHF_SC_R_SHF, shift_amt); + } else if (shift_amt == 32) { + /* NOTE: this also helps setting signedness bit. */ wrp_reg_mov(nfp_prog, dst, dst + 1); - wrp_immed(nfp_prog, reg_both(dst + 1), 0); - } else if (insn->imm > 32) { - emit_shf(nfp_prog, reg_both(dst), - reg_none(), SHF_OP_NONE, reg_b(dst + 1), - SHF_SC_R_SHF, insn->imm - 32); - wrp_immed(nfp_prog, reg_both(dst + 1), 0); + emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR, + reg_b(dst + 1), SHF_SC_R_SHF, 31); + } else if (shift_amt > 32) { + emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR, + reg_imm(0)); + emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR, + reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32); + emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR, + reg_b(dst + 1), SHF_SC_R_SHF, 31); + } + + return 0; +} + +static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + const struct bpf_insn *insn = &meta->insn; + u8 dst = insn->dst_reg * 2; + + return __ashr_imm64(nfp_prog, dst, insn->imm); +} + +static void ashr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src) +{ + /* NOTE: the first insn will set both indirect shift amount (source A) + * and signedness bit (MSB of result). + */ + emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1)); + emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR, + reg_b(dst + 1), SHF_SC_R_SHF); +} + +static void ashr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src) +{ + /* NOTE: it is the same as logic shift because we don't need to shift in + * signedness bit when the shift amount is less than 32. + */ + return shr_reg64_lt32_low(nfp_prog, dst, src); +} + +static void ashr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src) +{ + ashr_reg64_lt32_low(nfp_prog, dst, src); + ashr_reg64_lt32_high(nfp_prog, dst, src); +} + +static void ashr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src) +{ + emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1)); + emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR, + reg_b(dst + 1), SHF_SC_R_SHF); + emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR, + reg_b(dst + 1), SHF_SC_R_SHF, 31); +} + +/* Like ashr_imm64, but need to use indirect shift. */ +static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + const struct bpf_insn *insn = &meta->insn; + u64 umin, umax; + u8 dst, src; + + dst = insn->dst_reg * 2; + umin = meta->umin; + umax = meta->umax; + if (umin == umax) + return __ashr_imm64(nfp_prog, dst, umin); + + src = insn->src_reg * 2; + if (umax < 32) { + ashr_reg64_lt32(nfp_prog, dst, src); + } else if (umin >= 32) { + ashr_reg64_ge32(nfp_prog, dst, src); + } else { + u16 label_ge32, label_end; + + label_ge32 = nfp_prog_current_offset(nfp_prog) + 6; + emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0); + ashr_reg64_lt32_low(nfp_prog, dst, src); + label_end = nfp_prog_current_offset(nfp_prog) + 6; + emit_br(nfp_prog, BR_UNC, label_end, 2); + /* ashr_reg64_lt32_high packed in delay slot. */ + ashr_reg64_lt32_high(nfp_prog, dst, src); + + if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32)) + return -EINVAL; + ashr_reg64_ge32(nfp_prog, dst, src); + + if (!nfp_prog_confirm_current_offset(nfp_prog, label_end)) + return -EINVAL; } return 0; @@ -2501,8 +2849,12 @@ static const instr_cb_t instr_cb[256] = { [BPF_ALU64 | BPF_SUB | BPF_X] = sub_reg64, [BPF_ALU64 | BPF_SUB | BPF_K] = sub_imm64, [BPF_ALU64 | BPF_NEG] = neg_reg64, + [BPF_ALU64 | BPF_LSH | BPF_X] = shl_reg64, [BPF_ALU64 | BPF_LSH | BPF_K] = shl_imm64, + [BPF_ALU64 | BPF_RSH | BPF_X] = shr_reg64, [BPF_ALU64 | BPF_RSH | BPF_K] = shr_imm64, + [BPF_ALU64 | BPF_ARSH | BPF_X] = ashr_reg64, + [BPF_ALU64 | BPF_ARSH | BPF_K] = ashr_imm64, [BPF_ALU | BPF_MOV | BPF_X] = mov_reg, [BPF_ALU | BPF_MOV | BPF_K] = mov_imm, [BPF_ALU | BPF_XOR | BPF_X] = xor_reg, diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 8b143546ae85..654fe7823e5e 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -263,6 +263,8 @@ struct nfp_bpf_reg_state { * @func_id: function id for call instructions * @arg1: arg1 for call instructions * @arg2: arg2 for call instructions + * @umin: copy of core verifier umin_value. + * @umax: copy of core verifier umax_value. * @off: index of first generated machine instruction (in nfp_prog.prog) * @n: eBPF instruction number * @flags: eBPF instruction extra optimization flags @@ -298,6 +300,13 @@ struct nfp_insn_meta { struct bpf_reg_state arg1; struct nfp_bpf_reg_state arg2; }; + /* We are interested in range info for some operands, + * for example, the shift amount. + */ + struct { + u64 umin; + u64 umax; + }; }; unsigned int off; unsigned short n; @@ -375,6 +384,25 @@ static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta) return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_XADD); } +static inline bool is_mbpf_indir_shift(const struct nfp_insn_meta *meta) +{ + u8 code = meta->insn.code; + bool is_alu, is_shift; + u8 opclass, opcode; + + opclass = BPF_CLASS(code); + is_alu = opclass == BPF_ALU64 || opclass == BPF_ALU; + if (!is_alu) + return false; + + opcode = BPF_OP(code); + is_shift = opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH; + if (!is_shift) + return false; + + return BPF_SRC(code) == BPF_X; +} + /** * struct nfp_prog - nfp BPF program * @bpf: backpointer to the bpf app priv structure diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 4db0ac1e42a8..7eae4c0266f8 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -190,6 +190,8 @@ nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog, meta->insn = prog[i]; meta->n = i; + if (is_mbpf_indir_shift(meta)) + meta->umin = U64_MAX; list_add_tail(&meta->l, &nfp_prog->insns); } diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index 844a9be6e55a..4bfeba7b21b2 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -551,6 +551,14 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) if (is_mbpf_xadd(meta)) return nfp_bpf_check_xadd(nfp_prog, meta, env); + if (is_mbpf_indir_shift(meta)) { + const struct bpf_reg_state *sreg = + cur_regs(env) + meta->insn.src_reg; + + meta->umin = min(meta->umin, sreg->umin_value); + meta->umax = max(meta->umax, sreg->umax_value); + } + return 0; } diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c index 80df9a5d4217..4a6d2db75071 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/action.c +++ b/drivers/net/ethernet/netronome/nfp/flower/action.c @@ -72,6 +72,42 @@ nfp_fl_push_vlan(struct nfp_fl_push_vlan *push_vlan, push_vlan->vlan_tci = cpu_to_be16(tmp_push_vlan_tci); } +static int +nfp_fl_pre_lag(struct nfp_app *app, const struct tc_action *action, + struct nfp_fl_payload *nfp_flow, int act_len) +{ + size_t act_size = sizeof(struct nfp_fl_pre_lag); + struct nfp_fl_pre_lag *pre_lag; + struct net_device *out_dev; + int err; + + out_dev = tcf_mirred_dev(action); + if (!out_dev || !netif_is_lag_master(out_dev)) + return 0; + + if (act_len + act_size > NFP_FL_MAX_A_SIZ) + return -EOPNOTSUPP; + + /* Pre_lag action must be first on action list. + * If other actions already exist they need pushed forward. + */ + if (act_len) + memmove(nfp_flow->action_data + act_size, + nfp_flow->action_data, act_len); + + pre_lag = (struct nfp_fl_pre_lag *)nfp_flow->action_data; + err = nfp_flower_lag_populate_pre_action(app, out_dev, pre_lag); + if (err) + return err; + + pre_lag->head.jump_id = NFP_FL_ACTION_OPCODE_PRE_LAG; + pre_lag->head.len_lw = act_size >> NFP_FL_LW_SIZ; + + nfp_flow->meta.shortcut = cpu_to_be32(NFP_FL_SC_ACT_NULL); + + return act_size; +} + static bool nfp_fl_netdev_is_tunnel_type(struct net_device *out_dev, enum nfp_flower_tun_type tun_type) { @@ -88,12 +124,13 @@ static bool nfp_fl_netdev_is_tunnel_type(struct net_device *out_dev, } static int -nfp_fl_output(struct nfp_fl_output *output, const struct tc_action *action, - struct nfp_fl_payload *nfp_flow, bool last, - struct net_device *in_dev, enum nfp_flower_tun_type tun_type, - int *tun_out_cnt) +nfp_fl_output(struct nfp_app *app, struct nfp_fl_output *output, + const struct tc_action *action, struct nfp_fl_payload *nfp_flow, + bool last, struct net_device *in_dev, + enum nfp_flower_tun_type tun_type, int *tun_out_cnt) { size_t act_size = sizeof(struct nfp_fl_output); + struct nfp_flower_priv *priv = app->priv; struct net_device *out_dev; u16 tmp_flags; @@ -118,6 +155,15 @@ nfp_fl_output(struct nfp_fl_output *output, const struct tc_action *action, output->flags = cpu_to_be16(tmp_flags | NFP_FL_OUT_FLAGS_USE_TUN); output->port = cpu_to_be32(NFP_FL_PORT_TYPE_TUN | tun_type); + } else if (netif_is_lag_master(out_dev) && + priv->flower_ext_feats & NFP_FL_FEATS_LAG) { + int gid; + + output->flags = cpu_to_be16(tmp_flags); + gid = nfp_flower_lag_get_output_id(app, out_dev); + if (gid < 0) + return gid; + output->port = cpu_to_be32(NFP_FL_LAG_OUT | gid); } else { /* Set action output parameters. */ output->flags = cpu_to_be16(tmp_flags); @@ -164,7 +210,7 @@ static struct nfp_fl_pre_tunnel *nfp_fl_pre_tunnel(char *act_data, int act_len) struct nfp_fl_pre_tunnel *pre_tun_act; /* Pre_tunnel action must be first on action list. - * If other actions already exist they need pushed forward. + * If other actions already exist they need to be pushed forward. */ if (act_len) memmove(act_data + act_size, act_data, act_len); @@ -443,42 +489,73 @@ nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len) } static int -nfp_flower_loop_action(const struct tc_action *a, +nfp_flower_output_action(struct nfp_app *app, const struct tc_action *a, + struct nfp_fl_payload *nfp_fl, int *a_len, + struct net_device *netdev, bool last, + enum nfp_flower_tun_type *tun_type, int *tun_out_cnt, + int *out_cnt) +{ + struct nfp_flower_priv *priv = app->priv; + struct nfp_fl_output *output; + int err, prelag_size; + + if (*a_len + sizeof(struct nfp_fl_output) > NFP_FL_MAX_A_SIZ) + return -EOPNOTSUPP; + + output = (struct nfp_fl_output *)&nfp_fl->action_data[*a_len]; + err = nfp_fl_output(app, output, a, nfp_fl, last, netdev, *tun_type, + tun_out_cnt); + if (err) + return err; + + *a_len += sizeof(struct nfp_fl_output); + + if (priv->flower_ext_feats & NFP_FL_FEATS_LAG) { + /* nfp_fl_pre_lag returns -err or size of prelag action added. + * This will be 0 if it is not egressing to a lag dev. + */ + prelag_size = nfp_fl_pre_lag(app, a, nfp_fl, *a_len); + if (prelag_size < 0) + return prelag_size; + else if (prelag_size > 0 && (!last || *out_cnt)) + return -EOPNOTSUPP; + + *a_len += prelag_size; + } + (*out_cnt)++; + + return 0; +} + +static int +nfp_flower_loop_action(struct nfp_app *app, const struct tc_action *a, struct nfp_fl_payload *nfp_fl, int *a_len, struct net_device *netdev, - enum nfp_flower_tun_type *tun_type, int *tun_out_cnt) + enum nfp_flower_tun_type *tun_type, int *tun_out_cnt, + int *out_cnt) { struct nfp_fl_set_ipv4_udp_tun *set_tun; struct nfp_fl_pre_tunnel *pre_tun; struct nfp_fl_push_vlan *psh_v; struct nfp_fl_pop_vlan *pop_v; - struct nfp_fl_output *output; int err; if (is_tcf_gact_shot(a)) { nfp_fl->meta.shortcut = cpu_to_be32(NFP_FL_SC_ACT_DROP); } else if (is_tcf_mirred_egress_redirect(a)) { - if (*a_len + sizeof(struct nfp_fl_output) > NFP_FL_MAX_A_SIZ) - return -EOPNOTSUPP; - - output = (struct nfp_fl_output *)&nfp_fl->action_data[*a_len]; - err = nfp_fl_output(output, a, nfp_fl, true, netdev, *tun_type, - tun_out_cnt); + err = nfp_flower_output_action(app, a, nfp_fl, a_len, netdev, + true, tun_type, tun_out_cnt, + out_cnt); if (err) return err; - *a_len += sizeof(struct nfp_fl_output); } else if (is_tcf_mirred_egress_mirror(a)) { - if (*a_len + sizeof(struct nfp_fl_output) > NFP_FL_MAX_A_SIZ) - return -EOPNOTSUPP; - - output = (struct nfp_fl_output *)&nfp_fl->action_data[*a_len]; - err = nfp_fl_output(output, a, nfp_fl, false, netdev, *tun_type, - tun_out_cnt); + err = nfp_flower_output_action(app, a, nfp_fl, a_len, netdev, + false, tun_type, tun_out_cnt, + out_cnt); if (err) return err; - *a_len += sizeof(struct nfp_fl_output); } else if (is_tcf_vlan(a) && tcf_vlan_action(a) == TCA_VLAN_ACT_POP) { if (*a_len + sizeof(struct nfp_fl_pop_vlan) > NFP_FL_MAX_A_SIZ) return -EOPNOTSUPP; @@ -535,11 +612,12 @@ nfp_flower_loop_action(const struct tc_action *a, return 0; } -int nfp_flower_compile_action(struct tc_cls_flower_offload *flow, +int nfp_flower_compile_action(struct nfp_app *app, + struct tc_cls_flower_offload *flow, struct net_device *netdev, struct nfp_fl_payload *nfp_flow) { - int act_len, act_cnt, err, tun_out_cnt; + int act_len, act_cnt, err, tun_out_cnt, out_cnt; enum nfp_flower_tun_type tun_type; const struct tc_action *a; LIST_HEAD(actions); @@ -550,11 +628,12 @@ int nfp_flower_compile_action(struct tc_cls_flower_offload *flow, act_len = 0; act_cnt = 0; tun_out_cnt = 0; + out_cnt = 0; tcf_exts_to_list(flow->exts, &actions); list_for_each_entry(a, &actions, list) { - err = nfp_flower_loop_action(a, nfp_flow, &act_len, netdev, - &tun_type, &tun_out_cnt); + err = nfp_flower_loop_action(app, a, nfp_flow, &act_len, netdev, + &tun_type, &tun_out_cnt, &out_cnt); if (err) return err; act_cnt++; diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c index 577659f332e4..cb8565222621 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c @@ -239,8 +239,10 @@ nfp_flower_cmsg_portreify_rx(struct nfp_app *app, struct sk_buff *skb) static void nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb) { + struct nfp_flower_priv *app_priv = app->priv; struct nfp_flower_cmsg_hdr *cmsg_hdr; enum nfp_flower_cmsg_type_port type; + bool skb_stored = false; cmsg_hdr = nfp_flower_cmsg_get_hdr(skb); @@ -258,13 +260,20 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb) case NFP_FLOWER_CMSG_TYPE_ACTIVE_TUNS: nfp_tunnel_keep_alive(app, skb); break; + case NFP_FLOWER_CMSG_TYPE_LAG_CONFIG: + if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG) { + skb_stored = nfp_flower_lag_unprocessed_msg(app, skb); + break; + } + /* fall through */ default: nfp_flower_cmsg_warn(app, "Cannot handle invalid repr control type %u\n", type); goto out; } - dev_consume_skb_any(skb); + if (!skb_stored) + dev_consume_skb_any(skb); return; out: dev_kfree_skb_any(skb); diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h index bee4367a2c38..4a7f3510a296 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h @@ -92,6 +92,7 @@ #define NFP_FL_ACTION_OPCODE_SET_IPV6_DST 12 #define NFP_FL_ACTION_OPCODE_SET_UDP 14 #define NFP_FL_ACTION_OPCODE_SET_TCP 15 +#define NFP_FL_ACTION_OPCODE_PRE_LAG 16 #define NFP_FL_ACTION_OPCODE_PRE_TUNNEL 17 #define NFP_FL_ACTION_OPCODE_NUM 32 @@ -103,6 +104,9 @@ #define NFP_FL_PUSH_VLAN_CFI BIT(12) #define NFP_FL_PUSH_VLAN_VID GENMASK(11, 0) +/* LAG ports */ +#define NFP_FL_LAG_OUT 0xC0DE0000 + /* Tunnel ports */ #define NFP_FL_PORT_TYPE_TUN 0x50000000 #define NFP_FL_IPV4_TUNNEL_TYPE GENMASK(7, 4) @@ -177,6 +181,15 @@ struct nfp_fl_pop_vlan { __be16 reserved; }; +struct nfp_fl_pre_lag { + struct nfp_fl_act_head head; + __be16 group_id; + u8 lag_version[3]; + u8 instance; +}; + +#define NFP_FL_PRE_LAG_VER_OFF 8 + struct nfp_fl_pre_tunnel { struct nfp_fl_act_head head; __be16 reserved; @@ -366,6 +379,7 @@ struct nfp_flower_cmsg_hdr { enum nfp_flower_cmsg_type_port { NFP_FLOWER_CMSG_TYPE_FLOW_ADD = 0, NFP_FLOWER_CMSG_TYPE_FLOW_DEL = 2, + NFP_FLOWER_CMSG_TYPE_LAG_CONFIG = 4, NFP_FLOWER_CMSG_TYPE_PORT_REIFY = 6, NFP_FLOWER_CMSG_TYPE_MAC_REPR = 7, NFP_FLOWER_CMSG_TYPE_PORT_MOD = 8, diff --git a/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c new file mode 100644 index 000000000000..0c4c957717ea --- /dev/null +++ b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c @@ -0,0 +1,726 @@ +/* + * Copyright (C) 2018 Netronome Systems, Inc. + * + * This software is dual licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree or the BSD 2-Clause License provided below. You have the + * option to license this software under the complete terms of either license. + * + * The BSD 2-Clause License: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "main.h" + +/* LAG group config flags. */ +#define NFP_FL_LAG_LAST BIT(1) +#define NFP_FL_LAG_FIRST BIT(2) +#define NFP_FL_LAG_DATA BIT(3) +#define NFP_FL_LAG_XON BIT(4) +#define NFP_FL_LAG_SYNC BIT(5) +#define NFP_FL_LAG_SWITCH BIT(6) +#define NFP_FL_LAG_RESET BIT(7) + +/* LAG port state flags. */ +#define NFP_PORT_LAG_LINK_UP BIT(0) +#define NFP_PORT_LAG_TX_ENABLED BIT(1) +#define NFP_PORT_LAG_CHANGED BIT(2) + +enum nfp_fl_lag_batch { + NFP_FL_LAG_BATCH_FIRST, + NFP_FL_LAG_BATCH_MEMBER, + NFP_FL_LAG_BATCH_FINISHED +}; + +/** + * struct nfp_flower_cmsg_lag_config - control message payload for LAG config + * @ctrl_flags: Configuration flags + * @reserved: Reserved for future use + * @ttl: Time to live of packet - host always sets to 0xff + * @pkt_number: Config message packet number - increment for each message + * @batch_ver: Batch version of messages - increment for each batch of messages + * @group_id: Group ID applicable + * @group_inst: Group instance number - increment when group is reused + * @members: Array of 32-bit words listing all active group members + */ +struct nfp_flower_cmsg_lag_config { + u8 ctrl_flags; + u8 reserved[2]; + u8 ttl; + __be32 pkt_number; + __be32 batch_ver; + __be32 group_id; + __be32 group_inst; + __be32 members[]; +}; + +/** + * struct nfp_fl_lag_group - list entry for each LAG group + * @group_id: Assigned group ID for host/kernel sync + * @group_inst: Group instance in case of ID reuse + * @list: List entry + * @master_ndev: Group master Netdev + * @dirty: Marked if the group needs synced to HW + * @offloaded: Marked if the group is currently offloaded to NIC + * @to_remove: Marked if the group should be removed from NIC + * @to_destroy: Marked if the group should be removed from driver + * @slave_cnt: Number of slaves in group + */ +struct nfp_fl_lag_group { + unsigned int group_id; + u8 group_inst; + struct list_head list; + struct net_device *master_ndev; + bool dirty; + bool offloaded; + bool to_remove; + bool to_destroy; + unsigned int slave_cnt; +}; + +#define NFP_FL_LAG_PKT_NUMBER_MASK GENMASK(30, 0) +#define NFP_FL_LAG_VERSION_MASK GENMASK(22, 0) +#define NFP_FL_LAG_HOST_TTL 0xff + +/* Use this ID with zero members to ack a batch config */ +#define NFP_FL_LAG_SYNC_ID 0 +#define NFP_FL_LAG_GROUP_MIN 1 /* ID 0 reserved */ +#define NFP_FL_LAG_GROUP_MAX 32 /* IDs 1 to 31 are valid */ + +/* wait for more config */ +#define NFP_FL_LAG_DELAY (msecs_to_jiffies(2)) + +#define NFP_FL_LAG_RETRANS_LIMIT 100 /* max retrans cmsgs to store */ + +static unsigned int nfp_fl_get_next_pkt_number(struct nfp_fl_lag *lag) +{ + lag->pkt_num++; + lag->pkt_num &= NFP_FL_LAG_PKT_NUMBER_MASK; + + return lag->pkt_num; +} + +static void nfp_fl_increment_version(struct nfp_fl_lag *lag) +{ + /* LSB is not considered by firmware so add 2 for each increment. */ + lag->batch_ver += 2; + lag->batch_ver &= NFP_FL_LAG_VERSION_MASK; + + /* Zero is reserved by firmware. */ + if (!lag->batch_ver) + lag->batch_ver += 2; +} + +static struct nfp_fl_lag_group * +nfp_fl_lag_group_create(struct nfp_fl_lag *lag, struct net_device *master) +{ + struct nfp_fl_lag_group *group; + struct nfp_flower_priv *priv; + int id; + + priv = container_of(lag, struct nfp_flower_priv, nfp_lag); + + id = ida_simple_get(&lag->ida_handle, NFP_FL_LAG_GROUP_MIN, + NFP_FL_LAG_GROUP_MAX, GFP_KERNEL); + if (id < 0) { + nfp_flower_cmsg_warn(priv->app, + "No more bonding groups available\n"); + return ERR_PTR(id); + } + + group = kmalloc(sizeof(*group), GFP_KERNEL); + if (!group) { + ida_simple_remove(&lag->ida_handle, id); + return ERR_PTR(-ENOMEM); + } + + group->group_id = id; + group->master_ndev = master; + group->dirty = true; + group->offloaded = false; + group->to_remove = false; + group->to_destroy = false; + group->slave_cnt = 0; + group->group_inst = ++lag->global_inst; + list_add_tail(&group->list, &lag->group_list); + + return group; +} + +static struct nfp_fl_lag_group * +nfp_fl_lag_find_group_for_master_with_lag(struct nfp_fl_lag *lag, + struct net_device *master) +{ + struct nfp_fl_lag_group *entry; + + if (!master) + return NULL; + + list_for_each_entry(entry, &lag->group_list, list) + if (entry->master_ndev == master) + return entry; + + return NULL; +} + +int nfp_flower_lag_populate_pre_action(struct nfp_app *app, + struct net_device *master, + struct nfp_fl_pre_lag *pre_act) +{ + struct nfp_flower_priv *priv = app->priv; + struct nfp_fl_lag_group *group = NULL; + __be32 temp_vers; + + mutex_lock(&priv->nfp_lag.lock); + group = nfp_fl_lag_find_group_for_master_with_lag(&priv->nfp_lag, + master); + if (!group) { + mutex_unlock(&priv->nfp_lag.lock); + return -ENOENT; + } + + pre_act->group_id = cpu_to_be16(group->group_id); + temp_vers = cpu_to_be32(priv->nfp_lag.batch_ver << + NFP_FL_PRE_LAG_VER_OFF); + memcpy(pre_act->lag_version, &temp_vers, 3); + pre_act->instance = group->group_inst; + mutex_unlock(&priv->nfp_lag.lock); + + return 0; +} + +int nfp_flower_lag_get_output_id(struct nfp_app *app, struct net_device *master) +{ + struct nfp_flower_priv *priv = app->priv; + struct nfp_fl_lag_group *group = NULL; + int group_id = -ENOENT; + + mutex_lock(&priv->nfp_lag.lock); + group = nfp_fl_lag_find_group_for_master_with_lag(&priv->nfp_lag, + master); + if (group) + group_id = group->group_id; + mutex_unlock(&priv->nfp_lag.lock); + + return group_id; +} + +static int +nfp_fl_lag_config_group(struct nfp_fl_lag *lag, struct nfp_fl_lag_group *group, + struct net_device **active_members, + unsigned int member_cnt, enum nfp_fl_lag_batch *batch) +{ + struct nfp_flower_cmsg_lag_config *cmsg_payload; + struct nfp_flower_priv *priv; + unsigned long int flags; + unsigned int size, i; + struct sk_buff *skb; + + priv = container_of(lag, struct nfp_flower_priv, nfp_lag); + size = sizeof(*cmsg_payload) + sizeof(__be32) * member_cnt; + skb = nfp_flower_cmsg_alloc(priv->app, size, + NFP_FLOWER_CMSG_TYPE_LAG_CONFIG, + GFP_KERNEL); + if (!skb) + return -ENOMEM; + + cmsg_payload = nfp_flower_cmsg_get_data(skb); + flags = 0; + + /* Increment batch version for each new batch of config messages. */ + if (*batch == NFP_FL_LAG_BATCH_FIRST) { + flags |= NFP_FL_LAG_FIRST; + nfp_fl_increment_version(lag); + *batch = NFP_FL_LAG_BATCH_MEMBER; + } + + /* If it is a reset msg then it is also the end of the batch. */ + if (lag->rst_cfg) { + flags |= NFP_FL_LAG_RESET; + *batch = NFP_FL_LAG_BATCH_FINISHED; + } + + /* To signal the end of a batch, both the switch and last flags are set + * and the the reserved SYNC group ID is used. + */ + if (*batch == NFP_FL_LAG_BATCH_FINISHED) { + flags |= NFP_FL_LAG_SWITCH | NFP_FL_LAG_LAST; + lag->rst_cfg = false; + cmsg_payload->group_id = cpu_to_be32(NFP_FL_LAG_SYNC_ID); + cmsg_payload->group_inst = 0; + } else { + cmsg_payload->group_id = cpu_to_be32(group->group_id); + cmsg_payload->group_inst = cpu_to_be32(group->group_inst); + } + + cmsg_payload->reserved[0] = 0; + cmsg_payload->reserved[1] = 0; + cmsg_payload->ttl = NFP_FL_LAG_HOST_TTL; + cmsg_payload->ctrl_flags = flags; + cmsg_payload->batch_ver = cpu_to_be32(lag->batch_ver); + cmsg_payload->pkt_number = cpu_to_be32(nfp_fl_get_next_pkt_number(lag)); + + for (i = 0; i < member_cnt; i++) + cmsg_payload->members[i] = + cpu_to_be32(nfp_repr_get_port_id(active_members[i])); + + nfp_ctrl_tx(priv->app->ctrl, skb); + return 0; +} + +static void nfp_fl_lag_do_work(struct work_struct *work) +{ + enum nfp_fl_lag_batch batch = NFP_FL_LAG_BATCH_FIRST; + struct nfp_fl_lag_group *entry, *storage; + struct delayed_work *delayed_work; + struct nfp_flower_priv *priv; + struct nfp_fl_lag *lag; + int err; + + delayed_work = to_delayed_work(work); + lag = container_of(delayed_work, struct nfp_fl_lag, work); + priv = container_of(lag, struct nfp_flower_priv, nfp_lag); + + mutex_lock(&lag->lock); + list_for_each_entry_safe(entry, storage, &lag->group_list, list) { + struct net_device *iter_netdev, **acti_netdevs; + struct nfp_flower_repr_priv *repr_priv; + int active_count = 0, slaves = 0; + struct nfp_repr *repr; + unsigned long *flags; + + if (entry->to_remove) { + /* Active count of 0 deletes group on hw. */ + err = nfp_fl_lag_config_group(lag, entry, NULL, 0, + &batch); + if (!err) { + entry->to_remove = false; + entry->offloaded = false; + } else { + nfp_flower_cmsg_warn(priv->app, + "group delete failed\n"); + schedule_delayed_work(&lag->work, + NFP_FL_LAG_DELAY); + continue; + } + + if (entry->to_destroy) { + ida_simple_remove(&lag->ida_handle, + entry->group_id); + list_del(&entry->list); + kfree(entry); + } + continue; + } + + acti_netdevs = kmalloc_array(entry->slave_cnt, + sizeof(*acti_netdevs), GFP_KERNEL); + + /* Include sanity check in the loop. It may be that a bond has + * changed between processing the last notification and the + * work queue triggering. If the number of slaves has changed + * or it now contains netdevs that cannot be offloaded, ignore + * the group until pending notifications are processed. + */ + rcu_read_lock(); + for_each_netdev_in_bond_rcu(entry->master_ndev, iter_netdev) { + if (!nfp_netdev_is_nfp_repr(iter_netdev)) { + slaves = 0; + break; + } + + repr = netdev_priv(iter_netdev); + + if (repr->app != priv->app) { + slaves = 0; + break; + } + + slaves++; + if (slaves > entry->slave_cnt) + break; + + /* Check the ports for state changes. */ + repr_priv = repr->app_priv; + flags = &repr_priv->lag_port_flags; + + if (*flags & NFP_PORT_LAG_CHANGED) { + *flags &= ~NFP_PORT_LAG_CHANGED; + entry->dirty = true; + } + + if ((*flags & NFP_PORT_LAG_TX_ENABLED) && + (*flags & NFP_PORT_LAG_LINK_UP)) + acti_netdevs[active_count++] = iter_netdev; + } + rcu_read_unlock(); + + if (slaves != entry->slave_cnt || !entry->dirty) { + kfree(acti_netdevs); + continue; + } + + err = nfp_fl_lag_config_group(lag, entry, acti_netdevs, + active_count, &batch); + if (!err) { + entry->offloaded = true; + entry->dirty = false; + } else { + nfp_flower_cmsg_warn(priv->app, + "group offload failed\n"); + schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY); + } + + kfree(acti_netdevs); + } + + /* End the config batch if at least one packet has been batched. */ + if (batch == NFP_FL_LAG_BATCH_MEMBER) { + batch = NFP_FL_LAG_BATCH_FINISHED; + err = nfp_fl_lag_config_group(lag, NULL, NULL, 0, &batch); + if (err) + nfp_flower_cmsg_warn(priv->app, + "group batch end cmsg failed\n"); + } + + mutex_unlock(&lag->lock); +} + +static int +nfp_fl_lag_put_unprocessed(struct nfp_fl_lag *lag, struct sk_buff *skb) +{ + struct nfp_flower_cmsg_lag_config *cmsg_payload; + + cmsg_payload = nfp_flower_cmsg_get_data(skb); + if (be32_to_cpu(cmsg_payload->group_id) >= NFP_FL_LAG_GROUP_MAX) + return -EINVAL; + + /* Drop cmsg retrans if storage limit is exceeded to prevent + * overloading. If the fw notices that expected messages have not been + * received in a given time block, it will request a full resync. + */ + if (skb_queue_len(&lag->retrans_skbs) >= NFP_FL_LAG_RETRANS_LIMIT) + return -ENOSPC; + + __skb_queue_tail(&lag->retrans_skbs, skb); + + return 0; +} + +static void nfp_fl_send_unprocessed(struct nfp_fl_lag *lag) +{ + struct nfp_flower_priv *priv; + struct sk_buff *skb; + + priv = container_of(lag, struct nfp_flower_priv, nfp_lag); + + while ((skb = __skb_dequeue(&lag->retrans_skbs))) + nfp_ctrl_tx(priv->app->ctrl, skb); +} + +bool nfp_flower_lag_unprocessed_msg(struct nfp_app *app, struct sk_buff *skb) +{ + struct nfp_flower_cmsg_lag_config *cmsg_payload; + struct nfp_flower_priv *priv = app->priv; + struct nfp_fl_lag_group *group_entry; + unsigned long int flags; + bool store_skb = false; + int err; + + cmsg_payload = nfp_flower_cmsg_get_data(skb); + flags = cmsg_payload->ctrl_flags; + + /* Note the intentional fall through below. If DATA and XON are both + * set, the message will stored and sent again with the rest of the + * unprocessed messages list. + */ + + /* Store */ + if (flags & NFP_FL_LAG_DATA) + if (!nfp_fl_lag_put_unprocessed(&priv->nfp_lag, skb)) + store_skb = true; + + /* Send stored */ + if (flags & NFP_FL_LAG_XON) + nfp_fl_send_unprocessed(&priv->nfp_lag); + + /* Resend all */ + if (flags & NFP_FL_LAG_SYNC) { + /* To resend all config: + * 1) Clear all unprocessed messages + * 2) Mark all groups dirty + * 3) Reset NFP group config + * 4) Schedule a LAG config update + */ + + __skb_queue_purge(&priv->nfp_lag.retrans_skbs); + + mutex_lock(&priv->nfp_lag.lock); + list_for_each_entry(group_entry, &priv->nfp_lag.group_list, + list) + group_entry->dirty = true; + + err = nfp_flower_lag_reset(&priv->nfp_lag); + if (err) + nfp_flower_cmsg_warn(priv->app, + "mem err in group reset msg\n"); + mutex_unlock(&priv->nfp_lag.lock); + + schedule_delayed_work(&priv->nfp_lag.work, 0); + } + + return store_skb; +} + +static void +nfp_fl_lag_schedule_group_remove(struct nfp_fl_lag *lag, + struct nfp_fl_lag_group *group) +{ + group->to_remove = true; + + schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY); +} + +static int +nfp_fl_lag_schedule_group_delete(struct nfp_fl_lag *lag, + struct net_device *master) +{ + struct nfp_fl_lag_group *group; + + mutex_lock(&lag->lock); + group = nfp_fl_lag_find_group_for_master_with_lag(lag, master); + if (!group) { + mutex_unlock(&lag->lock); + return -ENOENT; + } + + group->to_remove = true; + group->to_destroy = true; + mutex_unlock(&lag->lock); + + schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY); + return 0; +} + +static int +nfp_fl_lag_changeupper_event(struct nfp_fl_lag *lag, + struct netdev_notifier_changeupper_info *info) +{ + struct net_device *upper = info->upper_dev, *iter_netdev; + struct netdev_lag_upper_info *lag_upper_info; + struct nfp_fl_lag_group *group; + struct nfp_flower_priv *priv; + unsigned int slave_count = 0; + bool can_offload = true; + struct nfp_repr *repr; + + if (!netif_is_lag_master(upper)) + return 0; + + priv = container_of(lag, struct nfp_flower_priv, nfp_lag); + + rcu_read_lock(); + for_each_netdev_in_bond_rcu(upper, iter_netdev) { + if (!nfp_netdev_is_nfp_repr(iter_netdev)) { + can_offload = false; + break; + } + repr = netdev_priv(iter_netdev); + + /* Ensure all ports are created by the same app/on same card. */ + if (repr->app != priv->app) { + can_offload = false; + break; + } + + slave_count++; + } + rcu_read_unlock(); + + lag_upper_info = info->upper_info; + + /* Firmware supports active/backup and L3/L4 hash bonds. */ + if (lag_upper_info && + lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_ACTIVEBACKUP && + (lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_HASH || + (lag_upper_info->hash_type != NETDEV_LAG_HASH_L34 && + lag_upper_info->hash_type != NETDEV_LAG_HASH_E34))) { + can_offload = false; + nfp_flower_cmsg_warn(priv->app, + "Unable to offload tx_type %u hash %u\n", + lag_upper_info->tx_type, + lag_upper_info->hash_type); + } + + mutex_lock(&lag->lock); + group = nfp_fl_lag_find_group_for_master_with_lag(lag, upper); + + if (slave_count == 0 || !can_offload) { + /* Cannot offload the group - remove if previously offloaded. */ + if (group && group->offloaded) + nfp_fl_lag_schedule_group_remove(lag, group); + + mutex_unlock(&lag->lock); + return 0; + } + + if (!group) { + group = nfp_fl_lag_group_create(lag, upper); + if (IS_ERR(group)) { + mutex_unlock(&lag->lock); + return PTR_ERR(group); + } + } + + group->dirty = true; + group->slave_cnt = slave_count; + + /* Group may have been on queue for removal but is now offfloable. */ + group->to_remove = false; + mutex_unlock(&lag->lock); + + schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY); + return 0; +} + +static int +nfp_fl_lag_changels_event(struct nfp_fl_lag *lag, struct net_device *netdev, + struct netdev_notifier_changelowerstate_info *info) +{ + struct netdev_lag_lower_state_info *lag_lower_info; + struct nfp_flower_repr_priv *repr_priv; + struct nfp_flower_priv *priv; + struct nfp_repr *repr; + unsigned long *flags; + + if (!netif_is_lag_port(netdev) || !nfp_netdev_is_nfp_repr(netdev)) + return 0; + + lag_lower_info = info->lower_state_info; + if (!lag_lower_info) + return 0; + + priv = container_of(lag, struct nfp_flower_priv, nfp_lag); + repr = netdev_priv(netdev); + + /* Verify that the repr is associated with this app. */ + if (repr->app != priv->app) + return 0; + + repr_priv = repr->app_priv; + flags = &repr_priv->lag_port_flags; + + mutex_lock(&lag->lock); + if (lag_lower_info->link_up) + *flags |= NFP_PORT_LAG_LINK_UP; + else + *flags &= ~NFP_PORT_LAG_LINK_UP; + + if (lag_lower_info->tx_enabled) + *flags |= NFP_PORT_LAG_TX_ENABLED; + else + *flags &= ~NFP_PORT_LAG_TX_ENABLED; + + *flags |= NFP_PORT_LAG_CHANGED; + mutex_unlock(&lag->lock); + + schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY); + return 0; +} + +static int +nfp_fl_lag_netdev_event(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct net_device *netdev; + struct nfp_fl_lag *lag; + int err; + + netdev = netdev_notifier_info_to_dev(ptr); + lag = container_of(nb, struct nfp_fl_lag, lag_nb); + + switch (event) { + case NETDEV_CHANGEUPPER: + err = nfp_fl_lag_changeupper_event(lag, ptr); + if (err) + return NOTIFY_BAD; + return NOTIFY_OK; + case NETDEV_CHANGELOWERSTATE: + err = nfp_fl_lag_changels_event(lag, netdev, ptr); + if (err) + return NOTIFY_BAD; + return NOTIFY_OK; + case NETDEV_UNREGISTER: + if (netif_is_bond_master(netdev)) { + err = nfp_fl_lag_schedule_group_delete(lag, netdev); + if (err) + return NOTIFY_BAD; + return NOTIFY_OK; + } + } + + return NOTIFY_DONE; +} + +int nfp_flower_lag_reset(struct nfp_fl_lag *lag) +{ + enum nfp_fl_lag_batch batch = NFP_FL_LAG_BATCH_FIRST; + + lag->rst_cfg = true; + return nfp_fl_lag_config_group(lag, NULL, NULL, 0, &batch); +} + +void nfp_flower_lag_init(struct nfp_fl_lag *lag) +{ + INIT_DELAYED_WORK(&lag->work, nfp_fl_lag_do_work); + INIT_LIST_HEAD(&lag->group_list); + mutex_init(&lag->lock); + ida_init(&lag->ida_handle); + + __skb_queue_head_init(&lag->retrans_skbs); + + /* 0 is a reserved batch version so increment to first valid value. */ + nfp_fl_increment_version(lag); + + lag->lag_nb.notifier_call = nfp_fl_lag_netdev_event; +} + +void nfp_flower_lag_cleanup(struct nfp_fl_lag *lag) +{ + struct nfp_fl_lag_group *entry, *storage; + + cancel_delayed_work_sync(&lag->work); + + __skb_queue_purge(&lag->retrans_skbs); + + /* Remove all groups. */ + mutex_lock(&lag->lock); + list_for_each_entry_safe(entry, storage, &lag->group_list, list) { + list_del(&entry->list); + kfree(entry); + } + mutex_unlock(&lag->lock); + mutex_destroy(&lag->lock); + ida_destroy(&lag->ida_handle); +} diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c index 4e67c0cbf9f0..19cfa162ac65 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.c +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -185,6 +185,10 @@ nfp_flower_repr_netdev_init(struct nfp_app *app, struct net_device *netdev) static void nfp_flower_repr_netdev_clean(struct nfp_app *app, struct net_device *netdev) { + struct nfp_repr *repr = netdev_priv(netdev); + + kfree(repr->app_priv); + tc_setup_cb_egdev_unregister(netdev, nfp_flower_setup_tc_egress_cb, netdev_priv(netdev)); } @@ -225,7 +229,9 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app, u8 nfp_pcie = nfp_cppcore_pcie_unit(app->pf->cpp); struct nfp_flower_priv *priv = app->priv; atomic_t *replies = &priv->reify_replies; + struct nfp_flower_repr_priv *repr_priv; enum nfp_port_type port_type; + struct nfp_repr *nfp_repr; struct nfp_reprs *reprs; int i, err, reify_cnt; const u8 queue = 0; @@ -248,6 +254,15 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app, goto err_reprs_clean; } + repr_priv = kzalloc(sizeof(*repr_priv), GFP_KERNEL); + if (!repr_priv) { + err = -ENOMEM; + goto err_reprs_clean; + } + + nfp_repr = netdev_priv(repr); + nfp_repr->app_priv = repr_priv; + /* For now we only support 1 PF */ WARN_ON(repr_type == NFP_REPR_TYPE_PF && i); @@ -324,6 +339,8 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv) { struct nfp_eth_table *eth_tbl = app->pf->eth_tbl; atomic_t *replies = &priv->reify_replies; + struct nfp_flower_repr_priv *repr_priv; + struct nfp_repr *nfp_repr; struct sk_buff *ctrl_skb; struct nfp_reprs *reprs; int err, reify_cnt; @@ -351,6 +368,15 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv) goto err_reprs_clean; } + repr_priv = kzalloc(sizeof(*repr_priv), GFP_KERNEL); + if (!repr_priv) { + err = -ENOMEM; + goto err_reprs_clean; + } + + nfp_repr = netdev_priv(repr); + nfp_repr->app_priv = repr_priv; + port = nfp_port_alloc(app, NFP_PORT_PHYS_PORT, repr); if (IS_ERR(port)) { err = PTR_ERR(port); @@ -546,8 +572,22 @@ static int nfp_flower_init(struct nfp_app *app) else app_priv->flower_ext_feats = features; + /* Tell the firmware that the driver supports lag. */ + err = nfp_rtsym_write_le(app->pf->rtbl, + "_abi_flower_balance_sync_enable", 1); + if (!err) { + app_priv->flower_ext_feats |= NFP_FL_FEATS_LAG; + nfp_flower_lag_init(&app_priv->nfp_lag); + } else if (err == -ENOENT) { + nfp_warn(app->cpp, "LAG not supported by FW.\n"); + } else { + goto err_cleanup_metadata; + } + return 0; +err_cleanup_metadata: + nfp_flower_metadata_cleanup(app); err_free_app_priv: vfree(app->priv); return err; @@ -561,6 +601,9 @@ static void nfp_flower_clean(struct nfp_app *app) skb_queue_purge(&app_priv->cmsg_skbs_low); flush_work(&app_priv->cmsg_work); + if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG) + nfp_flower_lag_cleanup(&app_priv->nfp_lag); + nfp_flower_metadata_cleanup(app); vfree(app->priv); app->priv = NULL; @@ -627,11 +670,29 @@ nfp_flower_repr_change_mtu(struct nfp_app *app, struct net_device *netdev, static int nfp_flower_start(struct nfp_app *app) { + struct nfp_flower_priv *app_priv = app->priv; + int err; + + if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG) { + err = nfp_flower_lag_reset(&app_priv->nfp_lag); + if (err) + return err; + + err = register_netdevice_notifier(&app_priv->nfp_lag.lag_nb); + if (err) + return err; + } + return nfp_tunnel_config_start(app); } static void nfp_flower_stop(struct nfp_app *app) { + struct nfp_flower_priv *app_priv = app->priv; + + if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG) + unregister_netdevice_notifier(&app_priv->nfp_lag.lag_nb); + nfp_tunnel_config_stop(app); } diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h index 733ff53cc601..bbe5764d26cb 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.h +++ b/drivers/net/ethernet/netronome/nfp/flower/main.h @@ -43,7 +43,9 @@ #include <net/pkt_cls.h> #include <net/tcp.h> #include <linux/workqueue.h> +#include <linux/idr.h> +struct nfp_fl_pre_lag; struct net_device; struct nfp_app; @@ -67,6 +69,7 @@ struct nfp_app; /* Extra features bitmap. */ #define NFP_FL_FEATS_GENEVE BIT(0) #define NFP_FL_NBI_MTU_SETTING BIT(1) +#define NFP_FL_FEATS_LAG BIT(31) struct nfp_fl_mask_id { struct circ_buf mask_id_free_list; @@ -97,6 +100,33 @@ struct nfp_mtu_conf { }; /** + * struct nfp_fl_lag - Flower APP priv data for link aggregation + * @lag_nb: Notifier to track master/slave events + * @work: Work queue for writing configs to the HW + * @lock: Lock to protect lag_group_list + * @group_list: List of all master/slave groups offloaded + * @ida_handle: IDA to handle group ids + * @pkt_num: Incremented for each config packet sent + * @batch_ver: Incremented for each batch of config packets + * @global_inst: Instance allocator for groups + * @rst_cfg: Marker to reset HW LAG config + * @retrans_skbs: Cmsgs that could not be processed by HW and require + * retransmission + */ +struct nfp_fl_lag { + struct notifier_block lag_nb; + struct delayed_work work; + struct mutex lock; + struct list_head group_list; + struct ida ida_handle; + unsigned int pkt_num; + unsigned int batch_ver; + u8 global_inst; + bool rst_cfg; + struct sk_buff_head retrans_skbs; +}; + +/** * struct nfp_flower_priv - Flower APP per-vNIC priv data * @app: Back pointer to app * @nn: Pointer to vNIC @@ -128,6 +158,7 @@ struct nfp_mtu_conf { * from firmware for repr reify * @reify_wait_queue: wait queue for repr reify response counting * @mtu_conf: Configuration of repr MTU value + * @nfp_lag: Link aggregation data block */ struct nfp_flower_priv { struct nfp_app *app; @@ -157,6 +188,15 @@ struct nfp_flower_priv { atomic_t reify_replies; wait_queue_head_t reify_wait_queue; struct nfp_mtu_conf mtu_conf; + struct nfp_fl_lag nfp_lag; +}; + +/** + * struct nfp_flower_repr_priv - Flower APP per-repr priv data + * @lag_port_flags: Extended port flags to record lag state of repr + */ +struct nfp_flower_repr_priv { + unsigned long lag_port_flags; }; struct nfp_fl_key_ls { @@ -214,7 +254,8 @@ int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow, struct net_device *netdev, struct nfp_fl_payload *nfp_flow, enum nfp_flower_tun_type tun_type); -int nfp_flower_compile_action(struct tc_cls_flower_offload *flow, +int nfp_flower_compile_action(struct nfp_app *app, + struct tc_cls_flower_offload *flow, struct net_device *netdev, struct nfp_fl_payload *nfp_flow); int nfp_compile_flow_metadata(struct nfp_app *app, @@ -241,5 +282,14 @@ void nfp_tunnel_request_route(struct nfp_app *app, struct sk_buff *skb); void nfp_tunnel_keep_alive(struct nfp_app *app, struct sk_buff *skb); int nfp_flower_setup_tc_egress_cb(enum tc_setup_type type, void *type_data, void *cb_priv); +void nfp_flower_lag_init(struct nfp_fl_lag *lag); +void nfp_flower_lag_cleanup(struct nfp_fl_lag *lag); +int nfp_flower_lag_reset(struct nfp_fl_lag *lag); +bool nfp_flower_lag_unprocessed_msg(struct nfp_app *app, struct sk_buff *skb); +int nfp_flower_lag_populate_pre_action(struct nfp_app *app, + struct net_device *master, + struct nfp_fl_pre_lag *pre_act); +int nfp_flower_lag_get_output_id(struct nfp_app *app, + struct net_device *master); #endif diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 70ec9d821b91..c42e64f32333 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -440,7 +440,7 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev, if (err) goto err_destroy_flow; - err = nfp_flower_compile_action(flow, netdev, flow_pay); + err = nfp_flower_compile_action(app, flow, netdev, flow_pay); if (err) goto err_destroy_flow; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h index faa4e131c136..f6677bc9875a 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h @@ -72,8 +72,21 @@ #define OP_BR_ADDR_LO 0x007ffc00000ULL #define OP_BR_ADDR_HI 0x10000000000ULL -#define nfp_is_br(_insn) \ - (((_insn) & OP_BR_BASE_MASK) == OP_BR_BASE) +#define OP_BR_BIT_BASE 0x0d000000000ULL +#define OP_BR_BIT_BASE_MASK 0x0f800080300ULL +#define OP_BR_BIT_A_SRC 0x000000000ffULL +#define OP_BR_BIT_B_SRC 0x0000003fc00ULL +#define OP_BR_BIT_BV 0x00000040000ULL +#define OP_BR_BIT_SRC_LMEXTN 0x40000000000ULL +#define OP_BR_BIT_DEFBR OP_BR_DEFBR +#define OP_BR_BIT_ADDR_LO OP_BR_ADDR_LO +#define OP_BR_BIT_ADDR_HI OP_BR_ADDR_HI + +static inline bool nfp_is_br(u64 insn) +{ + return (insn & OP_BR_BASE_MASK) == OP_BR_BASE || + (insn & OP_BR_BIT_BASE_MASK) == OP_BR_BIT_BASE; +} enum br_mask { BR_BEQ = 0x00, @@ -161,6 +174,7 @@ enum shf_op { SHF_OP_NONE = 0, SHF_OP_AND = 2, SHF_OP_OR = 5, + SHF_OP_ASHR = 6, }; enum shf_sc { diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c index 09e87d5f4f72..117eca6819de 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c @@ -277,6 +277,7 @@ const struct net_device_ops nfp_repr_netdev_ops = { .ndo_get_vf_config = nfp_app_get_vf_config, .ndo_set_vf_link_state = nfp_app_set_vf_link_state, .ndo_set_features = nfp_port_set_features, + .ndo_set_mac_address = eth_mac_addr, }; static void nfp_repr_clean(struct nfp_repr *repr) diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h index c9724fb7ea4b..df599d5b6bb3 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h @@ -100,6 +100,8 @@ nfp_rtsym_lookup(struct nfp_rtsym_table *rtbl, const char *name); u64 nfp_rtsym_read_le(struct nfp_rtsym_table *rtbl, const char *name, int *error); +int nfp_rtsym_write_le(struct nfp_rtsym_table *rtbl, const char *name, + u64 value); u8 __iomem * nfp_rtsym_map(struct nfp_rtsym_table *rtbl, const char *name, const char *id, unsigned int min_size, struct nfp_cpp_area **area); diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c index 46107aefad1c..9e34216578da 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c @@ -286,6 +286,49 @@ exit: return val; } +/** + * nfp_rtsym_write_le() - Write an unsigned scalar value to a symbol + * @rtbl: NFP RTsym table + * @name: Symbol name + * @value: Value to write + * + * Lookup a symbol and write a value to it. Symbol can be 4 or 8 bytes in size. + * If 4 bytes then the lower 32-bits of 'value' are used. Value will be + * written as simple little-endian unsigned value. + * + * Return: 0 on success or error code. + */ +int nfp_rtsym_write_le(struct nfp_rtsym_table *rtbl, const char *name, + u64 value) +{ + const struct nfp_rtsym *sym; + int err; + u32 id; + + sym = nfp_rtsym_lookup(rtbl, name); + if (!sym) + return -ENOENT; + + id = NFP_CPP_ISLAND_ID(sym->target, NFP_CPP_ACTION_RW, 0, sym->domain); + + switch (sym->size) { + case 4: + err = nfp_cpp_writel(rtbl->cpp, id, sym->addr, value); + break; + case 8: + err = nfp_cpp_writeq(rtbl->cpp, id, sym->addr, value); + break; + default: + nfp_err(rtbl->cpp, + "rtsym '%s' unsupported or non-scalar size: %lld\n", + name, sym->size); + err = -EINVAL; + break; + } + + return err; +} + u8 __iomem * nfp_rtsym_map(struct nfp_rtsym_table *rtbl, const char *name, const char *id, unsigned int min_size, struct nfp_cpp_area **area) diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c index 5e655c3601cf..1c0d0c217936 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_l2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c @@ -1677,6 +1677,8 @@ static void __qed_get_vport_tstats(struct qed_hwfn *p_hwfn, HILO_64_REGPAIR(tstats.mftag_filter_discard); p_stats->common.mac_filter_discards += HILO_64_REGPAIR(tstats.eth_mac_filter_discard); + p_stats->common.gft_filter_drop += + HILO_64_REGPAIR(tstats.eth_gft_drop_pkt); } static void __qed_get_vport_ustats_addrlen(struct qed_hwfn *p_hwfn, @@ -1973,6 +1975,8 @@ qed_arfs_mode_to_hsi(enum qed_filter_config_mode mode) return GFT_PROFILE_TYPE_4_TUPLE; if (mode == QED_FILTER_CONFIG_MODE_IP_DEST) return GFT_PROFILE_TYPE_IP_DST_ADDR; + if (mode == QED_FILTER_CONFIG_MODE_IP_SRC) + return GFT_PROFILE_TYPE_IP_SRC_ADDR; return GFT_PROFILE_TYPE_L4_DST_PORT; } @@ -2013,16 +2017,6 @@ qed_configure_rfs_ntuple_filter(struct qed_hwfn *p_hwfn, u8 abs_vport_id = 0; int rc = -EINVAL; - rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id); - if (rc) - return rc; - - if (p_params->qid != QED_RFS_NTUPLE_QID_RSS) { - rc = qed_fw_l2_queue(p_hwfn, p_params->qid, &abs_rx_q_id); - if (rc) - return rc; - } - /* Get SPQ entry */ memset(&init_data, 0, sizeof(init_data)); init_data.cid = qed_spq_get_cid(p_hwfn); @@ -2047,15 +2041,28 @@ qed_configure_rfs_ntuple_filter(struct qed_hwfn *p_hwfn, DMA_REGPAIR_LE(p_ramrod->pkt_hdr_addr, p_params->addr); p_ramrod->pkt_hdr_length = cpu_to_le16(p_params->length); - if (p_params->qid != QED_RFS_NTUPLE_QID_RSS) { - p_ramrod->rx_qid_valid = 1; - p_ramrod->rx_qid = cpu_to_le16(abs_rx_q_id); + if (p_params->b_is_drop) { + p_ramrod->vport_id = cpu_to_le16(ETH_GFT_TRASHCAN_VPORT); + } else { + rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id); + if (rc) + return rc; + + if (p_params->qid != QED_RFS_NTUPLE_QID_RSS) { + rc = qed_fw_l2_queue(p_hwfn, p_params->qid, + &abs_rx_q_id); + if (rc) + return rc; + + p_ramrod->rx_qid_valid = 1; + p_ramrod->rx_qid = cpu_to_le16(abs_rx_q_id); + } + + p_ramrod->vport_id = cpu_to_le16((u16)abs_vport_id); } p_ramrod->flow_id_valid = 0; p_ramrod->flow_id = 0; - - p_ramrod->vport_id = cpu_to_le16((u16)abs_vport_id); p_ramrod->filter_action = p_params->b_is_add ? GFT_ADD_FILTER : GFT_DELETE_FILTER; diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h index 2d3f09ed413b..81c5c8dfa2ef 100644 --- a/drivers/net/ethernet/qlogic/qede/qede.h +++ b/drivers/net/ethernet/qlogic/qede/qede.h @@ -75,6 +75,7 @@ struct qede_stats_common { u64 rx_bcast_pkts; u64 mftag_filter_discards; u64 mac_filter_discards; + u64 gft_filter_drop; u64 tx_ucast_bytes; u64 tx_mcast_bytes; u64 tx_bcast_bytes; diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c index 8c6fdad91986..6906e04b609e 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c +++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c @@ -161,6 +161,7 @@ static const struct { QEDE_STAT(no_buff_discards), QEDE_PF_STAT(mftag_filter_discards), QEDE_PF_STAT(mac_filter_discards), + QEDE_PF_STAT(gft_filter_drop), QEDE_STAT(tx_err_drop_pkts), QEDE_STAT(ttl0_discard), QEDE_STAT(packet_too_big_discard), diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index 43569b1839be..e9e088d9c815 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -38,6 +38,7 @@ #include <linux/qed/qed_if.h> #include "qede.h" +#define QEDE_FILTER_PRINT_MAX_LEN (64) struct qede_arfs_tuple { union { __be32 src_ipv4; @@ -51,6 +52,18 @@ struct qede_arfs_tuple { __be16 dst_port; __be16 eth_proto; u8 ip_proto; + + /* Describe filtering mode needed for this kind of filter */ + enum qed_filter_config_mode mode; + + /* Used to compare new/old filters. Return true if IPs match */ + bool (*ip_comp)(struct qede_arfs_tuple *a, struct qede_arfs_tuple *b); + + /* Given an address into ethhdr build a header from tuple info */ + void (*build_hdr)(struct qede_arfs_tuple *t, void *header); + + /* Stringify the tuple for a print into the provided buffer */ + void (*stringify)(struct qede_arfs_tuple *t, void *buffer); }; struct qede_arfs_fltr_node { @@ -73,9 +86,11 @@ struct qede_arfs_fltr_node { u16 sw_id; u16 rxq_id; u16 next_rxq_id; + u8 vfid; bool filter_op; bool used; u8 fw_rc; + bool b_is_drop; struct hlist_node node; }; @@ -90,7 +105,9 @@ struct qede_arfs { spinlock_t arfs_list_lock; unsigned long *arfs_fltr_bmap; int filter_count; - bool enable; + + /* Currently configured filtering mode */ + enum qed_filter_config_mode mode; }; static void qede_configure_arfs_fltr(struct qede_dev *edev, @@ -109,12 +126,22 @@ static void qede_configure_arfs_fltr(struct qede_dev *edev, params.length = n->buf_len; params.qid = rxq_id; params.b_is_add = add_fltr; + params.b_is_drop = n->b_is_drop; + + if (n->vfid) { + params.b_is_vf = true; + params.vf_id = n->vfid - 1; + } - DP_VERBOSE(edev, NETIF_MSG_RX_STATUS, - "%s arfs filter flow_id=%d, sw_id=%d, src_port=%d, dst_port=%d, rxq=%d\n", - add_fltr ? "Adding" : "Deleting", - n->flow_id, n->sw_id, ntohs(n->tuple.src_port), - ntohs(n->tuple.dst_port), rxq_id); + if (n->tuple.stringify) { + char tuple_buffer[QEDE_FILTER_PRINT_MAX_LEN]; + + n->tuple.stringify(&n->tuple, tuple_buffer); + DP_VERBOSE(edev, NETIF_MSG_RX_STATUS, + "%s sw_id[0x%x]: %s [vf %u queue %d]\n", + add_fltr ? "Adding" : "Deleting", + n->sw_id, tuple_buffer, n->vfid, rxq_id); + } n->used = true; n->filter_op = add_fltr; @@ -145,14 +172,13 @@ qede_enqueue_fltr_and_config_searcher(struct qede_dev *edev, INIT_HLIST_NODE(&fltr->node); hlist_add_head(&fltr->node, QEDE_ARFS_BUCKET_HEAD(edev, bucket_idx)); - edev->arfs->filter_count++; - - if (edev->arfs->filter_count == 1 && !edev->arfs->enable) { - enum qed_filter_config_mode mode; - mode = QED_FILTER_CONFIG_MODE_5_TUPLE; - edev->ops->configure_arfs_searcher(edev->cdev, mode); - edev->arfs->enable = true; + edev->arfs->filter_count++; + if (edev->arfs->filter_count == 1 && + edev->arfs->mode == QED_FILTER_CONFIG_MODE_DISABLE) { + edev->ops->configure_arfs_searcher(edev->cdev, + fltr->tuple.mode); + edev->arfs->mode = fltr->tuple.mode; } return 0; @@ -167,14 +193,15 @@ qede_dequeue_fltr_and_config_searcher(struct qede_dev *edev, fltr->buf_len, DMA_TO_DEVICE); qede_free_arfs_filter(edev, fltr); - edev->arfs->filter_count--; - if (!edev->arfs->filter_count && edev->arfs->enable) { + edev->arfs->filter_count--; + if (!edev->arfs->filter_count && + edev->arfs->mode != QED_FILTER_CONFIG_MODE_DISABLE) { enum qed_filter_config_mode mode; mode = QED_FILTER_CONFIG_MODE_DISABLE; - edev->arfs->enable = false; edev->ops->configure_arfs_searcher(edev->cdev, mode); + edev->arfs->mode = QED_FILTER_CONFIG_MODE_DISABLE; } } @@ -264,25 +291,17 @@ void qede_process_arfs_filters(struct qede_dev *edev, bool free_fltr) } } +#ifdef CONFIG_RFS_ACCEL spin_lock_bh(&edev->arfs->arfs_list_lock); - if (!edev->arfs->filter_count) { - if (edev->arfs->enable) { - enum qed_filter_config_mode mode; - - mode = QED_FILTER_CONFIG_MODE_DISABLE; - edev->arfs->enable = false; - edev->ops->configure_arfs_searcher(edev->cdev, mode); - } -#ifdef CONFIG_RFS_ACCEL - } else { + if (edev->arfs->filter_count) { set_bit(QEDE_SP_ARFS_CONFIG, &edev->sp_flags); schedule_delayed_work(&edev->sp_task, QEDE_SP_TASK_POLL_DELAY); -#endif } spin_unlock_bh(&edev->arfs->arfs_list_lock); +#endif } /* This function waits until all aRFS filters get deleted and freed. @@ -512,6 +531,7 @@ int qede_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb, eth->h_proto = skb->protocol; n->tuple.eth_proto = skb->protocol; n->tuple.ip_proto = ip_proto; + n->tuple.mode = QED_FILTER_CONFIG_MODE_5_TUPLE; memcpy(n->data + ETH_HLEN, skb->data, skb_headlen(skb)); rc = qede_enqueue_fltr_and_config_searcher(edev, n, tbl_idx); @@ -1339,38 +1359,6 @@ qede_get_arfs_fltr_by_loc(struct hlist_head *head, u32 location) return NULL; } -static bool -qede_compare_user_flow_ips(struct qede_arfs_fltr_node *tpos, - struct ethtool_rx_flow_spec *fsp, - __be16 proto) -{ - if (proto == htons(ETH_P_IP)) { - struct ethtool_tcpip4_spec *ip; - - ip = &fsp->h_u.tcp_ip4_spec; - - if (tpos->tuple.src_ipv4 == ip->ip4src && - tpos->tuple.dst_ipv4 == ip->ip4dst) - return true; - else - return false; - } else { - struct ethtool_tcpip6_spec *ip6; - struct in6_addr *src; - - ip6 = &fsp->h_u.tcp_ip6_spec; - src = &tpos->tuple.src_ipv6; - - if (!memcmp(src, &ip6->ip6src, sizeof(struct in6_addr)) && - !memcmp(&tpos->tuple.dst_ipv6, &ip6->ip6dst, - sizeof(struct in6_addr))) - return true; - else - return false; - } - return false; -} - int qede_get_cls_rule_all(struct qede_dev *edev, struct ethtool_rxnfc *info, u32 *rule_locs) { @@ -1455,102 +1443,444 @@ int qede_get_cls_rule_entry(struct qede_dev *edev, struct ethtool_rxnfc *cmd) fsp->ring_cookie = fltr->rxq_id; + if (fltr->vfid) { + fsp->ring_cookie |= ((u64)fltr->vfid) << + ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF; + } + + if (fltr->b_is_drop) + fsp->ring_cookie = RX_CLS_FLOW_DISC; unlock: __qede_unlock(edev); return rc; } static int -qede_validate_and_check_flow_exist(struct qede_dev *edev, - struct ethtool_rx_flow_spec *fsp, - int *min_hlen) +qede_poll_arfs_filter_config(struct qede_dev *edev, + struct qede_arfs_fltr_node *fltr) { - __be16 src_port = 0x0, dst_port = 0x0; - struct qede_arfs_fltr_node *fltr; - struct hlist_node *temp; - struct hlist_head *head; - __be16 eth_proto; - u8 ip_proto; + int count = QEDE_ARFS_POLL_COUNT; - if (fsp->location >= QEDE_RFS_MAX_FLTR || - fsp->ring_cookie >= QEDE_RSS_COUNT(edev)) - return -EINVAL; + while (fltr->used && count) { + msleep(20); + count--; + } + + if (count == 0 || fltr->fw_rc) { + DP_NOTICE(edev, "Timeout in polling filter config\n"); + qede_dequeue_fltr_and_config_searcher(edev, fltr); + return -EIO; + } + + return fltr->fw_rc; +} + +static int qede_flow_get_min_header_size(struct qede_arfs_tuple *t) +{ + int size = ETH_HLEN; + + if (t->eth_proto == htons(ETH_P_IP)) + size += sizeof(struct iphdr); + else + size += sizeof(struct ipv6hdr); + + if (t->ip_proto == IPPROTO_TCP) + size += sizeof(struct tcphdr); + else + size += sizeof(struct udphdr); + + return size; +} + +static bool qede_flow_spec_ipv4_cmp(struct qede_arfs_tuple *a, + struct qede_arfs_tuple *b) +{ + if (a->eth_proto != htons(ETH_P_IP) || + b->eth_proto != htons(ETH_P_IP)) + return false; + + return (a->src_ipv4 == b->src_ipv4) && + (a->dst_ipv4 == b->dst_ipv4); +} + +static void qede_flow_build_ipv4_hdr(struct qede_arfs_tuple *t, + void *header) +{ + __be16 *ports = (__be16 *)(header + ETH_HLEN + sizeof(struct iphdr)); + struct iphdr *ip = (struct iphdr *)(header + ETH_HLEN); + struct ethhdr *eth = (struct ethhdr *)header; + + eth->h_proto = t->eth_proto; + ip->saddr = t->src_ipv4; + ip->daddr = t->dst_ipv4; + ip->version = 0x4; + ip->ihl = 0x5; + ip->protocol = t->ip_proto; + ip->tot_len = cpu_to_be16(qede_flow_get_min_header_size(t) - ETH_HLEN); + + /* ports is weakly typed to suit both TCP and UDP ports */ + ports[0] = t->src_port; + ports[1] = t->dst_port; +} + +static void qede_flow_stringify_ipv4_hdr(struct qede_arfs_tuple *t, + void *buffer) +{ + const char *prefix = t->ip_proto == IPPROTO_TCP ? "TCP" : "UDP"; + + snprintf(buffer, QEDE_FILTER_PRINT_MAX_LEN, + "%s %pI4 (%04x) -> %pI4 (%04x)", + prefix, &t->src_ipv4, t->src_port, + &t->dst_ipv4, t->dst_port); +} + +static bool qede_flow_spec_ipv6_cmp(struct qede_arfs_tuple *a, + struct qede_arfs_tuple *b) +{ + if (a->eth_proto != htons(ETH_P_IPV6) || + b->eth_proto != htons(ETH_P_IPV6)) + return false; + + if (memcmp(&a->src_ipv6, &b->src_ipv6, sizeof(struct in6_addr))) + return false; - if (fsp->flow_type == TCP_V4_FLOW) { - *min_hlen += sizeof(struct iphdr) + - sizeof(struct tcphdr); - eth_proto = htons(ETH_P_IP); - ip_proto = IPPROTO_TCP; - } else if (fsp->flow_type == UDP_V4_FLOW) { - *min_hlen += sizeof(struct iphdr) + - sizeof(struct udphdr); - eth_proto = htons(ETH_P_IP); - ip_proto = IPPROTO_UDP; - } else if (fsp->flow_type == TCP_V6_FLOW) { - *min_hlen += sizeof(struct ipv6hdr) + - sizeof(struct tcphdr); - eth_proto = htons(ETH_P_IPV6); - ip_proto = IPPROTO_TCP; - } else if (fsp->flow_type == UDP_V6_FLOW) { - *min_hlen += sizeof(struct ipv6hdr) + - sizeof(struct udphdr); - eth_proto = htons(ETH_P_IPV6); - ip_proto = IPPROTO_UDP; + if (memcmp(&a->dst_ipv6, &b->dst_ipv6, sizeof(struct in6_addr))) + return false; + + return true; +} + +static void qede_flow_build_ipv6_hdr(struct qede_arfs_tuple *t, + void *header) +{ + __be16 *ports = (__be16 *)(header + ETH_HLEN + sizeof(struct ipv6hdr)); + struct ipv6hdr *ip6 = (struct ipv6hdr *)(header + ETH_HLEN); + struct ethhdr *eth = (struct ethhdr *)header; + + eth->h_proto = t->eth_proto; + memcpy(&ip6->saddr, &t->src_ipv6, sizeof(struct in6_addr)); + memcpy(&ip6->daddr, &t->dst_ipv6, sizeof(struct in6_addr)); + ip6->version = 0x6; + + if (t->ip_proto == IPPROTO_TCP) { + ip6->nexthdr = NEXTHDR_TCP; + ip6->payload_len = cpu_to_be16(sizeof(struct tcphdr)); } else { - DP_NOTICE(edev, "Unsupported flow type = 0x%x\n", - fsp->flow_type); - return -EPROTONOSUPPORT; + ip6->nexthdr = NEXTHDR_UDP; + ip6->payload_len = cpu_to_be16(sizeof(struct udphdr)); } - if (eth_proto == htons(ETH_P_IP)) { - src_port = fsp->h_u.tcp_ip4_spec.psrc; - dst_port = fsp->h_u.tcp_ip4_spec.pdst; + /* ports is weakly typed to suit both TCP and UDP ports */ + ports[0] = t->src_port; + ports[1] = t->dst_port; +} + +/* Validate fields which are set and not accepted by the driver */ +static int qede_flow_spec_validate_unused(struct qede_dev *edev, + struct ethtool_rx_flow_spec *fs) +{ + if (fs->flow_type & FLOW_MAC_EXT) { + DP_INFO(edev, "Don't support MAC extensions\n"); + return -EOPNOTSUPP; + } + + if ((fs->flow_type & FLOW_EXT) && + (fs->h_ext.vlan_etype || fs->h_ext.vlan_tci)) { + DP_INFO(edev, "Don't support vlan-based classification\n"); + return -EOPNOTSUPP; + } + + if ((fs->flow_type & FLOW_EXT) && + (fs->h_ext.data[0] || fs->h_ext.data[1])) { + DP_INFO(edev, "Don't support user defined data\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +static int qede_flow_spec_to_tuple_ipv4_common(struct qede_dev *edev, + struct qede_arfs_tuple *t, + struct ethtool_rx_flow_spec *fs) +{ + if ((fs->h_u.tcp_ip4_spec.ip4src & + fs->m_u.tcp_ip4_spec.ip4src) != fs->h_u.tcp_ip4_spec.ip4src) { + DP_INFO(edev, "Don't support IP-masks\n"); + return -EOPNOTSUPP; + } + + if ((fs->h_u.tcp_ip4_spec.ip4dst & + fs->m_u.tcp_ip4_spec.ip4dst) != fs->h_u.tcp_ip4_spec.ip4dst) { + DP_INFO(edev, "Don't support IP-masks\n"); + return -EOPNOTSUPP; + } + + if ((fs->h_u.tcp_ip4_spec.psrc & + fs->m_u.tcp_ip4_spec.psrc) != fs->h_u.tcp_ip4_spec.psrc) { + DP_INFO(edev, "Don't support port-masks\n"); + return -EOPNOTSUPP; + } + + if ((fs->h_u.tcp_ip4_spec.pdst & + fs->m_u.tcp_ip4_spec.pdst) != fs->h_u.tcp_ip4_spec.pdst) { + DP_INFO(edev, "Don't support port-masks\n"); + return -EOPNOTSUPP; + } + + if (fs->h_u.tcp_ip4_spec.tos) { + DP_INFO(edev, "Don't support tos\n"); + return -EOPNOTSUPP; + } + + t->eth_proto = htons(ETH_P_IP); + t->src_ipv4 = fs->h_u.tcp_ip4_spec.ip4src; + t->dst_ipv4 = fs->h_u.tcp_ip4_spec.ip4dst; + t->src_port = fs->h_u.tcp_ip4_spec.psrc; + t->dst_port = fs->h_u.tcp_ip4_spec.pdst; + + /* We must either have a valid 4-tuple or only dst port + * or only src ip as an input + */ + if (t->src_port && t->dst_port && t->src_ipv4 && t->dst_ipv4) { + t->mode = QED_FILTER_CONFIG_MODE_5_TUPLE; + } else if (!t->src_port && t->dst_port && + !t->src_ipv4 && !t->dst_ipv4) { + t->mode = QED_FILTER_CONFIG_MODE_L4_PORT; + } else if (!t->src_port && !t->dst_port && + !t->dst_ipv4 && t->src_ipv4) { + t->mode = QED_FILTER_CONFIG_MODE_IP_SRC; } else { - src_port = fsp->h_u.tcp_ip6_spec.psrc; - dst_port = fsp->h_u.tcp_ip6_spec.pdst; + DP_INFO(edev, "Invalid N-tuple\n"); + return -EOPNOTSUPP; } - head = QEDE_ARFS_BUCKET_HEAD(edev, 0); - hlist_for_each_entry_safe(fltr, temp, head, node) { - if ((fltr->tuple.ip_proto == ip_proto && - fltr->tuple.eth_proto == eth_proto && - qede_compare_user_flow_ips(fltr, fsp, eth_proto) && - fltr->tuple.src_port == src_port && - fltr->tuple.dst_port == dst_port) || - fltr->sw_id == fsp->location) - return -EEXIST; + t->ip_comp = qede_flow_spec_ipv4_cmp; + t->build_hdr = qede_flow_build_ipv4_hdr; + t->stringify = qede_flow_stringify_ipv4_hdr; + + return 0; +} + +static int qede_flow_spec_to_tuple_tcpv4(struct qede_dev *edev, + struct qede_arfs_tuple *t, + struct ethtool_rx_flow_spec *fs) +{ + t->ip_proto = IPPROTO_TCP; + + if (qede_flow_spec_to_tuple_ipv4_common(edev, t, fs)) + return -EINVAL; + + return 0; +} + +static int qede_flow_spec_to_tuple_udpv4(struct qede_dev *edev, + struct qede_arfs_tuple *t, + struct ethtool_rx_flow_spec *fs) +{ + t->ip_proto = IPPROTO_UDP; + + if (qede_flow_spec_to_tuple_ipv4_common(edev, t, fs)) + return -EINVAL; + + return 0; +} + +static int qede_flow_spec_to_tuple_ipv6_common(struct qede_dev *edev, + struct qede_arfs_tuple *t, + struct ethtool_rx_flow_spec *fs) +{ + struct in6_addr zero_addr; + void *p; + + p = &zero_addr; + memset(p, 0, sizeof(zero_addr)); + + if ((fs->h_u.tcp_ip6_spec.psrc & + fs->m_u.tcp_ip6_spec.psrc) != fs->h_u.tcp_ip6_spec.psrc) { + DP_INFO(edev, "Don't support port-masks\n"); + return -EOPNOTSUPP; + } + + if ((fs->h_u.tcp_ip6_spec.pdst & + fs->m_u.tcp_ip6_spec.pdst) != fs->h_u.tcp_ip6_spec.pdst) { + DP_INFO(edev, "Don't support port-masks\n"); + return -EOPNOTSUPP; + } + + if (fs->h_u.tcp_ip6_spec.tclass) { + DP_INFO(edev, "Don't support tclass\n"); + return -EOPNOTSUPP; } + t->eth_proto = htons(ETH_P_IPV6); + memcpy(&t->src_ipv6, &fs->h_u.tcp_ip6_spec.ip6src, + sizeof(struct in6_addr)); + memcpy(&t->dst_ipv6, &fs->h_u.tcp_ip6_spec.ip6dst, + sizeof(struct in6_addr)); + t->src_port = fs->h_u.tcp_ip6_spec.psrc; + t->dst_port = fs->h_u.tcp_ip6_spec.pdst; + + /* We must make sure we have a valid 4-tuple or only dest port + * or only src ip as an input + */ + if (t->src_port && t->dst_port && + memcmp(&t->src_ipv6, p, sizeof(struct in6_addr)) && + memcmp(&t->dst_ipv6, p, sizeof(struct in6_addr))) { + t->mode = QED_FILTER_CONFIG_MODE_5_TUPLE; + } else if (!t->src_port && t->dst_port && + !memcmp(&t->src_ipv6, p, sizeof(struct in6_addr)) && + !memcmp(&t->dst_ipv6, p, sizeof(struct in6_addr))) { + t->mode = QED_FILTER_CONFIG_MODE_L4_PORT; + } else if (!t->src_port && !t->dst_port && + !memcmp(&t->dst_ipv6, p, sizeof(struct in6_addr)) && + memcmp(&t->src_ipv6, p, sizeof(struct in6_addr))) { + t->mode = QED_FILTER_CONFIG_MODE_IP_SRC; + } else { + DP_INFO(edev, "Invalid N-tuple\n"); + return -EOPNOTSUPP; + } + + t->ip_comp = qede_flow_spec_ipv6_cmp; + t->build_hdr = qede_flow_build_ipv6_hdr; + return 0; } -static int -qede_poll_arfs_filter_config(struct qede_dev *edev, - struct qede_arfs_fltr_node *fltr) +static int qede_flow_spec_to_tuple_tcpv6(struct qede_dev *edev, + struct qede_arfs_tuple *t, + struct ethtool_rx_flow_spec *fs) { - int count = QEDE_ARFS_POLL_COUNT; + t->ip_proto = IPPROTO_TCP; - while (fltr->used && count) { - msleep(20); - count--; + if (qede_flow_spec_to_tuple_ipv6_common(edev, t, fs)) + return -EINVAL; + + return 0; +} + +static int qede_flow_spec_to_tuple_udpv6(struct qede_dev *edev, + struct qede_arfs_tuple *t, + struct ethtool_rx_flow_spec *fs) +{ + t->ip_proto = IPPROTO_UDP; + + if (qede_flow_spec_to_tuple_ipv6_common(edev, t, fs)) + return -EINVAL; + + return 0; +} + +static int qede_flow_spec_to_tuple(struct qede_dev *edev, + struct qede_arfs_tuple *t, + struct ethtool_rx_flow_spec *fs) +{ + memset(t, 0, sizeof(*t)); + + if (qede_flow_spec_validate_unused(edev, fs)) + return -EOPNOTSUPP; + + switch ((fs->flow_type & ~FLOW_EXT)) { + case TCP_V4_FLOW: + return qede_flow_spec_to_tuple_tcpv4(edev, t, fs); + case UDP_V4_FLOW: + return qede_flow_spec_to_tuple_udpv4(edev, t, fs); + case TCP_V6_FLOW: + return qede_flow_spec_to_tuple_tcpv6(edev, t, fs); + case UDP_V6_FLOW: + return qede_flow_spec_to_tuple_udpv6(edev, t, fs); + default: + DP_VERBOSE(edev, NETIF_MSG_IFUP, + "Can't support flow of type %08x\n", fs->flow_type); + return -EOPNOTSUPP; } - if (count == 0 || fltr->fw_rc) { - qede_dequeue_fltr_and_config_searcher(edev, fltr); - return -EIO; + return 0; +} + +static int qede_flow_spec_validate(struct qede_dev *edev, + struct ethtool_rx_flow_spec *fs, + struct qede_arfs_tuple *t) +{ + if (fs->location >= QEDE_RFS_MAX_FLTR) { + DP_INFO(edev, "Location out-of-bounds\n"); + return -EINVAL; } - return fltr->fw_rc; + /* Check location isn't already in use */ + if (test_bit(fs->location, edev->arfs->arfs_fltr_bmap)) { + DP_INFO(edev, "Location already in use\n"); + return -EINVAL; + } + + /* Check if the filtering-mode could support the filter */ + if (edev->arfs->filter_count && + edev->arfs->mode != t->mode) { + DP_INFO(edev, + "flow_spec would require filtering mode %08x, but %08x is configured\n", + t->mode, edev->arfs->filter_count); + return -EINVAL; + } + + /* If drop requested then no need to validate other data */ + if (fs->ring_cookie == RX_CLS_FLOW_DISC) + return 0; + + if (ethtool_get_flow_spec_ring_vf(fs->ring_cookie)) + return 0; + + if (fs->ring_cookie >= QEDE_RSS_COUNT(edev)) { + DP_INFO(edev, "Queue out-of-bounds\n"); + return -EINVAL; + } + + return 0; +} + +/* Must be called while qede lock is held */ +static struct qede_arfs_fltr_node * +qede_flow_find_fltr(struct qede_dev *edev, struct qede_arfs_tuple *t) +{ + struct qede_arfs_fltr_node *fltr; + struct hlist_node *temp; + struct hlist_head *head; + + head = QEDE_ARFS_BUCKET_HEAD(edev, 0); + + hlist_for_each_entry_safe(fltr, temp, head, node) { + if (fltr->tuple.ip_proto == t->ip_proto && + fltr->tuple.src_port == t->src_port && + fltr->tuple.dst_port == t->dst_port && + t->ip_comp(&fltr->tuple, t)) + return fltr; + } + + return NULL; +} + +static void qede_flow_set_destination(struct qede_dev *edev, + struct qede_arfs_fltr_node *n, + struct ethtool_rx_flow_spec *fs) +{ + if (fs->ring_cookie == RX_CLS_FLOW_DISC) { + n->b_is_drop = true; + return; + } + + n->vfid = ethtool_get_flow_spec_ring_vf(fs->ring_cookie); + n->rxq_id = ethtool_get_flow_spec_ring(fs->ring_cookie); + n->next_rxq_id = n->rxq_id; + + if (n->vfid) + DP_VERBOSE(edev, QED_MSG_SP, + "Configuring N-tuple for VF 0x%02x\n", n->vfid - 1); } int qede_add_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info) { struct ethtool_rx_flow_spec *fsp = &info->fs; struct qede_arfs_fltr_node *n; - int min_hlen = ETH_HLEN, rc; - struct ethhdr *eth; - struct iphdr *ip; - __be16 *ports; + struct qede_arfs_tuple t; + int min_hlen, rc; __qede_lock(edev); @@ -1559,16 +1889,28 @@ int qede_add_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info) goto unlock; } - rc = qede_validate_and_check_flow_exist(edev, fsp, &min_hlen); + /* Translate the flow specification into something fittign our DB */ + rc = qede_flow_spec_to_tuple(edev, &t, fsp); if (rc) goto unlock; + /* Make sure location is valid and filter isn't already set */ + rc = qede_flow_spec_validate(edev, fsp, &t); + if (rc) + goto unlock; + + if (qede_flow_find_fltr(edev, &t)) { + rc = -EINVAL; + goto unlock; + } + n = kzalloc(sizeof(*n), GFP_KERNEL); if (!n) { rc = -ENOMEM; goto unlock; } + min_hlen = qede_flow_get_min_header_size(&t); n->data = kzalloc(min_hlen, GFP_KERNEL); if (!n->data) { kfree(n); @@ -1579,68 +1921,13 @@ int qede_add_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info) n->sw_id = fsp->location; set_bit(n->sw_id, edev->arfs->arfs_fltr_bmap); n->buf_len = min_hlen; - n->rxq_id = fsp->ring_cookie; - n->next_rxq_id = n->rxq_id; - eth = (struct ethhdr *)n->data; - if (info->fs.flow_type == TCP_V4_FLOW || - info->fs.flow_type == UDP_V4_FLOW) { - ports = (__be16 *)(n->data + ETH_HLEN + - sizeof(struct iphdr)); - eth->h_proto = htons(ETH_P_IP); - n->tuple.eth_proto = htons(ETH_P_IP); - n->tuple.src_ipv4 = info->fs.h_u.tcp_ip4_spec.ip4src; - n->tuple.dst_ipv4 = info->fs.h_u.tcp_ip4_spec.ip4dst; - n->tuple.src_port = info->fs.h_u.tcp_ip4_spec.psrc; - n->tuple.dst_port = info->fs.h_u.tcp_ip4_spec.pdst; - ports[0] = n->tuple.src_port; - ports[1] = n->tuple.dst_port; - ip = (struct iphdr *)(n->data + ETH_HLEN); - ip->saddr = info->fs.h_u.tcp_ip4_spec.ip4src; - ip->daddr = info->fs.h_u.tcp_ip4_spec.ip4dst; - ip->version = 0x4; - ip->ihl = 0x5; - - if (info->fs.flow_type == TCP_V4_FLOW) { - n->tuple.ip_proto = IPPROTO_TCP; - ip->protocol = IPPROTO_TCP; - } else { - n->tuple.ip_proto = IPPROTO_UDP; - ip->protocol = IPPROTO_UDP; - } - ip->tot_len = cpu_to_be16(min_hlen - ETH_HLEN); - } else { - struct ipv6hdr *ip6; - - ip6 = (struct ipv6hdr *)(n->data + ETH_HLEN); - ports = (__be16 *)(n->data + ETH_HLEN + - sizeof(struct ipv6hdr)); - eth->h_proto = htons(ETH_P_IPV6); - n->tuple.eth_proto = htons(ETH_P_IPV6); - memcpy(&n->tuple.src_ipv6, &info->fs.h_u.tcp_ip6_spec.ip6src, - sizeof(struct in6_addr)); - memcpy(&n->tuple.dst_ipv6, &info->fs.h_u.tcp_ip6_spec.ip6dst, - sizeof(struct in6_addr)); - n->tuple.src_port = info->fs.h_u.tcp_ip6_spec.psrc; - n->tuple.dst_port = info->fs.h_u.tcp_ip6_spec.pdst; - ports[0] = n->tuple.src_port; - ports[1] = n->tuple.dst_port; - memcpy(&ip6->saddr, &n->tuple.src_ipv6, - sizeof(struct in6_addr)); - memcpy(&ip6->daddr, &n->tuple.dst_ipv6, - sizeof(struct in6_addr)); - ip6->version = 0x6; + memcpy(&n->tuple, &t, sizeof(n->tuple)); - if (info->fs.flow_type == TCP_V6_FLOW) { - n->tuple.ip_proto = IPPROTO_TCP; - ip6->nexthdr = NEXTHDR_TCP; - ip6->payload_len = cpu_to_be16(sizeof(struct tcphdr)); - } else { - n->tuple.ip_proto = IPPROTO_UDP; - ip6->nexthdr = NEXTHDR_UDP; - ip6->payload_len = cpu_to_be16(sizeof(struct udphdr)); - } - } + qede_flow_set_destination(edev, n, fsp); + + /* Build a minimal header according to the flow */ + n->tuple.build_hdr(&n->tuple, n->data); rc = qede_enqueue_fltr_and_config_searcher(edev, n, 0); if (rc) @@ -1650,6 +1937,7 @@ int qede_add_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info) rc = qede_poll_arfs_filter_config(edev, n); unlock: __qede_unlock(edev); + return rc; } diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 9e70f713c3c5..d118771e1a7b 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -347,6 +347,7 @@ void qede_fill_by_demand_stats(struct qede_dev *edev) p_common->rx_bcast_pkts = stats.common.rx_bcast_pkts; p_common->mftag_filter_discards = stats.common.mftag_filter_discards; p_common->mac_filter_discards = stats.common.mac_filter_discards; + p_common->gft_filter_drop = stats.common.gft_filter_drop; p_common->tx_ucast_bytes = stats.common.tx_ucast_bytes; p_common->tx_mcast_bytes = stats.common.tx_mcast_bytes; diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c index d118da5a10a2..ffd68a7bc9e1 100644 --- a/drivers/net/ethernet/realtek/8139too.c +++ b/drivers/net/ethernet/realtek/8139too.c @@ -1104,7 +1104,6 @@ static int rtl8139_init_one(struct pci_dev *pdev, return 0; err_out: - netif_napi_del(&tp->napi); __rtl8139_cleanup_dev (dev); pci_disable_device (pdev); return i; @@ -1119,7 +1118,6 @@ static void rtl8139_remove_one(struct pci_dev *pdev) assert (dev != NULL); cancel_delayed_work_sync(&tp->thread); - netif_napi_del(&tp->napi); unregister_netdev (dev); diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c index cece961f2e82..c3ad564ac4c0 100644 --- a/drivers/net/ethernet/sfc/tx.c +++ b/drivers/net/ethernet/sfc/tx.c @@ -435,17 +435,18 @@ static int efx_tx_map_data(struct efx_tx_queue *tx_queue, struct sk_buff *skb, } while (1); } -/* Remove buffers put into a tx_queue. None of the buffers must have - * an skb attached. +/* Remove buffers put into a tx_queue for the current packet. + * None of the buffers must have an skb attached. */ -static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue) +static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue, + unsigned int insert_count) { struct efx_tx_buffer *buffer; unsigned int bytes_compl = 0; unsigned int pkts_compl = 0; /* Work backwards until we hit the original insert pointer value */ - while (tx_queue->insert_count != tx_queue->write_count) { + while (tx_queue->insert_count != insert_count) { --tx_queue->insert_count; buffer = __efx_tx_queue_get_insert_buffer(tx_queue); efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl); @@ -504,6 +505,8 @@ static int efx_tx_tso_fallback(struct efx_tx_queue *tx_queue, */ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb) { + unsigned int old_insert_count = tx_queue->insert_count; + bool xmit_more = skb->xmit_more; bool data_mapped = false; unsigned int segments; unsigned int skb_len; @@ -553,8 +556,10 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb) /* Update BQL */ netdev_tx_sent_queue(tx_queue->core_txq, skb_len); + efx_tx_maybe_stop_queue(tx_queue); + /* Pass off to hardware */ - if (!skb->xmit_more || netif_xmit_stopped(tx_queue->core_txq)) { + if (!xmit_more || netif_xmit_stopped(tx_queue->core_txq)) { struct efx_tx_queue *txq2 = efx_tx_queue_partner(tx_queue); /* There could be packets left on the partner queue if those @@ -577,14 +582,26 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb) tx_queue->tx_packets++; } - efx_tx_maybe_stop_queue(tx_queue); - return NETDEV_TX_OK; err: - efx_enqueue_unwind(tx_queue); + efx_enqueue_unwind(tx_queue, old_insert_count); dev_kfree_skb_any(skb); + + /* If we're not expecting another transmit and we had something to push + * on this queue or a partner queue then we need to push here to get the + * previous packets out. + */ + if (!xmit_more) { + struct efx_tx_queue *txq2 = efx_tx_queue_partner(tx_queue); + + if (txq2->xmit_more_available) + efx_nic_push_buffers(txq2); + + efx_nic_push_buffers(tx_queue); + } + return NETDEV_TX_OK; } diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 7f3dab4b4cbc..5428bb261102 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1237,7 +1237,10 @@ static void rndis_get_friendly_name(struct net_device *net, if (rndis_filter_query_device(rndis_device, net_device, RNDIS_OID_GEN_FRIENDLY_NAME, wname, &size) != 0) - return; + return; /* ignore if host does not support */ + + if (size == 0) + return; /* name not set */ /* Convert Windows Unicode string to UTF-8 */ len = ucs2_as_utf8(ifalias, wname, sizeof(ifalias)); diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index d6ff881165d0..e6730a01d130 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1129,6 +1129,7 @@ static int team_upper_dev_link(struct team *team, struct team_port *port, int err; lag_upper_info.tx_type = team->mode->lag_tx_type; + lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN; err = netdev_master_upper_dev_link(port->dev, team->dev, NULL, &lag_upper_info, extack); if (err) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 99bf3cee1345..33a9c5661038 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -70,6 +70,7 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/sock.h> +#include <net/xdp.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/skb_array.h> @@ -1284,34 +1285,44 @@ static const struct net_device_ops tun_netdev_ops = { .ndo_get_stats64 = tun_net_get_stats64, }; -static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame) +static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; u32 numqueues; - int ret = 0; + int drops = 0; + int cnt = n; + int i; rcu_read_lock(); numqueues = READ_ONCE(tun->numqueues); if (!numqueues) { - ret = -ENOSPC; - goto out; + rcu_read_unlock(); + return -ENXIO; /* Caller will free/return all frames */ } tfile = rcu_dereference(tun->tfiles[smp_processor_id() % numqueues]); - /* Encode the XDP flag into lowest bit for consumer to differ - * XDP buffer from sk_buff. - */ - if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) { - this_cpu_inc(tun->pcpu_stats->tx_dropped); - ret = -ENOSPC; + + spin_lock(&tfile->tx_ring.producer_lock); + for (i = 0; i < n; i++) { + struct xdp_frame *xdp = frames[i]; + /* Encode the XDP flag into lowest bit for consumer to differ + * XDP buffer from sk_buff. + */ + void *frame = tun_xdp_to_ptr(xdp); + + if (__ptr_ring_produce(&tfile->tx_ring, frame)) { + this_cpu_inc(tun->pcpu_stats->tx_dropped); + xdp_return_frame_rx_napi(xdp); + drops++; + } } + spin_unlock(&tfile->tx_ring.producer_lock); -out: rcu_read_unlock(); - return ret; + return cnt - drops; } static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) @@ -1321,7 +1332,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) if (unlikely(!frame)) return -EOVERFLOW; - return tun_xdp_xmit(dev, frame); + return tun_xdp_xmit(dev, 1, &frame); } static void tun_xdp_flush(struct net_device *dev) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index f34794a76c4d..39a0783d1cde 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -419,23 +419,13 @@ static void virtnet_xdp_flush(struct net_device *dev) virtqueue_kick(sq->vq); } -static int __virtnet_xdp_xmit(struct virtnet_info *vi, - struct xdp_frame *xdpf) +static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, + struct send_queue *sq, + struct xdp_frame *xdpf) { struct virtio_net_hdr_mrg_rxbuf *hdr; - struct xdp_frame *xdpf_sent; - struct send_queue *sq; - unsigned int len; - unsigned int qp; int err; - qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); - sq = &vi->sq[qp]; - - /* Free up any pending old buffers before queueing new ones. */ - while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) - xdp_return_frame(xdpf_sent); - /* virtqueue want to use data area in-front of packet */ if (unlikely(xdpf->metasize > 0)) return -EOPNOTSUPP; @@ -459,11 +449,40 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi, return 0; } -static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) +static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi, + struct xdp_frame *xdpf) +{ + struct xdp_frame *xdpf_sent; + struct send_queue *sq; + unsigned int len; + unsigned int qp; + + qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); + sq = &vi->sq[qp]; + + /* Free up any pending old buffers before queueing new ones. */ + while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) + xdp_return_frame(xdpf_sent); + + return __virtnet_xdp_xmit_one(vi, sq, xdpf); +} + +static int virtnet_xdp_xmit(struct net_device *dev, + int n, struct xdp_frame **frames) { struct virtnet_info *vi = netdev_priv(dev); struct receive_queue *rq = vi->rq; + struct xdp_frame *xdpf_sent; struct bpf_prog *xdp_prog; + struct send_queue *sq; + unsigned int len; + unsigned int qp; + int drops = 0; + int err; + int i; + + qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); + sq = &vi->sq[qp]; /* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this * indicate XDP resources have been successfully allocated. @@ -472,7 +491,20 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) if (!xdp_prog) return -ENXIO; - return __virtnet_xdp_xmit(vi, xdpf); + /* Free up any pending old buffers before queueing new ones. */ + while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) + xdp_return_frame(xdpf_sent); + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + + err = __virtnet_xdp_xmit_one(vi, sq, xdpf); + if (err) { + xdp_return_frame_rx_napi(xdpf); + drops++; + } + } + return n - drops; } static unsigned int virtnet_get_headroom(struct virtnet_info *vi) @@ -616,7 +648,7 @@ static struct sk_buff *receive_small(struct net_device *dev, xdpf = convert_to_xdp_frame(&xdp); if (unlikely(!xdpf)) goto err_xdp; - err = __virtnet_xdp_xmit(vi, xdpf); + err = __virtnet_xdp_tx_xmit(vi, xdpf); if (unlikely(err)) { trace_xdp_exception(vi->dev, xdp_prog, act); goto err_xdp; @@ -779,7 +811,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, xdpf = convert_to_xdp_frame(&xdp); if (unlikely(!xdpf)) goto err_xdp; - err = __virtnet_xdp_xmit(vi, xdpf); + err = __virtnet_xdp_tx_xmit(vi, xdpf); if (unlikely(err)) { trace_xdp_exception(vi->dev, xdp_prog, act); if (unlikely(xdp_page != page)) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ed0122b45b63..bbe297436e5d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -69,8 +69,8 @@ struct bpf_map { u32 pages; u32 id; int numa_node; - u32 btf_key_id; - u32 btf_value_id; + u32 btf_key_type_id; + u32 btf_value_type_id; struct btf *btf; bool unpriv_array; /* 55 bytes hole */ @@ -463,6 +463,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); int bpf_get_file_flag(int flags); +int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size, + size_t actual_size); /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. @@ -485,14 +487,17 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); /* Map specifics */ -struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); +struct xdp_buff; + +struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); void __cpu_map_flush(struct bpf_map *map); -struct xdp_buff; int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -571,6 +576,16 @@ static inline void __dev_map_flush(struct bpf_map *map) { } +struct xdp_buff; +struct bpf_dtab_netdev; + +static inline +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + return 0; +} + static inline struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { @@ -585,7 +600,6 @@ static inline void __cpu_map_flush(struct bpf_map *map) { } -struct xdp_buff; static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b67f8793de0d..b161e506dcfc 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -9,9 +9,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) diff --git a/include/linux/filter.h b/include/linux/filter.h index 9dbcb9d55921..d358d1815c16 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -517,6 +517,7 @@ struct sk_msg_buff { bool sg_copy[MAX_SKB_FRAGS]; __u32 flags; struct sock *sk_redir; + struct sock *sk; struct sk_buff *skb; struct list_head list; }; diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 585d27182425..7843b98e1c6e 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -50,6 +50,7 @@ struct br_ip_list { #define BR_VLAN_TUNNEL BIT(13) #define BR_BCAST_FLOOD BIT(14) #define BR_NEIGH_SUPPRESS BIT(15) +#define BR_ISOLATED BIT(16) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 03ed492c4e14..8452f72087ef 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1185,9 +1185,13 @@ struct dev_ifalias { * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. - * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp); - * This function is used to submit a XDP packet for transmit on a - * netdevice. + * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp); + * This function is used to submit @n XDP packets for transmit on a + * netdevice. Returns number of frames successfully transmitted, frames + * that got dropped are freed/returned via xdp_return_frame(). + * Returns negative number, means general error invoking ndo, meaning + * no frames were xmit'ed and core-caller will free all frames. + * TODO: Consider add flag to allow sending flush operation. * void (*ndo_xdp_flush)(struct net_device *dev); * This function is used to inform the driver to flush a particular * xdp tx queue. Must be called on same CPU as xdp_xmit. @@ -1375,8 +1379,8 @@ struct net_device_ops { int needed_headroom); int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); - int (*ndo_xdp_xmit)(struct net_device *dev, - struct xdp_frame *xdp); + int (*ndo_xdp_xmit)(struct net_device *dev, int n, + struct xdp_frame **xdp); void (*ndo_xdp_flush)(struct net_device *dev); }; @@ -2328,8 +2332,19 @@ enum netdev_lag_tx_type { NETDEV_LAG_TX_TYPE_HASH, }; +enum netdev_lag_hash { + NETDEV_LAG_HASH_NONE, + NETDEV_LAG_HASH_L2, + NETDEV_LAG_HASH_L34, + NETDEV_LAG_HASH_L23, + NETDEV_LAG_HASH_E23, + NETDEV_LAG_HASH_E34, + NETDEV_LAG_HASH_UNKNOWN, +}; + struct netdev_lag_upper_info { enum netdev_lag_tx_type tx_type; + enum netdev_lag_hash hash_type; }; struct netdev_lag_lower_state_info { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e71e99eb9a4e..eec302b61cfe 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -868,6 +868,7 @@ extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); extern struct file *perf_event_get(unsigned int fd); +extern const struct perf_event *perf_get_event(struct file *file); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); @@ -1289,6 +1290,10 @@ static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } +static inline const struct perf_event *perf_get_event(struct file *file) +{ + return ERR_PTR(-EINVAL); +} static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h index 7f9756fe9180..2978fa4add42 100644 --- a/include/linux/qed/qed_eth_if.h +++ b/include/linux/qed/qed_eth_if.h @@ -66,6 +66,7 @@ enum qed_filter_config_mode { QED_FILTER_CONFIG_MODE_5_TUPLE, QED_FILTER_CONFIG_MODE_L4_PORT, QED_FILTER_CONFIG_MODE_IP_DEST, + QED_FILTER_CONFIG_MODE_IP_SRC, }; struct qed_ntuple_filter_params { @@ -88,6 +89,9 @@ struct qed_ntuple_filter_params { /* true iff this filter is to be added. Else to be removed */ bool b_is_add; + + /* If flow needs to be dropped */ + bool b_is_drop; }; struct qed_dev_eth_info { diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 44af652e7407..ac991a3b8f03 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -1129,6 +1129,7 @@ struct qed_eth_stats_common { u64 rx_bcast_pkts; u64 mftag_filter_discards; u64 mac_filter_discards; + u64 gft_filter_drop; u64 tx_ucast_bytes; u64 tx_mcast_bytes; u64 tx_bcast_bytes; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2bde3eff564c..d34144a3c5b5 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, + u32 *fd_type, const char **buf, + u64 *probe_offset, u64 *probe_addr); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name { return NULL; } +static inline int bpf_get_perf_event_info(const struct perf_event *event, + u32 *prog_id, u32 *fd_type, + const char **buf, u64 *probe_offset, + u64 *probe_addr) +{ + return -EOPNOTSUPP; +} #endif enum { @@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, int flags); #ifdef CONFIG_KPROBE_EVENTS extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_kprobe_destroy(struct perf_event *event); +extern int bpf_get_kprobe_info(const struct perf_event *event, + u32 *fd_type, const char **symbol, + u64 *probe_offset, u64 *probe_addr, + bool perf_type_tracepoint); #endif #ifdef CONFIG_UPROBE_EVENTS extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_uprobe_destroy(struct perf_event *event); +extern int bpf_get_uprobe_info(const struct perf_event *event, + u32 *fd_type, const char **filename, + u64 *probe_offset, bool perf_type_tracepoint); #endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); diff --git a/include/net/addrconf.h b/include/net/addrconf.h index ff766ab207e0..c07d4dd09361 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -236,6 +236,8 @@ struct ipv6_stub { struct flowi6 *fl6, int oif, const struct sk_buff *skb, int strict); + u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr); void (*udpv6_encap_enable)(void); void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index cc70f6da8462..7897efe80727 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -412,6 +412,12 @@ static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i) return f6i->fib6_nh.nh_dev; } +static inline +struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i) +{ + return f6i->fib6_nh.nh_lwtstate; +} + void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9e4d0f0aeb6d..59656fc580df 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -294,6 +294,9 @@ static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) return mtu; } +u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr); + struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, const void *daddr); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 81d0f2107ff1..69c91d1934c1 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -449,4 +449,6 @@ static inline void fib_proc_exit(struct net *net) } #endif +u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr); + #endif /* _NET_FIB_H */ diff --git a/include/net/page_pool.h b/include/net/page_pool.h index c79087153148..694d055e01ef 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -115,13 +115,14 @@ void page_pool_destroy(struct page_pool *pool); void __page_pool_put_page(struct page_pool *pool, struct page *page, bool allow_direct); -static inline void page_pool_put_page(struct page_pool *pool, struct page *page) +static inline void page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - __page_pool_put_page(pool, page, false); + __page_pool_put_page(pool, page, allow_direct); #endif } /* Very limited use-cases allow recycle direct */ diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 0005f0b40fe9..f3ec43725724 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -33,7 +33,7 @@ struct tcf_block_ext_info { }; struct tcf_block_cb; -bool tcf_queue_work(struct work_struct *work); +bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, diff --git a/include/net/seg6.h b/include/net/seg6.h index 099bad59dc90..e029e301faa5 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -49,7 +49,11 @@ struct seg6_pernet_data { static inline struct seg6_pernet_data *seg6_pernet(struct net *net) { +#if IS_ENABLED(CONFIG_IPV6) return net->ipv6.seg6_data; +#else + return NULL; +#endif } extern int seg6_init(void); @@ -63,5 +67,6 @@ extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len); extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto); extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh); - +extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id); #endif diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h new file mode 100644 index 000000000000..661fd5b4d3e0 --- /dev/null +++ b/include/net/seg6_local.h @@ -0,0 +1,32 @@ +/* + * SR-IPv6 implementation + * + * Authors: + * David Lebrun <[email protected]> + * eBPF support: Mathieu Xhonneux <[email protected]> + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _NET_SEG6_LOCAL_H +#define _NET_SEG6_LOCAL_H + +#include <linux/percpu.h> +#include <linux/net.h> +#include <linux/ipv6.h> + +extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id); + +struct seg6_bpf_srh_state { + bool valid; + u16 hdrlen; +}; + +DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); + +#endif diff --git a/include/net/xdp.h b/include/net/xdp.h index 0b689cf561c7..7ad779237ae8 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) } void xdp_return_frame(struct xdp_frame *xdpf); +void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 185f4928fbda..7a647c56ec15 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * AF_XDP internal functions +/* SPDX-License-Identifier: GPL-2.0 */ +/* AF_XDP internal functions * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _LINUX_XDP_SOCK_H diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h index 81b7e985bb45..9763cddd0594 100644 --- a/include/trace/events/fib.h +++ b/include/trace/events/fib.h @@ -12,12 +12,14 @@ TRACE_EVENT(fib_table_lookup, - TP_PROTO(u32 tb_id, const struct flowi4 *flp), + TP_PROTO(u32 tb_id, const struct flowi4 *flp, + const struct fib_nh *nh, int err), - TP_ARGS(tb_id, flp), + TP_ARGS(tb_id, flp, nh, err), TP_STRUCT__entry( __field( u32, tb_id ) + __field( int, err ) __field( int, oif ) __field( int, iif ) __field( __u8, tos ) @@ -25,12 +27,19 @@ TRACE_EVENT(fib_table_lookup, __field( __u8, flags ) __array( __u8, src, 4 ) __array( __u8, dst, 4 ) + __array( __u8, gw, 4 ) + __array( __u8, saddr, 4 ) + __field( u16, sport ) + __field( u16, dport ) + __field( u8, proto ) + __dynamic_array(char, name, IFNAMSIZ ) ), TP_fast_assign( __be32 *p32; __entry->tb_id = tb_id; + __entry->err = err; __entry->oif = flp->flowi4_oif; __entry->iif = flp->flowi4_iif; __entry->tos = flp->flowi4_tos; @@ -42,71 +51,41 @@ TRACE_EVENT(fib_table_lookup, p32 = (__be32 *) __entry->dst; *p32 = flp->daddr; - ), - - TP_printk("table %u oif %d iif %d src %pI4 dst %pI4 tos %d scope %d flags %x", - __entry->tb_id, __entry->oif, __entry->iif, - __entry->src, __entry->dst, __entry->tos, __entry->scope, - __entry->flags) -); - -TRACE_EVENT(fib_table_lookup_nh, - - TP_PROTO(const struct fib_nh *nh), - - TP_ARGS(nh), - - TP_STRUCT__entry( - __string( name, nh->nh_dev->name) - __field( int, oif ) - __array( __u8, src, 4 ) - ), - - TP_fast_assign( - __be32 *p32 = (__be32 *) __entry->src; - - __assign_str(name, nh->nh_dev ? nh->nh_dev->name : "not set"); - __entry->oif = nh->nh_oif; - *p32 = nh->nh_saddr; - ), - - TP_printk("nexthop dev %s oif %d src %pI4", - __get_str(name), __entry->oif, __entry->src) -); - -TRACE_EVENT(fib_validate_source, - - TP_PROTO(const struct net_device *dev, const struct flowi4 *flp), - - TP_ARGS(dev, flp), - TP_STRUCT__entry( - __string( name, dev->name ) - __field( int, oif ) - __field( int, iif ) - __field( __u8, tos ) - __array( __u8, src, 4 ) - __array( __u8, dst, 4 ) - ), - - TP_fast_assign( - __be32 *p32; - - __assign_str(name, dev ? dev->name : "not set"); - __entry->oif = flp->flowi4_oif; - __entry->iif = flp->flowi4_iif; - __entry->tos = flp->flowi4_tos; - - p32 = (__be32 *) __entry->src; - *p32 = flp->saddr; - - p32 = (__be32 *) __entry->dst; - *p32 = flp->daddr; + __entry->proto = flp->flowi4_proto; + if (__entry->proto == IPPROTO_TCP || + __entry->proto == IPPROTO_UDP) { + __entry->sport = ntohs(flp->fl4_sport); + __entry->dport = ntohs(flp->fl4_dport); + } else { + __entry->sport = 0; + __entry->dport = 0; + } + + if (nh) { + p32 = (__be32 *) __entry->saddr; + *p32 = nh->nh_saddr; + + p32 = (__be32 *) __entry->gw; + *p32 = nh->nh_gw; + + __assign_str(name, nh->nh_dev ? nh->nh_dev->name : "-"); + } else { + p32 = (__be32 *) __entry->saddr; + *p32 = 0; + + p32 = (__be32 *) __entry->gw; + *p32 = 0; + + __assign_str(name, "-"); + } ), - TP_printk("dev %s oif %d iif %d tos %d src %pI4 dst %pI4", - __get_str(name), __entry->oif, __entry->iif, __entry->tos, - __entry->src, __entry->dst) + TP_printk("table %u oif %d iif %d proto %u %pI4/%u -> %pI4/%u tos %d scope %d flags %x ==> dev %s gw %pI4 src %pI4 err %d", + __entry->tb_id, __entry->oif, __entry->iif, __entry->proto, + __entry->src, __entry->sport, __entry->dst, __entry->dport, + __entry->tos, __entry->scope, __entry->flags, + __get_str(name), __entry->gw, __entry->saddr, __entry->err) ); #endif /* _TRACE_FIB_H */ diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h index 1b8d951e3c12..b088b54d699c 100644 --- a/include/trace/events/fib6.h +++ b/include/trace/events/fib6.h @@ -19,7 +19,7 @@ TRACE_EVENT(fib6_table_lookup, TP_STRUCT__entry( __field( u32, tb_id ) - + __field( int, err ) __field( int, oif ) __field( int, iif ) __field( __u8, tos ) @@ -27,7 +27,10 @@ TRACE_EVENT(fib6_table_lookup, __field( __u8, flags ) __array( __u8, src, 16 ) __array( __u8, dst, 16 ) - + __field( u16, sport ) + __field( u16, dport ) + __field( u8, proto ) + __field( u8, rt_type ) __dynamic_array( char, name, IFNAMSIZ ) __array( __u8, gw, 16 ) ), @@ -36,6 +39,7 @@ TRACE_EVENT(fib6_table_lookup, struct in6_addr *in6; __entry->tb_id = table->tb6_id; + __entry->err = ip6_rt_type_to_error(f6i->fib6_type); __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; __entry->tos = ip6_tclass(flp->flowlabel); @@ -48,10 +52,20 @@ TRACE_EVENT(fib6_table_lookup, in6 = (struct in6_addr *)__entry->dst; *in6 = flp->daddr; + __entry->proto = flp->flowi6_proto; + if (__entry->proto == IPPROTO_TCP || + __entry->proto == IPPROTO_UDP) { + __entry->sport = ntohs(flp->fl6_sport); + __entry->dport = ntohs(flp->fl6_dport); + } else { + __entry->sport = 0; + __entry->dport = 0; + } + if (f6i->fib6_nh.nh_dev) { __assign_str(name, f6i->fib6_nh.nh_dev); } else { - __assign_str(name, ""); + __assign_str(name, "-"); } if (f6i == net->ipv6.fib6_null_entry) { struct in6_addr in6_zero = {}; @@ -65,10 +79,11 @@ TRACE_EVENT(fib6_table_lookup, } ), - TP_printk("table %3u oif %d iif %d src %pI6c dst %pI6c tos %d scope %d flags %x ==> dev %s gw %pI6c", - __entry->tb_id, __entry->oif, __entry->iif, - __entry->src, __entry->dst, __entry->tos, __entry->scope, - __entry->flags, __get_str(name), __entry->gw) + TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u tos %d scope %d flags %x ==> dev %s gw %pI6c err %d", + __entry->tb_id, __entry->oif, __entry->iif, __entry->proto, + __entry->src, __entry->sport, __entry->dst, __entry->dport, + __entry->tos, __entry->scope, __entry->flags, + __get_str(name), __entry->gw, __entry->err) ); #endif /* _TRACE_FIB6_H */ diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 8989a92c571a..1ecf4c67fcf7 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -138,11 +138,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, __entry->map_id, __entry->map_index) ); +#ifndef __DEVMAP_OBJ_TYPE +#define __DEVMAP_OBJ_TYPE +struct _bpf_dtab_netdev { + struct net_device *dev; +}; +#endif /* __DEVMAP_OBJ_TYPE */ + #define devmap_ifindex(fwd, map) \ (!fwd ? 0 : \ (!map ? 0 : \ ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ - ((struct net_device *)fwd)->ifindex : 0))) + ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))) #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ @@ -222,6 +229,47 @@ TRACE_EVENT(xdp_cpumap_enqueue, __entry->to_cpu) ); +TRACE_EVENT(xdp_devmap_xmit, + + TP_PROTO(const struct bpf_map *map, u32 map_index, + int sent, int drops, + const struct net_device *from_dev, + const struct net_device *to_dev, int err), + + TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err), + + TP_STRUCT__entry( + __field(int, map_id) + __field(u32, act) + __field(u32, map_index) + __field(int, drops) + __field(int, sent) + __field(int, from_ifindex) + __field(int, to_ifindex) + __field(int, err) + ), + + TP_fast_assign( + __entry->map_id = map->id; + __entry->act = XDP_REDIRECT; + __entry->map_index = map_index; + __entry->drops = drops; + __entry->sent = sent; + __entry->from_ifindex = from_dev->ifindex; + __entry->to_ifindex = to_dev->ifindex; + __entry->err = err; + ), + + TP_printk("ndo_xdp_xmit" + " map_id=%d map_index=%d action=%s" + " sent=%d drops=%d" + " from_ifindex=%d to_ifindex=%d err=%d", + __entry->map_id, __entry->map_index, + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), + __entry->sent, __entry->drops, + __entry->from_ifindex, __entry->to_ifindex, __entry->err) +); + #endif /* _TRACE_XDP_H */ #include <trace/define_trace.h> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d94d333a8225..9b8c6e310e9a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -97,6 +97,7 @@ enum bpf_cmd { BPF_RAW_TRACEPOINT_OPEN, BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, + BPF_TASK_FD_QUERY, }; enum bpf_map_type { @@ -141,6 +142,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_PROG_TYPE_LWT_SEG6LOCAL, }; enum bpf_attach_type { @@ -284,8 +286,8 @@ union bpf_attr { char map_name[BPF_OBJ_NAME_LEN]; __u32 map_ifindex; /* ifindex of netdev to create on */ __u32 btf_fd; /* fd pointing to a BTF type data */ - __u32 btf_key_id; /* BTF type_id of the key */ - __u32 btf_value_id; /* BTF type_id of the value */ + __u32 btf_key_type_id; /* BTF type_id of the key */ + __u32 btf_value_type_id; /* BTF type_id of the value */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -379,6 +381,22 @@ union bpf_attr { __u32 btf_log_size; __u32 btf_log_level; }; + + struct { + __u32 pid; /* input: pid */ + __u32 fd; /* input: fd */ + __u32 flags; /* input: flags */ + __u32 buf_len; /* input/output: buf len */ + __aligned_u64 buf; /* input/output: + * tp_name for tracepoint + * symbol for kprobe + * filename for uprobe + */ + __u32 prog_id; /* output: prod_id */ + __u32 fd_type; /* output: BPF_FD_TYPE_* */ + __u64 probe_offset; /* output: probe_offset */ + __u64 probe_addr; /* output: probe_addr */ + } task_fd_query; } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF @@ -1902,6 +1920,90 @@ union bpf_attr { * egress otherwise). This is the only flag supported for now. * Return * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * Description + * Encapsulate the packet associated to *skb* within a Layer 3 + * protocol header. This header is provided in the buffer at + * address *hdr*, with *len* its size in bytes. *type* indicates + * the protocol of the header and can be one of: + * + * **BPF_LWT_ENCAP_SEG6** + * IPv6 encapsulation with Segment Routing Header + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, + * the IPv6 header is computed by the kernel. + * **BPF_LWT_ENCAP_SEG6_INLINE** + * Only works if *skb* contains an IPv6 packet. Insert a + * Segment Routing Header (**struct ipv6_sr_hdr**) inside + * the IPv6 header. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. Only the flags, tag and TLVs + * inside the outermost IPv6 Segment Routing Header can be + * modified through this helper. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * Description + * Adjust the size allocated to TLVs in the outermost IPv6 + * Segment Routing Header contained in the packet associated to + * *skb*, at position *offset* by *delta* bytes. Only offsets + * after the segments are accepted. *delta* can be as well + * positive (growing) as negative (shrinking). + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * Description + * Apply an IPv6 Segment Routing action of type *action* to the + * packet associated to *skb*. Each action takes a parameter + * contained at address *param*, and of length *param_len* bytes. + * *action* can be one of: + * + * **SEG6_LOCAL_ACTION_END_X** + * End.X action: Endpoint with Layer-3 cross-connect. + * Type of *param*: **struct in6_addr**. + * **SEG6_LOCAL_ACTION_END_T** + * End.T action: Endpoint with specific IPv6 table lookup. + * Type of *param*: **int**. + * **SEG6_LOCAL_ACTION_END_B6** + * End.B6 action: Endpoint bound to an SRv6 policy. + * Type of param: **struct ipv6_sr_hdr**. + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** + * End.B6.Encap action: Endpoint bound to an SRv6 + * encapsulation policy. + * Type of param: **struct ipv6_sr_hdr**. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1976,7 +2078,11 @@ union bpf_attr { FN(fib_lookup), \ FN(sock_hash_update), \ FN(msg_redirect_hash), \ - FN(sk_redirect_hash), + FN(sk_redirect_hash), \ + FN(lwt_push_encap), \ + FN(lwt_seg6_store_bytes), \ + FN(lwt_seg6_adjust_srh), \ + FN(lwt_seg6_action), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2043,6 +2149,12 @@ enum bpf_hdr_start_off { BPF_HDR_START_NET, }; +/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ +enum bpf_lwt_encap_mode { + BPF_LWT_ENCAP_SEG6, + BPF_LWT_ENCAP_SEG6_INLINE +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -2176,6 +2288,14 @@ enum sk_action { struct sk_msg_md { void *data; void *data_end; + + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ }; #define BPF_TAG_SIZE 8 @@ -2197,6 +2317,10 @@ struct bpf_prog_info { __u32 gpl_compatible:1; __u64 netns_dev; __u64 netns_ino; + __u32 nr_jited_ksyms; + __u32 nr_jited_func_lens; + __aligned_u64 jited_ksyms; + __aligned_u64 jited_func_lens; } __attribute__((aligned(8))); struct bpf_map_info { @@ -2211,8 +2335,8 @@ struct bpf_map_info { __u64 netns_dev; __u64 netns_ino; __u32 btf_id; - __u32 btf_key_id; - __u32 btf_value_id; + __u32 btf_key_type_id; + __u32 btf_value_type_id; } __attribute__((aligned(8))); struct bpf_btf_info { @@ -2450,4 +2574,13 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +enum bpf_task_fd_type { + BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_UPROBE, /* filename + offset */ + BPF_FD_TYPE_URETPROBE, /* filename + offset */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index bcb56ee47014..0b5ddbe135a4 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -12,42 +12,29 @@ struct btf_header { __u16 magic; __u8 version; __u8 flags; - - __u32 parent_label; - __u32 parent_name; + __u32 hdr_len; /* All offsets are in bytes relative to the end of this header */ - __u32 label_off; /* offset of label section */ - __u32 object_off; /* offset of data object section*/ - __u32 func_off; /* offset of function section */ __u32 type_off; /* offset of type section */ + __u32 type_len; /* length of type section */ __u32 str_off; /* offset of string section */ __u32 str_len; /* length of string section */ }; /* Max # of type identifier */ -#define BTF_MAX_TYPE 0x7fffffff +#define BTF_MAX_TYPE 0x0000ffff /* Max offset into the string section */ -#define BTF_MAX_NAME_OFFSET 0x7fffffff +#define BTF_MAX_NAME_OFFSET 0x0000ffff /* Max # of struct/union/enum members or func args */ #define BTF_MAX_VLEN 0xffff -/* The type id is referring to a parent BTF */ -#define BTF_TYPE_PARENT(id) (((id) >> 31) & 0x1) -#define BTF_TYPE_ID(id) ((id) & BTF_MAX_TYPE) - -/* String is in the ELF string section */ -#define BTF_STR_TBL_ELF_ID(ref) (((ref) >> 31) & 0x1) -#define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET) - struct btf_type { __u32 name_off; /* "info" bits arrangement * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused - * bits 24-28: kind (e.g. int, ptr, array...etc) - * bits 29-30: unused - * bits 31: root + * bits 24-27: kind (e.g. int, ptr, array...etc) + * bits 28-31: unused */ __u32 info; /* "size" is used by INT, ENUM, STRUCT and UNION. @@ -62,8 +49,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) -#define BTF_INFO_ISROOT(info) (!!(((info) >> 24) & 0x80)) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_KIND_UNKN 0 /* Unknown */ @@ -88,15 +74,14 @@ struct btf_type { /* BTF_KIND_INT is followed by a u32 and the following * is the 32 bits arrangement: */ -#define BTF_INT_ENCODING(VAL) (((VAL) & 0xff000000) >> 24) +#define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) #define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) #define BTF_INT_BITS(VAL) ((VAL) & 0x0000ffff) /* Attributes stored in the BTF_INT_ENCODING */ -#define BTF_INT_SIGNED 0x1 -#define BTF_INT_CHAR 0x2 -#define BTF_INT_BOOL 0x4 -#define BTF_INT_VARARGS 0x8 +#define BTF_INT_SIGNED (1 << 0) +#define BTF_INT_CHAR (1 << 1) +#define BTF_INT_BOOL (1 << 2) /* BTF_KIND_ENUM is followed by multiple "struct btf_enum". * The exact number of btf_enum is stored in the vlen (of the diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index b85266420bfb..cf01b6824244 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -333,6 +333,7 @@ enum { IFLA_BRPORT_BCAST_FLOOD, IFLA_BRPORT_GROUP_FWD_MASK, IFLA_BRPORT_NEIGH_SUPPRESS, + IFLA_BRPORT_ISOLATED, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 77b88c4efe98..4737cfe222f5 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -1,17 +1,8 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note - * +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* * if_xdp: XDP socket user-space interface * Copyright(c) 2018 Intel Corporation. * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * * Author(s): Björn Töpel <[email protected]> * Magnus Karlsson <[email protected]> */ @@ -26,19 +17,33 @@ struct sockaddr_xdp { __u16 sxdp_family; + __u16 sxdp_flags; __u32 sxdp_ifindex; __u32 sxdp_queue_id; __u32 sxdp_shared_umem_fd; - __u16 sxdp_flags; +}; + +struct xdp_ring_offset { + __u64 producer; + __u64 consumer; + __u64 desc; +}; + +struct xdp_mmap_offsets { + struct xdp_ring_offset rx; + struct xdp_ring_offset tx; + struct xdp_ring_offset fr; /* Fill */ + struct xdp_ring_offset cr; /* Completion */ }; /* XDP socket options */ -#define XDP_RX_RING 1 -#define XDP_TX_RING 2 -#define XDP_UMEM_REG 3 -#define XDP_UMEM_FILL_RING 4 -#define XDP_UMEM_COMPLETION_RING 5 -#define XDP_STATISTICS 6 +#define XDP_MMAP_OFFSETS 1 +#define XDP_RX_RING 2 +#define XDP_TX_RING 3 +#define XDP_UMEM_REG 4 +#define XDP_UMEM_FILL_RING 5 +#define XDP_UMEM_COMPLETION_RING 6 +#define XDP_STATISTICS 7 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -59,6 +64,7 @@ struct xdp_statistics { #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 +/* Rx/Tx descriptor */ struct xdp_desc { __u32 idx; __u32 len; @@ -67,21 +73,6 @@ struct xdp_desc { __u8 padding[5]; }; -struct xdp_ring { - __u32 producer __attribute__((aligned(64))); - __u32 consumer __attribute__((aligned(64))); -}; - -/* Used for the RX and TX queues for packets */ -struct xdp_rxtx_ring { - struct xdp_ring ptrs; - struct xdp_desc desc[0] __attribute__((aligned(64))); -}; - -/* Used for the fill and completion queues for buffers */ -struct xdp_umem_ring { - struct xdp_ring ptrs; - __u32 desc[0] __attribute__((aligned(64))); -}; +/* UMEM descriptor is __u32 */ #endif /* _LINUX_IF_XDP_H */ diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h index ef2d8c3e76c1..edc138bdc56d 100644 --- a/include/uapi/linux/seg6_local.h +++ b/include/uapi/linux/seg6_local.h @@ -25,6 +25,7 @@ enum { SEG6_LOCAL_NH6, SEG6_LOCAL_IIF, SEG6_LOCAL_OIF, + SEG6_LOCAL_BPF, __SEG6_LOCAL_MAX, }; #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1) @@ -59,10 +60,21 @@ enum { SEG6_LOCAL_ACTION_END_AS = 13, /* forward to SR-unaware VNF with masquerading */ SEG6_LOCAL_ACTION_END_AM = 14, + /* custom BPF action */ + SEG6_LOCAL_ACTION_END_BPF = 15, __SEG6_LOCAL_ACTION_MAX, }; #define SEG6_LOCAL_ACTION_MAX (__SEG6_LOCAL_ACTION_MAX - 1) +enum { + SEG6_LOCAL_BPF_PROG_UNSPEC, + SEG6_LOCAL_BPF_PROG, + SEG6_LOCAL_BPF_PROG_NAME, + __SEG6_LOCAL_BPF_PROG_MAX, +}; + +#define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1) + #endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 0fd8d8f1a398..544e58f5f642 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -352,7 +352,7 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, } seq_printf(m, "%u: ", *(u32 *)key); - btf_type_seq_show(map->btf, map->btf_value_id, value, m); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); seq_puts(m, "\n"); rcu_read_unlock(); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ded10ab47b8a..7e90fd13b5b5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -12,6 +12,7 @@ #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/idr.h> +#include <linux/sort.h> #include <linux/bpf_verifier.h> #include <linux/btf.h> @@ -162,13 +163,16 @@ #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) +#define BTF_INFO_MASK 0x0f00ffff +#define BTF_INT_MASK 0x0fffffff +#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) +#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) + /* 16MB for 64k structs and each has 16 members and * a few MB spaces for the string section. * The hard limit is S32_MAX. */ #define BTF_MAX_SIZE (16 * 1024 * 1024) -/* 64k. We can raise it later. The hard limit is S32_MAX. */ -#define BTF_MAX_NR_TYPES 65535 #define for_each_member(i, struct_type, member) \ for (i = 0, member = btf_type_member(struct_type); \ @@ -184,15 +188,13 @@ static DEFINE_IDR(btf_idr); static DEFINE_SPINLOCK(btf_idr_lock); struct btf { - union { - struct btf_header *hdr; - void *data; - }; + void *data; struct btf_type **types; u32 *resolved_ids; u32 *resolved_sizes; const char *strings; void *nohdr_data; + struct btf_header hdr; u32 nr_types; u32 types_size; u32 data_size; @@ -228,6 +230,11 @@ enum resolve_mode { #define MAX_RESOLVE_DEPTH 32 +struct btf_sec_info { + u32 off; + u32 len; +}; + struct btf_verifier_env { struct btf *btf; u8 *visit_states; @@ -379,8 +386,6 @@ static const char *btf_int_encoding_str(u8 encoding) return "CHAR"; else if (encoding == BTF_INT_BOOL) return "BOOL"; - else if (encoding == BTF_INT_VARARGS) - return "VARARGS"; else return "UNKN"; } @@ -417,16 +422,16 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { - return !BTF_STR_TBL_ELF_ID(offset) && - BTF_STR_OFFSET(offset) < btf->hdr->str_len; + return BTF_STR_OFFSET_VALID(offset) && + offset < btf->hdr.str_len; } static const char *btf_name_by_offset(const struct btf *btf, u32 offset) { - if (!BTF_STR_OFFSET(offset)) + if (!offset) return "(anon)"; - else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len) - return &btf->strings[BTF_STR_OFFSET(offset)]; + else if (offset < btf->hdr.str_len) + return &btf->strings[offset]; else return "(invalid-name-offset)"; } @@ -439,6 +444,28 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) return btf->types[type_id]; } +/* + * Regular int is not a bit field and it must be either + * u8/u16/u32/u64. + */ +static bool btf_type_int_is_regular(const struct btf_type *t) +{ + u16 nr_bits, nr_bytes; + u32 int_data; + + int_data = btf_type_int(t); + nr_bits = BTF_INT_BITS(int_data); + nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); + if (BITS_PER_BYTE_MASKED(nr_bits) || + BTF_INT_OFFSET(int_data) || + (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && + nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { + return false; + } + + return true; +} + __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, const char *fmt, ...) { @@ -536,7 +563,8 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, __btf_verifier_log(log, "\n"); } -static void btf_verifier_log_hdr(struct btf_verifier_env *env) +static void btf_verifier_log_hdr(struct btf_verifier_env *env, + u32 btf_data_size) { struct bpf_verifier_log *log = &env->log; const struct btf *btf = env->btf; @@ -545,19 +573,16 @@ static void btf_verifier_log_hdr(struct btf_verifier_env *env) if (!bpf_verifier_log_needed(log)) return; - hdr = btf->hdr; + hdr = &btf->hdr; __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic); __btf_verifier_log(log, "version: %u\n", hdr->version); __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags); - __btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label); - __btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name); - __btf_verifier_log(log, "label_off: %u\n", hdr->label_off); - __btf_verifier_log(log, "object_off: %u\n", hdr->object_off); - __btf_verifier_log(log, "func_off: %u\n", hdr->func_off); + __btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len); __btf_verifier_log(log, "type_off: %u\n", hdr->type_off); + __btf_verifier_log(log, "type_len: %u\n", hdr->type_len); __btf_verifier_log(log, "str_off: %u\n", hdr->str_off); __btf_verifier_log(log, "str_len: %u\n", hdr->str_len); - __btf_verifier_log(log, "btf_total_size: %u\n", btf->data_size); + __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size); } static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) @@ -574,13 +599,13 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) struct btf_type **new_types; u32 expand_by, new_size; - if (btf->types_size == BTF_MAX_NR_TYPES) { + if (btf->types_size == BTF_MAX_TYPE) { btf_verifier_log(env, "Exceeded max num of types"); return -E2BIG; } expand_by = max_t(u32, btf->types_size >> 2, 16); - new_size = min_t(u32, BTF_MAX_NR_TYPES, + new_size = min_t(u32, BTF_MAX_TYPE, btf->types_size + expand_by); new_types = kvzalloc(new_size * sizeof(*new_types), @@ -910,6 +935,12 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, } int_data = btf_type_int(t); + if (int_data & ~BTF_INT_MASK) { + btf_verifier_log_basic(env, t, "Invalid int_data:%x", + int_data); + return -EINVAL; + } + nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); if (nr_bits > BITS_PER_U64) { @@ -923,12 +954,17 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* + * Only one of the encoding bits is allowed and it + * should be sufficient for the pretty print purpose (i.e. decoding). + * Multiple bits can be allowed later if it is found + * to be insufficient. + */ encoding = BTF_INT_ENCODING(int_data); if (encoding && encoding != BTF_INT_SIGNED && encoding != BTF_INT_CHAR && - encoding != BTF_INT_BOOL && - encoding != BTF_INT_VARARGS) { + encoding != BTF_INT_BOOL) { btf_verifier_log_type(env, t, "Unsupported encoding"); return -ENOTSUPP; } @@ -1102,7 +1138,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (BTF_TYPE_PARENT(t->type)) { + if (!BTF_TYPE_ID_VALID(t->type)) { btf_verifier_log_type(env, t, "Invalid type_id"); return -EINVAL; } @@ -1306,14 +1342,16 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, return -EINVAL; } - /* We are a little forgiving on array->index_type since - * the kernel is not using it. - */ - /* Array elem cannot be in type void, - * so !array->type is not allowed. + /* Array elem type and index type cannot be in type void, + * so !array->type and !array->index_type are not allowed. */ - if (!array->type || BTF_TYPE_PARENT(array->type)) { - btf_verifier_log_type(env, t, "Invalid type_id"); + if (!array->type || !BTF_TYPE_ID_VALID(array->type)) { + btf_verifier_log_type(env, t, "Invalid elem"); + return -EINVAL; + } + + if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) { + btf_verifier_log_type(env, t, "Invalid index"); return -EINVAL; } @@ -1326,11 +1364,32 @@ static int btf_array_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_array *array = btf_type_array(v->t); - const struct btf_type *elem_type; - u32 elem_type_id = array->type; + const struct btf_type *elem_type, *index_type; + u32 elem_type_id, index_type_id; struct btf *btf = env->btf; u32 elem_size; + /* Check array->index_type */ + index_type_id = array->index_type; + index_type = btf_type_by_id(btf, index_type_id); + if (btf_type_is_void_or_null(index_type)) { + btf_verifier_log_type(env, v->t, "Invalid index"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, index_type) && + !env_type_is_resolved(env, index_type_id)) + return env_stack_push(env, index_type, index_type_id); + + index_type = btf_type_id_size(btf, &index_type_id, NULL); + if (!index_type || !btf_type_is_int(index_type) || + !btf_type_int_is_regular(index_type)) { + btf_verifier_log_type(env, v->t, "Invalid index"); + return -EINVAL; + } + + /* Check array->type */ + elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); if (btf_type_is_void_or_null(elem_type)) { btf_verifier_log_type(env, v->t, @@ -1348,22 +1407,9 @@ static int btf_array_resolve(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_is_int(elem_type)) { - int int_type_data = btf_type_int(elem_type); - u16 nr_bits = BTF_INT_BITS(int_type_data); - u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); - - /* Put more restriction on array of int. The int cannot - * be a bit field and it must be either u8/u16/u32/u64. - */ - if (BITS_PER_BYTE_MASKED(nr_bits) || - BTF_INT_OFFSET(int_type_data) || - (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && - nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { - btf_verifier_log_type(env, v->t, - "Invalid array of int"); - return -EINVAL; - } + if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) { + btf_verifier_log_type(env, v->t, "Invalid array of int"); + return -EINVAL; } if (array->nelems && elem_size > U32_MAX / array->nelems) { @@ -1473,7 +1519,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, } /* A member cannot be in type void */ - if (!member->type || BTF_TYPE_PARENT(member->type)) { + if (!member->type || !BTF_TYPE_ID_VALID(member->type)) { btf_verifier_log_member(env, t, member, "Invalid type_id"); return -EINVAL; @@ -1726,6 +1772,12 @@ static s32 btf_check_meta(struct btf_verifier_env *env, } meta_left -= sizeof(*t); + if (t->info & ~BTF_INFO_MASK) { + btf_verifier_log(env, "[%u] Invalid btf_info:%x", + env->log_type_id, t->info); + return -EINVAL; + } + if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { btf_verifier_log(env, "[%u] Invalid kind:%u", @@ -1754,9 +1806,9 @@ static int btf_check_all_metas(struct btf_verifier_env *env) struct btf_header *hdr; void *cur, *end; - hdr = btf->hdr; + hdr = &btf->hdr; cur = btf->nohdr_data + hdr->type_off; - end = btf->nohdr_data + hdr->str_off; + end = btf->nohdr_data + hdr->type_len; env->log_type_id = 1; while (cur < end) { @@ -1866,8 +1918,20 @@ static int btf_check_all_types(struct btf_verifier_env *env) static int btf_parse_type_sec(struct btf_verifier_env *env) { + const struct btf_header *hdr = &env->btf->hdr; int err; + /* Type section must align to 4 bytes */ + if (hdr->type_off & (sizeof(u32) - 1)) { + btf_verifier_log(env, "Unaligned type_off"); + return -EINVAL; + } + + if (!hdr->type_len) { + btf_verifier_log(env, "No type found"); + return -EINVAL; + } + err = btf_check_all_metas(env); if (err) return err; @@ -1881,10 +1945,15 @@ static int btf_parse_str_sec(struct btf_verifier_env *env) struct btf *btf = env->btf; const char *start, *end; - hdr = btf->hdr; + hdr = &btf->hdr; start = btf->nohdr_data + hdr->str_off; end = start + hdr->str_len; + if (end != btf->data + btf->data_size) { + btf_verifier_log(env, "String section is not at the end"); + return -EINVAL; + } + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || start[0] || end[-1]) { btf_verifier_log(env, "Invalid string section"); @@ -1896,20 +1965,121 @@ static int btf_parse_str_sec(struct btf_verifier_env *env) return 0; } -static int btf_parse_hdr(struct btf_verifier_env *env) +static const size_t btf_sec_info_offset[] = { + offsetof(struct btf_header, type_off), + offsetof(struct btf_header, str_off), +}; + +static int btf_sec_info_cmp(const void *a, const void *b) +{ + const struct btf_sec_info *x = a; + const struct btf_sec_info *y = b; + + return (int)(x->off - y->off) ? : (int)(x->len - y->len); +} + +static int btf_check_sec_info(struct btf_verifier_env *env, + u32 btf_data_size) { + struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)]; + u32 total, expected_total, i; const struct btf_header *hdr; - struct btf *btf = env->btf; - u32 meta_left; + const struct btf *btf; + + btf = env->btf; + hdr = &btf->hdr; - if (btf->data_size < sizeof(*hdr)) { + /* Populate the secs from hdr */ + for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) + secs[i] = *(struct btf_sec_info *)((void *)hdr + + btf_sec_info_offset[i]); + + sort(secs, ARRAY_SIZE(btf_sec_info_offset), + sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL); + + /* Check for gaps and overlap among sections */ + total = 0; + expected_total = btf_data_size - hdr->hdr_len; + for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) { + if (expected_total < secs[i].off) { + btf_verifier_log(env, "Invalid section offset"); + return -EINVAL; + } + if (total < secs[i].off) { + /* gap */ + btf_verifier_log(env, "Unsupported section found"); + return -EINVAL; + } + if (total > secs[i].off) { + btf_verifier_log(env, "Section overlap found"); + return -EINVAL; + } + if (expected_total - total < secs[i].len) { + btf_verifier_log(env, + "Total section length too long"); + return -EINVAL; + } + total += secs[i].len; + } + + /* There is data other than hdr and known sections */ + if (expected_total != total) { + btf_verifier_log(env, "Unsupported section found"); + return -EINVAL; + } + + return 0; +} + +static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, + u32 btf_data_size) +{ + const struct btf_header *hdr; + u32 hdr_len, hdr_copy; + /* + * Minimal part of the "struct btf_header" that + * contains the hdr_len. + */ + struct btf_min_header { + u16 magic; + u8 version; + u8 flags; + u32 hdr_len; + } __user *min_hdr; + struct btf *btf; + int err; + + btf = env->btf; + min_hdr = btf_data; + + if (btf_data_size < sizeof(*min_hdr)) { + btf_verifier_log(env, "hdr_len not found"); + return -EINVAL; + } + + if (get_user(hdr_len, &min_hdr->hdr_len)) + return -EFAULT; + + if (btf_data_size < hdr_len) { btf_verifier_log(env, "btf_header not found"); return -EINVAL; } - btf_verifier_log_hdr(env); + err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len); + if (err) { + if (err == -E2BIG) + btf_verifier_log(env, "Unsupported btf_header"); + return err; + } + + hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); + if (copy_from_user(&btf->hdr, btf_data, hdr_copy)) + return -EFAULT; + + hdr = &btf->hdr; + + btf_verifier_log_hdr(env, btf_data_size); - hdr = btf->hdr; if (hdr->magic != BTF_MAGIC) { btf_verifier_log(env, "Invalid magic"); return -EINVAL; @@ -1925,26 +2095,14 @@ static int btf_parse_hdr(struct btf_verifier_env *env) return -ENOTSUPP; } - meta_left = btf->data_size - sizeof(*hdr); - if (!meta_left) { + if (btf_data_size == hdr->hdr_len) { btf_verifier_log(env, "No data"); return -EINVAL; } - if (meta_left < hdr->type_off || hdr->str_off <= hdr->type_off || - /* Type section must align to 4 bytes */ - hdr->type_off & (sizeof(u32) - 1)) { - btf_verifier_log(env, "Invalid type_off"); - return -EINVAL; - } - - if (meta_left < hdr->str_off || - meta_left - hdr->str_off < hdr->str_len) { - btf_verifier_log(env, "Invalid str_off or str_len"); - return -EINVAL; - } - - btf->nohdr_data = btf->hdr + 1; + err = btf_check_sec_info(env, btf_data_size); + if (err) + return err; return 0; } @@ -1987,6 +2145,11 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, err = -ENOMEM; goto errout; } + env->btf = btf; + + err = btf_parse_hdr(env, btf_data, btf_data_size); + if (err) + goto errout; data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { @@ -1996,18 +2159,13 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, btf->data = data; btf->data_size = btf_data_size; + btf->nohdr_data = btf->data + btf->hdr.hdr_len; if (copy_from_user(data, btf_data, btf_data_size)) { err = -EFAULT; goto errout; } - env->btf = btf; - - err = btf_parse_hdr(env); - if (err) - goto errout; - err = btf_parse_str_sec(env); if (err) goto errout; @@ -2016,16 +2174,14 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, if (err) goto errout; - if (!err && log->level && bpf_verifier_log_full(log)) { + if (log->level && bpf_verifier_log_full(log)) { err = -ENOSPC; goto errout; } - if (!err) { - btf_verifier_env_free(env); - refcount_set(&btf->refcnt, 1); - return btf; - } + btf_verifier_env_free(env); + refcount_set(&btf->refcnt, 1); + return btf; errout: btf_verifier_env_free(env); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index c95b04ec103e..e0918d180f08 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); } processed++; } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 565f9ece9115..ae16d0c373ef 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,15 +48,25 @@ * calls will fail at this point. */ #include <linux/bpf.h> +#include <net/xdp.h> #include <linux/filter.h> +#include <trace/events/xdp.h> #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define DEV_MAP_BULK_SIZE 16 +struct xdp_bulk_queue { + struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + struct net_device *dev_rx; + unsigned int count; +}; + struct bpf_dtab_netdev { - struct net_device *dev; + struct net_device *dev; /* must be first member, due to tracepoint */ struct bpf_dtab *dtab; unsigned int bit; + struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; }; @@ -206,6 +216,50 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) __set_bit(bit, bitmap); } +static int bq_xmit_all(struct bpf_dtab_netdev *obj, + struct xdp_bulk_queue *bq) +{ + struct net_device *dev = obj->dev; + int sent = 0, drops = 0, err = 0; + int i; + + if (unlikely(!bq->count)) + return 0; + + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + + prefetch(xdpf); + } + + sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q); + if (sent < 0) { + err = sent; + sent = 0; + goto error; + } + drops = bq->count - sent; +out: + bq->count = 0; + + trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, + sent, drops, bq->dev_rx, dev, err); + bq->dev_rx = NULL; + return 0; +error: + /* If ndo_xdp_xmit fails with an errno, no frames have been + * xmit'ed and it's our responsibility to them free all. + */ + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + + /* RX path under NAPI protection, can return frames faster */ + xdp_return_frame_rx_napi(xdpf); + drops++; + } + goto out; +} + /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled @@ -221,6 +275,7 @@ void __dev_map_flush(struct bpf_map *map) for_each_set_bit(bit, bitmap, map->max_entries) { struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); + struct xdp_bulk_queue *bq; struct net_device *netdev; /* This is possible if the dev entry is removed by user space @@ -230,6 +285,9 @@ void __dev_map_flush(struct bpf_map *map) continue; __clear_bit(bit, bitmap); + + bq = this_cpu_ptr(dev->bulkq); + bq_xmit_all(dev, bq); netdev = dev->dev; if (likely(netdev->netdev_ops->ndo_xdp_flush)) netdev->netdev_ops->ndo_xdp_flush(netdev); @@ -240,21 +298,61 @@ void __dev_map_flush(struct bpf_map *map) * update happens in parallel here a dev_put wont happen until after reading the * ifindex. */ -struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_dtab_netdev *dev; + struct bpf_dtab_netdev *obj; if (key >= map->max_entries) return NULL; - dev = READ_ONCE(dtab->netdev_map[key]); - return dev ? dev->dev : NULL; + obj = READ_ONCE(dtab->netdev_map[key]); + return obj; +} + +/* Runs under RCU-read-side, plus in softirq under NAPI protection. + * Thus, safe percpu variable access. + */ +static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, + struct net_device *dev_rx) + +{ + struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); + + if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) + bq_xmit_all(obj, bq); + + /* Ingress dev_rx will be the same for all xdp_frame's in + * bulk_queue, because bq stored per-CPU and must be flushed + * from net_device drivers NAPI func end. + */ + if (!bq->dev_rx) + bq->dev_rx = dev_rx; + + bq->q[bq->count++] = xdpf; + return 0; +} + +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + struct net_device *dev = dst->dev; + struct xdp_frame *xdpf; + + if (!dev->netdev_ops->ndo_xdp_xmit) + return -EOPNOTSUPP; + + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + return bq_enqueue(dst, xdpf, dev_rx); } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { - struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key); + struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); + struct net_device *dev = dev = obj ? obj->dev : NULL; return dev ? &dev->ifindex : NULL; } @@ -263,13 +361,18 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_flush) { struct net_device *fl = dev->dev; + struct xdp_bulk_queue *bq; unsigned long *bitmap; + int cpu; for_each_online_cpu(cpu) { bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); __clear_bit(dev->bit, bitmap); + bq = per_cpu_ptr(dev->bulkq, cpu); + bq_xmit_all(dev, bq); + fl->netdev_ops->ndo_xdp_flush(dev->dev); } } @@ -281,6 +384,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu) dev = container_of(rcu, struct bpf_dtab_netdev, rcu); dev_map_flush_old(dev); + free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -313,6 +417,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct net *net = current->nsproxy->net_ns; + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev, *old_dev; u32 i = *(u32 *)key; u32 ifindex = *(u32 *)value; @@ -327,13 +432,20 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, - map->numa_node); + dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); if (!dev) return -ENOMEM; + dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), + sizeof(void *), gfp); + if (!dev->bulkq) { + kfree(dev); + return -ENOMEM; + } + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { + free_percpu(dev->bulkq); kfree(dev); return -EINVAL; } @@ -405,6 +517,9 @@ static struct notifier_block dev_map_notifier = { static int __init dev_map_init(void) { + /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ + BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != + offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); return 0; } diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index cd832250a478..52a91d816c0e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -523,6 +523,7 @@ static unsigned int smap_do_tx_msg(struct sock *sk, } bpf_compute_data_pointers_sg(md); + md->sk = sk; rc = (*prog->bpf_func)(md, prog->insnsi); psock->apply_bytes = md->apply_bytes; @@ -1713,7 +1714,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, struct smap_psock_map_entry *e = NULL; struct smap_psock *psock; bool new = false; - int err; + int err = 0; /* 1. If sock map has BPF programs those will be inherited by the * sock being added. If the sock is already attached to BPF programs @@ -1823,7 +1824,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, write_unlock_bh(&sock->sk_callback_lock); return err; out_free: - kfree(e); smap_release_sock(psock, sock); out_progs: if (parse && verdict) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bfcde949c7f8..388d4feda348 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -18,7 +18,9 @@ #include <linux/vmalloc.h> #include <linux/mmzone.h> #include <linux/anon_inodes.h> +#include <linux/fdtable.h> #include <linux/file.h> +#include <linux/fs.h> #include <linux/license.h> #include <linux/filter.h> #include <linux/version.h> @@ -65,9 +67,9 @@ static const struct bpf_map_ops * const bpf_map_types[] = { * copy_from_user() call. However, this is not a concern since this function is * meant to be a future-proofing of bits. */ -static int check_uarg_tail_zero(void __user *uaddr, - size_t expected_size, - size_t actual_size) +int bpf_check_uarg_tail_zero(void __user *uaddr, + size_t expected_size, + size_t actual_size) { unsigned char __user *addr; unsigned char __user *end; @@ -422,7 +424,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD btf_value_id +#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -457,10 +459,10 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->usercnt, 1); if (bpf_map_support_seq_show(map) && - (attr->btf_key_id || attr->btf_value_id)) { + (attr->btf_key_type_id || attr->btf_value_type_id)) { struct btf *btf; - if (!attr->btf_key_id || !attr->btf_value_id) { + if (!attr->btf_key_type_id || !attr->btf_value_type_id) { err = -EINVAL; goto free_map_nouncharge; } @@ -471,16 +473,16 @@ static int map_create(union bpf_attr *attr) goto free_map_nouncharge; } - err = map->ops->map_check_btf(map, btf, attr->btf_key_id, - attr->btf_value_id); + err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); if (err) { btf_put(btf); goto free_map_nouncharge; } map->btf = btf; - map->btf_key_id = attr->btf_key_id; - map->btf_value_id = attr->btf_value_id; + map->btf_key_type_id = attr->btf_key_type_id; + map->btf_value_type_id = attr->btf_value_type_id; } err = security_bpf_map_alloc(map); @@ -1899,7 +1901,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, u32 ulen; int err; - err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -1933,6 +1935,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; + info.nr_jited_ksyms = 0; goto done; } @@ -1969,18 +1972,93 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, * for offload. */ ulen = info.jited_prog_len; - info.jited_prog_len = prog->jited_len; + if (prog->aux->func_cnt) { + u32 i; + + info.jited_prog_len = 0; + for (i = 0; i < prog->aux->func_cnt; i++) + info.jited_prog_len += prog->aux->func[i]->jited_len; + } else { + info.jited_prog_len = prog->jited_len; + } + if (info.jited_prog_len && ulen) { if (bpf_dump_raw_ok()) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; + + /* for multi-function programs, copy the JITed + * instructions for all the functions + */ + if (prog->aux->func_cnt) { + u32 len, free, i; + u8 *img; + + free = ulen; + for (i = 0; i < prog->aux->func_cnt; i++) { + len = prog->aux->func[i]->jited_len; + len = min_t(u32, len, free); + img = (u8 *) prog->aux->func[i]->bpf_func; + if (copy_to_user(uinsns, img, len)) + return -EFAULT; + uinsns += len; + free -= len; + if (!free) + break; + } + } else { + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } } else { info.jited_prog_insns = 0; } } + ulen = info.nr_jited_ksyms; + info.nr_jited_ksyms = prog->aux->func_cnt; + if (info.nr_jited_ksyms && ulen) { + if (bpf_dump_raw_ok()) { + u64 __user *user_ksyms; + ulong ksym_addr; + u32 i; + + /* copy the address of the kernel symbol + * corresponding to each function + */ + ulen = min_t(u32, info.nr_jited_ksyms, ulen); + user_ksyms = u64_to_user_ptr(info.jited_ksyms); + for (i = 0; i < ulen; i++) { + ksym_addr = (ulong) prog->aux->func[i]->bpf_func; + ksym_addr &= PAGE_MASK; + if (put_user((u64) ksym_addr, &user_ksyms[i])) + return -EFAULT; + } + } else { + info.jited_ksyms = 0; + } + } + + ulen = info.nr_jited_func_lens; + info.nr_jited_func_lens = prog->aux->func_cnt; + if (info.nr_jited_func_lens && ulen) { + if (bpf_dump_raw_ok()) { + u32 __user *user_lens; + u32 func_len, i; + + /* copy the JITed image lengths for each function */ + ulen = min_t(u32, info.nr_jited_func_lens, ulen); + user_lens = u64_to_user_ptr(info.jited_func_lens); + for (i = 0; i < ulen; i++) { + func_len = prog->aux->func[i]->jited_len; + if (put_user(func_len, &user_lens[i])) + return -EFAULT; + } + } else { + info.jited_func_lens = 0; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) @@ -1998,7 +2076,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, u32 info_len = attr->info.info_len; int err; - err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -2013,8 +2091,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, if (map->btf) { info.btf_id = btf_id(map->btf); - info.btf_key_id = map->btf_key_id; - info.btf_value_id = map->btf_value_id; + info.btf_key_type_id = map->btf_key_type_id; + info.btf_value_type_id = map->btf_value_type_id; } if (bpf_map_is_dev_bound(map)) { @@ -2038,7 +2116,7 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, u32 info_len = attr->info.info_len; int err; - err = check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len); + err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len); if (err) return err; @@ -2102,6 +2180,132 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) return btf_get_fd_by_id(attr->btf_id); } +static int bpf_task_fd_query_copy(const union bpf_attr *attr, + union bpf_attr __user *uattr, + u32 prog_id, u32 fd_type, + const char *buf, u64 probe_offset, + u64 probe_addr) +{ + char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf); + u32 len = buf ? strlen(buf) : 0, input_len; + int err = 0; + + if (put_user(len, &uattr->task_fd_query.buf_len)) + return -EFAULT; + input_len = attr->task_fd_query.buf_len; + if (input_len && ubuf) { + if (!len) { + /* nothing to copy, just make ubuf NULL terminated */ + char zero = '\0'; + + if (put_user(zero, ubuf)) + return -EFAULT; + } else if (input_len >= len + 1) { + /* ubuf can hold the string with NULL terminator */ + if (copy_to_user(ubuf, buf, len + 1)) + return -EFAULT; + } else { + /* ubuf cannot hold the string with NULL terminator, + * do a partial copy with NULL terminator. + */ + char zero = '\0'; + + err = -ENOSPC; + if (copy_to_user(ubuf, buf, input_len - 1)) + return -EFAULT; + if (put_user(zero, ubuf + input_len - 1)) + return -EFAULT; + } + } + + if (put_user(prog_id, &uattr->task_fd_query.prog_id) || + put_user(fd_type, &uattr->task_fd_query.fd_type) || + put_user(probe_offset, &uattr->task_fd_query.probe_offset) || + put_user(probe_addr, &uattr->task_fd_query.probe_addr)) + return -EFAULT; + + return err; +} + +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr + +static int bpf_task_fd_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + pid_t pid = attr->task_fd_query.pid; + u32 fd = attr->task_fd_query.fd; + const struct perf_event *event; + struct files_struct *files; + struct task_struct *task; + struct file *file; + int err; + + if (CHECK_ATTR(BPF_TASK_FD_QUERY)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (attr->task_fd_query.flags != 0) + return -EINVAL; + + task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + if (!task) + return -ENOENT; + + files = get_files_struct(task); + put_task_struct(task); + if (!files) + return -ENOENT; + + err = 0; + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (!file) + err = -EBADF; + else + get_file(file); + spin_unlock(&files->file_lock); + put_files_struct(files); + + if (err) + goto out; + + if (file->f_op == &bpf_raw_tp_fops) { + struct bpf_raw_tracepoint *raw_tp = file->private_data; + struct bpf_raw_event_map *btp = raw_tp->btp; + + err = bpf_task_fd_query_copy(attr, uattr, + raw_tp->prog->aux->id, + BPF_FD_TYPE_RAW_TRACEPOINT, + btp->tp->name, 0, 0); + goto put_file; + } + + event = perf_get_event(file); + if (!IS_ERR(event)) { + u64 probe_offset, probe_addr; + u32 prog_id, fd_type; + const char *buf; + + err = bpf_get_perf_event_info(event, &prog_id, &fd_type, + &buf, &probe_offset, + &probe_addr); + if (!err) + err = bpf_task_fd_query_copy(attr, uattr, prog_id, + fd_type, buf, + probe_offset, + probe_addr); + goto put_file; + } + + err = -ENOTSUPP; +put_file: + fput(file); +out: + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -2110,7 +2314,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) return -EPERM; - err = check_uarg_tail_zero(uattr, sizeof(attr), size); + err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) return err; size = min_t(u32, size, sizeof(attr)); @@ -2188,6 +2392,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_BTF_GET_FD_BY_ID: err = bpf_btf_get_fd_by_id(&attr); break; + case BPF_TASK_FD_QUERY: + err = bpf_task_fd_query(&attr, uattr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a9e4b1372da6..967cacf286ea 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1262,6 +1262,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, switch (env->prog->type) { case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; @@ -5383,11 +5384,24 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->src_reg != BPF_PSEUDO_CALL) continue; subprog = insn->off; - insn->off = 0; insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) func[subprog]->bpf_func - __bpf_call_base; } + + /* we use the aux data to keep a list of the start addresses + * of the JITed images for each function in the program + * + * for some architectures, such as powerpc64, the imm field + * might not be large enough to hold the offset of the start + * address of the callee's JITed image from __bpf_call_base + * + * in such cases, we can lookup the start address of a callee + * by using its subprog id, available from the off field of + * the call instruction, as an index for this list + */ + func[i]->aux->func = func; + func[i]->aux->func_cnt = env->subprog_cnt; } for (i = 0; i < env->subprog_cnt; i++) { old_bpf_func = func[i]->bpf_func; @@ -5413,17 +5427,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - unsigned long addr; - if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; insn->off = env->insn_aux_data[i].call_imm; subprog = find_subprog(env, i + insn->off + 1); - addr = (unsigned long)func[subprog]->bpf_func; - addr &= PAGE_MASK; - insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) - addr - __bpf_call_base; + insn->imm = subprog; } prog->jited = 1; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index cb3a12137404..b3c557476a8d 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -1,15 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* XSKMAP used for AF_XDP sockets * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <linux/bpf.h> diff --git a/kernel/events/core.c b/kernel/events/core.c index 67612ce359ad..6eeab86d24ba 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11212,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd) return file; } +const struct perf_event *perf_get_event(struct file *file) +{ + if (file->f_op != &perf_fops) + return ERR_PTR(-EINVAL); + + return file->private_data; +} + const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { if (!event) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ce2cbbff27e4..81fdf2fc94ac 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,6 +14,7 @@ #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/kprobes.h> +#include <linux/syscalls.h> #include <linux/error-injection.h> #include "trace_probe.h" @@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) mutex_unlock(&bpf_event_mutex); return err; } + +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, + u32 *fd_type, const char **buf, + u64 *probe_offset, u64 *probe_addr) +{ + bool is_tracepoint, is_syscall_tp; + struct bpf_prog *prog; + int flags, err = 0; + + prog = event->prog; + if (!prog) + return -ENOENT; + + /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) + return -EOPNOTSUPP; + + *prog_id = prog->aux->id; + flags = event->tp_event->flags; + is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT; + is_syscall_tp = is_syscall_trace_event(event->tp_event); + + if (is_tracepoint || is_syscall_tp) { + *buf = is_tracepoint ? event->tp_event->tp->name + : event->tp_event->name; + *fd_type = BPF_FD_TYPE_TRACEPOINT; + *probe_offset = 0x0; + *probe_addr = 0x0; + } else { + /* kprobe/uprobe */ + err = -EOPNOTSUPP; +#ifdef CONFIG_KPROBE_EVENTS + if (flags & TRACE_EVENT_FL_KPROBE) + err = bpf_get_kprobe_info(event, fd_type, buf, + probe_offset, probe_addr, + event->attr.type == PERF_TYPE_TRACEPOINT); +#endif +#ifdef CONFIG_UPROBE_EVENTS + if (flags & TRACE_EVENT_FL_UPROBE) + err = bpf_get_uprobe_info(event, fd_type, buf, + probe_offset, + event->attr.type == PERF_TYPE_TRACEPOINT); +#endif + } + + return err; +} diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 02aed76e0978..daa81571b22a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, head, NULL); } NOKPROBE_SYMBOL(kretprobe_perf_func); + +int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, + const char **symbol, u64 *probe_offset, + u64 *probe_addr, bool perf_type_tracepoint) +{ + const char *pevent = trace_event_name(event->tp_event); + const char *group = event->tp_event->class->system; + struct trace_kprobe *tk; + + if (perf_type_tracepoint) + tk = find_trace_kprobe(pevent, group); + else + tk = event->tp_event->data; + if (!tk) + return -EINVAL; + + *fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE + : BPF_FD_TYPE_KPROBE; + if (tk->symbol) { + *symbol = tk->symbol; + *probe_offset = tk->rp.kp.offset; + *probe_addr = 0; + } else { + *symbol = NULL; + *probe_offset = 0; + *probe_addr = (unsigned long)tk->rp.kp.addr; + } + return 0; +} #endif /* CONFIG_PERF_EVENTS */ /* diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index ac892878dbe6..bf89a51e740d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, { __uprobe_perf_func(tu, func, regs, ucb, dsize); } + +int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, + const char **filename, u64 *probe_offset, + bool perf_type_tracepoint) +{ + const char *pevent = trace_event_name(event->tp_event); + const char *group = event->tp_event->class->system; + struct trace_uprobe *tu; + + if (perf_type_tracepoint) + tu = find_probe_event(pevent, group); + else + tu = event->tp_event->data; + if (!tu) + return -EINVAL; + + *fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE + : BPF_FD_TYPE_UPROBE; + *filename = tu->filename; + *probe_offset = tu->offset; + return 0; +} #endif /* CONFIG_PERF_EVENTS */ static int diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 7a7fd672ccf2..9019f326fe81 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -30,7 +30,8 @@ static inline int should_deliver(const struct net_bridge_port *p, vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING && - nbp_switchdev_allowed_egress(p, skb); + nbp_switchdev_allowed_egress(p, skb) && + !br_skb_isolated(p, skb); } int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 7f98a7d25866..72074276c088 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -114,6 +114,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb goto drop; BR_INPUT_SKB_CB(skb)->brdev = br->dev; + BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED); if (IS_ENABLED(CONFIG_INET) && (skb->protocol == htons(ETH_P_ARP) || diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 015f465c514b..9f5eb05b0373 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -139,6 +139,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ + + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ @@ -213,7 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, BR_VLAN_TUNNEL)) || nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, - !!(p->flags & BR_NEIGH_SUPPRESS))) + !!(p->flags & BR_NEIGH_SUPPRESS)) || + nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -660,6 +662,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 }, [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, + [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, }; /* Change the state of the port and notify spanning tree */ @@ -810,6 +813,10 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) if (err) return err; + err = br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED); + if (err) + return err; + br_port_flags_change(p, old_flags ^ p->flags); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 742f40aefdaf..11520ed528b0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -423,6 +423,7 @@ struct br_input_skb_cb { #endif bool proxyarp_replied; + bool src_port_isolated; #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool vlan_filtered; @@ -574,6 +575,14 @@ int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb); void br_flood(struct net_bridge *br, struct sk_buff *skb, enum br_pkt_type pkt_type, bool local_rcv, bool local_orig); +/* return true if both source port and dest port are isolated */ +static inline bool br_skb_isolated(const struct net_bridge_port *to, + const struct sk_buff *skb) +{ + return BR_INPUT_SKB_CB(skb)->src_port_isolated && + (to->flags & BR_ISOLATED); +} + /* br_if.c */ void br_port_carrier_check(struct net_bridge_port *p, bool *notified); int br_add_bridge(struct net *net, const char *name); diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index fd31ad83ec7b..f99c5bf5c906 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -192,6 +192,7 @@ BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD); BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD); BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS); +BRPORT_ATTR_FLAG(isolated, BR_ISOLATED); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -243,6 +244,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_broadcast_flood, &brport_attr_group_fwd_mask, &brport_attr_neigh_suppress, + &brport_attr_isolated, NULL }; diff --git a/net/core/filter.c b/net/core/filter.c index 51ea7ddb2d8d..acf1f4fb99d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -64,6 +64,10 @@ #include <net/ip_fib.h> #include <net/flow.h> #include <net/arp.h> +#include <net/ipv6.h> +#include <linux/seg6_local.h> +#include <net/seg6.h> +#include <net/seg6_local.h> /** * sk_filter_trim_cap - run a packet through a socket filter @@ -3042,7 +3046,7 @@ static int __bpf_tx_xdp(struct net_device *dev, u32 index) { struct xdp_frame *xdpf; - int err; + int sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; @@ -3052,9 +3056,9 @@ static int __bpf_tx_xdp(struct net_device *dev, if (unlikely(!xdpf)) return -EOVERFLOW; - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); - if (err) - return err; + sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf); + if (sent <= 0) + return sent; dev->netdev_ops->ndo_xdp_flush(dev); return 0; } @@ -3068,20 +3072,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: { - struct net_device *dev = fwd; - struct xdp_frame *xdpf; + struct bpf_dtab_netdev *dst = fwd; - if (!dev->netdev_ops->ndo_xdp_xmit) - return -EOPNOTSUPP; - - xdpf = convert_to_xdp_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - - /* TODO: move to inside map code instead, for bulk support - * err = dev_map_enqueue(dev, xdp); - */ - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); + err = dev_map_enqueue(dst, xdp, dev_rx); if (err) return err; __dev_map_insert_ctx(map, index); @@ -3370,28 +3363,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .arg3_type = ARG_ANYTHING, }; -bool bpf_helper_changes_pkt_data(void *func) -{ - if (func == bpf_skb_vlan_push || - func == bpf_skb_vlan_pop || - func == bpf_skb_store_bytes || - func == bpf_skb_change_proto || - func == bpf_skb_change_head || - func == bpf_skb_change_tail || - func == bpf_skb_adjust_room || - func == bpf_skb_pull_data || - func == bpf_clone_redirect || - func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data || - func == bpf_xdp_adjust_tail) - return true; - - return false; -} - static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, unsigned long off, unsigned long len) { @@ -4096,7 +4067,7 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, #if IS_ENABLED(CONFIG_INET) static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, - u32 flags) + u32 flags, bool check_mtu) { struct in_device *in_dev; struct neighbour *neigh; @@ -4105,6 +4076,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct fib_nh *nh; struct flowi4 fl4; int err; + u32 mtu; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) @@ -4156,6 +4128,12 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (res.fi->fib_nhs > 1) fib_select_path(net, &res, &fl4, NULL); + if (check_mtu) { + mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); + if (params->tot_len > mtu) + return 0; + } + nh = &res.fi->fib_nh[res.nh_sel]; /* do not handle lwt encaps right now */ @@ -4184,7 +4162,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, #if IS_ENABLED(CONFIG_IPV6) static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, - u32 flags) + u32 flags, bool check_mtu) { struct in6_addr *src = (struct in6_addr *) params->ipv6_src; struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; @@ -4195,6 +4173,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct flowi6 fl6; int strict = 0; int oif; + u32 mtu; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) @@ -4257,6 +4236,12 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl6.flowi6_oif, NULL, strict); + if (check_mtu) { + mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); + if (params->tot_len > mtu) + return 0; + } + if (f6i->fib6_nh.nh_lwtstate) return 0; @@ -4289,12 +4274,12 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, #if IS_ENABLED(CONFIG_INET) case AF_INET: return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, - flags); + flags, true); #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, - flags); + flags, true); #endif } return 0; @@ -4313,20 +4298,34 @@ static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { + struct net *net = dev_net(skb->dev); + int index = 0; + if (plen < sizeof(*params)) return -EINVAL; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: - return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags); + index = bpf_ipv4_fib_lookup(net, params, flags, false); + break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags); + index = bpf_ipv6_fib_lookup(net, params, flags, false); + break; #endif } - return -ENOTSUPP; + + if (index > 0) { + struct net_device *dev; + + dev = dev_get_by_index_rcu(net, index); + if (!is_skb_forwardable(dev, skb)) + index = 0; + } + + return index; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { @@ -4339,6 +4338,264 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .arg4_type = ARG_ANYTHING, }; +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) +static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) +{ + int err; + struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; + + if (!seg6_validate_srh(srh, len)) + return -EINVAL; + + switch (type) { + case BPF_LWT_ENCAP_SEG6_INLINE: + if (skb->protocol != htons(ETH_P_IPV6)) + return -EBADMSG; + + err = seg6_do_srh_inline(skb, srh); + break; + case BPF_LWT_ENCAP_SEG6: + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); + break; + default: + return -EINVAL; + } + + bpf_compute_data_pointers(skb); + if (err) + return err; + + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + return seg6_lookup_nexthop(skb, NULL, 0); +} +#endif /* CONFIG_IPV6_SEG6_BPF */ + +BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, + u32, len) +{ + switch (type) { +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + case BPF_LWT_ENCAP_SEG6: + case BPF_LWT_ENCAP_SEG6_INLINE: + return bpf_push_seg6_encap(skb, type, hdr, len); +#endif + default: + return -EINVAL; + } +} + +static const struct bpf_func_proto bpf_lwt_push_encap_proto = { + .func = bpf_lwt_push_encap, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, + const void *, from, u32, len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + void *srh_tlvs, *srh_end, *ptr; + struct ipv6_sr_hdr *srh; + int srhoff = 0; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); + srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); + + ptr = skb->data + offset; + if (ptr >= srh_tlvs && ptr + len <= srh_end) + srh_state->valid = 0; + else if (ptr < (void *)&srh->flags || + ptr + len > (void *)&srh->segments) + return -EFAULT; + + if (unlikely(bpf_try_make_writable(skb, offset + len))) + return -EFAULT; + + memcpy(skb->data + offset, from, len); + return 0; +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { + .func = bpf_lwt_seg6_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, + u32, action, void *, param, u32, param_len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh; + int srhoff = 0; + int err; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + + if (!srh_state->valid) { + if (unlikely((srh_state->hdrlen & 7) != 0)) + return -EBADMSG; + + srh->hdrlen = (u8)(srh_state->hdrlen >> 3); + if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) + return -EBADMSG; + + srh_state->valid = 1; + } + + switch (action) { + case SEG6_LOCAL_ACTION_END_X: + if (param_len != sizeof(struct in6_addr)) + return -EINVAL; + return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); + case SEG6_LOCAL_ACTION_END_T: + if (param_len != sizeof(int)) + return -EINVAL; + return seg6_lookup_nexthop(skb, NULL, *(int *)param); + case SEG6_LOCAL_ACTION_END_B6: + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, + param, param_len); + if (!err) + srh_state->hdrlen = + ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + return err; + case SEG6_LOCAL_ACTION_END_B6_ENCAP: + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, + param, param_len); + if (!err) + srh_state->hdrlen = + ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + return err; + default: + return -EINVAL; + } +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { + .func = bpf_lwt_seg6_action, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, + s32, len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + void *srh_end, *srh_tlvs, *ptr; + struct ipv6_sr_hdr *srh; + struct ipv6hdr *hdr; + int srhoff = 0; + int ret; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + + srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + + ((srh->first_segment + 1) << 4)); + srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + + srh_state->hdrlen); + ptr = skb->data + offset; + + if (unlikely(ptr < srh_tlvs || ptr > srh_end)) + return -EFAULT; + if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) + return -EFAULT; + + if (len > 0) { + ret = skb_cow_head(skb, len); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_push(skb, offset, len); + } else { + ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); + } + + bpf_compute_data_pointers(skb); + if (unlikely(ret < 0)) + return ret; + + hdr = (struct ipv6hdr *)skb->data; + hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + + srh_state->hdrlen += len; + srh_state->valid = 0; + return 0; +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { + .func = bpf_lwt_seg6_adjust_srh, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +bool bpf_helper_changes_pkt_data(void *func) +{ + if (func == bpf_skb_vlan_push || + func == bpf_skb_vlan_pop || + func == bpf_skb_store_bytes || + func == bpf_skb_change_proto || + func == bpf_skb_change_head || + func == bpf_skb_change_tail || + func == bpf_skb_adjust_room || + func == bpf_skb_pull_data || + func == bpf_clone_redirect || + func == bpf_l3_csum_replace || + func == bpf_l4_csum_replace || + func == bpf_xdp_adjust_head || + func == bpf_xdp_adjust_meta || + func == bpf_msg_pull_data || + func == bpf_xdp_adjust_tail || + func == bpf_lwt_push_encap || + func == bpf_lwt_seg6_store_bytes || + func == bpf_lwt_seg6_adjust_srh || + func == bpf_lwt_seg6_action + ) + return true; + + return false; +} + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -4523,33 +4780,6 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } static const struct bpf_func_proto * -lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_skb_load_bytes: - return &bpf_skb_load_bytes_proto; - case BPF_FUNC_skb_pull_data: - return &bpf_skb_pull_data_proto; - case BPF_FUNC_csum_diff: - return &bpf_csum_diff_proto; - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_proto; - case BPF_FUNC_get_route_realm: - return &bpf_get_route_realm_proto; - case BPF_FUNC_get_hash_recalc: - return &bpf_get_hash_recalc_proto; - case BPF_FUNC_perf_event_output: - return &bpf_skb_event_output_proto; - case BPF_FUNC_get_smp_processor_id: - return &bpf_get_smp_processor_id_proto; - case BPF_FUNC_skb_under_cgroup: - return &bpf_skb_under_cgroup_proto; - default: - return bpf_base_func_proto(func_id); - } -} - -static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { @@ -4615,6 +4845,44 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } static const struct bpf_func_proto * +lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_lwt_push_encap: + return &bpf_lwt_push_encap_proto; + default: + return lwt_out_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { @@ -4645,7 +4913,22 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; default: - return lwt_inout_func_proto(func_id, prog); + return lwt_out_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_lwt_seg6_store_bytes: + return &bpf_lwt_seg6_store_bytes_proto; + case BPF_FUNC_lwt_seg6_action: + return &bpf_lwt_seg6_action_proto; + case BPF_FUNC_lwt_seg6_adjust_srh: + return &bpf_lwt_seg6_adjust_srh_proto; + default: + return lwt_out_func_proto(func_id, prog); } } @@ -4753,7 +5036,6 @@ static bool lwt_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } - /* Attach type specific accesses */ static bool __sock_filter_check_attach_type(int off, enum bpf_access_type access_type, @@ -5155,18 +5437,23 @@ static bool sk_msg_is_valid_access(int off, int size, switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; + if (size != sizeof(__u64)) + return false; break; case offsetof(struct sk_msg_md, data_end): info->reg_type = PTR_TO_PACKET_END; + if (size != sizeof(__u64)) + return false; break; + default: + if (size != sizeof(__u32)) + return false; } if (off < 0 || off >= sizeof(struct sk_msg_md)) return false; if (off % size != 0) return false; - if (size != sizeof(__u64)) - return false; return true; } @@ -5842,7 +6129,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, local_ip4): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), @@ -6159,6 +6447,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + int off; switch (si->off) { case offsetof(struct sk_msg_md, data): @@ -6171,6 +6460,107 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct sk_msg_buff, data_end)); break; + case offsetof(struct sk_msg_md, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_family)); + break; + + case offsetof(struct sk_msg_md, remote_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_daddr)); + break; + + case offsetof(struct sk_msg_md, local_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_rcv_saddr)); + break; + + case offsetof(struct sk_msg_md, remote_ip6[0]) ... + offsetof(struct sk_msg_md, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct sk_msg_md, remote_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct sk_msg_md, local_ip6[0]) ... + offsetof(struct sk_msg_md, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct sk_msg_md, local_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct sk_msg_md, remote_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct sk_msg_md, local_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_num)); + break; } return insn - insn_buf; @@ -6219,13 +6609,23 @@ const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops lwt_inout_verifier_ops = { - .get_func_proto = lwt_inout_func_proto, +const struct bpf_verifier_ops lwt_in_verifier_ops = { + .get_func_proto = lwt_in_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; -const struct bpf_prog_ops lwt_inout_prog_ops = { +const struct bpf_prog_ops lwt_in_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops lwt_out_verifier_ops = { + .get_func_proto = lwt_out_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_out_prog_ops = { .test_run = bpf_prog_test_run_skb, }; @@ -6240,6 +6640,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; +const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { + .get_func_proto = lwt_seg6local_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_seg6local_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 380934580fa1..419af6dfe29f 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -35,10 +35,6 @@ #include <trace/events/tcp.h> #include <trace/events/fib.h> #include <trace/events/qdisc.h> -#if IS_ENABLED(CONFIG_IPV6) -#include <trace/events/fib6.h> -EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); -#endif #if IS_ENABLED(CONFIG_BRIDGE) #include <trace/events/bridge.h> EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add); diff --git a/net/core/xdp.c b/net/core/xdp.c index bf6758f74339..cb8c4e061a5a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -308,7 +308,13 @@ err: } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); -static void xdp_return(void *data, struct xdp_mem_info *mem) +/* XDP RX runs under NAPI protection, and in different delivery error + * scenarios (e.g. queue full), it is possible to return the xdp_frame + * while still leveraging this protection. The @napi_direct boolian + * is used for those calls sites. Thus, allowing for faster recycling + * of xdp_frames/pages in those cases. + */ +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) { struct xdp_mem_allocator *xa; struct page *page; @@ -320,7 +326,7 @@ static void xdp_return(void *data, struct xdp_mem_info *mem) xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); if (xa) - page_pool_put_page(xa->page_pool, page); + page_pool_put_page(xa->page_pool, page, napi_direct); else put_page(page); rcu_read_unlock(); @@ -340,12 +346,18 @@ static void xdp_return(void *data, struct xdp_mem_info *mem) void xdp_return_frame(struct xdp_frame *xdpf) { - xdp_return(xdpf->data, &xdpf->mem); + __xdp_return(xdpf->data, &xdpf->mem, false); } EXPORT_SYMBOL_GPL(xdp_return_frame); +void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) +{ + __xdp_return(xdpf->data, &xdpf->mem, true); +} +EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); + void xdp_return_buff(struct xdp_buff *xdp) { - xdp_return(xdp->data, &xdp->rxq->mem); + __xdp_return(xdp->data, &xdp->rxq->mem, true); } EXPORT_SYMBOL_GPL(xdp_return_buff); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 897ae92dff0f..045c43a27c12 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -354,8 +354,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.fl4_dport = 0; } - trace_fib_validate_source(dev, &fl4); - if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST && diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3dcffd3ce98c..65c340f230ae 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1326,14 +1326,14 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, unsigned long index; t_key cindex; - trace_fib_table_lookup(tb->tb_id, flp); - pn = t->kv; cindex = 0; n = get_child_rcu(pn, cindex); - if (!n) + if (!n) { + trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN); return -EAGAIN; + } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->gets); @@ -1416,8 +1416,11 @@ backtrace: * nothing for us to do as we do not have any * further nodes to parse. */ - if (IS_TRIE(pn)) + if (IS_TRIE(pn)) { + trace_fib_table_lookup(tb->tb_id, flp, + NULL, -EAGAIN); return -EAGAIN; + } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->backtrack); #endif @@ -1459,6 +1462,7 @@ found: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif + trace_fib_table_lookup(tb->tb_id, flp, NULL, err); return err; } if (fi->fib_flags & RTNH_F_DEAD) @@ -1494,7 +1498,7 @@ found: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif - trace_fib_table_lookup_nh(nh); + trace_fib_table_lookup(tb->tb_id, flp, nh, err); return err; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0e401dc4e1bd..45ad2585eb28 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1352,6 +1352,37 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) return NULL; } +/* MTU selection: + * 1. mtu on route is locked - use it + * 2. mtu from nexthop exception + * 3. mtu from egress device + */ + +u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) +{ + struct fib_info *fi = res->fi; + struct fib_nh *nh = &fi->fib_nh[res->nh_sel]; + struct net_device *dev = nh->nh_dev; + u32 mtu = 0; + + if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || + fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) + mtu = fi->fib_mtu; + + if (likely(!mtu)) { + struct fib_nh_exception *fnhe; + + fnhe = find_exception(nh, daddr); + if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) + mtu = fnhe->fnhe_pmtu; + } + + if (likely(!mtu)) + mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); + + return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu); +} + static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, __be32 daddr, const bool do_cache) { diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 11e4e80cf7e9..0eff75525da1 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -329,4 +329,9 @@ config IPV6_SEG6_HMAC If unsure, say N. +config IPV6_SEG6_BPF + def_bool y + depends on IPV6_SEG6_LWTUNNEL + depends on IPV6 = y + endif # IPV6 diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 2fe754fd4f5e..5cd0029d930e 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -161,12 +161,20 @@ eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i, return f6i; } +static u32 +eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr) +{ + return 0; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, .fib6_get_table = eafnosupport_fib6_get_table, .fib6_table_lookup = eafnosupport_fib6_table_lookup, .fib6_lookup = eafnosupport_fib6_lookup, .fib6_multipath_select = eafnosupport_fib6_multipath_select, + .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 50de8b0d4f70..9ed0eae91758 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -894,6 +894,7 @@ static const struct ipv6_stub ipv6_stub_impl = { .fib6_table_lookup = fib6_table_lookup, .fib6_lookup = fib6_lookup, .fib6_multipath_select = fib6_multipath_select, + .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 038d661d5ffc..22c4de2317d0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -64,14 +64,19 @@ #include <net/ip_tunnels.h> #include <net/l3mdev.h> #include <net/ip.h> -#include <trace/events/fib6.h> - #include <linux/uaccess.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif +static int ip6_rt_type_to_error(u8 fib6_type); + +#define CREATE_TRACE_POINTS +#include <trace/events/fib6.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); +#undef CREATE_TRACE_POINTS + enum rt6_nud_state { RT6_NUD_FAIL_HARD = -3, RT6_NUD_FAIL_PROBE = -2, @@ -2604,6 +2609,54 @@ out: return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } +/* MTU selection: + * 1. mtu on route is locked - use it + * 2. mtu from nexthop exception + * 3. mtu from egress device + * + * based on ip6_dst_mtu_forward and exception logic of + * rt6_find_cached_rt; called with rcu_read_lock + */ +u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct in6_addr *src_key; + struct inet6_dev *idev; + u32 mtu = 0; + + if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { + mtu = f6i->fib6_pmtu; + if (mtu) + goto out; + } + + src_key = NULL; +#ifdef CONFIG_IPV6_SUBTREES + if (f6i->fib6_src.plen) + src_key = saddr; +#endif + + bucket = rcu_dereference(f6i->rt6i_exception_bucket); + rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); + if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) + mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); + + if (likely(!mtu)) { + struct net_device *dev = fib6_info_nh_dev(f6i); + + mtu = IPV6_MIN_MTU; + idev = __in6_dev_get(dev); + if (idev && idev->cnf.mtu6 > mtu) + mtu = idev->cnf.mtu6; + } + + mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); +out: + return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); +} + struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6) { diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 45722327375a..cd6e4cab63f6 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -1,8 +1,9 @@ /* * SR-IPv6 implementation * - * Author: + * Authors: * David Lebrun <[email protected]> + * eBPF support: Mathieu Xhonneux <[email protected]> * * * This program is free software; you can redistribute it and/or @@ -30,7 +31,9 @@ #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif +#include <net/seg6_local.h> #include <linux/etherdevice.h> +#include <linux/bpf.h> struct seg6_local_lwt; @@ -41,6 +44,11 @@ struct seg6_action_desc { int static_headroom; }; +struct bpf_lwt_prog { + struct bpf_prog *prog; + char *name; +}; + struct seg6_local_lwt { int action; struct ipv6_sr_hdr *srh; @@ -49,6 +57,7 @@ struct seg6_local_lwt { struct in6_addr nh6; int iif; int oif; + struct bpf_lwt_prog bpf; int headroom; struct seg6_action_desc *desc; @@ -140,8 +149,8 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr) *daddr = *addr; } -static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, - u32 tbl_id) +int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id) { struct net *net = dev_net(skb->dev); struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -187,6 +196,7 @@ out: skb_dst_drop(skb); skb_dst_set(skb, dst); + return dst->error; } /* regular endpoint function */ @@ -200,7 +210,7 @@ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - lookup_nexthop(skb, NULL, 0); + seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); @@ -220,7 +230,7 @@ static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt) advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - lookup_nexthop(skb, &slwt->nh6, 0); + seg6_lookup_nexthop(skb, &slwt->nh6, 0); return dst_input(skb); @@ -239,7 +249,7 @@ static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt) advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - lookup_nexthop(skb, NULL, slwt->table); + seg6_lookup_nexthop(skb, NULL, slwt->table); return dst_input(skb); @@ -331,7 +341,7 @@ static int input_action_end_dx6(struct sk_buff *skb, if (!ipv6_addr_any(&slwt->nh6)) nhaddr = &slwt->nh6; - lookup_nexthop(skb, nhaddr, 0); + seg6_lookup_nexthop(skb, nhaddr, 0); return dst_input(skb); drop: @@ -380,7 +390,7 @@ static int input_action_end_dt6(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - lookup_nexthop(skb, NULL, slwt->table); + seg6_lookup_nexthop(skb, NULL, slwt->table); return dst_input(skb); @@ -406,7 +416,7 @@ static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt) ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); - lookup_nexthop(skb, NULL, 0); + seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); @@ -438,7 +448,7 @@ static int input_action_end_b6_encap(struct sk_buff *skb, ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); - lookup_nexthop(skb, NULL, 0); + seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); @@ -447,6 +457,71 @@ drop: return err; } +DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); + +static int input_action_end_bpf(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct seg6_bpf_srh_state local_srh_state; + struct ipv6_sr_hdr *srh; + int srhoff = 0; + int ret; + + srh = get_and_validate_srh(skb); + if (!srh) + goto drop; + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + + /* preempt_disable is needed to protect the per-CPU buffer srh_state, + * which is also accessed by the bpf_lwt_seg6_* helpers + */ + preempt_disable(); + srh_state->hdrlen = srh->hdrlen << 3; + srh_state->valid = 1; + + rcu_read_lock(); + bpf_compute_data_pointers(skb); + ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb); + rcu_read_unlock(); + + local_srh_state = *srh_state; + preempt_enable(); + + switch (ret) { + case BPF_OK: + case BPF_REDIRECT: + break; + case BPF_DROP: + goto drop; + default: + pr_warn_once("bpf-seg6local: Illegal return value %u\n", ret); + goto drop; + } + + if (unlikely((local_srh_state.hdrlen & 7) != 0)) + goto drop; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + goto drop; + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3); + + if (!local_srh_state.valid && + unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) + goto drop; + + if (ret != BPF_REDIRECT) + seg6_lookup_nexthop(skb, NULL, 0); + + return dst_input(skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + static struct seg6_action_desc seg6_action_table[] = { { .action = SEG6_LOCAL_ACTION_END, @@ -493,7 +568,13 @@ static struct seg6_action_desc seg6_action_table[] = { .attrs = (1 << SEG6_LOCAL_SRH), .input = input_action_end_b6_encap, .static_headroom = sizeof(struct ipv6hdr), - } + }, + { + .action = SEG6_LOCAL_ACTION_END_BPF, + .attrs = (1 << SEG6_LOCAL_BPF), + .input = input_action_end_bpf, + }, + }; static struct seg6_action_desc *__get_action_desc(int action) @@ -538,6 +619,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { .len = sizeof(struct in6_addr) }, [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, + [SEG6_LOCAL_BPF] = { .type = NLA_NESTED }, }; static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) @@ -715,6 +797,75 @@ static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b) return 0; } +#define MAX_PROG_NAME 256 +static const struct nla_policy bpf_prog_policy[SEG6_LOCAL_BPF_PROG_MAX + 1] = { + [SEG6_LOCAL_BPF_PROG] = { .type = NLA_U32, }, + [SEG6_LOCAL_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, + .len = MAX_PROG_NAME }, +}; + +static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt) +{ + struct nlattr *tb[SEG6_LOCAL_BPF_PROG_MAX + 1]; + struct bpf_prog *p; + int ret; + u32 fd; + + ret = nla_parse_nested(tb, SEG6_LOCAL_BPF_PROG_MAX, + attrs[SEG6_LOCAL_BPF], bpf_prog_policy, NULL); + if (ret < 0) + return ret; + + if (!tb[SEG6_LOCAL_BPF_PROG] || !tb[SEG6_LOCAL_BPF_PROG_NAME]) + return -EINVAL; + + slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_KERNEL); + if (!slwt->bpf.name) + return -ENOMEM; + + fd = nla_get_u32(tb[SEG6_LOCAL_BPF_PROG]); + p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL); + if (IS_ERR(p)) { + kfree(slwt->bpf.name); + return PTR_ERR(p); + } + + slwt->bpf.prog = p; + return 0; +} + +static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + struct nlattr *nest; + + if (!slwt->bpf.prog) + return 0; + + nest = nla_nest_start(skb, SEG6_LOCAL_BPF); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, SEG6_LOCAL_BPF_PROG, slwt->bpf.prog->aux->id)) + return -EMSGSIZE; + + if (slwt->bpf.name && + nla_put_string(skb, SEG6_LOCAL_BPF_PROG_NAME, slwt->bpf.name)) + return -EMSGSIZE; + + return nla_nest_end(skb, nest); +} + +static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b) +{ + if (!a->bpf.name && !b->bpf.name) + return 0; + + if (!a->bpf.name || !b->bpf.name) + return 1; + + return strcmp(a->bpf.name, b->bpf.name); +} + struct seg6_action_param { int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt); int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt); @@ -745,6 +896,11 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_OIF] = { .parse = parse_nla_oif, .put = put_nla_oif, .cmp = cmp_nla_oif }, + + [SEG6_LOCAL_BPF] = { .parse = parse_nla_bpf, + .put = put_nla_bpf, + .cmp = cmp_nla_bpf }, + }; static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) @@ -830,6 +986,13 @@ static void seg6_local_destroy_state(struct lwtunnel_state *lwt) struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt); kfree(slwt->srh); + + if (slwt->desc->attrs & (1 << SEG6_LOCAL_BPF)) { + kfree(slwt->bpf.name); + bpf_prog_put(slwt->bpf.prog); + } + + return; } static int seg6_local_fill_encap(struct sk_buff *skb, @@ -882,6 +1045,11 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt) if (attrs & (1 << SEG6_LOCAL_OIF)) nlsize += nla_total_size(4); + if (attrs & (1 << SEG6_LOCAL_BPF)) + nlsize += nla_total_size(sizeof(struct nlattr)) + + nla_total_size(MAX_PROG_NAME) + + nla_total_size(4); + return nlsize; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 963e4bf0aab8..a4a5ace834c3 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -103,9 +103,10 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) } EXPORT_SYMBOL(unregister_tcf_proto_ops); -bool tcf_queue_work(struct work_struct *work) +bool tcf_queue_work(struct rcu_work *rwork, work_func_t func) { - return queue_work(tc_filter_wq, work); + INIT_RCU_WORK(rwork, func); + return queue_rcu_work(tc_filter_wq, rwork); } EXPORT_SYMBOL(tcf_queue_work); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 6b7ab3512f5b..95367f37098d 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -35,10 +35,7 @@ struct basic_filter { struct tcf_result res; struct tcf_proto *tp; struct list_head link; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, @@ -97,21 +94,14 @@ static void __basic_delete_filter(struct basic_filter *f) static void basic_delete_filter_work(struct work_struct *work) { - struct basic_filter *f = container_of(work, struct basic_filter, work); - + struct basic_filter *f = container_of(to_rcu_work(work), + struct basic_filter, + rwork); rtnl_lock(); __basic_delete_filter(f); rtnl_unlock(); } -static void basic_delete_filter(struct rcu_head *head) -{ - struct basic_filter *f = container_of(head, struct basic_filter, rcu); - - INIT_WORK(&f->work, basic_delete_filter_work); - tcf_queue_work(&f->work); -} - static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct basic_head *head = rtnl_dereference(tp->root); @@ -122,7 +112,7 @@ static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) tcf_unbind_filter(tp, &f->res); idr_remove(&head->handle_idr, f->handle); if (tcf_exts_get_net(&f->exts)) - call_rcu(&f->rcu, basic_delete_filter); + tcf_queue_work(&f->rwork, basic_delete_filter_work); else __basic_delete_filter(f); } @@ -140,7 +130,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, bool *last, tcf_unbind_filter(tp, &f->res); idr_remove(&head->handle_idr, f->handle); tcf_exts_get_net(&f->exts); - call_rcu(&f->rcu, basic_delete_filter); + tcf_queue_work(&f->rwork, basic_delete_filter_work); *last = list_empty(&head->flist); return 0; } @@ -234,7 +224,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, list_replace_rcu(&fold->link, &fnew->link); tcf_unbind_filter(tp, &fold->res); tcf_exts_get_net(&fold->exts); - call_rcu(&fold->rcu, basic_delete_filter); + tcf_queue_work(&fold->rwork, basic_delete_filter_work); } else { list_add_rcu(&fnew->link, &head->flist); } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index b07c1fa8bc0d..1aa7f6511065 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -49,10 +49,7 @@ struct cls_bpf_prog { struct sock_filter *bpf_ops; const char *bpf_name; struct tcf_proto *tp; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { @@ -275,21 +272,14 @@ static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) static void cls_bpf_delete_prog_work(struct work_struct *work) { - struct cls_bpf_prog *prog = container_of(work, struct cls_bpf_prog, work); - + struct cls_bpf_prog *prog = container_of(to_rcu_work(work), + struct cls_bpf_prog, + rwork); rtnl_lock(); __cls_bpf_delete_prog(prog); rtnl_unlock(); } -static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu) -{ - struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu); - - INIT_WORK(&prog->work, cls_bpf_delete_prog_work); - tcf_queue_work(&prog->work); -} - static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct netlink_ext_ack *extack) { @@ -300,7 +290,7 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog, list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); if (tcf_exts_get_net(&prog->exts)) - call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu); + tcf_queue_work(&prog->rwork, cls_bpf_delete_prog_work); else __cls_bpf_delete_prog(prog); } @@ -526,7 +516,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); tcf_exts_get_net(&oldprog->exts); - call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu); + tcf_queue_work(&oldprog->rwork, cls_bpf_delete_prog_work); } else { list_add_rcu(&prog->link, &head->plist); } diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 762da5c0cf5e..3bc01bdde165 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -23,10 +23,7 @@ struct cls_cgroup_head { struct tcf_exts exts; struct tcf_ematch_tree ematches; struct tcf_proto *tp; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, @@ -70,24 +67,14 @@ static void __cls_cgroup_destroy(struct cls_cgroup_head *head) static void cls_cgroup_destroy_work(struct work_struct *work) { - struct cls_cgroup_head *head = container_of(work, + struct cls_cgroup_head *head = container_of(to_rcu_work(work), struct cls_cgroup_head, - work); + rwork); rtnl_lock(); __cls_cgroup_destroy(head); rtnl_unlock(); } -static void cls_cgroup_destroy_rcu(struct rcu_head *root) -{ - struct cls_cgroup_head *head = container_of(root, - struct cls_cgroup_head, - rcu); - - INIT_WORK(&head->work, cls_cgroup_destroy_work); - tcf_queue_work(&head->work); -} - static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -134,7 +121,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, rcu_assign_pointer(tp->root, new); if (head) { tcf_exts_get_net(&head->exts); - call_rcu(&head->rcu, cls_cgroup_destroy_rcu); + tcf_queue_work(&head->rwork, cls_cgroup_destroy_work); } return 0; errout: @@ -151,7 +138,7 @@ static void cls_cgroup_destroy(struct tcf_proto *tp, /* Head can still be NULL due to cls_cgroup_init(). */ if (head) { if (tcf_exts_get_net(&head->exts)) - call_rcu(&head->rcu, cls_cgroup_destroy_rcu); + tcf_queue_work(&head->rwork, cls_cgroup_destroy_work); else __cls_cgroup_destroy(head); } diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index cd5fe383afdd..2bb043cd436b 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -57,10 +57,7 @@ struct flow_filter { u32 divisor; u32 baseclass; u32 hashrnd; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; static inline u32 addr_fold(void *addr) @@ -383,21 +380,14 @@ static void __flow_destroy_filter(struct flow_filter *f) static void flow_destroy_filter_work(struct work_struct *work) { - struct flow_filter *f = container_of(work, struct flow_filter, work); - + struct flow_filter *f = container_of(to_rcu_work(work), + struct flow_filter, + rwork); rtnl_lock(); __flow_destroy_filter(f); rtnl_unlock(); } -static void flow_destroy_filter(struct rcu_head *head) -{ - struct flow_filter *f = container_of(head, struct flow_filter, rcu); - - INIT_WORK(&f->work, flow_destroy_filter_work); - tcf_queue_work(&f->work); -} - static int flow_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -563,7 +553,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (fold) { tcf_exts_get_net(&fold->exts); - call_rcu(&fold->rcu, flow_destroy_filter); + tcf_queue_work(&fold->rwork, flow_destroy_filter_work); } return 0; @@ -583,7 +573,7 @@ static int flow_delete(struct tcf_proto *tp, void *arg, bool *last, list_del_rcu(&f->list); tcf_exts_get_net(&f->exts); - call_rcu(&f->rcu, flow_destroy_filter); + tcf_queue_work(&f->rwork, flow_destroy_filter_work); *last = list_empty(&head->filters); return 0; } @@ -608,7 +598,7 @@ static void flow_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) list_for_each_entry_safe(f, next, &head->filters, list) { list_del_rcu(&f->list); if (tcf_exts_get_net(&f->exts)) - call_rcu(&f->rcu, flow_destroy_filter); + tcf_queue_work(&f->rwork, flow_destroy_filter_work); else __flow_destroy_filter(f); } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index eacaaf803914..4e74508515f4 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -73,10 +73,7 @@ struct fl_flow_mask { struct cls_fl_head { struct rhashtable ht; struct list_head masks; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; struct idr handle_idr; }; @@ -90,10 +87,7 @@ struct cls_fl_filter { struct list_head list; u32 handle; u32 flags; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; struct net_device *hw_dev; }; @@ -235,21 +229,14 @@ static void __fl_destroy_filter(struct cls_fl_filter *f) static void fl_destroy_filter_work(struct work_struct *work) { - struct cls_fl_filter *f = container_of(work, struct cls_fl_filter, work); + struct cls_fl_filter *f = container_of(to_rcu_work(work), + struct cls_fl_filter, rwork); rtnl_lock(); __fl_destroy_filter(f); rtnl_unlock(); } -static void fl_destroy_filter(struct rcu_head *head) -{ - struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu); - - INIT_WORK(&f->work, fl_destroy_filter_work); - tcf_queue_work(&f->work); -} - static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f, struct netlink_ext_ack *extack) { @@ -327,7 +314,7 @@ static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, fl_hw_destroy_filter(tp, f, extack); tcf_unbind_filter(tp, &f->res); if (async) - call_rcu(&f->rcu, fl_destroy_filter); + tcf_queue_work(&f->rwork, fl_destroy_filter_work); else __fl_destroy_filter(f); @@ -336,20 +323,13 @@ static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, static void fl_destroy_sleepable(struct work_struct *work) { - struct cls_fl_head *head = container_of(work, struct cls_fl_head, - work); + struct cls_fl_head *head = container_of(to_rcu_work(work), + struct cls_fl_head, + rwork); kfree(head); module_put(THIS_MODULE); } -static void fl_destroy_rcu(struct rcu_head *rcu) -{ - struct cls_fl_head *head = container_of(rcu, struct cls_fl_head, rcu); - - INIT_WORK(&head->work, fl_destroy_sleepable); - schedule_work(&head->work); -} - static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct cls_fl_head *head = rtnl_dereference(tp->root); @@ -365,7 +345,7 @@ static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) idr_destroy(&head->handle_idr); __module_get(THIS_MODULE); - call_rcu(&head->rcu, fl_destroy_rcu); + tcf_queue_work(&head->rwork, fl_destroy_sleepable); } static void *fl_get(struct tcf_proto *tp, u32 handle) @@ -1036,7 +1016,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, list_replace_rcu(&fold->list, &fnew->list); tcf_unbind_filter(tp, &fold->res); tcf_exts_get_net(&fold->exts); - call_rcu(&fold->rcu, fl_destroy_filter); + tcf_queue_work(&fold->rwork, fl_destroy_filter_work); } else { list_add_tail_rcu(&fnew->list, &fnew->mask->filters); } diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 8b207723fbc2..29eeeaf3ea44 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -47,10 +47,7 @@ struct fw_filter { #endif /* CONFIG_NET_CLS_IND */ struct tcf_exts exts; struct tcf_proto *tp; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; static u32 fw_hash(u32 handle) @@ -134,21 +131,14 @@ static void __fw_delete_filter(struct fw_filter *f) static void fw_delete_filter_work(struct work_struct *work) { - struct fw_filter *f = container_of(work, struct fw_filter, work); - + struct fw_filter *f = container_of(to_rcu_work(work), + struct fw_filter, + rwork); rtnl_lock(); __fw_delete_filter(f); rtnl_unlock(); } -static void fw_delete_filter(struct rcu_head *head) -{ - struct fw_filter *f = container_of(head, struct fw_filter, rcu); - - INIT_WORK(&f->work, fw_delete_filter_work); - tcf_queue_work(&f->work); -} - static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); @@ -164,7 +154,7 @@ static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) rtnl_dereference(f->next)); tcf_unbind_filter(tp, &f->res); if (tcf_exts_get_net(&f->exts)) - call_rcu(&f->rcu, fw_delete_filter); + tcf_queue_work(&f->rwork, fw_delete_filter_work); else __fw_delete_filter(f); } @@ -193,7 +183,7 @@ static int fw_delete(struct tcf_proto *tp, void *arg, bool *last, RCU_INIT_POINTER(*fp, rtnl_dereference(f->next)); tcf_unbind_filter(tp, &f->res); tcf_exts_get_net(&f->exts); - call_rcu(&f->rcu, fw_delete_filter); + tcf_queue_work(&f->rwork, fw_delete_filter_work); ret = 0; break; } @@ -316,7 +306,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, rcu_assign_pointer(*fp, fnew); tcf_unbind_filter(tp, &f->res); tcf_exts_get_net(&f->exts); - call_rcu(&f->rcu, fw_delete_filter); + tcf_queue_work(&f->rwork, fw_delete_filter_work); *arg = fnew; return err; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 2ba721a590a7..47b207ef7762 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -21,10 +21,7 @@ struct cls_mall_head { struct tcf_result res; u32 handle; u32 flags; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, @@ -53,22 +50,14 @@ static void __mall_destroy(struct cls_mall_head *head) static void mall_destroy_work(struct work_struct *work) { - struct cls_mall_head *head = container_of(work, struct cls_mall_head, - work); + struct cls_mall_head *head = container_of(to_rcu_work(work), + struct cls_mall_head, + rwork); rtnl_lock(); __mall_destroy(head); rtnl_unlock(); } -static void mall_destroy_rcu(struct rcu_head *rcu) -{ - struct cls_mall_head *head = container_of(rcu, struct cls_mall_head, - rcu); - - INIT_WORK(&head->work, mall_destroy_work); - tcf_queue_work(&head->work); -} - static void mall_destroy_hw_filter(struct tcf_proto *tp, struct cls_mall_head *head, unsigned long cookie, @@ -126,7 +115,7 @@ static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) mall_destroy_hw_filter(tp, head, (unsigned long) head, extack); if (tcf_exts_get_net(&head->exts)) - call_rcu(&head->rcu, mall_destroy_rcu); + tcf_queue_work(&head->rwork, mall_destroy_work); else __mall_destroy(head); } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 21a03a8ee029..0404aa5fa7cb 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -57,10 +57,7 @@ struct route4_filter { u32 handle; struct route4_bucket *bkt; struct tcf_proto *tp; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; #define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) @@ -266,19 +263,17 @@ static void __route4_delete_filter(struct route4_filter *f) static void route4_delete_filter_work(struct work_struct *work) { - struct route4_filter *f = container_of(work, struct route4_filter, work); - + struct route4_filter *f = container_of(to_rcu_work(work), + struct route4_filter, + rwork); rtnl_lock(); __route4_delete_filter(f); rtnl_unlock(); } -static void route4_delete_filter(struct rcu_head *head) +static void route4_queue_work(struct route4_filter *f) { - struct route4_filter *f = container_of(head, struct route4_filter, rcu); - - INIT_WORK(&f->work, route4_delete_filter_work); - tcf_queue_work(&f->work); + tcf_queue_work(&f->rwork, route4_delete_filter_work); } static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) @@ -304,7 +299,7 @@ static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) RCU_INIT_POINTER(b->ht[h2], next); tcf_unbind_filter(tp, &f->res); if (tcf_exts_get_net(&f->exts)) - call_rcu(&f->rcu, route4_delete_filter); + route4_queue_work(f); else __route4_delete_filter(f); } @@ -349,7 +344,7 @@ static int route4_delete(struct tcf_proto *tp, void *arg, bool *last, /* Delete it */ tcf_unbind_filter(tp, &f->res); tcf_exts_get_net(&f->exts); - call_rcu(&f->rcu, route4_delete_filter); + tcf_queue_work(&f->rwork, route4_delete_filter_work); /* Strip RTNL protected tree */ for (i = 0; i <= 32; i++) { @@ -554,7 +549,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (fold) { tcf_unbind_filter(tp, &fold->res); tcf_exts_get_net(&fold->exts); - call_rcu(&fold->rcu, route4_delete_filter); + tcf_queue_work(&fold->rwork, route4_delete_filter_work); } return 0; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 4f1297657c27..e9ccf7daea7d 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -97,10 +97,7 @@ struct rsvp_filter { u32 handle; struct rsvp_session *sess; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) @@ -294,21 +291,14 @@ static void __rsvp_delete_filter(struct rsvp_filter *f) static void rsvp_delete_filter_work(struct work_struct *work) { - struct rsvp_filter *f = container_of(work, struct rsvp_filter, work); - + struct rsvp_filter *f = container_of(to_rcu_work(work), + struct rsvp_filter, + rwork); rtnl_lock(); __rsvp_delete_filter(f); rtnl_unlock(); } -static void rsvp_delete_filter_rcu(struct rcu_head *head) -{ - struct rsvp_filter *f = container_of(head, struct rsvp_filter, rcu); - - INIT_WORK(&f->work, rsvp_delete_filter_work); - tcf_queue_work(&f->work); -} - static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) { tcf_unbind_filter(tp, &f->res); @@ -317,7 +307,7 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) * in cleanup() callback */ if (tcf_exts_get_net(&f->exts)) - call_rcu(&f->rcu, rsvp_delete_filter_rcu); + tcf_queue_work(&f->rwork, rsvp_delete_filter_work); else __rsvp_delete_filter(f); } diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index b49cc990a000..32f4bbd82f35 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -28,20 +28,14 @@ struct tcindex_filter_result { struct tcf_exts exts; struct tcf_result res; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; struct tcindex_filter { u16 key; struct tcindex_filter_result result; struct tcindex_filter __rcu *next; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; }; @@ -152,21 +146,14 @@ static void tcindex_destroy_rexts_work(struct work_struct *work) { struct tcindex_filter_result *r; - r = container_of(work, struct tcindex_filter_result, work); + r = container_of(to_rcu_work(work), + struct tcindex_filter_result, + rwork); rtnl_lock(); __tcindex_destroy_rexts(r); rtnl_unlock(); } -static void tcindex_destroy_rexts(struct rcu_head *head) -{ - struct tcindex_filter_result *r; - - r = container_of(head, struct tcindex_filter_result, rcu); - INIT_WORK(&r->work, tcindex_destroy_rexts_work); - tcf_queue_work(&r->work); -} - static void __tcindex_destroy_fexts(struct tcindex_filter *f) { tcf_exts_destroy(&f->result.exts); @@ -176,23 +163,15 @@ static void __tcindex_destroy_fexts(struct tcindex_filter *f) static void tcindex_destroy_fexts_work(struct work_struct *work) { - struct tcindex_filter *f = container_of(work, struct tcindex_filter, - work); + struct tcindex_filter *f = container_of(to_rcu_work(work), + struct tcindex_filter, + rwork); rtnl_lock(); __tcindex_destroy_fexts(f); rtnl_unlock(); } -static void tcindex_destroy_fexts(struct rcu_head *head) -{ - struct tcindex_filter *f = container_of(head, struct tcindex_filter, - rcu); - - INIT_WORK(&f->work, tcindex_destroy_fexts_work); - tcf_queue_work(&f->work); -} - static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last, struct netlink_ext_ack *extack) { @@ -228,12 +207,12 @@ found: */ if (f) { if (tcf_exts_get_net(&f->result.exts)) - call_rcu(&f->rcu, tcindex_destroy_fexts); + tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work); else __tcindex_destroy_fexts(f); } else { if (tcf_exts_get_net(&r->exts)) - call_rcu(&r->rcu, tcindex_destroy_rexts); + tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); else __tcindex_destroy_rexts(r); } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index bac47b5d18fd..fb861f90fde6 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -68,10 +68,7 @@ struct tc_u_knode { u32 __percpu *pcpu_success; #endif struct tcf_proto *tp; - union { - struct work_struct work; - struct rcu_head rcu; - }; + struct rcu_work rwork; /* The 'sel' field MUST be the last field in structure to allow for * tc_u32_keys allocated at end of structure. */ @@ -436,21 +433,14 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n, */ static void u32_delete_key_work(struct work_struct *work) { - struct tc_u_knode *key = container_of(work, struct tc_u_knode, work); - + struct tc_u_knode *key = container_of(to_rcu_work(work), + struct tc_u_knode, + rwork); rtnl_lock(); u32_destroy_key(key->tp, key, false); rtnl_unlock(); } -static void u32_delete_key_rcu(struct rcu_head *rcu) -{ - struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu); - - INIT_WORK(&key->work, u32_delete_key_work); - tcf_queue_work(&key->work); -} - /* u32_delete_key_freepf_rcu is the rcu callback variant * that free's the entire structure including the statistics * percpu variables. Only use this if the key is not a copy @@ -460,21 +450,14 @@ static void u32_delete_key_rcu(struct rcu_head *rcu) */ static void u32_delete_key_freepf_work(struct work_struct *work) { - struct tc_u_knode *key = container_of(work, struct tc_u_knode, work); - + struct tc_u_knode *key = container_of(to_rcu_work(work), + struct tc_u_knode, + rwork); rtnl_lock(); u32_destroy_key(key->tp, key, true); rtnl_unlock(); } -static void u32_delete_key_freepf_rcu(struct rcu_head *rcu) -{ - struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu); - - INIT_WORK(&key->work, u32_delete_key_freepf_work); - tcf_queue_work(&key->work); -} - static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) { struct tc_u_knode __rcu **kp; @@ -491,7 +474,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) tcf_unbind_filter(tp, &key->res); idr_remove(&ht->handle_idr, key->handle); tcf_exts_get_net(&key->exts); - call_rcu(&key->rcu, u32_delete_key_freepf_rcu); + tcf_queue_work(&key->rwork, u32_delete_key_freepf_work); return 0; } } @@ -611,7 +594,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, u32_remove_hw_knode(tp, n, extack); idr_remove(&ht->handle_idr, n->handle); if (tcf_exts_get_net(&n->exts)) - call_rcu(&n->rcu, u32_delete_key_freepf_rcu); + tcf_queue_work(&n->rwork, u32_delete_key_freepf_work); else u32_destroy_key(n->tp, n, true); } @@ -995,7 +978,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); tcf_exts_get_net(&n->exts); - call_rcu(&n->rcu, u32_delete_key_rcu); + tcf_queue_work(&n->rwork, u32_delete_key_work); return 0; } diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 074fb2b2d51c..04f073146256 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,2 +1 @@ obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o - diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 2b47a1dd7c6c..87998818116f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -1,15 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <linux/init.h> @@ -25,39 +16,25 @@ #define XDP_UMEM_MIN_FRAME_SIZE 2048 -int xdp_umem_create(struct xdp_umem **umem) -{ - *umem = kzalloc(sizeof(**umem), GFP_KERNEL); - - if (!(*umem)) - return -ENOMEM; - - return 0; -} - static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unsigned int i; - if (umem->pgs) { - for (i = 0; i < umem->npgs; i++) { - struct page *page = umem->pgs[i]; - - set_page_dirty_lock(page); - put_page(page); - } + for (i = 0; i < umem->npgs; i++) { + struct page *page = umem->pgs[i]; - kfree(umem->pgs); - umem->pgs = NULL; + set_page_dirty_lock(page); + put_page(page); } + + kfree(umem->pgs); + umem->pgs = NULL; } static void xdp_umem_unaccount_pages(struct xdp_umem *umem) { - if (umem->user) { - atomic_long_sub(umem->npgs, &umem->user->locked_vm); - free_uid(umem->user); - } + atomic_long_sub(umem->npgs, &umem->user->locked_vm); + free_uid(umem->user); } static void xdp_umem_release(struct xdp_umem *umem) @@ -75,22 +52,18 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->cq = NULL; } - if (umem->pgs) { - xdp_umem_unpin_pages(umem); - - task = get_pid_task(umem->pid, PIDTYPE_PID); - put_pid(umem->pid); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); - if (!mm) - goto out; + xdp_umem_unpin_pages(umem); - mmput(mm); - umem->pgs = NULL; - } + task = get_pid_task(umem->pid, PIDTYPE_PID); + put_pid(umem->pid); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + mmput(mm); xdp_umem_unaccount_pages(umem); out: kfree(umem); @@ -105,7 +78,7 @@ static void xdp_umem_release_deferred(struct work_struct *work) void xdp_get_umem(struct xdp_umem *umem) { - atomic_inc(&umem->users); + refcount_inc(&umem->users); } void xdp_put_umem(struct xdp_umem *umem) @@ -113,7 +86,7 @@ void xdp_put_umem(struct xdp_umem *umem) if (!umem) return; - if (atomic_dec_and_test(&umem->users)) { + if (refcount_dec_and_test(&umem->users)) { INIT_WORK(&umem->work, xdp_umem_release_deferred); schedule_work(&umem->work); } @@ -176,16 +149,13 @@ static int xdp_umem_account_pages(struct xdp_umem *umem) return 0; } -int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) +static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; u64 addr = mr->addr, size = mr->len; unsigned int nframes, nfpp; int size_chk, err; - if (!umem) - return -EINVAL; - if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: * - huge pages, or* @@ -236,7 +206,7 @@ int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->frame_size_log2 = ilog2(frame_size); umem->nfpp_mask = nfpp - 1; umem->nfpplog2 = ilog2(nfpp); - atomic_set(&umem->users, 1); + refcount_set(&umem->users, 1); err = xdp_umem_account_pages(umem); if (err) @@ -254,7 +224,25 @@ out: return err; } +struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) +{ + struct xdp_umem *umem; + int err; + + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + err = xdp_umem_reg(umem, mr); + if (err) { + kfree(umem); + return ERR_PTR(err); + } + + return umem; +} + bool xdp_umem_validate_queues(struct xdp_umem *umem) { - return (umem->fq && umem->cq); + return umem->fq && umem->cq; } diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 7e0b2fab8522..0881cf456230 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * XDP user-space packet buffer +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef XDP_UMEM_H_ @@ -36,7 +27,7 @@ struct xdp_umem { struct pid *pid; unsigned long address; size_t size; - atomic_t users; + refcount_t users; struct work_struct work; }; @@ -59,9 +50,8 @@ static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem, } bool xdp_umem_validate_queues(struct xdp_umem *umem); -int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); -int xdp_umem_create(struct xdp_umem **umem); +struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr); #endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h index 77fb5daf29f3..2cf8ec485fd2 100644 --- a/net/xdp/xdp_umem_props.h +++ b/net/xdp/xdp_umem_props.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * XDP user-space packet buffer +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef XDP_UMEM_PROPS_H_ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 009c5af5bba5..cce0e4f8a536 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -5,15 +5,6 @@ * applications. * Copyright(c) 2018 Intel Corporation. * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * * Author(s): Björn Töpel <[email protected]> * Magnus Karlsson <[email protected]> */ @@ -151,6 +142,11 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, goto out; } + if (xs->queue_id >= xs->dev->real_num_tx_queues) { + err = -ENXIO; + goto out; + } + skb = sock_alloc_send_skb(sk, len, !need_wait, &err); if (unlikely(!skb)) { err = -EAGAIN; @@ -232,18 +228,12 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, if (!q) return -ENOMEM; + /* Make sure queue is ready before it can be seen by others */ + smp_wmb(); *queue = q; return 0; } -static void __xsk_release(struct xdp_sock *xs) -{ - /* Wait for driver to stop using the xdp socket. */ - synchronize_net(); - - dev_put(xs->dev); -} - static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -260,7 +250,9 @@ static int xsk_release(struct socket *sock) local_bh_enable(); if (xs->dev) { - __xsk_release(xs); + /* Wait for driver to stop using the xdp socket. */ + synchronize_net(); + dev_put(xs->dev); xs->dev = NULL; } @@ -294,9 +286,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; struct sock *sk = sock->sk; - struct net_device *dev, *dev_curr; struct xdp_sock *xs = xdp_sk(sk); - struct xdp_umem *old_umem = NULL; + struct net_device *dev; int err = 0; if (addr_len < sizeof(struct sockaddr_xdp)) @@ -305,7 +296,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) return -EINVAL; mutex_lock(&xs->mutex); - dev_curr = xs->dev; + if (xs->dev) { + err = -EBUSY; + goto out_release; + } + dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); if (!dev) { err = -ENODEV; @@ -317,7 +312,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } - if (sxdp->sxdp_queue_id >= dev->num_rx_queues) { + if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) || + (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) { err = -EINVAL; goto out_unlock; } @@ -352,7 +348,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } xdp_get_umem(umem_xs->umem); - old_umem = xs->umem; xs->umem = umem_xs->umem; sockfd_put(sock); } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { @@ -364,14 +359,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xskq_set_umem(xs->umem->cq, &xs->umem->props); } - /* Rebind? */ - if (dev_curr && (dev_curr != dev || - xs->queue_id != sxdp->sxdp_queue_id)) { - __xsk_release(xs); - if (old_umem) - xdp_put_umem(old_umem); - } - xs->dev = dev; xs->queue_id = sxdp->sxdp_queue_id; @@ -419,25 +406,23 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xdp_umem_reg mr; struct xdp_umem *umem; - if (xs->umem) - return -EBUSY; - if (copy_from_user(&mr, optval, sizeof(mr))) return -EFAULT; mutex_lock(&xs->mutex); - err = xdp_umem_create(&umem); + if (xs->umem) { + mutex_unlock(&xs->mutex); + return -EBUSY; + } - err = xdp_umem_reg(umem, &mr); - if (err) { - kfree(umem); + umem = xdp_umem_create(&mr); + if (IS_ERR(umem)) { mutex_unlock(&xs->mutex); - return err; + return PTR_ERR(umem); } /* Make sure umem is ready before it can be seen by others */ smp_wmb(); - xs->umem = umem; mutex_unlock(&xs->mutex); return 0; @@ -448,13 +433,15 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xsk_queue **q; int entries; - if (!xs->umem) - return -EINVAL; - if (copy_from_user(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); + if (!xs->umem) { + mutex_unlock(&xs->mutex); + return -EINVAL; + } + q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : &xs->umem->cq; err = xsk_init_queue(entries, q, true); @@ -504,6 +491,35 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, return 0; } + case XDP_MMAP_OFFSETS: + { + struct xdp_mmap_offsets off; + + if (len < sizeof(off)) + return -EINVAL; + + off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); + off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); + off.rx.desc = offsetof(struct xdp_rxtx_ring, desc); + off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); + off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); + off.tx.desc = offsetof(struct xdp_rxtx_ring, desc); + + off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); + off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); + off.fr.desc = offsetof(struct xdp_umem_ring, desc); + off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); + off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); + off.cr.desc = offsetof(struct xdp_umem_ring, desc); + + len = sizeof(off); + if (copy_to_user(optval, &off, len)) + return -EFAULT; + if (put_user(len, optlen)) + return -EFAULT; + + return 0; + } default: break; } @@ -518,21 +534,23 @@ static int xsk_mmap(struct file *file, struct socket *sock, unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); struct xsk_queue *q = NULL; + struct xdp_umem *umem; unsigned long pfn; struct page *qpg; if (offset == XDP_PGOFF_RX_RING) { - q = xs->rx; + q = READ_ONCE(xs->rx); } else if (offset == XDP_PGOFF_TX_RING) { - q = xs->tx; + q = READ_ONCE(xs->tx); } else { - if (!xs->umem) + umem = READ_ONCE(xs->umem); + if (!umem) return -EINVAL; if (offset == XDP_UMEM_PGOFF_FILL_RING) - q = xs->umem->fq; + q = READ_ONCE(umem->fq); else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) - q = xs->umem->cq; + q = READ_ONCE(umem->cq); } if (!q) @@ -554,24 +572,24 @@ static struct proto xsk_proto = { }; static const struct proto_ops xsk_proto_ops = { - .family = PF_XDP, - .owner = THIS_MODULE, - .release = xsk_release, - .bind = xsk_bind, - .connect = sock_no_connect, - .socketpair = sock_no_socketpair, - .accept = sock_no_accept, - .getname = sock_no_getname, - .poll = xsk_poll, - .ioctl = sock_no_ioctl, - .listen = sock_no_listen, - .shutdown = sock_no_shutdown, - .setsockopt = xsk_setsockopt, - .getsockopt = xsk_getsockopt, - .sendmsg = xsk_sendmsg, - .recvmsg = sock_no_recvmsg, - .mmap = xsk_mmap, - .sendpage = sock_no_sendpage, + .family = PF_XDP, + .owner = THIS_MODULE, + .release = xsk_release, + .bind = xsk_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = xsk_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = xsk_setsockopt, + .getsockopt = xsk_getsockopt, + .sendmsg = xsk_sendmsg, + .recvmsg = sock_no_recvmsg, + .mmap = xsk_mmap, + .sendpage = sock_no_sendpage, }; static void xsk_destruct(struct sock *sk) diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index d012e5e23591..ebe85e59507e 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -1,15 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <linux/slab.h> @@ -31,8 +22,7 @@ static u32 xskq_umem_get_ring_size(struct xsk_queue *q) static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) { - return (sizeof(struct xdp_ring) + - q->nentries * sizeof(struct xdp_desc)); + return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc); } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 7aa9a535db0e..cb8e5be35110 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * XDP user-space ring structure +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _LINUX_XSK_QUEUE_H @@ -22,6 +13,23 @@ #define RX_BATCH_SIZE 16 +struct xdp_ring { + u32 producer ____cacheline_aligned_in_smp; + u32 consumer ____cacheline_aligned_in_smp; +}; + +/* Used for the RX and TX queues for packets */ +struct xdp_rxtx_ring { + struct xdp_ring ptrs; + struct xdp_desc desc[0] ____cacheline_aligned_in_smp; +}; + +/* Used for the fill and completion queues for buffers */ +struct xdp_umem_ring { + struct xdp_ring ptrs; + u32 desc[0] ____cacheline_aligned_in_smp; +}; + struct xsk_queue { struct xdp_umem_props umem_props; u32 ring_mask; @@ -232,12 +240,12 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) static inline bool xskq_full_desc(struct xsk_queue *q) { - return (xskq_nb_avail(q, q->nentries) == q->nentries); + return xskq_nb_avail(q, q->nentries) == q->nentries; } static inline bool xskq_empty_desc(struct xsk_queue *q) { - return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries); + return xskq_nb_free(q, q->prod_tail, 1) == q->nentries; } void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 62a99ab680e3..1303af10e54d 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -51,6 +51,7 @@ hostprogs-y += cpustat hostprogs-y += xdp_adjust_tail hostprogs-y += xdpsock hostprogs-y += xdp_fwd +hostprogs-y += task_fd_query # Libbpf dependencies LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a @@ -105,6 +106,7 @@ cpustat-objs := bpf_load.o cpustat_user.o xdp_adjust_tail-objs := xdp_adjust_tail_user.o xdpsock-objs := bpf_load.o xdpsock_user.o xdp_fwd-objs := bpf_load.o xdp_fwd_user.o +task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -160,6 +162,7 @@ always += cpustat_kern.o always += xdp_adjust_tail_kern.o always += xdpsock_kern.o always += xdp_fwd_kern.o +always += task_fd_query_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -175,6 +178,7 @@ HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/ HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/ HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/ HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/ +HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/ HOST_LOADLIBES += $(LIBBPF) -lelf HOSTLOADLIBES_tracex4 += -lrt diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c new file mode 100644 index 000000000000..f4b0a9ea674d --- /dev/null +++ b/samples/bpf/task_fd_query_kern.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/version.h> +#include <linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +SEC("kprobe/blk_start_request") +int bpf_prog1(struct pt_regs *ctx) +{ + return 0; +} + +SEC("kretprobe/blk_account_io_completion") +int bpf_prog2(struct pt_regs *ctx) +{ + return 0; +} +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c new file mode 100644 index 000000000000..8381d792f138 --- /dev/null +++ b/samples/bpf/task_fd_query_user.c @@ -0,0 +1,382 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <stdint.h> +#include <fcntl.h> +#include <linux/bpf.h> +#include <sys/ioctl.h> +#include <sys/resource.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include "libbpf.h" +#include "bpf_load.h" +#include "bpf_util.h" +#include "perf-sys.h" +#include "trace_helpers.h" + +#define CHECK_PERROR_RET(condition) ({ \ + int __ret = !!(condition); \ + if (__ret) { \ + printf("FAIL: %s:\n", __func__); \ + perror(" "); \ + return -1; \ + } \ +}) + +#define CHECK_AND_RET(condition) ({ \ + int __ret = !!(condition); \ + if (__ret) \ + return -1; \ +}) + +static __u64 ptr_to_u64(void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type" +static int bpf_find_probe_type(const char *event_type) +{ + char buf[256]; + int fd, ret; + + ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type); + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); + + fd = open(buf, O_RDONLY); + CHECK_PERROR_RET(fd < 0); + + ret = read(fd, buf, sizeof(buf)); + close(fd); + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); + + errno = 0; + ret = (int)strtol(buf, NULL, 10); + CHECK_PERROR_RET(errno); + return ret; +} + +#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe" +static int bpf_get_retprobe_bit(const char *event_type) +{ + char buf[256]; + int fd, ret; + + ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type); + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); + + fd = open(buf, O_RDONLY); + CHECK_PERROR_RET(fd < 0); + + ret = read(fd, buf, sizeof(buf)); + close(fd); + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); + CHECK_PERROR_RET(strlen(buf) < strlen("config:")); + + errno = 0; + ret = (int)strtol(buf + strlen("config:"), NULL, 10); + CHECK_PERROR_RET(errno); + return ret; +} + +static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name, + __u32 expected_fd_type) +{ + __u64 probe_offset, probe_addr; + __u32 len, prog_id, fd_type; + char buf[256]; + int err; + + len = sizeof(buf); + err = bpf_task_fd_query(getpid(), event_fd[prog_fd_idx], 0, buf, &len, + &prog_id, &fd_type, &probe_offset, + &probe_addr); + if (err < 0) { + printf("FAIL: %s, for event_fd idx %d, fn_name %s\n", + __func__, prog_fd_idx, fn_name); + perror(" :"); + return -1; + } + if (strcmp(buf, fn_name) != 0 || + fd_type != expected_fd_type || + probe_offset != 0x0 || probe_addr != 0x0) { + printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n", + prog_fd_idx); + printf("buf: %s, fd_type: %u, probe_offset: 0x%llx," + " probe_addr: 0x%llx\n", + buf, fd_type, probe_offset, probe_addr); + return -1; + } + return 0; +} + +static int test_nondebug_fs_kuprobe_common(const char *event_type, + const char *name, __u64 offset, __u64 addr, bool is_return, + char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, + __u64 *probe_offset, __u64 *probe_addr) +{ + int is_return_bit = bpf_get_retprobe_bit(event_type); + int type = bpf_find_probe_type(event_type); + struct perf_event_attr attr = {}; + int fd; + + if (type < 0 || is_return_bit < 0) { + printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n", + __func__, type, is_return_bit); + return -1; + } + + attr.sample_period = 1; + attr.wakeup_events = 1; + if (is_return) + attr.config |= 1 << is_return_bit; + + if (name) { + attr.config1 = ptr_to_u64((void *)name); + attr.config2 = offset; + } else { + attr.config1 = 0; + attr.config2 = addr; + } + attr.size = sizeof(attr); + attr.type = type; + + fd = sys_perf_event_open(&attr, -1, 0, -1, 0); + CHECK_PERROR_RET(fd < 0); + + CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0); + CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0); + CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len, + prog_id, fd_type, probe_offset, probe_addr) < 0); + + return 0; +} + +static int test_nondebug_fs_probe(const char *event_type, const char *name, + __u64 offset, __u64 addr, bool is_return, + __u32 expected_fd_type, + __u32 expected_ret_fd_type, + char *buf, __u32 buf_len) +{ + __u64 probe_offset, probe_addr; + __u32 prog_id, fd_type; + int err; + + err = test_nondebug_fs_kuprobe_common(event_type, name, + offset, addr, is_return, + buf, &buf_len, &prog_id, + &fd_type, &probe_offset, + &probe_addr); + if (err < 0) { + printf("FAIL: %s, " + "for name %s, offset 0x%llx, addr 0x%llx, is_return %d\n", + __func__, name ? name : "", offset, addr, is_return); + perror(" :"); + return -1; + } + if ((is_return && fd_type != expected_ret_fd_type) || + (!is_return && fd_type != expected_fd_type)) { + printf("FAIL: %s, incorrect fd_type %u\n", + __func__, fd_type); + return -1; + } + if (name) { + if (strcmp(name, buf) != 0) { + printf("FAIL: %s, incorrect buf %s\n", __func__, buf); + return -1; + } + if (probe_offset != offset) { + printf("FAIL: %s, incorrect probe_offset 0x%llx\n", + __func__, probe_offset); + return -1; + } + } else { + if (buf_len != 0) { + printf("FAIL: %s, incorrect buf %p\n", + __func__, buf); + return -1; + } + + if (probe_addr != addr) { + printf("FAIL: %s, incorrect probe_addr 0x%llx\n", + __func__, probe_addr); + return -1; + } + } + return 0; +} + +static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) +{ + const char *event_type = "uprobe"; + struct perf_event_attr attr = {}; + char buf[256], event_alias[256]; + __u64 probe_offset, probe_addr; + __u32 len, prog_id, fd_type; + int err, res, kfd, efd; + ssize_t bytes; + + snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", + event_type); + kfd = open(buf, O_WRONLY | O_APPEND, 0); + CHECK_PERROR_RET(kfd < 0); + + res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid()); + CHECK_PERROR_RET(res < 0 || res >= sizeof(event_alias)); + + res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", + is_return ? 'r' : 'p', event_type, event_alias, + binary_path, offset); + CHECK_PERROR_RET(res < 0 || res >= sizeof(buf)); + CHECK_PERROR_RET(write(kfd, buf, strlen(buf)) < 0); + + close(kfd); + kfd = -1; + + snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s/id", + event_type, event_alias); + efd = open(buf, O_RDONLY, 0); + CHECK_PERROR_RET(efd < 0); + + bytes = read(efd, buf, sizeof(buf)); + CHECK_PERROR_RET(bytes <= 0 || bytes >= sizeof(buf)); + close(efd); + buf[bytes] = '\0'; + + attr.config = strtol(buf, NULL, 0); + attr.type = PERF_TYPE_TRACEPOINT; + attr.sample_period = 1; + attr.wakeup_events = 1; + kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + CHECK_PERROR_RET(kfd < 0); + CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0); + CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0); + + len = sizeof(buf); + err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len, + &prog_id, &fd_type, &probe_offset, + &probe_addr); + if (err < 0) { + printf("FAIL: %s, binary_path %s\n", __func__, binary_path); + perror(" :"); + return -1; + } + if ((is_return && fd_type != BPF_FD_TYPE_URETPROBE) || + (!is_return && fd_type != BPF_FD_TYPE_UPROBE)) { + printf("FAIL: %s, incorrect fd_type %u\n", __func__, + fd_type); + return -1; + } + if (strcmp(binary_path, buf) != 0) { + printf("FAIL: %s, incorrect buf %s\n", __func__, buf); + return -1; + } + if (probe_offset != offset) { + printf("FAIL: %s, incorrect probe_offset 0x%llx\n", __func__, + probe_offset); + return -1; + } + + close(kfd); + return 0; +} + +int main(int argc, char **argv) +{ + struct rlimit r = {1024*1024, RLIM_INFINITY}; + extern char __executable_start; + char filename[256], buf[256]; + __u64 uprobe_file_offset; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + if (load_kallsyms()) { + printf("failed to process /proc/kallsyms\n"); + return 1; + } + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + /* test two functions in the corresponding *_kern.c file */ + CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_start_request", + BPF_FD_TYPE_KPROBE)); + CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion", + BPF_FD_TYPE_KRETPROBE)); + + /* test nondebug fs kprobe */ + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0, + false, BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); +#ifdef __x86_64__ + /* set a kprobe on "bpf_check + 0x5", which is x64 specific */ + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x5, 0x0, + false, BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); +#endif + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0, + true, BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, + ksym_get_addr("bpf_check"), false, + BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, + ksym_get_addr("bpf_check"), false, + BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + NULL, 0)); + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, + ksym_get_addr("bpf_check"), true, + BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, + ksym_get_addr("bpf_check"), true, + BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + 0, 0)); + + /* test nondebug fs uprobe */ + /* the calculation of uprobe file offset is based on gcc 7.3.1 on x64 + * and the default linker script, which defines __executable_start as + * the start of the .text section. The calculation could be different + * on different systems with different compilers. The right way is + * to parse the ELF file. We took a shortcut here. + */ + uprobe_file_offset = (__u64)main - (__u64)&__executable_start; + CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0], + uprobe_file_offset, 0x0, false, + BPF_FD_TYPE_UPROBE, + BPF_FD_TYPE_URETPROBE, + buf, sizeof(buf))); + CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0], + uprobe_file_offset, 0x0, true, + BPF_FD_TYPE_UPROBE, + BPF_FD_TYPE_URETPROBE, + buf, sizeof(buf))); + + /* test debug fs uprobe */ + CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset, + false)); + CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset, + true)); + + return 0; +} diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c index 211db8ded0de..ad10fe700d7d 100644 --- a/samples/bpf/xdp_monitor_kern.c +++ b/samples/bpf/xdp_monitor_kern.c @@ -125,6 +125,7 @@ struct datarec { u64 processed; u64 dropped; u64 info; + u64 err; }; #define MAX_CPUS 64 @@ -208,3 +209,51 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) return 0; } + +struct bpf_map_def SEC("maps") devmap_xmit_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct devmap_xmit_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + u32 map_index; // offset:16; size:4; signed:0; + int drops; // offset:20; size:4; signed:1; + int sent; // offset:24; size:4; signed:1; + int from_ifindex; // offset:28; size:4; signed:1; + int to_ifindex; // offset:32; size:4; signed:1; + int err; // offset:36; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_devmap_xmit") +int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key); + if (!rec) + return 0; + rec->processed += ctx->sent; + rec->dropped += ctx->drops; + + /* Record bulk events, then userspace can calc average bulk size */ + rec->info += 1; + + /* Record error cases, where no frame were sent */ + if (ctx->err) + rec->err++; + + /* Catch API error of drv ndo_xdp_xmit sent more than count */ + if (ctx->drops < 0) + rec->err++; + + return 1; +} diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index bf09b5188acd..dd558cbb2309 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -117,6 +117,7 @@ struct datarec { __u64 processed; __u64 dropped; __u64 info; + __u64 err; }; #define MAX_CPUS 64 @@ -141,6 +142,7 @@ struct stats_record { struct record_u64 xdp_exception[XDP_ACTION_MAX]; struct record xdp_cpumap_kthread; struct record xdp_cpumap_enqueue[MAX_CPUS]; + struct record xdp_devmap_xmit; }; static bool map_collect_record(int fd, __u32 key, struct record *rec) @@ -151,6 +153,7 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec) __u64 sum_processed = 0; __u64 sum_dropped = 0; __u64 sum_info = 0; + __u64 sum_err = 0; int i; if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { @@ -169,10 +172,13 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec) sum_dropped += values[i].dropped; rec->cpu[i].info = values[i].info; sum_info += values[i].info; + rec->cpu[i].err = values[i].err; + sum_err += values[i].err; } rec->total.processed = sum_processed; rec->total.dropped = sum_dropped; rec->total.info = sum_info; + rec->total.err = sum_err; return true; } @@ -273,6 +279,18 @@ static double calc_info(struct datarec *r, struct datarec *p, double period) return pps; } +static double calc_err(struct datarec *r, struct datarec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->err - p->err; + pps = packets / period; + } + return pps; +} + static void stats_print(struct stats_record *stats_rec, struct stats_record *stats_prev, bool err_only) @@ -397,7 +415,7 @@ static void stats_print(struct stats_record *stats_rec, info = calc_info(r, p, t); if (info > 0) i_str = "sched"; - if (pps > 0) + if (pps > 0 || drop > 0) printf(fmt1, "cpumap-kthread", i, pps, drop, info, i_str); } @@ -409,6 +427,50 @@ static void stats_print(struct stats_record *stats_rec, printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str); } + /* devmap ndo_xdp_xmit stats */ + { + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n"; + struct record *rec, *prev; + double drop, info, err; + char *i_str = ""; + char *err_str = ""; + + rec = &stats_rec->xdp_devmap_xmit; + prev = &stats_prev->xdp_devmap_xmit; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop(r, p, t); + info = calc_info(r, p, t); + err = calc_err(r, p, t); + if (info > 0) { + i_str = "bulk-average"; + info = (pps+drop) / info; /* calc avg bulk */ + } + if (err > 0) + err_str = "drv-err"; + if (pps > 0 || drop > 0) + printf(fmt1, "devmap-xmit", + i, pps, drop, info, i_str, err_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop(&rec->total, &prev->total, t); + info = calc_info(&rec->total, &prev->total, t); + err = calc_err(&rec->total, &prev->total, t); + if (info > 0) { + i_str = "bulk-average"; + info = (pps+drop) / info; /* calc avg bulk */ + } + if (err > 0) + err_str = "drv-err"; + printf(fmt2, "devmap-xmit", "total", pps, drop, + info, i_str, err_str); + } + printf("\n"); } @@ -437,6 +499,9 @@ static bool stats_collect(struct stats_record *rec) fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */ map_collect_record(fd, 0, &rec->xdp_cpumap_kthread); + fd = map_data[4].fd; /* map4: devmap_xmit_cnt */ + map_collect_record(fd, 0, &rec->xdp_devmap_xmit); + return true; } @@ -480,6 +545,7 @@ static struct stats_record *alloc_stats_record(void) rec_sz = sizeof(struct datarec); rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz); + rec->xdp_devmap_xmit.cpu = alloc_rec_per_cpu(rec_sz); for (i = 0; i < MAX_CPUS; i++) rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz); @@ -498,6 +564,7 @@ static void free_stats_record(struct stats_record *r) free(r->xdp_exception[i].cpu); free(r->xdp_cpumap_kthread.cpu); + free(r->xdp_devmap_xmit.cpu); for (i = 0; i < MAX_CPUS; i++) free(r->xdp_cpumap_enqueue[i].cpu); diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 7fe60f6f7d53..e379eac034ac 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1,15 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 2017 - 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ +/* Copyright(c) 2017 - 2018 Intel Corporation. */ #include <assert.h> #include <errno.h> @@ -89,7 +79,10 @@ struct xdp_umem_uqueue { u32 cached_cons; u32 mask; u32 size; - struct xdp_umem_ring *ring; + u32 *producer; + u32 *consumer; + u32 *ring; + void *map; }; struct xdp_umem { @@ -104,7 +97,10 @@ struct xdp_uqueue { u32 cached_cons; u32 mask; u32 size; - struct xdp_rxtx_ring *ring; + u32 *producer; + u32 *consumer; + struct xdp_desc *ring; + void *map; }; struct xdpsock { @@ -165,7 +161,7 @@ static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb) return free_entries; /* Refresh the local tail pointer */ - q->cached_cons = q->ring->ptrs.consumer; + q->cached_cons = *q->consumer; return q->size - (q->cached_prod - q->cached_cons); } @@ -178,7 +174,7 @@ static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs) return free_entries; /* Refresh the local tail pointer */ - q->cached_cons = q->ring->ptrs.consumer + q->size; + q->cached_cons = *q->consumer + q->size; return q->cached_cons - q->cached_prod; } @@ -187,7 +183,7 @@ static inline u32 umem_nb_avail(struct xdp_umem_uqueue *q, u32 nb) u32 entries = q->cached_prod - q->cached_cons; if (entries == 0) { - q->cached_prod = q->ring->ptrs.producer; + q->cached_prod = *q->producer; entries = q->cached_prod - q->cached_cons; } @@ -199,7 +195,7 @@ static inline u32 xq_nb_avail(struct xdp_uqueue *q, u32 ndescs) u32 entries = q->cached_prod - q->cached_cons; if (entries == 0) { - q->cached_prod = q->ring->ptrs.producer; + q->cached_prod = *q->producer; entries = q->cached_prod - q->cached_cons; } @@ -218,12 +214,12 @@ static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq, for (i = 0; i < nb; i++) { u32 idx = fq->cached_prod++ & fq->mask; - fq->ring->desc[idx] = d[i].idx; + fq->ring[idx] = d[i].idx; } u_smp_wmb(); - fq->ring->ptrs.producer = fq->cached_prod; + *fq->producer = fq->cached_prod; return 0; } @@ -239,12 +235,12 @@ static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u32 *d, for (i = 0; i < nb; i++) { u32 idx = fq->cached_prod++ & fq->mask; - fq->ring->desc[idx] = d[i]; + fq->ring[idx] = d[i]; } u_smp_wmb(); - fq->ring->ptrs.producer = fq->cached_prod; + *fq->producer = fq->cached_prod; return 0; } @@ -258,13 +254,13 @@ static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq, for (i = 0; i < entries; i++) { idx = cq->cached_cons++ & cq->mask; - d[i] = cq->ring->desc[idx]; + d[i] = cq->ring[idx]; } if (entries > 0) { u_smp_wmb(); - cq->ring->ptrs.consumer = cq->cached_cons; + *cq->consumer = cq->cached_cons; } return entries; @@ -280,7 +276,7 @@ static inline int xq_enq(struct xdp_uqueue *uq, const struct xdp_desc *descs, unsigned int ndescs) { - struct xdp_rxtx_ring *r = uq->ring; + struct xdp_desc *r = uq->ring; unsigned int i; if (xq_nb_free(uq, ndescs) < ndescs) @@ -289,21 +285,21 @@ static inline int xq_enq(struct xdp_uqueue *uq, for (i = 0; i < ndescs; i++) { u32 idx = uq->cached_prod++ & uq->mask; - r->desc[idx].idx = descs[i].idx; - r->desc[idx].len = descs[i].len; - r->desc[idx].offset = descs[i].offset; + r[idx].idx = descs[i].idx; + r[idx].len = descs[i].len; + r[idx].offset = descs[i].offset; } u_smp_wmb(); - r->ptrs.producer = uq->cached_prod; + *uq->producer = uq->cached_prod; return 0; } static inline int xq_enq_tx_only(struct xdp_uqueue *uq, __u32 idx, unsigned int ndescs) { - struct xdp_rxtx_ring *q = uq->ring; + struct xdp_desc *r = uq->ring; unsigned int i; if (xq_nb_free(uq, ndescs) < ndescs) @@ -312,14 +308,14 @@ static inline int xq_enq_tx_only(struct xdp_uqueue *uq, for (i = 0; i < ndescs; i++) { u32 idx = uq->cached_prod++ & uq->mask; - q->desc[idx].idx = idx + i; - q->desc[idx].len = sizeof(pkt_data) - 1; - q->desc[idx].offset = 0; + r[idx].idx = idx + i; + r[idx].len = sizeof(pkt_data) - 1; + r[idx].offset = 0; } u_smp_wmb(); - q->ptrs.producer = uq->cached_prod; + *uq->producer = uq->cached_prod; return 0; } @@ -327,7 +323,7 @@ static inline int xq_deq(struct xdp_uqueue *uq, struct xdp_desc *descs, int ndescs) { - struct xdp_rxtx_ring *r = uq->ring; + struct xdp_desc *r = uq->ring; unsigned int idx; int i, entries; @@ -337,13 +333,13 @@ static inline int xq_deq(struct xdp_uqueue *uq, for (i = 0; i < entries; i++) { idx = uq->cached_cons++ & uq->mask; - descs[i] = r->desc[idx]; + descs[i] = r[idx]; } if (entries > 0) { u_smp_wmb(); - r->ptrs.consumer = uq->cached_cons; + *uq->consumer = uq->cached_cons; } return entries; @@ -402,8 +398,10 @@ static size_t gen_eth_frame(char *frame) static struct xdp_umem *xdp_umem_configure(int sfd) { int fq_size = FQ_NUM_DESCS, cq_size = CQ_NUM_DESCS; + struct xdp_mmap_offsets off; struct xdp_umem_reg mr; struct xdp_umem *umem; + socklen_t optlen; void *bufs; umem = calloc(1, sizeof(*umem)); @@ -423,25 +421,35 @@ static struct xdp_umem *xdp_umem_configure(int sfd) lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size, sizeof(int)) == 0); - umem->fq.ring = mmap(0, sizeof(struct xdp_umem_ring) + - FQ_NUM_DESCS * sizeof(u32), - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, sfd, - XDP_UMEM_PGOFF_FILL_RING); - lassert(umem->fq.ring != MAP_FAILED); + optlen = sizeof(off); + lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off, + &optlen) == 0); + + umem->fq.map = mmap(0, off.fr.desc + + FQ_NUM_DESCS * sizeof(u32), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_UMEM_PGOFF_FILL_RING); + lassert(umem->fq.map != MAP_FAILED); umem->fq.mask = FQ_NUM_DESCS - 1; umem->fq.size = FQ_NUM_DESCS; + umem->fq.producer = umem->fq.map + off.fr.producer; + umem->fq.consumer = umem->fq.map + off.fr.consumer; + umem->fq.ring = umem->fq.map + off.fr.desc; - umem->cq.ring = mmap(0, sizeof(struct xdp_umem_ring) + + umem->cq.map = mmap(0, off.cr.desc + CQ_NUM_DESCS * sizeof(u32), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, sfd, XDP_UMEM_PGOFF_COMPLETION_RING); - lassert(umem->cq.ring != MAP_FAILED); + lassert(umem->cq.map != MAP_FAILED); umem->cq.mask = CQ_NUM_DESCS - 1; umem->cq.size = CQ_NUM_DESCS; + umem->cq.producer = umem->cq.map + off.cr.producer; + umem->cq.consumer = umem->cq.map + off.cr.consumer; + umem->cq.ring = umem->cq.map + off.cr.desc; umem->frames = (char (*)[FRAME_SIZE])bufs; umem->fd = sfd; @@ -459,9 +467,11 @@ static struct xdp_umem *xdp_umem_configure(int sfd) static struct xdpsock *xsk_configure(struct xdp_umem *umem) { struct sockaddr_xdp sxdp = {}; + struct xdp_mmap_offsets off; int sfd, ndescs = NUM_DESCS; struct xdpsock *xsk; bool shared = true; + socklen_t optlen; u32 i; sfd = socket(PF_XDP, SOCK_RAW, 0); @@ -484,15 +494,18 @@ static struct xdpsock *xsk_configure(struct xdp_umem *umem) &ndescs, sizeof(int)) == 0); lassert(setsockopt(sfd, SOL_XDP, XDP_TX_RING, &ndescs, sizeof(int)) == 0); + optlen = sizeof(off); + lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off, + &optlen) == 0); /* Rx */ - xsk->rx.ring = mmap(NULL, - sizeof(struct xdp_ring) + - NUM_DESCS * sizeof(struct xdp_desc), - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, sfd, - XDP_PGOFF_RX_RING); - lassert(xsk->rx.ring != MAP_FAILED); + xsk->rx.map = mmap(NULL, + off.rx.desc + + NUM_DESCS * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_PGOFF_RX_RING); + lassert(xsk->rx.map != MAP_FAILED); if (!shared) { for (i = 0; i < NUM_DESCS / 2; i++) @@ -501,19 +514,25 @@ static struct xdpsock *xsk_configure(struct xdp_umem *umem) } /* Tx */ - xsk->tx.ring = mmap(NULL, - sizeof(struct xdp_ring) + - NUM_DESCS * sizeof(struct xdp_desc), - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, sfd, - XDP_PGOFF_TX_RING); - lassert(xsk->tx.ring != MAP_FAILED); + xsk->tx.map = mmap(NULL, + off.tx.desc + + NUM_DESCS * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_PGOFF_TX_RING); + lassert(xsk->tx.map != MAP_FAILED); xsk->rx.mask = NUM_DESCS - 1; xsk->rx.size = NUM_DESCS; + xsk->rx.producer = xsk->rx.map + off.rx.producer; + xsk->rx.consumer = xsk->rx.map + off.rx.consumer; + xsk->rx.ring = xsk->rx.map + off.rx.desc; xsk->tx.mask = NUM_DESCS - 1; xsk->tx.size = NUM_DESCS; + xsk->tx.producer = xsk->tx.map + off.tx.producer; + xsk->tx.consumer = xsk->tx.map + off.tx.consumer; + xsk->tx.ring = xsk->tx.map + off.tx.desc; sxdp.sxdp_family = PF_XDP; sxdp.sxdp_ifindex = opt_ifindex; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 8f59897fbda1..5010a4d5bfba 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -95,7 +95,7 @@ class HeaderParser(object): return capture.group(1) def parse_desc(self): - p = re.compile(' \* ?(?:\t| {6,8})Description$') + p = re.compile(' \* ?(?:\t| {5,8})Description$') capture = p.match(self.line) if not capture: # Helper can have empty description and we might be parsing another @@ -109,7 +109,7 @@ class HeaderParser(object): if self.line == ' *\n': desc += '\n' else: - p = re.compile(' \* ?(?:\t| {6,8})(?:\t| {8})(.*)') + p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') capture = p.match(self.line) if capture: desc += capture.group(1) + '\n' @@ -118,7 +118,7 @@ class HeaderParser(object): return desc def parse_ret(self): - p = re.compile(' \* ?(?:\t| {6,8})Return$') + p = re.compile(' \* ?(?:\t| {5,8})Return$') capture = p.match(self.line) if not capture: # Helper can have empty retval and we might be parsing another @@ -132,7 +132,7 @@ class HeaderParser(object): if self.line == ' *\n': ret += '\n' else: - p = re.compile(' \* ?(?:\t| {6,8})(?:\t| {8})(.*)') + p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') capture = p.match(self.line) if capture: ret += capture.group(1) + '\n' diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst new file mode 100644 index 000000000000..e3eb0eab7641 --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -0,0 +1,81 @@ +================ +bpftool-perf +================ +------------------------------------------------------------------------------- +tool for inspection of perf related bpf prog attachments +------------------------------------------------------------------------------- + +:Manual section: 8 + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **perf** *COMMAND* + + *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] } + + *COMMANDS* := + { **show** | **list** | **help** } + +PERF COMMANDS +============= + +| **bpftool** **perf { show | list }** +| **bpftool** **perf help** + +DESCRIPTION +=========== + **bpftool perf { show | list }** + List all raw_tracepoint, tracepoint, kprobe attachment in the system. + + Output will start with process id and file descriptor in that process, + followed by bpf program id, attachment information, and attachment point. + The attachment point for raw_tracepoint/tracepoint is the trace probe name. + The attachment point for k[ret]probe is either symbol name and offset, + or a kernel virtual address. + The attachment point for u[ret]probe is the file name and the file offset. + + **bpftool perf help** + Print short help message. + +OPTIONS +======= + -h, --help + Print short generic help message (similar to **bpftool help**). + + -v, --version + Print version number (similar to **bpftool version**). + + -j, --json + Generate JSON output. For commands that cannot produce JSON, this + option has no effect. + + -p, --pretty + Generate human-readable JSON output. Implies **-j**. + +EXAMPLES +======== + +| **# bpftool perf** + +:: + + pid 21711 fd 5: prog_id 5 kprobe func __x64_sys_write offset 0 + pid 21765 fd 5: prog_id 7 kretprobe func __x64_sys_nanosleep offset 0 + pid 21767 fd 5: prog_id 8 tracepoint sys_enter_nanosleep + pid 21800 fd 5: prog_id 9 uprobe filename /home/yhs/a.out offset 1159 + +| +| **# bpftool -j perf** + +:: + + [{"pid":21711,"fd":5,"prog_id":5,"fd_type":"kprobe","func":"__x64_sys_write","offset":0}, \ + {"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \ + {"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \ + {"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}] + + +SEE ALSO +======== + **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 564cb0d9692b..b6f5d560460d 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -16,7 +16,7 @@ SYNOPSIS **bpftool** **version** - *OBJECT* := { **map** | **program** | **cgroup** } + *OBJECT* := { **map** | **program** | **cgroup** | **perf** } *OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** } | { **-j** | **--json** } [{ **-p** | **--pretty** }] } @@ -30,6 +30,8 @@ SYNOPSIS *CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } + *PERF-COMMANDS* := { **show** | **list** | **help** } + DESCRIPTION =========== *bpftool* allows for inspection and simple modification of BPF objects @@ -56,3 +58,4 @@ OPTIONS SEE ALSO ======== **bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8) + **bpftool-perf**\ (8) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index b301c9b315f1..7bc198d60de2 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -448,6 +448,15 @@ _bpftool() ;; esac ;; + perf) + case $command in + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help \ + show list' -- "$cur" ) ) + ;; + esac + ;; esac } && complete -F _bpftool bpftool diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 1ec852d21d44..eea7f14355f3 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -87,7 +87,7 @@ static int do_help(int argc, char **argv) " %s batch file FILE\n" " %s version\n" "\n" - " OBJECT := { prog | map | cgroup }\n" + " OBJECT := { prog | map | cgroup | perf }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, bin_name, bin_name); @@ -216,6 +216,7 @@ static const struct cmd cmds[] = { { "prog", do_prog }, { "map", do_map }, { "cgroup", do_cgroup }, + { "perf", do_perf }, { "version", do_version }, { 0 } }; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 6173cd997e7a..63fdb310b9a4 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -119,6 +119,7 @@ int do_prog(int argc, char **arg); int do_map(int argc, char **arg); int do_event_pipe(int argc, char **argv); int do_cgroup(int argc, char **arg); +int do_perf(int argc, char **arg); int prog_parse_fd(int *argc, char ***argv); int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len); diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c new file mode 100644 index 000000000000..ac6b1a12c9b7 --- /dev/null +++ b/tools/bpf/bpftool/perf.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (C) 2018 Facebook +// Author: Yonghong Song <[email protected]> + +#define _GNU_SOURCE +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <ftw.h> + +#include <bpf.h> + +#include "main.h" + +/* 0: undecided, 1: supported, 2: not supported */ +static int perf_query_supported; +static bool has_perf_query_support(void) +{ + __u64 probe_offset, probe_addr; + __u32 len, prog_id, fd_type; + char buf[256]; + int fd; + + if (perf_query_supported) + goto out; + + fd = open(bin_name, O_RDONLY); + if (fd < 0) { + p_err("perf_query_support: %s", strerror(errno)); + goto out; + } + + /* the following query will fail as no bpf attachment, + * the expected errno is ENOTSUPP + */ + errno = 0; + len = sizeof(buf); + bpf_task_fd_query(getpid(), fd, 0, buf, &len, &prog_id, + &fd_type, &probe_offset, &probe_addr); + + if (errno == 524 /* ENOTSUPP */) { + perf_query_supported = 1; + goto close_fd; + } + + perf_query_supported = 2; + p_err("perf_query_support: %s", strerror(errno)); + fprintf(stderr, + "HINT: non root or kernel doesn't support TASK_FD_QUERY\n"); + +close_fd: + close(fd); +out: + return perf_query_supported == 1; +} + +static void print_perf_json(int pid, int fd, __u32 prog_id, __u32 fd_type, + char *buf, __u64 probe_offset, __u64 probe_addr) +{ + jsonw_start_object(json_wtr); + jsonw_int_field(json_wtr, "pid", pid); + jsonw_int_field(json_wtr, "fd", fd); + jsonw_uint_field(json_wtr, "prog_id", prog_id); + switch (fd_type) { + case BPF_FD_TYPE_RAW_TRACEPOINT: + jsonw_string_field(json_wtr, "fd_type", "raw_tracepoint"); + jsonw_string_field(json_wtr, "tracepoint", buf); + break; + case BPF_FD_TYPE_TRACEPOINT: + jsonw_string_field(json_wtr, "fd_type", "tracepoint"); + jsonw_string_field(json_wtr, "tracepoint", buf); + break; + case BPF_FD_TYPE_KPROBE: + jsonw_string_field(json_wtr, "fd_type", "kprobe"); + if (buf[0] != '\0') { + jsonw_string_field(json_wtr, "func", buf); + jsonw_lluint_field(json_wtr, "offset", probe_offset); + } else { + jsonw_lluint_field(json_wtr, "addr", probe_addr); + } + break; + case BPF_FD_TYPE_KRETPROBE: + jsonw_string_field(json_wtr, "fd_type", "kretprobe"); + if (buf[0] != '\0') { + jsonw_string_field(json_wtr, "func", buf); + jsonw_lluint_field(json_wtr, "offset", probe_offset); + } else { + jsonw_lluint_field(json_wtr, "addr", probe_addr); + } + break; + case BPF_FD_TYPE_UPROBE: + jsonw_string_field(json_wtr, "fd_type", "uprobe"); + jsonw_string_field(json_wtr, "filename", buf); + jsonw_lluint_field(json_wtr, "offset", probe_offset); + break; + case BPF_FD_TYPE_URETPROBE: + jsonw_string_field(json_wtr, "fd_type", "uretprobe"); + jsonw_string_field(json_wtr, "filename", buf); + jsonw_lluint_field(json_wtr, "offset", probe_offset); + break; + } + jsonw_end_object(json_wtr); +} + +static void print_perf_plain(int pid, int fd, __u32 prog_id, __u32 fd_type, + char *buf, __u64 probe_offset, __u64 probe_addr) +{ + printf("pid %d fd %d: prog_id %u ", pid, fd, prog_id); + switch (fd_type) { + case BPF_FD_TYPE_RAW_TRACEPOINT: + printf("raw_tracepoint %s\n", buf); + break; + case BPF_FD_TYPE_TRACEPOINT: + printf("tracepoint %s\n", buf); + break; + case BPF_FD_TYPE_KPROBE: + if (buf[0] != '\0') + printf("kprobe func %s offset %llu\n", buf, + probe_offset); + else + printf("kprobe addr %llu\n", probe_addr); + break; + case BPF_FD_TYPE_KRETPROBE: + if (buf[0] != '\0') + printf("kretprobe func %s offset %llu\n", buf, + probe_offset); + else + printf("kretprobe addr %llu\n", probe_addr); + break; + case BPF_FD_TYPE_UPROBE: + printf("uprobe filename %s offset %llu\n", buf, probe_offset); + break; + case BPF_FD_TYPE_URETPROBE: + printf("uretprobe filename %s offset %llu\n", buf, + probe_offset); + break; + } +} + +static int show_proc(const char *fpath, const struct stat *sb, + int tflag, struct FTW *ftwbuf) +{ + __u64 probe_offset, probe_addr; + __u32 len, prog_id, fd_type; + int err, pid = 0, fd = 0; + const char *pch; + char buf[4096]; + + /* prefix always /proc */ + pch = fpath + 5; + if (*pch == '\0') + return 0; + + /* pid should be all numbers */ + pch++; + while (isdigit(*pch)) { + pid = pid * 10 + *pch - '0'; + pch++; + } + if (*pch == '\0') + return 0; + if (*pch != '/') + return FTW_SKIP_SUBTREE; + + /* check /proc/<pid>/fd directory */ + pch++; + if (strncmp(pch, "fd", 2)) + return FTW_SKIP_SUBTREE; + pch += 2; + if (*pch == '\0') + return 0; + if (*pch != '/') + return FTW_SKIP_SUBTREE; + + /* check /proc/<pid>/fd/<fd_num> */ + pch++; + while (isdigit(*pch)) { + fd = fd * 10 + *pch - '0'; + pch++; + } + if (*pch != '\0') + return FTW_SKIP_SUBTREE; + + /* query (pid, fd) for potential perf events */ + len = sizeof(buf); + err = bpf_task_fd_query(pid, fd, 0, buf, &len, &prog_id, &fd_type, + &probe_offset, &probe_addr); + if (err < 0) + return 0; + + if (json_output) + print_perf_json(pid, fd, prog_id, fd_type, buf, probe_offset, + probe_addr); + else + print_perf_plain(pid, fd, prog_id, fd_type, buf, probe_offset, + probe_addr); + + return 0; +} + +static int do_show(int argc, char **argv) +{ + int flags = FTW_ACTIONRETVAL | FTW_PHYS; + int err = 0, nopenfd = 16; + + if (!has_perf_query_support()) + return -1; + + if (json_output) + jsonw_start_array(json_wtr); + if (nftw("/proc", show_proc, nopenfd, flags) == -1) { + p_err("%s", strerror(errno)); + err = -1; + } + if (json_output) + jsonw_end_array(json_wtr); + + return err; +} + +static int do_help(int argc, char **argv) +{ + fprintf(stderr, + "Usage: %s %s { show | list | help }\n" + "", + bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "help", do_help }, + { 0 } +}; + +int do_perf(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 9bdfdf2d3fbe..39b88e760367 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -420,7 +420,11 @@ static int do_show(int argc, char **argv) static int do_dump(int argc, char **argv) { + unsigned long *func_ksyms = NULL; struct bpf_prog_info info = {}; + unsigned int *func_lens = NULL; + unsigned int nr_func_ksyms; + unsigned int nr_func_lens; struct dump_data dd = {}; __u32 len = sizeof(info); unsigned int buf_size; @@ -496,10 +500,34 @@ static int do_dump(int argc, char **argv) return -1; } + nr_func_ksyms = info.nr_jited_ksyms; + if (nr_func_ksyms) { + func_ksyms = malloc(nr_func_ksyms * sizeof(__u64)); + if (!func_ksyms) { + p_err("mem alloc failed"); + close(fd); + goto err_free; + } + } + + nr_func_lens = info.nr_jited_func_lens; + if (nr_func_lens) { + func_lens = malloc(nr_func_lens * sizeof(__u32)); + if (!func_lens) { + p_err("mem alloc failed"); + close(fd); + goto err_free; + } + } + memset(&info, 0, sizeof(info)); *member_ptr = ptr_to_u64(buf); *member_len = buf_size; + info.jited_ksyms = ptr_to_u64(func_ksyms); + info.nr_jited_ksyms = nr_func_ksyms; + info.jited_func_lens = ptr_to_u64(func_lens); + info.nr_jited_func_lens = nr_func_lens; err = bpf_obj_get_info_by_fd(fd, &info, &len); close(fd); @@ -513,6 +541,16 @@ static int do_dump(int argc, char **argv) goto err_free; } + if (info.nr_jited_ksyms > nr_func_ksyms) { + p_err("too many addresses returned"); + goto err_free; + } + + if (info.nr_jited_func_lens > nr_func_lens) { + p_err("too many values returned"); + goto err_free; + } + if ((member_len == &info.jited_prog_len && info.jited_prog_insns == 0) || (member_len == &info.xlated_prog_len && @@ -550,7 +588,57 @@ static int do_dump(int argc, char **argv) goto err_free; } - disasm_print_insn(buf, *member_len, opcodes, name); + if (info.nr_jited_func_lens && info.jited_func_lens) { + struct kernel_sym *sym = NULL; + char sym_name[SYM_MAX_NAME]; + unsigned char *img = buf; + __u64 *ksyms = NULL; + __u32 *lens; + __u32 i; + + if (info.nr_jited_ksyms) { + kernel_syms_load(&dd); + ksyms = (__u64 *) info.jited_ksyms; + } + + if (json_output) + jsonw_start_array(json_wtr); + + lens = (__u32 *) info.jited_func_lens; + for (i = 0; i < info.nr_jited_func_lens; i++) { + if (ksyms) { + sym = kernel_syms_search(&dd, ksyms[i]); + if (sym) + sprintf(sym_name, "%s", sym->name); + else + sprintf(sym_name, "0x%016llx", ksyms[i]); + } else { + strcpy(sym_name, "unknown"); + } + + if (json_output) { + jsonw_start_object(json_wtr); + jsonw_name(json_wtr, "name"); + jsonw_string(json_wtr, sym_name); + jsonw_name(json_wtr, "insns"); + } else { + printf("%s:\n", sym_name); + } + + disasm_print_insn(img, lens[i], opcodes, name); + img += lens[i]; + + if (json_output) + jsonw_end_object(json_wtr); + else + printf("\n"); + } + + if (json_output) + jsonw_end_array(json_wtr); + } else { + disasm_print_insn(buf, *member_len, opcodes, name); + } } else if (visual) { if (json_output) jsonw_null(json_wtr); @@ -558,6 +646,9 @@ static int do_dump(int argc, char **argv) dump_xlated_cfg(buf, *member_len); } else { kernel_syms_load(&dd); + dd.nr_jited_ksyms = info.nr_jited_ksyms; + dd.jited_ksyms = (__u64 *) info.jited_ksyms; + if (json_output) dump_xlated_json(&dd, buf, *member_len, opcodes); else @@ -566,10 +657,14 @@ static int do_dump(int argc, char **argv) } free(buf); + free(func_ksyms); + free(func_lens); return 0; err_free: free(buf); + free(func_ksyms); + free(func_lens); return -1; } diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c index 7a3173b76c16..b97f1da60dd1 100644 --- a/tools/bpf/bpftool/xlated_dumper.c +++ b/tools/bpf/bpftool/xlated_dumper.c @@ -102,8 +102,8 @@ void kernel_syms_destroy(struct dump_data *dd) free(dd->sym_mapping); } -static struct kernel_sym *kernel_syms_search(struct dump_data *dd, - unsigned long key) +struct kernel_sym *kernel_syms_search(struct dump_data *dd, + unsigned long key) { struct kernel_sym sym = { .address = key, @@ -174,7 +174,11 @@ static const char *print_call_pcrel(struct dump_data *dd, unsigned long address, const struct bpf_insn *insn) { - if (sym) + if (!dd->nr_jited_ksyms) + /* Do not show address for interpreted programs */ + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "%+d", insn->off); + else if (sym) snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), "%+d#%s", insn->off, sym->name); else @@ -203,6 +207,10 @@ static const char *print_call(void *private_data, unsigned long address = dd->address_call_base + insn->imm; struct kernel_sym *sym; + if (insn->src_reg == BPF_PSEUDO_CALL && + (__u32) insn->imm < dd->nr_jited_ksyms) + address = dd->jited_ksyms[insn->imm]; + sym = kernel_syms_search(dd, address); if (insn->src_reg == BPF_PSEUDO_CALL) return print_call_pcrel(dd, sym, address, insn); diff --git a/tools/bpf/bpftool/xlated_dumper.h b/tools/bpf/bpftool/xlated_dumper.h index b34affa7ef2d..33d86e2b369b 100644 --- a/tools/bpf/bpftool/xlated_dumper.h +++ b/tools/bpf/bpftool/xlated_dumper.h @@ -49,11 +49,14 @@ struct dump_data { unsigned long address_call_base; struct kernel_sym *sym_mapping; __u32 sym_count; + __u64 *jited_ksyms; + __u32 nr_jited_ksyms; char scratch_buff[SYM_MAX_NAME + 8]; }; void kernel_syms_load(struct dump_data *dd); void kernel_syms_destroy(struct dump_data *dd); +struct kernel_sym *kernel_syms_search(struct dump_data *dd, unsigned long key); void dump_xlated_json(struct dump_data *dd, void *buf, unsigned int len, bool opcodes); void dump_xlated_plain(struct dump_data *dd, void *buf, unsigned int len, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d94d333a8225..9b8c6e310e9a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -97,6 +97,7 @@ enum bpf_cmd { BPF_RAW_TRACEPOINT_OPEN, BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, + BPF_TASK_FD_QUERY, }; enum bpf_map_type { @@ -141,6 +142,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_PROG_TYPE_LWT_SEG6LOCAL, }; enum bpf_attach_type { @@ -284,8 +286,8 @@ union bpf_attr { char map_name[BPF_OBJ_NAME_LEN]; __u32 map_ifindex; /* ifindex of netdev to create on */ __u32 btf_fd; /* fd pointing to a BTF type data */ - __u32 btf_key_id; /* BTF type_id of the key */ - __u32 btf_value_id; /* BTF type_id of the value */ + __u32 btf_key_type_id; /* BTF type_id of the key */ + __u32 btf_value_type_id; /* BTF type_id of the value */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -379,6 +381,22 @@ union bpf_attr { __u32 btf_log_size; __u32 btf_log_level; }; + + struct { + __u32 pid; /* input: pid */ + __u32 fd; /* input: fd */ + __u32 flags; /* input: flags */ + __u32 buf_len; /* input/output: buf len */ + __aligned_u64 buf; /* input/output: + * tp_name for tracepoint + * symbol for kprobe + * filename for uprobe + */ + __u32 prog_id; /* output: prod_id */ + __u32 fd_type; /* output: BPF_FD_TYPE_* */ + __u64 probe_offset; /* output: probe_offset */ + __u64 probe_addr; /* output: probe_addr */ + } task_fd_query; } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF @@ -1902,6 +1920,90 @@ union bpf_attr { * egress otherwise). This is the only flag supported for now. * Return * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * Description + * Encapsulate the packet associated to *skb* within a Layer 3 + * protocol header. This header is provided in the buffer at + * address *hdr*, with *len* its size in bytes. *type* indicates + * the protocol of the header and can be one of: + * + * **BPF_LWT_ENCAP_SEG6** + * IPv6 encapsulation with Segment Routing Header + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, + * the IPv6 header is computed by the kernel. + * **BPF_LWT_ENCAP_SEG6_INLINE** + * Only works if *skb* contains an IPv6 packet. Insert a + * Segment Routing Header (**struct ipv6_sr_hdr**) inside + * the IPv6 header. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. Only the flags, tag and TLVs + * inside the outermost IPv6 Segment Routing Header can be + * modified through this helper. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * Description + * Adjust the size allocated to TLVs in the outermost IPv6 + * Segment Routing Header contained in the packet associated to + * *skb*, at position *offset* by *delta* bytes. Only offsets + * after the segments are accepted. *delta* can be as well + * positive (growing) as negative (shrinking). + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * Description + * Apply an IPv6 Segment Routing action of type *action* to the + * packet associated to *skb*. Each action takes a parameter + * contained at address *param*, and of length *param_len* bytes. + * *action* can be one of: + * + * **SEG6_LOCAL_ACTION_END_X** + * End.X action: Endpoint with Layer-3 cross-connect. + * Type of *param*: **struct in6_addr**. + * **SEG6_LOCAL_ACTION_END_T** + * End.T action: Endpoint with specific IPv6 table lookup. + * Type of *param*: **int**. + * **SEG6_LOCAL_ACTION_END_B6** + * End.B6 action: Endpoint bound to an SRv6 policy. + * Type of param: **struct ipv6_sr_hdr**. + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** + * End.B6.Encap action: Endpoint bound to an SRv6 + * encapsulation policy. + * Type of param: **struct ipv6_sr_hdr**. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1976,7 +2078,11 @@ union bpf_attr { FN(fib_lookup), \ FN(sock_hash_update), \ FN(msg_redirect_hash), \ - FN(sk_redirect_hash), + FN(sk_redirect_hash), \ + FN(lwt_push_encap), \ + FN(lwt_seg6_store_bytes), \ + FN(lwt_seg6_adjust_srh), \ + FN(lwt_seg6_action), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2043,6 +2149,12 @@ enum bpf_hdr_start_off { BPF_HDR_START_NET, }; +/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ +enum bpf_lwt_encap_mode { + BPF_LWT_ENCAP_SEG6, + BPF_LWT_ENCAP_SEG6_INLINE +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -2176,6 +2288,14 @@ enum sk_action { struct sk_msg_md { void *data; void *data_end; + + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ }; #define BPF_TAG_SIZE 8 @@ -2197,6 +2317,10 @@ struct bpf_prog_info { __u32 gpl_compatible:1; __u64 netns_dev; __u64 netns_ino; + __u32 nr_jited_ksyms; + __u32 nr_jited_func_lens; + __aligned_u64 jited_ksyms; + __aligned_u64 jited_func_lens; } __attribute__((aligned(8))); struct bpf_map_info { @@ -2211,8 +2335,8 @@ struct bpf_map_info { __u64 netns_dev; __u64 netns_ino; __u32 btf_id; - __u32 btf_key_id; - __u32 btf_value_id; + __u32 btf_key_type_id; + __u32 btf_value_type_id; } __attribute__((aligned(8))); struct bpf_btf_info { @@ -2450,4 +2574,13 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +enum bpf_task_fd_type { + BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_UPROBE, /* filename + offset */ + BPF_FD_TYPE_URETPROBE, /* filename + offset */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index bcb56ee47014..0b5ddbe135a4 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -12,42 +12,29 @@ struct btf_header { __u16 magic; __u8 version; __u8 flags; - - __u32 parent_label; - __u32 parent_name; + __u32 hdr_len; /* All offsets are in bytes relative to the end of this header */ - __u32 label_off; /* offset of label section */ - __u32 object_off; /* offset of data object section*/ - __u32 func_off; /* offset of function section */ __u32 type_off; /* offset of type section */ + __u32 type_len; /* length of type section */ __u32 str_off; /* offset of string section */ __u32 str_len; /* length of string section */ }; /* Max # of type identifier */ -#define BTF_MAX_TYPE 0x7fffffff +#define BTF_MAX_TYPE 0x0000ffff /* Max offset into the string section */ -#define BTF_MAX_NAME_OFFSET 0x7fffffff +#define BTF_MAX_NAME_OFFSET 0x0000ffff /* Max # of struct/union/enum members or func args */ #define BTF_MAX_VLEN 0xffff -/* The type id is referring to a parent BTF */ -#define BTF_TYPE_PARENT(id) (((id) >> 31) & 0x1) -#define BTF_TYPE_ID(id) ((id) & BTF_MAX_TYPE) - -/* String is in the ELF string section */ -#define BTF_STR_TBL_ELF_ID(ref) (((ref) >> 31) & 0x1) -#define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET) - struct btf_type { __u32 name_off; /* "info" bits arrangement * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused - * bits 24-28: kind (e.g. int, ptr, array...etc) - * bits 29-30: unused - * bits 31: root + * bits 24-27: kind (e.g. int, ptr, array...etc) + * bits 28-31: unused */ __u32 info; /* "size" is used by INT, ENUM, STRUCT and UNION. @@ -62,8 +49,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) -#define BTF_INFO_ISROOT(info) (!!(((info) >> 24) & 0x80)) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_KIND_UNKN 0 /* Unknown */ @@ -88,15 +74,14 @@ struct btf_type { /* BTF_KIND_INT is followed by a u32 and the following * is the 32 bits arrangement: */ -#define BTF_INT_ENCODING(VAL) (((VAL) & 0xff000000) >> 24) +#define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) #define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) #define BTF_INT_BITS(VAL) ((VAL) & 0x0000ffff) /* Attributes stored in the BTF_INT_ENCODING */ -#define BTF_INT_SIGNED 0x1 -#define BTF_INT_CHAR 0x2 -#define BTF_INT_BOOL 0x4 -#define BTF_INT_VARARGS 0x8 +#define BTF_INT_SIGNED (1 << 0) +#define BTF_INT_CHAR (1 << 1) +#define BTF_INT_BOOL (1 << 2) /* BTF_KIND_ENUM is followed by multiple "struct btf_enum". * The exact number of btf_enum is stored in the vlen (of the diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 6a8a00097fd8..9ddc89dae962 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -89,8 +89,8 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr) min(name_len, BPF_OBJ_NAME_LEN - 1)); attr.numa_node = create_attr->numa_node; attr.btf_fd = create_attr->btf_fd; - attr.btf_key_id = create_attr->btf_key_id; - attr.btf_value_id = create_attr->btf_value_id; + attr.btf_key_type_id = create_attr->btf_key_type_id; + attr.btf_value_type_id = create_attr->btf_value_type_id; attr.map_ifindex = create_attr->map_ifindex; return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); @@ -643,3 +643,26 @@ retry: return fd; } + +int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, + __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, + __u64 *probe_addr) +{ + union bpf_attr attr = {}; + int err; + + attr.task_fd_query.pid = pid; + attr.task_fd_query.fd = fd; + attr.task_fd_query.flags = flags; + attr.task_fd_query.buf = ptr_to_u64(buf); + attr.task_fd_query.buf_len = *buf_len; + + err = sys_bpf(BPF_TASK_FD_QUERY, &attr, sizeof(attr)); + *buf_len = attr.task_fd_query.buf_len; + *prog_id = attr.task_fd_query.prog_id; + *fd_type = attr.task_fd_query.fd_type; + *probe_offset = attr.task_fd_query.probe_offset; + *probe_addr = attr.task_fd_query.probe_addr; + + return err; +} diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 15bff7728cf1..0639a30a457d 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -36,8 +36,8 @@ struct bpf_create_map_attr { __u32 max_entries; __u32 numa_node; __u32 btf_fd; - __u32 btf_key_id; - __u32 btf_value_id; + __u32 btf_key_type_id; + __u32 btf_value_type_id; __u32 map_ifindex; }; @@ -107,4 +107,7 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, int bpf_raw_tracepoint_open(const char *name, int prog_fd); int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size, bool do_log); +int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, + __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, + __u64 *probe_addr); #endif diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 2bac710e3194..8c54a4b6f187 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -35,9 +35,8 @@ struct btf { static const char *btf_name_by_offset(const struct btf *btf, uint32_t offset) { - if (!BTF_STR_TBL_ELF_ID(offset) && - BTF_STR_OFFSET(offset) < btf->hdr->str_len) - return &btf->strings[BTF_STR_OFFSET(offset)]; + if (offset < btf->hdr->str_len) + return &btf->strings[offset]; else return NULL; } diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index cbdf34a6fb93..d20411ebfa2f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -216,8 +216,8 @@ struct bpf_map { size_t offset; int map_ifindex; struct bpf_map_def def; - uint32_t btf_key_id; - uint32_t btf_value_id; + uint32_t btf_key_type_id; + uint32_t btf_value_type_id; void *priv; bpf_map_clear_priv_t clear_priv; }; @@ -1042,8 +1042,8 @@ static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf) } if (def->key_size != key_size) { - pr_warning("map:%s key_type:%s has BTF type_size:%ld != key_size:%u\n", - map->name, name, key_size, def->key_size); + pr_warning("map:%s key_type:%s has BTF type_size:%u != key_size:%u\n", + map->name, name, (unsigned int)key_size, def->key_size); return -EINVAL; } @@ -1069,13 +1069,13 @@ static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf) } if (def->value_size != value_size) { - pr_warning("map:%s value_type:%s has BTF type_size:%ld != value_size:%u\n", - map->name, name, value_size, def->value_size); + pr_warning("map:%s value_type:%s has BTF type_size:%u != value_size:%u\n", + map->name, name, (unsigned int)value_size, def->value_size); return -EINVAL; } - map->btf_key_id = key_id; - map->btf_value_id = value_id; + map->btf_key_type_id = key_id; + map->btf_value_type_id = value_id; return 0; } @@ -1100,24 +1100,24 @@ bpf_object__create_maps(struct bpf_object *obj) create_attr.value_size = def->value_size; create_attr.max_entries = def->max_entries; create_attr.btf_fd = 0; - create_attr.btf_key_id = 0; - create_attr.btf_value_id = 0; + create_attr.btf_key_type_id = 0; + create_attr.btf_value_type_id = 0; if (obj->btf && !bpf_map_find_btf_info(map, obj->btf)) { create_attr.btf_fd = btf__fd(obj->btf); - create_attr.btf_key_id = map->btf_key_id; - create_attr.btf_value_id = map->btf_value_id; + create_attr.btf_key_type_id = map->btf_key_type_id; + create_attr.btf_value_type_id = map->btf_value_type_id; } *pfd = bpf_create_map_xattr(&create_attr); - if (*pfd < 0 && create_attr.btf_key_id) { + if (*pfd < 0 && create_attr.btf_key_type_id) { pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", map->name, strerror(errno), errno); create_attr.btf_fd = 0; - create_attr.btf_key_id = 0; - create_attr.btf_value_id = 0; - map->btf_key_id = 0; - map->btf_value_id = 0; + create_attr.btf_key_type_id = 0; + create_attr.btf_value_type_id = 0; + map->btf_key_type_id = 0; + map->btf_value_type_id = 0; *pfd = bpf_create_map_xattr(&create_attr); } @@ -1456,6 +1456,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type) case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_CGROUP_DEVICE: @@ -2085,14 +2086,14 @@ const char *bpf_map__name(struct bpf_map *map) return map ? map->name : NULL; } -uint32_t bpf_map__btf_key_id(const struct bpf_map *map) +uint32_t bpf_map__btf_key_type_id(const struct bpf_map *map) { - return map ? map->btf_key_id : 0; + return map ? map->btf_key_type_id : 0; } -uint32_t bpf_map__btf_value_id(const struct bpf_map *map) +uint32_t bpf_map__btf_value_type_id(const struct bpf_map *map) { - return map ? map->btf_value_id : 0; + return map ? map->btf_value_type_id : 0; } int bpf_map__set_priv(struct bpf_map *map, void *priv, diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index cd3fd8d782c7..09976531aa74 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -244,8 +244,8 @@ bpf_map__next(struct bpf_map *map, struct bpf_object *obj); int bpf_map__fd(struct bpf_map *map); const struct bpf_map_def *bpf_map__def(struct bpf_map *map); const char *bpf_map__name(struct bpf_map *map); -uint32_t bpf_map__btf_key_id(const struct bpf_map *map); -uint32_t bpf_map__btf_value_id(const struct bpf_map *map); +uint32_t bpf_map__btf_key_type_id(const struct bpf_map *map); +uint32_t bpf_map__btf_value_type_id(const struct bpf_map *map); typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *); int bpf_map__set_priv(struct bpf_map *map, void *priv, diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1eb0fa2aba92..85044448bbc7 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -33,7 +33,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \ sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \ test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \ - test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o + test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ + test_lwt_seg6local.o # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ @@ -42,7 +43,8 @@ TEST_PROGS := test_kmod.sh \ test_xdp_meta.sh \ test_offload.py \ test_sock_addr.sh \ - test_tunnel.sh + test_tunnel.sh \ + test_lwt_seg6local.sh # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr @@ -84,7 +86,17 @@ else CPU ?= generic endif +# Get Clang's default includes on this system, as opposed to those seen by +# '-target bpf'. This fixes "missing" files on some architectures/distros, +# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +CLANG_SYS_INCLUDES := $(shell $(CLANG) -v -E - </dev/null 2>&1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \ + $(CLANG_SYS_INCLUDES) \ -Wno-compare-distinct-pointer-types $(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 8f143dfb3700..334d3e8c5e89 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -114,6 +114,18 @@ static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) = static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, int plen, __u32 flags) = (void *) BPF_FUNC_fib_lookup; +static int (*bpf_lwt_push_encap)(void *ctx, unsigned int type, void *hdr, + unsigned int len) = + (void *) BPF_FUNC_lwt_push_encap; +static int (*bpf_lwt_seg6_store_bytes)(void *ctx, unsigned int offset, + void *from, unsigned int len) = + (void *) BPF_FUNC_lwt_seg6_store_bytes; +static int (*bpf_lwt_seg6_action)(void *ctx, unsigned int action, void *param, + unsigned int param_len) = + (void *) BPF_FUNC_lwt_seg6_action; +static int (*bpf_lwt_seg6_adjust_srh)(void *ctx, unsigned int offset, + unsigned int len) = + (void *) BPF_FUNC_lwt_seg6_adjust_srh; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index c8bceae7ec02..35064df688c1 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -113,22 +113,25 @@ static char btf_log_buf[BTF_LOG_BUF_SIZE]; static struct btf_header hdr_tmpl = { .magic = BTF_MAGIC, .version = BTF_VERSION, + .hdr_len = sizeof(struct btf_header), }; struct btf_raw_test { const char *descr; const char *str_sec; const char *map_name; + const char *err_str; __u32 raw_types[MAX_NR_RAW_TYPES]; __u32 str_sec_size; enum bpf_map_type map_type; __u32 key_size; __u32 value_size; - __u32 key_id; - __u32 value_id; + __u32 key_type_id; + __u32 value_type_id; __u32 max_entries; bool btf_load_err; bool map_create_err; + int hdr_len_delta; int type_off_delta; int str_off_delta; int str_len_delta; @@ -141,8 +144,8 @@ static struct btf_raw_test raw_tests[] = { * }; * * struct A { - * int m; - * unsigned long long n; + * unsigned long long m; + * int n; * char o; * [3 bytes hole] * int p[8]; @@ -163,8 +166,8 @@ static struct btf_raw_test raw_tests[] = { BTF_TYPE_ARRAY_ENC(1, 1, 8), /* [4] */ /* struct A { */ /* [5] */ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 6), 180), - BTF_MEMBER_ENC(NAME_TBD, 1, 0), /* int m; */ - BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* unsigned long long n;*/ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* unsigned long long m;*/ + BTF_MEMBER_ENC(NAME_TBD, 1, 64),/* int n; */ BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o; */ BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8] */ BTF_MEMBER_ENC(NAME_TBD, 6, 384),/* int q[4][8] */ @@ -172,6 +175,7 @@ static struct btf_raw_test raw_tests[] = { /* } */ /* int[4][8] */ BTF_TYPE_ARRAY_ENC(4, 1, 4), /* [6] */ + /* enum E */ /* [7] */ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), sizeof(int)), BTF_ENUM_ENC(NAME_TBD, 0), BTF_ENUM_ENC(NAME_TBD, 1), @@ -183,8 +187,8 @@ static struct btf_raw_test raw_tests[] = { .map_name = "struct_test1_map", .key_size = sizeof(int), .value_size = 180, - .key_id = 1, - .value_id = 5, + .key_type_id = 1, + .value_type_id = 5, .max_entries = 4, }, @@ -238,8 +242,8 @@ static struct btf_raw_test raw_tests[] = { .map_name = "struct_test2_map", .key_size = sizeof(int), .value_size = 68, - .key_id = 1, - .value_id = 3, + .key_type_id = 1, + .value_type_id = 3, .max_entries = 4, }, @@ -258,7 +262,7 @@ static struct btf_raw_test raw_tests[] = { /* struct A { */ /* [2] */ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) * 2 - 1), BTF_MEMBER_ENC(NAME_TBD, 1, 0), /* int m; */ - BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* int n; */ + BTF_MEMBER_ENC(NAME_TBD, 1, 32),/* int n; */ /* } */ BTF_END_RAW, }, @@ -268,10 +272,11 @@ static struct btf_raw_test raw_tests[] = { .map_name = "size_check1_map", .key_size = sizeof(int), .value_size = 1, - .key_id = 1, - .value_id = 2, + .key_type_id = 1, + .value_type_id = 2, .max_entries = 4, .btf_load_err = true, + .err_str = "Member exceeds struct_size", }, /* Test member exeeds the size of struct @@ -301,11 +306,11 @@ static struct btf_raw_test raw_tests[] = { .map_name = "size_check2_map", .key_size = sizeof(int), .value_size = 1, - .key_id = 1, - .value_id = 3, + .key_type_id = 1, + .value_type_id = 3, .max_entries = 4, .btf_load_err = true, - + .err_str = "Member exceeds struct_size", }, /* Test member exeeds the size of struct @@ -335,10 +340,11 @@ static struct btf_raw_test raw_tests[] = { .map_name = "size_check3_map", .key_size = sizeof(int), .value_size = 1, - .key_id = 1, - .value_id = 3, + .key_type_id = 1, + .value_type_id = 3, .max_entries = 4, .btf_load_err = true, + .err_str = "Member exceeds struct_size", }, /* Test member exceeds the size of struct @@ -376,10 +382,11 @@ static struct btf_raw_test raw_tests[] = { .map_name = "size_check4_map", .key_size = sizeof(int), .value_size = 1, - .key_id = 1, - .value_id = 3, + .key_type_id = 1, + .value_type_id = 3, .max_entries = 4, .btf_load_err = true, + .err_str = "Member exceeds struct_size", }, /* typedef const void * const_void_ptr; @@ -411,8 +418,8 @@ static struct btf_raw_test raw_tests[] = { .map_name = "void_test1_map", .key_size = sizeof(int), .value_size = sizeof(void *), - .key_id = 1, - .value_id = 4, + .key_type_id = 1, + .value_type_id = 4, .max_entries = 4, }, @@ -440,10 +447,11 @@ static struct btf_raw_test raw_tests[] = { .map_name = "void_test2_map", .key_size = sizeof(int), .value_size = sizeof(void *), - .key_id = 1, - .value_id = 3, + .key_type_id = 1, + .value_type_id = 3, .max_entries = 4, .btf_load_err = true, + .err_str = "Invalid member", }, /* typedef const void * const_void_ptr; @@ -458,9 +466,9 @@ static struct btf_raw_test raw_tests[] = { BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0), /* const void* */ /* [3] */ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2), - /* typedef const void * const_void_ptr */ + /* typedef const void * const_void_ptr */ /* [4] */ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3), - /* const_void_ptr[4] */ /* [4] */ + /* const_void_ptr[4] */ /* [5] */ BTF_TYPE_ARRAY_ENC(3, 1, 4), BTF_END_RAW, }, @@ -470,8 +478,8 @@ static struct btf_raw_test raw_tests[] = { .map_name = "void_test3_map", .key_size = sizeof(int), .value_size = sizeof(void *) * 4, - .key_id = 1, - .value_id = 4, + .key_type_id = 1, + .value_type_id = 4, .max_entries = 4, }, @@ -493,10 +501,11 @@ static struct btf_raw_test raw_tests[] = { .map_name = "void_test4_map", .key_size = sizeof(int), .value_size = sizeof(void *) * 4, - .key_id = 1, - .value_id = 3, + .key_type_id = 1, + .value_type_id = 3, .max_entries = 4, .btf_load_err = true, + .err_str = "Invalid elem", }, /* Array_A <------------------+ @@ -523,10 +532,11 @@ static struct btf_raw_test raw_tests[] = { .map_name = "loop_test1_map", .key_size = sizeof(int), .value_size = sizeof(sizeof(int) * 8), - .key_id = 1, - .value_id = 2, + .key_type_id = 1, + .value_type_id = 2, .max_entries = 4, .btf_load_err = true, + .err_str = "Loop detected", }, /* typedef is _before_ the BTF type of Array_A and Array_B @@ -551,7 +561,6 @@ static struct btf_raw_test raw_tests[] = { BTF_TYPE_ARRAY_ENC(2, 1, 8), /* [3] */ /* Array_B */ BTF_TYPE_ARRAY_ENC(3, 1, 8), /* [4] */ - BTF_END_RAW, }, .str_sec = "\0int_array\0", @@ -560,10 +569,11 @@ static struct btf_raw_test raw_tests[] = { .map_name = "loop_test2_map", .key_size = sizeof(int), .value_size = sizeof(sizeof(int) * 8), - .key_id = 1, - .value_id = 2, + .key_type_id = 1, + .value_type_id = 2, .max_entries = 4, .btf_load_err = true, + .err_str = "Loop detected", }, /* Array_A <------------------+ @@ -582,7 +592,6 @@ static struct btf_raw_test raw_tests[] = { BTF_TYPE_ARRAY_ENC(3, 1, 8), /* Array_B */ /* [3] */ BTF_TYPE_ARRAY_ENC(2, 1, 8), - BTF_END_RAW, }, .str_sec = "", @@ -591,10 +600,11 @@ static struct btf_raw_test raw_tests[] = { .map_name = "loop_test3_map", .key_size = sizeof(int), .value_size = sizeof(sizeof(int) * 8), - .key_id = 1, - .value_id = 2, + .key_type_id = 1, + .value_type_id = 2, .max_entries = 4, .btf_load_err = true, + .err_str = "Loop detected", }, /* typedef is _between_ the BTF type of Array_A and Array_B @@ -627,10 +637,11 @@ static struct btf_raw_test raw_tests[] = { .map_name = "loop_test4_map", .key_size = sizeof(int), .value_size = sizeof(sizeof(int) * 8), - .key_id = 1, - .value_id = 2, + .key_type_id = 1, + .value_type_id = 2, .max_entries = 4, .btf_load_err = true, + .err_str = "Loop detected", }, /* typedef struct B Struct_B @@ -668,10 +679,11 @@ static struct btf_raw_test raw_tests[] = { .map_name = "loop_test5_map", .key_size = sizeof(int), .value_size = 8, - .key_id = 1, - .value_id = 2, + .key_type_id = 1, + .value_type_id = 2, .max_entries = 4, .btf_load_err = true, + .err_str = "Loop detected", }, /* struct A { @@ -697,10 +709,11 @@ static struct btf_raw_test raw_tests[] = { .map_name = "loop_test6_map", .key_size = sizeof(int), .value_size = 8, - .key_id = 1, - .value_id = 2, + .key_type_id = 1, + .value_type_id = 2, .max_entries = 4, .btf_load_err = true, + .err_str = "Loop detected", }, { @@ -724,10 +737,11 @@ static struct btf_raw_test raw_tests[] = { .map_name = "loop_test7_map", .key_size = sizeof(int), .value_size = sizeof(void *), - .key_id = 1, - .value_id = 2, + .key_type_id = 1, + .value_type_id = 2, .max_entries = 4, .btf_load_err = true, + .err_str = "Loop detected", }, { @@ -759,34 +773,73 @@ static struct btf_raw_test raw_tests[] = { .map_name = "loop_test8_map", .key_size = sizeof(int), .value_size = sizeof(void *), - .key_id = 1, - .value_id = 2, + .key_type_id = 1, + .value_type_id = 2, .max_entries = 4, .btf_load_err = true, + .err_str = "Loop detected", }, { - .descr = "type_off == str_off", + .descr = "string section does not end with null", .raw_types = { /* int */ /* [1] */ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), BTF_END_RAW, }, .str_sec = "\0int", + .str_sec_size = sizeof("\0int") - 1, + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "hdr_test_map", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid string section", +}, + +{ + .descr = "empty string section", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = 0, + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "hdr_test_map", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid string section", +}, + +{ + .descr = "empty type section", + .raw_types = { + BTF_END_RAW, + }, + .str_sec = "\0int", .str_sec_size = sizeof("\0int"), .map_type = BPF_MAP_TYPE_ARRAY, .map_name = "hdr_test_map", .key_size = sizeof(int), .value_size = sizeof(int), - .key_id = 1, - .value_id = 1, + .key_type_id = 1, + .value_type_id = 1, .max_entries = 4, .btf_load_err = true, - .type_off_delta = sizeof(struct btf_type) + sizeof(int) + sizeof("\0int"), + .err_str = "No type found", }, { - .descr = "Unaligned type_off", + .descr = "btf_header test. Longer hdr_len", .raw_types = { /* int */ /* [1] */ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), @@ -798,15 +851,16 @@ static struct btf_raw_test raw_tests[] = { .map_name = "hdr_test_map", .key_size = sizeof(int), .value_size = sizeof(int), - .key_id = 1, - .value_id = 1, + .key_type_id = 1, + .value_type_id = 1, .max_entries = 4, .btf_load_err = true, - .type_off_delta = 1, + .hdr_len_delta = 4, + .err_str = "Unsupported btf_header", }, { - .descr = "str_off beyonds btf size", + .descr = "btf_header test. Gap between hdr and type", .raw_types = { /* int */ /* [1] */ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), @@ -818,15 +872,16 @@ static struct btf_raw_test raw_tests[] = { .map_name = "hdr_test_map", .key_size = sizeof(int), .value_size = sizeof(int), - .key_id = 1, - .value_id = 1, + .key_type_id = 1, + .value_type_id = 1, .max_entries = 4, .btf_load_err = true, - .str_off_delta = sizeof("\0int") + 1, + .type_off_delta = 4, + .err_str = "Unsupported section found", }, { - .descr = "str_len beyonds btf size", + .descr = "btf_header test. Gap between type and str", .raw_types = { /* int */ /* [1] */ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), @@ -838,15 +893,16 @@ static struct btf_raw_test raw_tests[] = { .map_name = "hdr_test_map", .key_size = sizeof(int), .value_size = sizeof(int), - .key_id = 1, - .value_id = 1, + .key_type_id = 1, + .value_type_id = 1, .max_entries = 4, .btf_load_err = true, - .str_len_delta = 1, + .str_off_delta = 4, + .err_str = "Unsupported section found", }, { - .descr = "String section does not end with null", + .descr = "btf_header test. Overlap between type and str", .raw_types = { /* int */ /* [1] */ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), @@ -858,15 +914,16 @@ static struct btf_raw_test raw_tests[] = { .map_name = "hdr_test_map", .key_size = sizeof(int), .value_size = sizeof(int), - .key_id = 1, - .value_id = 1, + .key_type_id = 1, + .value_type_id = 1, .max_entries = 4, .btf_load_err = true, - .str_len_delta = -1, + .str_off_delta = -4, + .err_str = "Section overlap found", }, { - .descr = "Empty string section", + .descr = "btf_header test. Larger BTF size", .raw_types = { /* int */ /* [1] */ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), @@ -878,11 +935,288 @@ static struct btf_raw_test raw_tests[] = { .map_name = "hdr_test_map", .key_size = sizeof(int), .value_size = sizeof(int), - .key_id = 1, - .value_id = 1, + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .str_len_delta = -4, + .err_str = "Unsupported section found", +}, + +{ + .descr = "btf_header test. Smaller BTF size", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + BTF_END_RAW, + }, + .str_sec = "\0int", + .str_sec_size = sizeof("\0int"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "hdr_test_map", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .str_len_delta = 4, + .err_str = "Total section length too long", +}, + +{ + .descr = "array test. index_type/elem_type \"int\"", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + /* int[16] */ /* [2] */ + BTF_TYPE_ARRAY_ENC(1, 1, 16), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_test_map", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, +}, + +{ + .descr = "array test. index_type/elem_type \"const int\"", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + /* int[16] */ /* [2] */ + BTF_TYPE_ARRAY_ENC(3, 3, 16), + /* CONST type_id=1 */ /* [3] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 1), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_test_map", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, +}, + +{ + .descr = "array test. index_type \"const int:31\"", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + /* int:31 */ /* [2] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 31, 4), + /* int[16] */ /* [3] */ + BTF_TYPE_ARRAY_ENC(1, 4, 16), + /* CONST type_id=2 */ /* [4] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 2), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_test_map", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid index", +}, + +{ + .descr = "array test. elem_type \"const int:31\"", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + /* int:31 */ /* [2] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 31, 4), + /* int[16] */ /* [3] */ + BTF_TYPE_ARRAY_ENC(4, 1, 16), + /* CONST type_id=2 */ /* [4] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 2), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_test_map", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid array of int", +}, + +{ + .descr = "array test. index_type \"void\"", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + /* int[16] */ /* [2] */ + BTF_TYPE_ARRAY_ENC(1, 0, 16), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_test_map", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid index", +}, + +{ + .descr = "array test. index_type \"const void\"", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + /* int[16] */ /* [2] */ + BTF_TYPE_ARRAY_ENC(1, 3, 16), + /* CONST type_id=0 (void) */ /* [3] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_test_map", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid index", +}, + +{ + .descr = "array test. elem_type \"const void\"", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + /* int[16] */ /* [2] */ + BTF_TYPE_ARRAY_ENC(3, 1, 16), + /* CONST type_id=0 (void) */ /* [3] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_test_map", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid elem", +}, + +{ + .descr = "array test. elem_type \"const void *\"", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + /* const void *[16] */ /* [2] */ + BTF_TYPE_ARRAY_ENC(3, 1, 16), + /* CONST type_id=4 */ /* [3] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 4), + /* void* */ /* [4] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_test_map", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, +}, + +{ + .descr = "array test. index_type \"const void *\"", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + /* const void *[16] */ /* [2] */ + BTF_TYPE_ARRAY_ENC(3, 3, 16), + /* CONST type_id=4 */ /* [3] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 4), + /* void* */ /* [4] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_test_map", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, .max_entries = 4, .btf_load_err = true, - .str_len_delta = 0 - (int)sizeof("\0int"), + .err_str = "Invalid index", +}, + +{ + .descr = "int test. invalid int_data", + .raw_types = { + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), 4), + 0x10000000, + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_test_map", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid int_data", +}, + +{ + .descr = "invalid BTF_INFO", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + BTF_TYPE_ENC(0, 0x10000000, 4), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_test_map", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid btf_info", }, }; /* struct btf_raw_test raw_tests[] */ @@ -951,6 +1285,7 @@ static void *btf_raw_create(const struct btf_header *hdr, memcpy(raw_btf + offset, str, str_sec_size); ret_hdr = (struct btf_header *)raw_btf; + ret_hdr->type_len = type_sec_size; ret_hdr->str_off = type_sec_size; ret_hdr->str_len = str_sec_size; @@ -981,6 +1316,7 @@ static int do_test_raw(unsigned int test_num) hdr = raw_btf; + hdr->hdr_len = (int)hdr->hdr_len + test->hdr_len_delta; hdr->type_off = (int)hdr->type_off + test->type_off_delta; hdr->str_off = (int)hdr->str_off + test->str_off_delta; hdr->str_len = (int)hdr->str_len + test->str_len_delta; @@ -992,8 +1328,13 @@ static int do_test_raw(unsigned int test_num) free(raw_btf); err = ((btf_fd == -1) != test->btf_load_err); - CHECK(err, "btf_fd:%d test->btf_load_err:%u", - btf_fd, test->btf_load_err); + if (CHECK(err, "btf_fd:%d test->btf_load_err:%u", + btf_fd, test->btf_load_err) || + CHECK(test->err_str && !strstr(btf_log_buf, test->err_str), + "expected err_str:%s", test->err_str)) { + err = -1; + goto done; + } if (err || btf_fd == -1) goto done; @@ -1004,8 +1345,8 @@ static int do_test_raw(unsigned int test_num) create_attr.value_size = test->value_size; create_attr.max_entries = test->max_entries; create_attr.btf_fd = btf_fd; - create_attr.btf_key_id = test->key_id; - create_attr.btf_value_id = test->value_id; + create_attr.btf_key_type_id = test->key_type_id; + create_attr.btf_value_type_id = test->value_type_id; map_fd = bpf_create_map_xattr(&create_attr); @@ -1267,8 +1608,8 @@ static int test_btf_id(unsigned int test_num) create_attr.value_size = sizeof(unsigned int); create_attr.max_entries = 4; create_attr.btf_fd = btf_fd[0]; - create_attr.btf_key_id = 1; - create_attr.btf_value_id = 2; + create_attr.btf_key_type_id = 1; + create_attr.btf_value_type_id = 2; map_fd = bpf_create_map_xattr(&create_attr); if (CHECK(map_fd == -1, "errno:%d", errno)) { @@ -1279,10 +1620,10 @@ static int test_btf_id(unsigned int test_num) info_len = sizeof(map_info); err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len); if (CHECK(err || map_info.btf_id != info[0].id || - map_info.btf_key_id != 1 || map_info.btf_value_id != 2, - "err:%d errno:%d info.id:%u btf_id:%u btf_key_id:%u btf_value_id:%u", - err, errno, info[0].id, map_info.btf_id, map_info.btf_key_id, - map_info.btf_value_id)) { + map_info.btf_key_type_id != 1 || map_info.btf_value_type_id != 2, + "err:%d errno:%d info.id:%u btf_id:%u btf_key_type_id:%u btf_value_type_id:%u", + err, errno, info[0].id, map_info.btf_id, map_info.btf_key_type_id, + map_info.btf_value_type_id)) { err = -1; goto done; } @@ -1542,10 +1883,10 @@ static int do_test_file(unsigned int test_num) goto done; } - err = (bpf_map__btf_key_id(map) == 0 || bpf_map__btf_value_id(map) == 0) + err = (bpf_map__btf_key_type_id(map) == 0 || bpf_map__btf_value_type_id(map) == 0) != test->btf_kv_notfound; - if (CHECK(err, "btf_key_id:%u btf_value_id:%u test->btf_kv_notfound:%u", - bpf_map__btf_key_id(map), bpf_map__btf_value_id(map), + if (CHECK(err, "btf_key_type_id:%u btf_value_type_id:%u test->btf_kv_notfound:%u", + bpf_map__btf_key_type_id(map), bpf_map__btf_value_type_id(map), test->btf_kv_notfound)) goto done; @@ -1615,7 +1956,7 @@ static struct btf_raw_test pprint_test = { /* 28 bits */ /* [7] */ BTF_TYPE_INT_ENC(0, 0, 0, 28, 4), /* uint8_t[8] */ /* [8] */ - BTF_TYPE_ARRAY_ENC(9, 3, 8), + BTF_TYPE_ARRAY_ENC(9, 1, 8), /* typedef unsigned char uint8_t */ /* [9] */ BTF_TYPEDEF_ENC(NAME_TBD, 1), /* typedef unsigned short uint16_t */ /* [10] */ @@ -1654,8 +1995,8 @@ static struct btf_raw_test pprint_test = { .map_name = "pprint_test", .key_size = sizeof(unsigned int), .value_size = sizeof(struct pprint_mapv), - .key_id = 3, /* unsigned int */ - .value_id = 16, /* struct pprint_mapv */ + .key_type_id = 3, /* unsigned int */ + .value_type_id = 16, /* struct pprint_mapv */ .max_entries = 128 * 1024, }; @@ -1712,8 +2053,8 @@ static int test_pprint(void) create_attr.value_size = test->value_size; create_attr.max_entries = test->max_entries; create_attr.btf_fd = btf_fd; - create_attr.btf_key_id = test->key_id; - create_attr.btf_value_id = test->value_id; + create_attr.btf_key_type_id = test->key_type_id; + create_attr.btf_value_type_id = test->value_type_id; map_fd = bpf_create_map_xattr(&create_attr); if (CHECK(map_fd == -1, "errno:%d", errno)) { diff --git a/tools/testing/selftests/bpf/test_lwt_seg6local.c b/tools/testing/selftests/bpf/test_lwt_seg6local.c new file mode 100644 index 000000000000..0575751bc1bc --- /dev/null +++ b/tools/testing/selftests/bpf/test_lwt_seg6local.c @@ -0,0 +1,437 @@ +#include <stddef.h> +#include <inttypes.h> +#include <errno.h> +#include <linux/seg6_local.h> +#include <linux/bpf.h> +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +/* Packet parsing state machine helpers. */ +#define cursor_advance(_cursor, _len) \ + ({ void *_tmp = _cursor; _cursor += _len; _tmp; }) + +#define SR6_FLAG_ALERT (1 << 4) + +#define htonll(x) ((bpf_htonl(1)) == 1 ? (x) : ((uint64_t)bpf_htonl((x) & \ + 0xFFFFFFFF) << 32) | bpf_htonl((x) >> 32)) +#define ntohll(x) ((bpf_ntohl(1)) == 1 ? (x) : ((uint64_t)bpf_ntohl((x) & \ + 0xFFFFFFFF) << 32) | bpf_ntohl((x) >> 32)) +#define BPF_PACKET_HEADER __attribute__((packed)) + +struct ip6_t { + unsigned int ver:4; + unsigned int priority:8; + unsigned int flow_label:20; + unsigned short payload_len; + unsigned char next_header; + unsigned char hop_limit; + unsigned long long src_hi; + unsigned long long src_lo; + unsigned long long dst_hi; + unsigned long long dst_lo; +} BPF_PACKET_HEADER; + +struct ip6_addr_t { + unsigned long long hi; + unsigned long long lo; +} BPF_PACKET_HEADER; + +struct ip6_srh_t { + unsigned char nexthdr; + unsigned char hdrlen; + unsigned char type; + unsigned char segments_left; + unsigned char first_segment; + unsigned char flags; + unsigned short tag; + + struct ip6_addr_t segments[0]; +} BPF_PACKET_HEADER; + +struct sr6_tlv_t { + unsigned char type; + unsigned char len; + unsigned char value[0]; +} BPF_PACKET_HEADER; + +__attribute__((always_inline)) struct ip6_srh_t *get_srh(struct __sk_buff *skb) +{ + void *cursor, *data_end; + struct ip6_srh_t *srh; + struct ip6_t *ip; + uint8_t *ipver; + + data_end = (void *)(long)skb->data_end; + cursor = (void *)(long)skb->data; + ipver = (uint8_t *)cursor; + + if ((void *)ipver + sizeof(*ipver) > data_end) + return NULL; + + if ((*ipver >> 4) != 6) + return NULL; + + ip = cursor_advance(cursor, sizeof(*ip)); + if ((void *)ip + sizeof(*ip) > data_end) + return NULL; + + if (ip->next_header != 43) + return NULL; + + srh = cursor_advance(cursor, sizeof(*srh)); + if ((void *)srh + sizeof(*srh) > data_end) + return NULL; + + if (srh->type != 4) + return NULL; + + return srh; +} + +__attribute__((always_inline)) +int update_tlv_pad(struct __sk_buff *skb, uint32_t new_pad, + uint32_t old_pad, uint32_t pad_off) +{ + int err; + + if (new_pad != old_pad) { + err = bpf_lwt_seg6_adjust_srh(skb, pad_off, + (int) new_pad - (int) old_pad); + if (err) + return err; + } + + if (new_pad > 0) { + char pad_tlv_buf[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0}; + struct sr6_tlv_t *pad_tlv = (struct sr6_tlv_t *) pad_tlv_buf; + + pad_tlv->type = SR6_TLV_PADDING; + pad_tlv->len = new_pad - 2; + + err = bpf_lwt_seg6_store_bytes(skb, pad_off, + (void *)pad_tlv_buf, new_pad); + if (err) + return err; + } + + return 0; +} + +__attribute__((always_inline)) +int is_valid_tlv_boundary(struct __sk_buff *skb, struct ip6_srh_t *srh, + uint32_t *tlv_off, uint32_t *pad_size, + uint32_t *pad_off) +{ + uint32_t srh_off, cur_off; + int offset_valid = 0; + int err; + + srh_off = (char *)srh - (char *)(long)skb->data; + // cur_off = end of segments, start of possible TLVs + cur_off = srh_off + sizeof(*srh) + + sizeof(struct ip6_addr_t) * (srh->first_segment + 1); + + *pad_off = 0; + + // we can only go as far as ~10 TLVs due to the BPF max stack size + #pragma clang loop unroll(full) + for (int i = 0; i < 10; i++) { + struct sr6_tlv_t tlv; + + if (cur_off == *tlv_off) + offset_valid = 1; + + if (cur_off >= srh_off + ((srh->hdrlen + 1) << 3)) + break; + + err = bpf_skb_load_bytes(skb, cur_off, &tlv, sizeof(tlv)); + if (err) + return err; + + if (tlv.type == SR6_TLV_PADDING) { + *pad_size = tlv.len + sizeof(tlv); + *pad_off = cur_off; + + if (*tlv_off == srh_off) { + *tlv_off = cur_off; + offset_valid = 1; + } + break; + + } else if (tlv.type == SR6_TLV_HMAC) { + break; + } + + cur_off += sizeof(tlv) + tlv.len; + } // we reached the padding or HMAC TLVs, or the end of the SRH + + if (*pad_off == 0) + *pad_off = cur_off; + + if (*tlv_off == -1) + *tlv_off = cur_off; + else if (!offset_valid) + return -EINVAL; + + return 0; +} + +__attribute__((always_inline)) +int add_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh, uint32_t tlv_off, + struct sr6_tlv_t *itlv, uint8_t tlv_size) +{ + uint32_t srh_off = (char *)srh - (char *)(long)skb->data; + uint8_t len_remaining, new_pad; + uint32_t pad_off = 0; + uint32_t pad_size = 0; + uint32_t partial_srh_len; + int err; + + if (tlv_off != -1) + tlv_off += srh_off; + + if (itlv->type == SR6_TLV_PADDING || itlv->type == SR6_TLV_HMAC) + return -EINVAL; + + err = is_valid_tlv_boundary(skb, srh, &tlv_off, &pad_size, &pad_off); + if (err) + return err; + + err = bpf_lwt_seg6_adjust_srh(skb, tlv_off, sizeof(*itlv) + itlv->len); + if (err) + return err; + + err = bpf_lwt_seg6_store_bytes(skb, tlv_off, (void *)itlv, tlv_size); + if (err) + return err; + + // the following can't be moved inside update_tlv_pad because the + // bpf verifier has some issues with it + pad_off += sizeof(*itlv) + itlv->len; + partial_srh_len = pad_off - srh_off; + len_remaining = partial_srh_len % 8; + new_pad = 8 - len_remaining; + + if (new_pad == 1) // cannot pad for 1 byte only + new_pad = 9; + else if (new_pad == 8) + new_pad = 0; + + return update_tlv_pad(skb, new_pad, pad_size, pad_off); +} + +__attribute__((always_inline)) +int delete_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh, + uint32_t tlv_off) +{ + uint32_t srh_off = (char *)srh - (char *)(long)skb->data; + uint8_t len_remaining, new_pad; + uint32_t partial_srh_len; + uint32_t pad_off = 0; + uint32_t pad_size = 0; + struct sr6_tlv_t tlv; + int err; + + tlv_off += srh_off; + + err = is_valid_tlv_boundary(skb, srh, &tlv_off, &pad_size, &pad_off); + if (err) + return err; + + err = bpf_skb_load_bytes(skb, tlv_off, &tlv, sizeof(tlv)); + if (err) + return err; + + err = bpf_lwt_seg6_adjust_srh(skb, tlv_off, -(sizeof(tlv) + tlv.len)); + if (err) + return err; + + pad_off -= sizeof(tlv) + tlv.len; + partial_srh_len = pad_off - srh_off; + len_remaining = partial_srh_len % 8; + new_pad = 8 - len_remaining; + if (new_pad == 1) // cannot pad for 1 byte only + new_pad = 9; + else if (new_pad == 8) + new_pad = 0; + + return update_tlv_pad(skb, new_pad, pad_size, pad_off); +} + +__attribute__((always_inline)) +int has_egr_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh) +{ + int tlv_offset = sizeof(struct ip6_t) + sizeof(struct ip6_srh_t) + + ((srh->first_segment + 1) << 4); + struct sr6_tlv_t tlv; + + if (bpf_skb_load_bytes(skb, tlv_offset, &tlv, sizeof(struct sr6_tlv_t))) + return 0; + + if (tlv.type == SR6_TLV_EGRESS && tlv.len == 18) { + struct ip6_addr_t egr_addr; + + if (bpf_skb_load_bytes(skb, tlv_offset + 4, &egr_addr, 16)) + return 0; + + // check if egress TLV value is correct + if (ntohll(egr_addr.hi) == 0xfd00000000000000 && + ntohll(egr_addr.lo) == 0x4) + return 1; + } + + return 0; +} + +// This function will push a SRH with segments fd00::1, fd00::2, fd00::3, +// fd00::4 +SEC("encap_srh") +int __encap_srh(struct __sk_buff *skb) +{ + unsigned long long hi = 0xfd00000000000000; + struct ip6_addr_t *seg; + struct ip6_srh_t *srh; + char srh_buf[72]; // room for 4 segments + int err; + + srh = (struct ip6_srh_t *)srh_buf; + srh->nexthdr = 0; + srh->hdrlen = 8; + srh->type = 4; + srh->segments_left = 3; + srh->first_segment = 3; + srh->flags = 0; + srh->tag = 0; + + seg = (struct ip6_addr_t *)((char *)srh + sizeof(*srh)); + + #pragma clang loop unroll(full) + for (unsigned long long lo = 0; lo < 4; lo++) { + seg->lo = htonll(4 - lo); + seg->hi = htonll(hi); + seg = (struct ip6_addr_t *)((char *)seg + sizeof(*seg)); + } + + err = bpf_lwt_push_encap(skb, 0, (void *)srh, sizeof(srh_buf)); + if (err) + return BPF_DROP; + + return BPF_REDIRECT; +} + +// Add an Egress TLV fc00::4, add the flag A, +// and apply End.X action to fc42::1 +SEC("add_egr_x") +int __add_egr_x(struct __sk_buff *skb) +{ + unsigned long long hi = 0xfc42000000000000; + unsigned long long lo = 0x1; + struct ip6_srh_t *srh = get_srh(skb); + uint8_t new_flags = SR6_FLAG_ALERT; + struct ip6_addr_t addr; + int err, offset; + + if (srh == NULL) + return BPF_DROP; + + uint8_t tlv[20] = {2, 18, 0, 0, 0xfd, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4}; + + err = add_tlv(skb, srh, (srh->hdrlen+1) << 3, + (struct sr6_tlv_t *)&tlv, 20); + if (err) + return BPF_DROP; + + offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, flags); + err = bpf_lwt_seg6_store_bytes(skb, offset, + (void *)&new_flags, sizeof(new_flags)); + if (err) + return BPF_DROP; + + addr.lo = htonll(lo); + addr.hi = htonll(hi); + err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_X, + (void *)&addr, sizeof(addr)); + if (err) + return BPF_DROP; + return BPF_REDIRECT; +} + +// Pop the Egress TLV, reset the flags, change the tag 2442 and finally do a +// simple End action +SEC("pop_egr") +int __pop_egr(struct __sk_buff *skb) +{ + struct ip6_srh_t *srh = get_srh(skb); + uint16_t new_tag = bpf_htons(2442); + uint8_t new_flags = 0; + int err, offset; + + if (srh == NULL) + return BPF_DROP; + + if (srh->flags != SR6_FLAG_ALERT) + return BPF_DROP; + + if (srh->hdrlen != 11) // 4 segments + Egress TLV + Padding TLV + return BPF_DROP; + + if (!has_egr_tlv(skb, srh)) + return BPF_DROP; + + err = delete_tlv(skb, srh, 8 + (srh->first_segment + 1) * 16); + if (err) + return BPF_DROP; + + offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, flags); + if (bpf_lwt_seg6_store_bytes(skb, offset, (void *)&new_flags, + sizeof(new_flags))) + return BPF_DROP; + + offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, tag); + if (bpf_lwt_seg6_store_bytes(skb, offset, (void *)&new_tag, + sizeof(new_tag))) + return BPF_DROP; + + return BPF_OK; +} + +// Inspect if the Egress TLV and flag have been removed, if the tag is correct, +// then apply a End.T action to reach the last segment +SEC("inspect_t") +int __inspect_t(struct __sk_buff *skb) +{ + struct ip6_srh_t *srh = get_srh(skb); + int table = 117; + int err; + + if (srh == NULL) + return BPF_DROP; + + if (srh->flags != 0) + return BPF_DROP; + + if (srh->tag != bpf_htons(2442)) + return BPF_DROP; + + if (srh->hdrlen != 8) // 4 segments + return BPF_DROP; + + err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_T, + (void *)&table, sizeof(table)); + + if (err) + return BPF_DROP; + + return BPF_REDIRECT; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_lwt_seg6local.sh b/tools/testing/selftests/bpf/test_lwt_seg6local.sh new file mode 100755 index 000000000000..1c77994b5e71 --- /dev/null +++ b/tools/testing/selftests/bpf/test_lwt_seg6local.sh @@ -0,0 +1,140 @@ +#!/bin/bash +# Connects 6 network namespaces through veths. +# Each NS may have different IPv6 global scope addresses : +# NS1 ---- NS2 ---- NS3 ---- NS4 ---- NS5 ---- NS6 +# fb00::1 fd00::1 fd00::2 fd00::3 fb00::6 +# fc42::1 fd00::4 +# +# All IPv6 packets going to fb00::/16 through NS2 will be encapsulated in a +# IPv6 header with a Segment Routing Header, with segments : +# fd00::1 -> fd00::2 -> fd00::3 -> fd00::4 +# +# 3 fd00::/16 IPv6 addresses are binded to seg6local End.BPF actions : +# - fd00::1 : add a TLV, change the flags and apply a End.X action to fc42::1 +# - fd00::2 : remove the TLV, change the flags, add a tag +# - fd00::3 : apply an End.T action to fd00::4, through routing table 117 +# +# fd00::4 is a simple Segment Routing node decapsulating the inner IPv6 packet. +# Each End.BPF action will validate the operations applied on the SRH by the +# previous BPF program in the chain, otherwise the packet is dropped. +# +# An UDP datagram is sent from fb00::1 to fb00::6. The test succeeds if this +# datagram can be read on NS6 when binding to fb00::6. + +TMP_FILE="/tmp/selftest_lwt_seg6local.txt" + +cleanup() +{ + if [ "$?" = "0" ]; then + echo "selftests: test_lwt_seg6local [PASS]"; + else + echo "selftests: test_lwt_seg6local [FAILED]"; + fi + + set +e + ip netns del ns1 2> /dev/null + ip netns del ns2 2> /dev/null + ip netns del ns3 2> /dev/null + ip netns del ns4 2> /dev/null + ip netns del ns5 2> /dev/null + ip netns del ns6 2> /dev/null + rm -f $TMP_FILE +} + +set -e + +ip netns add ns1 +ip netns add ns2 +ip netns add ns3 +ip netns add ns4 +ip netns add ns5 +ip netns add ns6 + +trap cleanup 0 2 3 6 9 + +ip link add veth1 type veth peer name veth2 +ip link add veth3 type veth peer name veth4 +ip link add veth5 type veth peer name veth6 +ip link add veth7 type veth peer name veth8 +ip link add veth9 type veth peer name veth10 + +ip link set veth1 netns ns1 +ip link set veth2 netns ns2 +ip link set veth3 netns ns2 +ip link set veth4 netns ns3 +ip link set veth5 netns ns3 +ip link set veth6 netns ns4 +ip link set veth7 netns ns4 +ip link set veth8 netns ns5 +ip link set veth9 netns ns5 +ip link set veth10 netns ns6 + +ip netns exec ns1 ip link set dev veth1 up +ip netns exec ns2 ip link set dev veth2 up +ip netns exec ns2 ip link set dev veth3 up +ip netns exec ns3 ip link set dev veth4 up +ip netns exec ns3 ip link set dev veth5 up +ip netns exec ns4 ip link set dev veth6 up +ip netns exec ns4 ip link set dev veth7 up +ip netns exec ns5 ip link set dev veth8 up +ip netns exec ns5 ip link set dev veth9 up +ip netns exec ns6 ip link set dev veth10 up +ip netns exec ns6 ip link set dev lo up + +# All link scope addresses and routes required between veths +ip netns exec ns1 ip -6 addr add fb00::12/16 dev veth1 scope link +ip netns exec ns1 ip -6 route add fb00::21 dev veth1 scope link +ip netns exec ns2 ip -6 addr add fb00::21/16 dev veth2 scope link +ip netns exec ns2 ip -6 addr add fb00::34/16 dev veth3 scope link +ip netns exec ns2 ip -6 route add fb00::43 dev veth3 scope link +ip netns exec ns3 ip -6 route add fb00::65 dev veth5 scope link +ip netns exec ns3 ip -6 addr add fb00::43/16 dev veth4 scope link +ip netns exec ns3 ip -6 addr add fb00::56/16 dev veth5 scope link +ip netns exec ns4 ip -6 addr add fb00::65/16 dev veth6 scope link +ip netns exec ns4 ip -6 addr add fb00::78/16 dev veth7 scope link +ip netns exec ns4 ip -6 route add fb00::87 dev veth7 scope link +ip netns exec ns5 ip -6 addr add fb00::87/16 dev veth8 scope link +ip netns exec ns5 ip -6 addr add fb00::910/16 dev veth9 scope link +ip netns exec ns5 ip -6 route add fb00::109 dev veth9 scope link +ip netns exec ns5 ip -6 route add fb00::109 table 117 dev veth9 scope link +ip netns exec ns6 ip -6 addr add fb00::109/16 dev veth10 scope link + +ip netns exec ns1 ip -6 addr add fb00::1/16 dev lo +ip netns exec ns1 ip -6 route add fb00::6 dev veth1 via fb00::21 + +ip netns exec ns2 ip -6 route add fb00::6 encap bpf in obj test_lwt_seg6local.o sec encap_srh dev veth2 +ip netns exec ns2 ip -6 route add fd00::1 dev veth3 via fb00::43 scope link + +ip netns exec ns3 ip -6 route add fc42::1 dev veth5 via fb00::65 +ip netns exec ns3 ip -6 route add fd00::1 encap seg6local action End.BPF obj test_lwt_seg6local.o sec add_egr_x dev veth4 + +ip netns exec ns4 ip -6 route add fd00::2 encap seg6local action End.BPF obj test_lwt_seg6local.o sec pop_egr dev veth6 +ip netns exec ns4 ip -6 addr add fc42::1 dev lo +ip netns exec ns4 ip -6 route add fd00::3 dev veth7 via fb00::87 + +ip netns exec ns5 ip -6 route add fd00::4 table 117 dev veth9 via fb00::109 +ip netns exec ns5 ip -6 route add fd00::3 encap seg6local action End.BPF obj test_lwt_seg6local.o sec inspect_t dev veth8 + +ip netns exec ns6 ip -6 addr add fb00::6/16 dev lo +ip netns exec ns6 ip -6 addr add fd00::4/16 dev lo + +ip netns exec ns1 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec ns2 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec ns3 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec ns4 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec ns5 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null + +ip netns exec ns6 sysctl net.ipv6.conf.all.seg6_enabled=1 > /dev/null +ip netns exec ns6 sysctl net.ipv6.conf.lo.seg6_enabled=1 > /dev/null +ip netns exec ns6 sysctl net.ipv6.conf.veth10.seg6_enabled=1 > /dev/null + +ip netns exec ns6 nc -l -6 -u -d 7330 > $TMP_FILE & +ip netns exec ns1 bash -c "echo 'foobar' | nc -w0 -6 -u -p 2121 -s fb00::1 fb00::6 7330" +sleep 5 # wait enough time to ensure the UDP datagram arrived to the last segment +kill -INT $! + +if [[ $(< $TMP_FILE) != "foobar" ]]; then + exit 1 +fi + +exit 0 diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 3ecf733330c1..0ef68204c84b 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1542,6 +1542,162 @@ close_prog_noerr: bpf_object__close(obj); } +static void test_task_fd_query_rawtp(void) +{ + const char *file = "./test_get_stack_rawtp.o"; + __u64 probe_offset, probe_addr; + __u32 len, prog_id, fd_type; + struct bpf_object *obj; + int efd, err, prog_fd; + __u32 duration = 0; + char buf[256]; + + err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd); + if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno)) + return; + + efd = bpf_raw_tracepoint_open("sys_enter", prog_fd); + if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno)) + goto close_prog; + + /* query (getpid(), efd) */ + len = sizeof(buf); + err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id, + &fd_type, &probe_offset, &probe_addr); + if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err, + errno)) + goto close_prog; + + err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT && + strcmp(buf, "sys_enter") == 0; + if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n", + fd_type, buf)) + goto close_prog; + + /* test zero len */ + len = 0; + err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id, + &fd_type, &probe_offset, &probe_addr); + if (CHECK(err < 0, "bpf_task_fd_query (len = 0)", "err %d errno %d\n", + err, errno)) + goto close_prog; + err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT && + len == strlen("sys_enter"); + if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len)) + goto close_prog; + + /* test empty buffer */ + len = sizeof(buf); + err = bpf_task_fd_query(getpid(), efd, 0, 0, &len, &prog_id, + &fd_type, &probe_offset, &probe_addr); + if (CHECK(err < 0, "bpf_task_fd_query (buf = 0)", "err %d errno %d\n", + err, errno)) + goto close_prog; + err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT && + len == strlen("sys_enter"); + if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len)) + goto close_prog; + + /* test smaller buffer */ + len = 3; + err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id, + &fd_type, &probe_offset, &probe_addr); + if (CHECK(err >= 0 || errno != ENOSPC, "bpf_task_fd_query (len = 3)", + "err %d errno %d\n", err, errno)) + goto close_prog; + err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT && + len == strlen("sys_enter") && + strcmp(buf, "sy") == 0; + if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len)) + goto close_prog; + + goto close_prog_noerr; +close_prog: + error_cnt++; +close_prog_noerr: + bpf_object__close(obj); +} + +static void test_task_fd_query_tp_core(const char *probe_name, + const char *tp_name) +{ + const char *file = "./test_tracepoint.o"; + int err, bytes, efd, prog_fd, pmu_fd; + struct perf_event_attr attr = {}; + __u64 probe_offset, probe_addr; + __u32 len, prog_id, fd_type; + struct bpf_object *obj; + __u32 duration = 0; + char buf[256]; + + err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd); + if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno)) + goto close_prog; + + snprintf(buf, sizeof(buf), + "/sys/kernel/debug/tracing/events/%s/id", probe_name); + efd = open(buf, O_RDONLY, 0); + if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno)) + goto close_prog; + bytes = read(efd, buf, sizeof(buf)); + close(efd); + if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read", + "bytes %d errno %d\n", bytes, errno)) + goto close_prog; + + attr.config = strtol(buf, NULL, 0); + attr.type = PERF_TYPE_TRACEPOINT; + attr.sample_type = PERF_SAMPLE_RAW; + attr.sample_period = 1; + attr.wakeup_events = 1; + pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, + 0 /* cpu 0 */, -1 /* group id */, + 0 /* flags */); + if (CHECK(err, "perf_event_open", "err %d errno %d\n", err, errno)) + goto close_pmu; + + err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0); + if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err, + errno)) + goto close_pmu; + + err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); + if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err, + errno)) + goto close_pmu; + + /* query (getpid(), pmu_fd) */ + len = sizeof(buf); + err = bpf_task_fd_query(getpid(), pmu_fd, 0, buf, &len, &prog_id, + &fd_type, &probe_offset, &probe_addr); + if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err, + errno)) + goto close_pmu; + + err = (fd_type == BPF_FD_TYPE_TRACEPOINT) && !strcmp(buf, tp_name); + if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n", + fd_type, buf)) + goto close_pmu; + + close(pmu_fd); + goto close_prog_noerr; + +close_pmu: + close(pmu_fd); +close_prog: + error_cnt++; +close_prog_noerr: + bpf_object__close(obj); +} + +static void test_task_fd_query_tp(void) +{ + test_task_fd_query_tp_core("sched/sched_switch", + "sched_switch"); + test_task_fd_query_tp_core("syscalls/sys_enter_read", + "sys_enter_read"); +} + int main(void) { jit_enabled = is_jit_enabled(); @@ -1561,6 +1717,8 @@ int main(void) test_stacktrace_build_id_nmi(); test_stacktrace_map_raw_tp(); test_get_stack_raw_tp(); + test_task_fd_query_rawtp(); + test_task_fd_query_tp(); printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 94498eaf872e..4b4f015be217 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -1686,6 +1686,121 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SK_SKB, }, { + "valid access family in SK_MSG", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct sk_msg_md, family)), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SK_MSG, + }, + { + "valid access remote_ip4 in SK_MSG", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct sk_msg_md, remote_ip4)), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SK_MSG, + }, + { + "valid access local_ip4 in SK_MSG", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct sk_msg_md, local_ip4)), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SK_MSG, + }, + { + "valid access remote_port in SK_MSG", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct sk_msg_md, remote_port)), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SK_MSG, + }, + { + "valid access local_port in SK_MSG", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct sk_msg_md, local_port)), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SK_MSG, + }, + { + "valid access remote_ip6 in SK_MSG", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct sk_msg_md, remote_ip6[0])), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct sk_msg_md, remote_ip6[1])), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct sk_msg_md, remote_ip6[2])), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct sk_msg_md, remote_ip6[3])), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SK_SKB, + }, + { + "valid access local_ip6 in SK_MSG", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct sk_msg_md, local_ip6[0])), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct sk_msg_md, local_ip6[1])), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct sk_msg_md, local_ip6[2])), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct sk_msg_md, local_ip6[3])), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SK_SKB, + }, + { + "invalid 64B read of family in SK_MSG", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, + offsetof(struct sk_msg_md, family)), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SK_MSG, + }, + { + "invalid read past end of SK_MSG", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct sk_msg_md, local_port) + 4), + BPF_EXIT_INSN(), + }, + .errstr = "R0 !read_ok", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SK_MSG, + }, + { + "invalid read offset in SK_MSG", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct sk_msg_md, family) + 1), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SK_MSG, + }, + { "direct packet read for SK_MSG", .insns = { BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 8fb4fe8686e4..3868dcb63420 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -72,6 +72,18 @@ struct ksym *ksym_search(long key) return &syms[0]; } +long ksym_get_addr(const char *name) +{ + int i; + + for (i = 0; i < sym_cnt; i++) { + if (strcmp(syms[i].name, name) == 0) + return syms[i].addr; + } + + return 0; +} + static int page_size; static int page_cnt = 8; static struct perf_event_mmap_page *header; diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 36d90e3b1ea9..3b4bcf7f5084 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -11,6 +11,7 @@ struct ksym { int load_kallsyms(void); struct ksym *ksym_search(long key); +long ksym_get_addr(const char *name); typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size); diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 91041c49655b..89ba4cdd4392 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -340,6 +340,31 @@ tunnel_destroy() ip link del dev $name } +vlan_create() +{ + local if_name=$1; shift + local vid=$1; shift + local vrf=$1; shift + local ips=("${@}") + local name=$if_name.$vid + + ip link add name $name link $if_name type vlan id $vid + if [ "$vrf" != "" ]; then + ip link set dev $name master $vrf + fi + ip link set dev $name up + __addr_add_del $name add "${ips[@]}" +} + +vlan_destroy() +{ + local if_name=$1; shift + local vid=$1; shift + local name=$if_name.$vid + + ip link del dev $name +} + master_name_get() { local if_name=$1 @@ -423,26 +448,35 @@ tc_offload_check() return 0 } -slow_path_trap_install() +trap_install() { local dev=$1; shift local direction=$1; shift - if [ "${tcflags/skip_hw}" != "$tcflags" ]; then - # For slow-path testing, we need to install a trap to get to - # slow path the packets that would otherwise be switched in HW. - tc filter add dev $dev $direction pref 1 \ - flower skip_sw action trap - fi + # For slow-path testing, we need to install a trap to get to + # slow path the packets that would otherwise be switched in HW. + tc filter add dev $dev $direction pref 1 flower skip_sw action trap } -slow_path_trap_uninstall() +trap_uninstall() { local dev=$1; shift local direction=$1; shift + tc filter del dev $dev $direction pref 1 flower skip_sw +} + +slow_path_trap_install() +{ + if [ "${tcflags/skip_hw}" != "$tcflags" ]; then + trap_install "$@" + fi +} + +slow_path_trap_uninstall() +{ if [ "${tcflags/skip_hw}" != "$tcflags" ]; then - tc filter del dev $dev $direction pref 1 flower skip_sw + trap_uninstall "$@" fi } diff --git a/tools/testing/selftests/net/forwarding/mirror_gre.sh b/tools/testing/selftests/net/forwarding/mirror_gre.sh index c6786d1b2b96..e6fd7a18c655 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre.sh @@ -72,7 +72,6 @@ test_span_gre_mac() RET=0 mirror_install $swp1 $direction $tundev "matchall $tcflags" - tc qdisc add dev $h3 clsact tc filter add dev $h3 ingress pref 77 prot $prot \ flower ip_proto 0x2f src_mac $swp3mac dst_mac $h3mac \ action pass @@ -80,7 +79,6 @@ test_span_gre_mac() mirror_test v$h1 192.0.2.1 192.0.2.2 $h3 77 10 tc filter del dev $h3 ingress pref 77 - tc qdisc del dev $h3 clsact mirror_uninstall $swp1 $direction log_test "$direction $what: envelope MAC ($tcflags)" diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh new file mode 100755 index 000000000000..3d47afcf7fb9 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh @@ -0,0 +1,109 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This test uses standard topology for testing gretap. See +# mirror_gre_topo_lib.sh for more details. +# +# Test for "tc action mirred egress mirror" when the underlay route points at a +# bridge device without vlan filtering (802.1d). The device attached to that +# bridge is a VLAN. + +ALL_TESTS=" + test_gretap + test_ip6gretap +" + +NUM_NETIFS=6 +source lib.sh +source mirror_lib.sh +source mirror_gre_lib.sh +source mirror_gre_topo_lib.sh + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + swp3=${NETIFS[p5]} + h3=${NETIFS[p6]} + + vrf_prepare + mirror_gre_topo_create + + ip link add name br2 type bridge vlan_filtering 0 + ip link set dev br2 up + + vlan_create $swp3 555 + + ip link set dev $swp3.555 master br2 + ip route add 192.0.2.130/32 dev br2 + ip -6 route add 2001:db8:2::2/128 dev br2 + + ip address add dev br2 192.0.2.129/32 + ip address add dev br2 2001:db8:2::1/128 + + vlan_create $h3 555 v$h3 192.0.2.130/28 2001:db8:2::2/64 +} + +cleanup() +{ + pre_cleanup + + vlan_destroy $h3 555 + ip link del dev br2 + vlan_destroy $swp3 555 + + mirror_gre_topo_destroy + vrf_cleanup +} + +test_vlan_match() +{ + local tundev=$1; shift + local vlan_match=$1; shift + local what=$1; shift + + full_test_span_gre_dir_vlan $tundev ingress "$vlan_match" 8 0 "$what" + full_test_span_gre_dir_vlan $tundev egress "$vlan_match" 0 8 "$what" +} + +test_gretap() +{ + test_vlan_match gt4 'vlan_id 555 vlan_ethtype ip' "mirror to gretap" +} + +test_ip6gretap() +{ + test_vlan_match gt6 'vlan_id 555 vlan_ethtype ipv6' "mirror to ip6gretap" +} + +test_all() +{ + slow_path_trap_install $swp1 ingress + slow_path_trap_install $swp1 egress + + tests_run + + slow_path_trap_uninstall $swp1 egress + slow_path_trap_uninstall $swp1 ingress +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tcflags="skip_hw" +test_all + +if ! tc_offload_check; then + echo "WARN: Could not test offloaded functionality" +else + tcflags="skip_sw" + test_all +fi + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh index 50ab3462af0c..aa29d46186a8 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh @@ -12,6 +12,8 @@ ALL_TESTS=" test_tun_up test_egress_up test_remote_ip + test_tun_del + test_route_del " NUM_NETIFS=6 @@ -71,7 +73,6 @@ test_span_gre_ttl() RET=0 mirror_install $swp1 ingress $tundev "matchall $tcflags" - tc qdisc add dev $h3 clsact tc filter add dev $h3 ingress pref 77 prot $prot \ flower ip_ttl 50 action pass @@ -82,7 +83,6 @@ test_span_gre_ttl() ip link set dev $tundev type $type ttl 100 tc filter del dev $h3 ingress pref 77 - tc qdisc del dev $h3 clsact mirror_uninstall $swp1 ingress log_test "$what: TTL change ($tcflags)" @@ -159,6 +159,58 @@ test_span_gre_remote_ip() log_test "$what: remote address change ($tcflags)" } +test_span_gre_tun_del() +{ + local tundev=$1; shift + local type=$1; shift + local flags=$1; shift + local local_ip=$1; shift + local remote_ip=$1; shift + local what=$1; shift + + RET=0 + + mirror_install $swp1 ingress $tundev "matchall $tcflags" + quick_test_span_gre_dir $tundev ingress + ip link del dev $tundev + fail_test_span_gre_dir $tundev ingress + + tunnel_create $tundev $type $local_ip $remote_ip \ + ttl 100 tos inherit $flags + + # Recreating the tunnel doesn't reestablish mirroring, so reinstall it + # and verify it works for the follow-up tests. + mirror_uninstall $swp1 ingress + mirror_install $swp1 ingress $tundev "matchall $tcflags" + quick_test_span_gre_dir $tundev ingress + mirror_uninstall $swp1 ingress + + log_test "$what: tunnel deleted ($tcflags)" +} + +test_span_gre_route_del() +{ + local tundev=$1; shift + local edev=$1; shift + local route=$1; shift + local what=$1; shift + + RET=0 + + mirror_install $swp1 ingress $tundev "matchall $tcflags" + quick_test_span_gre_dir $tundev ingress + + ip route del $route dev $edev + fail_test_span_gre_dir $tundev ingress + + ip route add $route dev $edev + quick_test_span_gre_dir $tundev ingress + + mirror_uninstall $swp1 ingress + + log_test "$what: underlay route removal ($tcflags)" +} + test_ttl() { test_span_gre_ttl gt4 gretap ip "mirror to gretap" @@ -183,6 +235,20 @@ test_remote_ip() test_span_gre_remote_ip gt6 ip6gretap 2001:db8:2::2 2001:db8:2::4 "mirror to ip6gretap" } +test_tun_del() +{ + test_span_gre_tun_del gt4 gretap "" \ + 192.0.2.129 192.0.2.130 "mirror to gretap" + test_span_gre_tun_del gt6 ip6gretap allow-localremote \ + 2001:db8:2::1 2001:db8:2::2 "mirror to ip6gretap" +} + +test_route_del() +{ + test_span_gre_route_del gt4 $swp3 192.0.2.128/28 "mirror to gretap" + test_span_gre_route_del gt6 $swp3 2001:db8:2::/64 "mirror to ip6gretap" +} + test_all() { slow_path_trap_install $swp1 ingress diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh b/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh index 2e54407d8954..12914f40612d 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh @@ -67,6 +67,11 @@ test_span_gre_dir_acl() test_span_gre_dir_ips "$@" 192.0.2.3 192.0.2.4 } +fail_test_span_gre_dir_acl() +{ + fail_test_span_gre_dir_ips "$@" 192.0.2.3 192.0.2.4 +} + full_test_span_gre_dir_acl() { local tundev=$1; shift @@ -83,6 +88,9 @@ full_test_span_gre_dir_acl() "$forward_type" "$backward_type" mirror_uninstall $swp1 $direction + # Test lack of mirroring after ACL mirror is uninstalled. + fail_test_span_gre_dir_acl "$tundev" "$direction" + log_test "$direction $what ($tcflags)" } diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh index 207ffd167dba..92ef6dde98bd 100644 --- a/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh @@ -1,53 +1,53 @@ # SPDX-License-Identifier: GPL-2.0 -do_test_span_gre_dir_ips() +source mirror_lib.sh + +quick_test_span_gre_dir_ips() { - local expect=$1; shift local tundev=$1; shift - local direction=$1; shift - local ip1=$1; shift - local ip2=$1; shift - icmp_capture_install h3-$tundev - mirror_test v$h1 $ip1 $ip2 h3-$tundev 100 $expect - mirror_test v$h2 $ip2 $ip1 h3-$tundev 100 $expect - icmp_capture_uninstall h3-$tundev + do_test_span_dir_ips 10 h3-$tundev "$@" } -quick_test_span_gre_dir_ips() +fail_test_span_gre_dir_ips() { - do_test_span_gre_dir_ips 10 "$@" + local tundev=$1; shift + + do_test_span_dir_ips 0 h3-$tundev "$@" } -fail_test_span_gre_dir_ips() +test_span_gre_dir_ips() { - do_test_span_gre_dir_ips 0 "$@" + local tundev=$1; shift + + test_span_dir_ips h3-$tundev "$@" } -test_span_gre_dir_ips() +full_test_span_gre_dir_ips() { local tundev=$1; shift local direction=$1; shift local forward_type=$1; shift local backward_type=$1; shift + local what=$1; shift local ip1=$1; shift local ip2=$1; shift - quick_test_span_gre_dir_ips "$tundev" "$direction" "$ip1" "$ip2" + RET=0 - icmp_capture_install h3-$tundev "type $forward_type" - mirror_test v$h1 $ip1 $ip2 h3-$tundev 100 10 - icmp_capture_uninstall h3-$tundev + mirror_install $swp1 $direction $tundev "matchall $tcflags" + test_span_dir_ips "h3-$tundev" "$direction" "$forward_type" \ + "$backward_type" "$ip1" "$ip2" + mirror_uninstall $swp1 $direction - icmp_capture_install h3-$tundev "type $backward_type" - mirror_test v$h2 $ip2 $ip1 h3-$tundev 100 10 - icmp_capture_uninstall h3-$tundev + log_test "$direction $what ($tcflags)" } -full_test_span_gre_dir_ips() +full_test_span_gre_dir_vlan_ips() { local tundev=$1; shift local direction=$1; shift + local vlan_match=$1; shift local forward_type=$1; shift local backward_type=$1; shift local what=$1; shift @@ -57,8 +57,16 @@ full_test_span_gre_dir_ips() RET=0 mirror_install $swp1 $direction $tundev "matchall $tcflags" - test_span_gre_dir_ips "$tundev" "$direction" "$forward_type" \ - "$backward_type" "$ip1" "$ip2" + + test_span_dir_ips "h3-$tundev" "$direction" "$forward_type" \ + "$backward_type" "$ip1" "$ip2" + + tc filter add dev $h3 ingress pref 77 prot 802.1q \ + flower $vlan_match ip_proto 0x2f \ + action pass + mirror_test v$h1 $ip1 $ip2 $h3 77 10 + tc filter del dev $h3 ingress pref 77 + mirror_uninstall $swp1 $direction log_test "$direction $what ($tcflags)" @@ -83,3 +91,8 @@ full_test_span_gre_dir() { full_test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2 } + +full_test_span_gre_dir_vlan() +{ + full_test_span_gre_dir_vlan_ips "$@" 192.0.2.1 192.0.2.2 +} diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh b/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh index b3ceda2b4197..253419564708 100644 --- a/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh @@ -33,29 +33,11 @@ # | | # +-------------------------------------------------------------------------+ -mirror_gre_topo_h1_create() -{ - simple_if_init $h1 192.0.2.1/28 -} - -mirror_gre_topo_h1_destroy() -{ - simple_if_fini $h1 192.0.2.1/28 -} - -mirror_gre_topo_h2_create() -{ - simple_if_init $h2 192.0.2.2/28 -} - -mirror_gre_topo_h2_destroy() -{ - simple_if_fini $h2 192.0.2.2/28 -} +source mirror_topo_lib.sh mirror_gre_topo_h3_create() { - simple_if_init $h3 + mirror_topo_h3_create tunnel_create h3-gt4 gretap 192.0.2.130 192.0.2.129 ip link set h3-gt4 vrf v$h3 @@ -71,49 +53,32 @@ mirror_gre_topo_h3_destroy() tunnel_destroy h3-gt6 tunnel_destroy h3-gt4 - simple_if_fini $h3 + mirror_topo_h3_destroy } mirror_gre_topo_switch_create() { - ip link set dev $swp3 up - - ip link add name br1 type bridge vlan_filtering 1 - ip link set dev br1 up - - ip link set dev $swp1 master br1 - ip link set dev $swp1 up - - ip link set dev $swp2 master br1 - ip link set dev $swp2 up + mirror_topo_switch_create tunnel_create gt4 gretap 192.0.2.129 192.0.2.130 \ ttl 100 tos inherit tunnel_create gt6 ip6gretap 2001:db8:2::1 2001:db8:2::2 \ ttl 100 tos inherit allow-localremote - - tc qdisc add dev $swp1 clsact } mirror_gre_topo_switch_destroy() { - tc qdisc del dev $swp1 clsact - tunnel_destroy gt6 tunnel_destroy gt4 - ip link set dev $swp1 down - ip link set dev $swp2 down - ip link del dev br1 - - ip link set dev $swp3 down + mirror_topo_switch_destroy } mirror_gre_topo_create() { - mirror_gre_topo_h1_create - mirror_gre_topo_h2_create + mirror_topo_h1_create + mirror_topo_h2_create mirror_gre_topo_h3_create mirror_gre_topo_switch_create @@ -124,6 +89,6 @@ mirror_gre_topo_destroy() mirror_gre_topo_switch_destroy mirror_gre_topo_h3_destroy - mirror_gre_topo_h2_destroy - mirror_gre_topo_h1_destroy + mirror_topo_h2_destroy + mirror_topo_h1_destroy } diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan.sh new file mode 100755 index 000000000000..88cecdb9a861 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This test uses standard topology for testing gretap. See +# mirror_gre_topo_lib.sh for more details. +# +# Test for "tc action mirred egress mirror" that mirrors to a gretap netdevice +# whose underlay route points at a vlan device. + +ALL_TESTS=" + test_gretap +" + +NUM_NETIFS=6 +source lib.sh +source mirror_lib.sh +source mirror_gre_lib.sh +source mirror_gre_topo_lib.sh + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + swp3=${NETIFS[p5]} + h3=${NETIFS[p6]} + + vrf_prepare + mirror_gre_topo_create + + ip link add name $swp3.555 link $swp3 type vlan id 555 + ip address add dev $swp3.555 192.0.2.129/32 + ip address add dev $swp3.555 2001:db8:2::1/128 + ip link set dev $swp3.555 up + + ip route add 192.0.2.130/32 dev $swp3.555 + ip -6 route add 2001:db8:2::2/128 dev $swp3.555 + + ip link add name $h3.555 link $h3 type vlan id 555 + ip link set dev $h3.555 master v$h3 + ip address add dev $h3.555 192.0.2.130/28 + ip address add dev $h3.555 2001:db8:2::2/64 + ip link set dev $h3.555 up +} + +cleanup() +{ + pre_cleanup + + ip link del dev $h3.555 + ip link del dev $swp3.555 + + mirror_gre_topo_destroy + vrf_cleanup +} + +test_gretap() +{ + full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap" + full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap" +} + +test_all() +{ + slow_path_trap_install $swp1 ingress + slow_path_trap_install $swp1 egress + + tests_run + + slow_path_trap_uninstall $swp1 egress + slow_path_trap_uninstall $swp1 ingress +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tcflags="skip_hw" +test_all + +if ! tc_offload_check; then + echo "WARN: Could not test offloaded functionality" +else + tcflags="skip_sw" + test_all +fi + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh new file mode 100755 index 000000000000..01ec28ac2e4a --- /dev/null +++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh @@ -0,0 +1,140 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This test uses standard topology for testing gretap. See +# mirror_gre_topo_lib.sh for more details. +# +# Test for "tc action mirred egress mirror" when the underlay route points at a +# vlan device on top of a bridge device with vlan filtering (802.1q). + +ALL_TESTS=" + test_gretap + test_ip6gretap + test_gretap_forbidden + test_ip6gretap_forbidden +" + +NUM_NETIFS=6 +source lib.sh +source mirror_lib.sh +source mirror_gre_lib.sh +source mirror_gre_topo_lib.sh + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + swp3=${NETIFS[p5]} + h3=${NETIFS[p6]} + + vrf_prepare + mirror_gre_topo_create + + vlan_create br1 555 "" 192.0.2.129/32 2001:db8:2::1/128 + bridge vlan add dev br1 vid 555 self + ip route rep 192.0.2.130/32 dev br1.555 + ip -6 route rep 2001:db8:2::2/128 dev br1.555 + + vlan_create $h3 555 v$h3 192.0.2.130/28 2001:db8:2::2/64 + + ip link set dev $swp3 master br1 + bridge vlan add dev $swp3 vid 555 +} + +cleanup() +{ + pre_cleanup + + ip link set dev $swp3 nomaster + vlan_destroy $h3 555 + vlan_destroy br1 555 + + mirror_gre_topo_destroy + vrf_cleanup +} + +test_vlan_match() +{ + local tundev=$1; shift + local vlan_match=$1; shift + local what=$1; shift + + full_test_span_gre_dir_vlan $tundev ingress "$vlan_match" 8 0 "$what" + full_test_span_gre_dir_vlan $tundev egress "$vlan_match" 0 8 "$what" +} + +test_gretap() +{ + test_vlan_match gt4 'vlan_id 555 vlan_ethtype ip' "mirror to gretap" +} + +test_ip6gretap() +{ + test_vlan_match gt6 'vlan_id 555 vlan_ethtype ipv6' "mirror to ip6gretap" +} + +test_span_gre_forbidden() +{ + local tundev=$1; shift + local what=$1; shift + + RET=0 + + # Run the pass-test first, to prime neighbor table. + mirror_install $swp1 ingress $tundev "matchall $tcflags" + quick_test_span_gre_dir $tundev ingress + + # Now forbid the VLAN at the bridge and see it fail. + bridge vlan del dev br1 vid 555 self + sleep 1 + + fail_test_span_gre_dir $tundev ingress + mirror_uninstall $swp1 ingress + + bridge vlan add dev br1 vid 555 self + sleep 1 + + log_test "$what: vlan forbidden at a bridge ($tcflags)" +} + +test_gretap_forbidden() +{ + test_span_gre_forbidden gt4 "mirror to gretap" +} + +test_ip6gretap_forbidden() +{ + test_span_gre_forbidden gt4 "mirror to ip6gretap" +} + +test_all() +{ + slow_path_trap_install $swp1 ingress + slow_path_trap_install $swp1 egress + + tests_run + + slow_path_trap_uninstall $swp1 egress + slow_path_trap_uninstall $swp1 ingress +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tcflags="skip_hw" +test_all + +if ! tc_offload_check; then + echo "WARN: Could not test offloaded functionality" +else + tcflags="skip_sw" + test_all +fi + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh index e5028a5725e3..04cbc38e71a2 100644 --- a/tools/testing/selftests/net/forwarding/mirror_lib.sh +++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh @@ -38,3 +38,57 @@ mirror_test() ((expect <= delta && delta <= expect + 2)) check_err $? "Expected to capture $expect packets, got $delta." } + +do_test_span_dir_ips() +{ + local expect=$1; shift + local dev=$1; shift + local direction=$1; shift + local ip1=$1; shift + local ip2=$1; shift + + icmp_capture_install $dev + mirror_test v$h1 $ip1 $ip2 $dev 100 $expect + mirror_test v$h2 $ip2 $ip1 $dev 100 $expect + icmp_capture_uninstall $dev +} + +quick_test_span_dir_ips() +{ + do_test_span_dir_ips 10 "$@" +} + +fail_test_span_dir_ips() +{ + do_test_span_dir_ips 0 "$@" +} + +test_span_dir_ips() +{ + local dev=$1; shift + local direction=$1; shift + local forward_type=$1; shift + local backward_type=$1; shift + local ip1=$1; shift + local ip2=$1; shift + + quick_test_span_dir_ips "$dev" "$direction" "$ip1" "$ip2" + + icmp_capture_install $dev "type $forward_type" + mirror_test v$h1 $ip1 $ip2 $dev 100 10 + icmp_capture_uninstall $dev + + icmp_capture_install $dev "type $backward_type" + mirror_test v$h2 $ip2 $ip1 $dev 100 10 + icmp_capture_uninstall $dev +} + +fail_test_span_dir() +{ + fail_test_span_dir_ips "$@" 192.0.2.1 192.0.2.2 +} + +test_span_dir() +{ + test_span_dir_ips "$@" 192.0.2.1 192.0.2.2 +} diff --git a/tools/testing/selftests/net/forwarding/mirror_topo_lib.sh b/tools/testing/selftests/net/forwarding/mirror_topo_lib.sh new file mode 100644 index 000000000000..04979e5962e7 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/mirror_topo_lib.sh @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: GPL-2.0 + +# This is the standard topology for testing mirroring. The tests that use it +# tweak it in one way or another--typically add more devices to the topology. +# +# +---------------------+ +---------------------+ +# | H1 | | H2 | +# | + $h1 | | $h2 + | +# | | 192.0.2.1/28 | | 192.0.2.2/28 | | +# +-----|---------------+ +---------------|-----+ +# | | +# +-----|-------------------------------------------------------------|-----+ +# | SW o--> mirror | | +# | +---|-------------------------------------------------------------|---+ | +# | | + $swp1 BR $swp2 + | | +# | +---------------------------------------------------------------------+ | +# | | +# | + $swp3 | +# +-----|-------------------------------------------------------------------+ +# | +# +-----|-------------------------------------------------------------------+ +# | H3 + $h3 | +# | | +# +-------------------------------------------------------------------------+ + +mirror_topo_h1_create() +{ + simple_if_init $h1 192.0.2.1/28 +} + +mirror_topo_h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/28 +} + +mirror_topo_h2_create() +{ + simple_if_init $h2 192.0.2.2/28 +} + +mirror_topo_h2_destroy() +{ + simple_if_fini $h2 192.0.2.2/28 +} + +mirror_topo_h3_create() +{ + simple_if_init $h3 + tc qdisc add dev $h3 clsact +} + +mirror_topo_h3_destroy() +{ + tc qdisc del dev $h3 clsact + simple_if_fini $h3 +} + +mirror_topo_switch_create() +{ + ip link set dev $swp3 up + + ip link add name br1 type bridge vlan_filtering 1 + ip link set dev br1 up + + ip link set dev $swp1 master br1 + ip link set dev $swp1 up + + ip link set dev $swp2 master br1 + ip link set dev $swp2 up + + tc qdisc add dev $swp1 clsact +} + +mirror_topo_switch_destroy() +{ + tc qdisc del dev $swp1 clsact + + ip link set dev $swp1 down + ip link set dev $swp2 down + ip link del dev br1 + + ip link set dev $swp3 down +} + +mirror_topo_create() +{ + mirror_topo_h1_create + mirror_topo_h2_create + mirror_topo_h3_create + + mirror_topo_switch_create +} + +mirror_topo_destroy() +{ + mirror_topo_switch_destroy + + mirror_topo_h3_destroy + mirror_topo_h2_destroy + mirror_topo_h1_destroy +} diff --git a/tools/testing/selftests/net/forwarding/mirror_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_vlan.sh new file mode 100755 index 000000000000..1e10520dccf4 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/mirror_vlan.sh @@ -0,0 +1,169 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This test uses standard topology for testing mirroring. See mirror_topo_lib.sh +# for more details. +# +# Test for "tc action mirred egress mirror" that mirrors to a vlan device. + +ALL_TESTS=" + test_vlan + test_tagged_vlan +" + +NUM_NETIFS=6 +source lib.sh +source mirror_lib.sh +source mirror_topo_lib.sh + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + swp3=${NETIFS[p5]} + h3=${NETIFS[p6]} + + vrf_prepare + mirror_topo_create + + vlan_create $swp3 555 + + vlan_create $h3 555 v$h3 + matchall_sink_create $h3.555 + + vlan_create $h1 111 v$h1 192.0.2.17/28 + bridge vlan add dev $swp1 vid 111 + + vlan_create $h2 111 v$h2 192.0.2.18/28 + bridge vlan add dev $swp2 vid 111 +} + +cleanup() +{ + pre_cleanup + + vlan_destroy $h2 111 + vlan_destroy $h1 111 + vlan_destroy $h3 555 + vlan_destroy $swp3 555 + + mirror_topo_destroy + vrf_cleanup +} + +test_vlan_dir() +{ + local direction=$1; shift + local forward_type=$1; shift + local backward_type=$1; shift + + RET=0 + + mirror_install $swp1 $direction $swp3.555 "matchall $tcflags" + test_span_dir "$h3.555" "$direction" "$forward_type" "$backward_type" + mirror_uninstall $swp1 $direction + + log_test "$direction mirror to vlan ($tcflags)" +} + +test_vlan() +{ + test_vlan_dir ingress 8 0 + test_vlan_dir egress 0 8 +} + +vlan_capture_add_del() +{ + local add_del=$1; shift + local pref=$1; shift + local dev=$1; shift + local filter=$1; shift + + tc filter $add_del dev "$dev" ingress \ + proto 802.1q pref $pref \ + flower $filter \ + action pass +} + +vlan_capture_install() +{ + vlan_capture_add_del add 100 "$@" +} + +vlan_capture_uninstall() +{ + vlan_capture_add_del del 100 "$@" +} + +do_test_span_vlan_dir_ips() +{ + local expect=$1; shift + local dev=$1; shift + local vid=$1; shift + local direction=$1; shift + local ip1=$1; shift + local ip2=$1; shift + + vlan_capture_install $dev "vlan_id $vid" + mirror_test v$h1 $ip1 $ip2 $dev 100 $expect + mirror_test v$h2 $ip2 $ip1 $dev 100 $expect + vlan_capture_uninstall $dev +} + +test_tagged_vlan_dir() +{ + local direction=$1; shift + local forward_type=$1; shift + local backward_type=$1; shift + + RET=0 + + mirror_install $swp1 $direction $swp3.555 "matchall $tcflags" + do_test_span_vlan_dir_ips 10 "$h3.555" 111 "$direction" \ + 192.0.2.17 192.0.2.18 + do_test_span_vlan_dir_ips 0 "$h3.555" 555 "$direction" \ + 192.0.2.17 192.0.2.18 + mirror_uninstall $swp1 $direction + + log_test "$direction mirror to vlan ($tcflags)" +} + +test_tagged_vlan() +{ + test_tagged_vlan_dir ingress 8 0 + test_tagged_vlan_dir egress 0 8 +} + +test_all() +{ + slow_path_trap_install $swp1 ingress + slow_path_trap_install $swp1 egress + trap_install $h3 ingress + + tests_run + + trap_install $h3 ingress + slow_path_trap_uninstall $swp1 egress + slow_path_trap_uninstall $swp1 ingress +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tcflags="skip_hw" +test_all + +if ! tc_offload_check; then + echo "WARN: Could not test offloaded functionality" +else + tcflags="skip_sw" + test_all +fi + +exit $EXIT_STATUS |