diff options
45 files changed, 3164 insertions, 973 deletions
diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index b89bc82eed46..16a924c486bf 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -27,6 +27,8 @@ Contents of this document: (*) AF_RXRPC kernel interface. + (*) Configurable parameters. + ======== OVERVIEW @@ -864,3 +866,82 @@ The kernel interface functions are as follows: This is used to allocate a null RxRPC key that can be used to indicate anonymous security for a particular domain. + + +======================= +CONFIGURABLE PARAMETERS +======================= + +The RxRPC protocol driver has a number of configurable parameters that can be +adjusted through sysctls in /proc/net/rxrpc/: + + (*) req_ack_delay + + The amount of time in milliseconds after receiving a packet with the + request-ack flag set before we honour the flag and actually send the + requested ack. + + Usually the other side won't stop sending packets until the advertised + reception window is full (to a maximum of 255 packets), so delaying the + ACK permits several packets to be ACK'd in one go. + + (*) soft_ack_delay + + The amount of time in milliseconds after receiving a new packet before we + generate a soft-ACK to tell the sender that it doesn't need to resend. + + (*) idle_ack_delay + + The amount of time in milliseconds after all the packets currently in the + received queue have been consumed before we generate a hard-ACK to tell + the sender it can free its buffers, assuming no other reason occurs that + we would send an ACK. + + (*) resend_timeout + + The amount of time in milliseconds after transmitting a packet before we + transmit it again, assuming no ACK is received from the receiver telling + us they got it. + + (*) max_call_lifetime + + The maximum amount of time in seconds that a call may be in progress + before we preemptively kill it. + + (*) dead_call_expiry + + The amount of time in seconds before we remove a dead call from the call + list. Dead calls are kept around for a little while for the purpose of + repeating ACK and ABORT packets. + + (*) connection_expiry + + The amount of time in seconds after a connection was last used before we + remove it from the connection list. Whilst a connection is in existence, + it serves as a placeholder for negotiated security; when it is deleted, + the security must be renegotiated. + + (*) transport_expiry + + The amount of time in seconds after a transport was last used before we + remove it from the transport list. Whilst a transport is in existence, it + serves to anchor the peer data and keeps the connection ID counter. + + (*) rxrpc_rx_window_size + + The size of the receive window in packets. This is the maximum number of + unconsumed received packets we're willing to hold in memory for any + particular call. + + (*) rxrpc_rx_mtu + + The maximum packet MTU size that we're willing to receive in bytes. This + indicates to the peer whether we're willing to accept jumbo packets. + + (*) rxrpc_rx_jumbo_max + + The maximum number of packets that we're willing to accept in a jumbo + packet. Non-terminal packets in a jumbo packet must contain a four byte + header plus exactly 1412 bytes of data. The terminal packet must contain + a four byte header plus any amount of data. In any event, a jumbo packet + may not exceed rxrpc_rx_mtu in size. diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index fc6d25e7d053..22800bde9752 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -21,55 +21,55 @@ #include "bonding.h" static int bond_option_active_slave_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_miimon_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_updelay_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_downdelay_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_use_carrier_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_arp_interval_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_arp_ip_target_add(struct bonding *bond, __be32 target); static int bond_option_arp_ip_target_rem(struct bonding *bond, __be32 target); static int bond_option_arp_ip_targets_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_arp_validate_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_arp_all_targets_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_primary_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_primary_reselect_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_fail_over_mac_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_xmit_hash_policy_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_resend_igmp_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_num_peer_notif_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_all_slaves_active_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_min_links_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_lp_interval_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_pps_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_lacp_rate_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_ad_select_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_queue_id_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_mode_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static int bond_option_slaves_set(struct bonding *bond, - struct bond_opt_value *newval); + const struct bond_opt_value *newval); static const struct bond_opt_value bond_mode_tbl[] = { @@ -504,7 +504,7 @@ static int bond_opt_check_deps(struct bonding *bond, static void bond_opt_dep_print(struct bonding *bond, const struct bond_option *opt) { - struct bond_opt_value *modeval; + const struct bond_opt_value *modeval; struct bond_params *params; params = &bond->params; @@ -517,9 +517,9 @@ static void bond_opt_dep_print(struct bonding *bond, static void bond_opt_error_interpret(struct bonding *bond, const struct bond_option *opt, - int error, struct bond_opt_value *val) + int error, const struct bond_opt_value *val) { - struct bond_opt_value *minval, *maxval; + const struct bond_opt_value *minval, *maxval; char *p; switch (error) { @@ -574,7 +574,7 @@ static void bond_opt_error_interpret(struct bonding *bond, int __bond_opt_set(struct bonding *bond, unsigned int option, struct bond_opt_value *val) { - struct bond_opt_value *retval = NULL; + const struct bond_opt_value *retval = NULL; const struct bond_option *opt; int ret = -ENOENT; @@ -637,7 +637,7 @@ const struct bond_option *bond_opt_get(unsigned int option) return &bond_opts[option]; } -int bond_option_mode_set(struct bonding *bond, struct bond_opt_value *newval) +int bond_option_mode_set(struct bonding *bond, const struct bond_opt_value *newval) { if (BOND_NO_USES_ARP(newval->value) && bond->params.arp_interval) { pr_info("%s: %s mode is incompatible with arp monitoring, start mii monitoring\n", @@ -676,7 +676,7 @@ struct net_device *bond_option_active_slave_get(struct bonding *bond) } static int bond_option_active_slave_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { char ifname[IFNAMSIZ] = { 0, }; struct net_device *slave_dev; @@ -745,7 +745,7 @@ static int bond_option_active_slave_set(struct bonding *bond, } static int bond_option_miimon_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { pr_info("%s: Setting MII monitoring interval to %llu\n", bond->dev->name, newval->value); @@ -783,7 +783,7 @@ static int bond_option_miimon_set(struct bonding *bond, } static int bond_option_updelay_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { int value = newval->value; @@ -807,7 +807,7 @@ static int bond_option_updelay_set(struct bonding *bond, } static int bond_option_downdelay_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { int value = newval->value; @@ -831,7 +831,7 @@ static int bond_option_downdelay_set(struct bonding *bond, } static int bond_option_use_carrier_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { pr_info("%s: Setting use_carrier to %llu\n", bond->dev->name, newval->value); @@ -841,7 +841,7 @@ static int bond_option_use_carrier_set(struct bonding *bond, } static int bond_option_arp_interval_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { pr_info("%s: Setting ARP monitoring interval to %llu\n", bond->dev->name, newval->value); @@ -991,7 +991,7 @@ void bond_option_arp_ip_targets_clear(struct bonding *bond) } static int bond_option_arp_ip_targets_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { int ret = -EPERM; __be32 target; @@ -1018,7 +1018,7 @@ static int bond_option_arp_ip_targets_set(struct bonding *bond, } static int bond_option_arp_validate_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { pr_info("%s: Setting arp_validate to %s (%llu)\n", bond->dev->name, newval->string, newval->value); @@ -1035,7 +1035,7 @@ static int bond_option_arp_validate_set(struct bonding *bond, } static int bond_option_arp_all_targets_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { pr_info("%s: Setting arp_all_targets to %s (%llu)\n", bond->dev->name, newval->string, newval->value); @@ -1045,7 +1045,7 @@ static int bond_option_arp_all_targets_set(struct bonding *bond, } static int bond_option_primary_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { char *p, *primary = newval->string; struct list_head *iter; @@ -1098,7 +1098,7 @@ out: } static int bond_option_primary_reselect_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { pr_info("%s: Setting primary_reselect to %s (%llu)\n", bond->dev->name, newval->string, newval->value); @@ -1114,7 +1114,7 @@ static int bond_option_primary_reselect_set(struct bonding *bond, } static int bond_option_fail_over_mac_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { pr_info("%s: Setting fail_over_mac to %s (%llu)\n", bond->dev->name, newval->string, newval->value); @@ -1124,7 +1124,7 @@ static int bond_option_fail_over_mac_set(struct bonding *bond, } static int bond_option_xmit_hash_policy_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { pr_info("%s: Setting xmit hash policy to %s (%llu)\n", bond->dev->name, newval->string, newval->value); @@ -1134,7 +1134,7 @@ static int bond_option_xmit_hash_policy_set(struct bonding *bond, } static int bond_option_resend_igmp_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { pr_info("%s: Setting resend_igmp to %llu\n", bond->dev->name, newval->value); @@ -1144,7 +1144,7 @@ static int bond_option_resend_igmp_set(struct bonding *bond, } static int bond_option_num_peer_notif_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { bond->params.num_peer_notif = newval->value; @@ -1152,7 +1152,7 @@ static int bond_option_num_peer_notif_set(struct bonding *bond, } static int bond_option_all_slaves_active_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { struct list_head *iter; struct slave *slave; @@ -1173,7 +1173,7 @@ static int bond_option_all_slaves_active_set(struct bonding *bond, } static int bond_option_min_links_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { pr_info("%s: Setting min links value to %llu\n", bond->dev->name, newval->value); @@ -1183,7 +1183,7 @@ static int bond_option_min_links_set(struct bonding *bond, } static int bond_option_lp_interval_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { bond->params.lp_interval = newval->value; @@ -1191,7 +1191,7 @@ static int bond_option_lp_interval_set(struct bonding *bond, } static int bond_option_pps_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { bond->params.packets_per_slave = newval->value; if (newval->value > 0) { @@ -1209,7 +1209,7 @@ static int bond_option_pps_set(struct bonding *bond, } static int bond_option_lacp_rate_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { pr_info("%s: Setting LACP rate to %s (%llu)\n", bond->dev->name, newval->string, newval->value); @@ -1220,7 +1220,7 @@ static int bond_option_lacp_rate_set(struct bonding *bond, } static int bond_option_ad_select_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { pr_info("%s: Setting ad_select to %s (%llu)\n", bond->dev->name, newval->string, newval->value); @@ -1230,7 +1230,7 @@ static int bond_option_ad_select_set(struct bonding *bond, } static int bond_option_queue_id_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { struct slave *slave, *update_slave; struct net_device *sdev; @@ -1291,7 +1291,7 @@ err_no_cmd: } static int bond_option_slaves_set(struct bonding *bond, - struct bond_opt_value *newval) + const struct bond_opt_value *newval) { char command[IFNAMSIZ + 1] = { 0, }; struct net_device *dev; diff --git a/drivers/net/bonding/bond_options.h b/drivers/net/bonding/bond_options.h index 6c5ba0ffc31c..12be9e1bfb0c 100644 --- a/drivers/net/bonding/bond_options.h +++ b/drivers/net/bonding/bond_options.h @@ -94,14 +94,15 @@ struct bond_option { */ const struct bond_opt_value *values; - int (*set)(struct bonding *bond, struct bond_opt_value *val); + int (*set)(struct bonding *bond, const struct bond_opt_value *val); }; int __bond_opt_set(struct bonding *bond, unsigned int option, struct bond_opt_value *val); int bond_opt_tryset_rtnl(struct bonding *bond, unsigned int option, char *buf); + const struct bond_opt_value *bond_opt_parse(const struct bond_option *opt, - struct bond_opt_value *val); + struct bond_opt_value *val); const struct bond_option *bond_opt_get(unsigned int option); const struct bond_opt_value *bond_opt_get_val(unsigned int option, u64 val); diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c index 588cf39d832c..013fdd0f45e9 100644 --- a/drivers/net/bonding/bond_procfs.c +++ b/drivers/net/bonding/bond_procfs.c @@ -65,7 +65,7 @@ static void bond_info_seq_stop(struct seq_file *seq, void *v) static void bond_info_show_master(struct seq_file *seq) { struct bonding *bond = seq->private; - struct bond_opt_value *optval; + const struct bond_opt_value *optval; struct slave *curr; int i; diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c index cf09d8faca84..66759b6ce373 100644 --- a/drivers/net/ethernet/emulex/benet/be_ethtool.c +++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c @@ -802,16 +802,18 @@ be_self_test(struct net_device *netdev, struct ethtool_test *test, u64 *data) if (test->flags & ETH_TEST_FL_OFFLINE) { if (be_loopback_test(adapter, BE_MAC_LOOPBACK, - &data[0]) != 0) { + &data[0]) != 0) test->flags |= ETH_TEST_FL_FAILED; - } + if (be_loopback_test(adapter, BE_PHY_LOOPBACK, - &data[1]) != 0) { - test->flags |= ETH_TEST_FL_FAILED; - } - if (be_loopback_test(adapter, BE_ONE_PORT_EXT_LOOPBACK, - &data[2]) != 0) { + &data[1]) != 0) test->flags |= ETH_TEST_FL_FAILED; + + if (test->flags & ETH_TEST_FL_EXTERNAL_LB) { + if (be_loopback_test(adapter, BE_ONE_PORT_EXT_LOOPBACK, + &data[2]) != 0) + test->flags |= ETH_TEST_FL_FAILED; + test->flags |= ETH_TEST_FL_EXTERNAL_LB_DONE; } } diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 72dae4d97b43..838b69b74edf 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -86,12 +86,12 @@ #define I40E_NVM_VERSION_LO_SHIFT 0 #define I40E_NVM_VERSION_LO_MASK (0xff << I40E_NVM_VERSION_LO_SHIFT) -#define I40E_NVM_VERSION_HI_SHIFT 8 -#define I40E_NVM_VERSION_HI_MASK (0xff << I40E_NVM_VERSION_HI_SHIFT) +#define I40E_NVM_VERSION_HI_SHIFT 12 +#define I40E_NVM_VERSION_HI_MASK (0xf << I40E_NVM_VERSION_HI_SHIFT) /* The values in here are decimal coded as hex as is the case in the NVM map*/ #define I40E_CURRENT_NVM_VERSION_HI 0x2 -#define I40E_CURRENT_NVM_VERSION_LO 0x30 +#define I40E_CURRENT_NVM_VERSION_LO 0x40 /* magic for getting defines into strings */ #define STRINGIFY(foo) #foo @@ -152,8 +152,18 @@ struct i40e_lump_tracking { }; #define I40E_DEFAULT_ATR_SAMPLE_RATE 20 -#define I40E_FDIR_MAX_RAW_PACKET_LOOKUP 512 -struct i40e_fdir_data { +#define I40E_FDIR_MAX_RAW_PACKET_SIZE 512 +struct i40e_fdir_filter { + struct hlist_node fdir_node; + /* filter ipnut set */ + u8 flow_type; + u8 ip4_proto; + __be32 dst_ip[4]; + __be32 src_ip[4]; + __be16 src_port; + __be16 dst_port; + __be32 sctp_v_tag; + /* filter control */ u16 q_index; u8 flex_off; u8 pctype; @@ -162,7 +172,6 @@ struct i40e_fdir_data { u8 fd_status; u16 cnt_index; u32 fd_id; - u8 *raw_packet; }; #define I40E_ETH_P_LLDP 0x88cc @@ -210,6 +219,9 @@ struct i40e_pf { u8 atr_sample_rate; bool wol_en; + struct hlist_head fdir_filter_list; + u16 fdir_pf_active_filters; + #ifdef CONFIG_I40E_VXLAN __be16 vxlan_ports[I40E_MAX_PF_UDP_OFFLOAD_PORTS]; u16 pending_vxlan_bitmap; @@ -477,10 +489,10 @@ static inline char *i40e_fw_version_str(struct i40e_hw *hw) "f%d.%d a%d.%d n%02x.%02x e%08x", hw->aq.fw_maj_ver, hw->aq.fw_min_ver, hw->aq.api_maj_ver, hw->aq.api_min_ver, - (hw->nvm.version & I40E_NVM_VERSION_HI_MASK) - >> I40E_NVM_VERSION_HI_SHIFT, - (hw->nvm.version & I40E_NVM_VERSION_LO_MASK) - >> I40E_NVM_VERSION_LO_SHIFT, + (hw->nvm.version & I40E_NVM_VERSION_HI_MASK) >> + I40E_NVM_VERSION_HI_SHIFT, + (hw->nvm.version & I40E_NVM_VERSION_LO_MASK) >> + I40E_NVM_VERSION_LO_SHIFT, hw->nvm.eetrack); return buf; @@ -534,9 +546,10 @@ struct rtnl_link_stats64 *i40e_get_vsi_stats_struct(struct i40e_vsi *vsi); int i40e_fetch_switch_configuration(struct i40e_pf *pf, bool printconfig); -int i40e_program_fdir_filter(struct i40e_fdir_data *fdir_data, +int i40e_program_fdir_filter(struct i40e_fdir_filter *fdir_data, u8 *raw_packet, struct i40e_pf *pf, bool add); - +int i40e_add_del_fdir(struct i40e_vsi *vsi, + struct i40e_fdir_filter *input, bool add); void i40e_set_ethtool_ops(struct net_device *netdev); struct i40e_mac_filter *i40e_add_filter(struct i40e_vsi *vsi, u8 *macaddr, s16 vlan, diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index e7f38b57834d..bb948dd92474 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -162,6 +162,372 @@ i40e_status i40e_aq_queue_shutdown(struct i40e_hw *hw, return status; } +/* The i40e_ptype_lookup table is used to convert from the 8-bit ptype in the + * hardware to a bit-field that can be used by SW to more easily determine the + * packet type. + * + * Macros are used to shorten the table lines and make this table human + * readable. + * + * We store the PTYPE in the top byte of the bit field - this is just so that + * we can check that the table doesn't have a row missing, as the index into + * the table should be the PTYPE. + * + * Typical work flow: + * + * IF NOT i40e_ptype_lookup[ptype].known + * THEN + * Packet is unknown + * ELSE IF i40e_ptype_lookup[ptype].outer_ip == I40E_RX_PTYPE_OUTER_IP + * Use the rest of the fields to look at the tunnels, inner protocols, etc + * ELSE + * Use the enum i40e_rx_l2_ptype to decode the packet type + * ENDIF + */ + +/* macro to make the table lines short */ +#define I40E_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\ + { PTYPE, \ + 1, \ + I40E_RX_PTYPE_OUTER_##OUTER_IP, \ + I40E_RX_PTYPE_OUTER_##OUTER_IP_VER, \ + I40E_RX_PTYPE_##OUTER_FRAG, \ + I40E_RX_PTYPE_TUNNEL_##T, \ + I40E_RX_PTYPE_TUNNEL_END_##TE, \ + I40E_RX_PTYPE_##TEF, \ + I40E_RX_PTYPE_INNER_PROT_##I, \ + I40E_RX_PTYPE_PAYLOAD_LAYER_##PL } + +#define I40E_PTT_UNUSED_ENTRY(PTYPE) \ + { PTYPE, 0, 0, 0, 0, 0, 0, 0, 0, 0 } + +/* shorter macros makes the table fit but are terse */ +#define I40E_RX_PTYPE_NOF I40E_RX_PTYPE_NOT_FRAG +#define I40E_RX_PTYPE_FRG I40E_RX_PTYPE_FRAG +#define I40E_RX_PTYPE_INNER_PROT_TS I40E_RX_PTYPE_INNER_PROT_TIMESYNC + +/* Lookup table mapping the HW PTYPE to the bit field for decoding */ +struct i40e_rx_ptype_decoded i40e_ptype_lookup[] = { + /* L2 Packet types */ + I40E_PTT_UNUSED_ENTRY(0), + I40E_PTT(1, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT(2, L2, NONE, NOF, NONE, NONE, NOF, TS, PAY2), + I40E_PTT(3, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT_UNUSED_ENTRY(4), + I40E_PTT_UNUSED_ENTRY(5), + I40E_PTT(6, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT(7, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT_UNUSED_ENTRY(8), + I40E_PTT_UNUSED_ENTRY(9), + I40E_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE), + I40E_PTT(12, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(13, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(14, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(15, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(16, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(17, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(18, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(19, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(20, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(21, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + + /* Non Tunneled IPv4 */ + I40E_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(25), + I40E_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP, PAY4), + I40E_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4), + I40E_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4), + + /* IPv4 --> IPv4 */ + I40E_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3), + I40E_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3), + I40E_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(32), + I40E_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP, PAY4), + I40E_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4), + I40E_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4), + + /* IPv4 --> IPv6 */ + I40E_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3), + I40E_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3), + I40E_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(39), + I40E_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP, PAY4), + I40E_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4), + I40E_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT */ + I40E_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3), + + /* IPv4 --> GRE/NAT --> IPv4 */ + I40E_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3), + I40E_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3), + I40E_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(47), + I40E_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP, PAY4), + I40E_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4), + I40E_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT --> IPv6 */ + I40E_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3), + I40E_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3), + I40E_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(54), + I40E_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP, PAY4), + I40E_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4), + I40E_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT --> MAC */ + I40E_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3), + + /* IPv4 --> GRE/NAT --> MAC --> IPv4 */ + I40E_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3), + I40E_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3), + I40E_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(62), + I40E_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP, PAY4), + I40E_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4), + I40E_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT -> MAC --> IPv6 */ + I40E_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3), + I40E_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3), + I40E_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(69), + I40E_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP, PAY4), + I40E_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4), + I40E_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT --> MAC/VLAN */ + I40E_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3), + + /* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */ + I40E_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3), + I40E_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3), + I40E_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(77), + I40E_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP, PAY4), + I40E_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4), + I40E_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4), + + /* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */ + I40E_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3), + I40E_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3), + I40E_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(84), + I40E_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP, PAY4), + I40E_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4), + I40E_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4), + + /* Non Tunneled IPv6 */ + I40E_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP, PAY3), + I40E_PTT_UNUSED_ENTRY(91), + I40E_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP, PAY4), + I40E_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4), + I40E_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4), + + /* IPv6 --> IPv4 */ + I40E_PTT(95, IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3), + I40E_PTT(96, IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3), + I40E_PTT(97, IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(98), + I40E_PTT(99, IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP, PAY4), + I40E_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4), + I40E_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4), + + /* IPv6 --> IPv6 */ + I40E_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3), + I40E_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3), + I40E_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(105), + I40E_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP, PAY4), + I40E_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4), + I40E_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT */ + I40E_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3), + + /* IPv6 --> GRE/NAT -> IPv4 */ + I40E_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3), + I40E_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3), + I40E_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(113), + I40E_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP, PAY4), + I40E_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4), + I40E_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> IPv6 */ + I40E_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3), + I40E_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3), + I40E_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(120), + I40E_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP, PAY4), + I40E_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4), + I40E_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> MAC */ + I40E_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3), + + /* IPv6 --> GRE/NAT -> MAC -> IPv4 */ + I40E_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3), + I40E_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3), + I40E_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(128), + I40E_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP, PAY4), + I40E_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4), + I40E_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> MAC -> IPv6 */ + I40E_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3), + I40E_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3), + I40E_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(135), + I40E_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP, PAY4), + I40E_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4), + I40E_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> MAC/VLAN */ + I40E_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3), + + /* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */ + I40E_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3), + I40E_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3), + I40E_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(143), + I40E_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP, PAY4), + I40E_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4), + I40E_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */ + I40E_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3), + I40E_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3), + I40E_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(150), + I40E_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP, PAY4), + I40E_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4), + I40E_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4), + + /* unused entries */ + I40E_PTT_UNUSED_ENTRY(154), + I40E_PTT_UNUSED_ENTRY(155), + I40E_PTT_UNUSED_ENTRY(156), + I40E_PTT_UNUSED_ENTRY(157), + I40E_PTT_UNUSED_ENTRY(158), + I40E_PTT_UNUSED_ENTRY(159), + + I40E_PTT_UNUSED_ENTRY(160), + I40E_PTT_UNUSED_ENTRY(161), + I40E_PTT_UNUSED_ENTRY(162), + I40E_PTT_UNUSED_ENTRY(163), + I40E_PTT_UNUSED_ENTRY(164), + I40E_PTT_UNUSED_ENTRY(165), + I40E_PTT_UNUSED_ENTRY(166), + I40E_PTT_UNUSED_ENTRY(167), + I40E_PTT_UNUSED_ENTRY(168), + I40E_PTT_UNUSED_ENTRY(169), + + I40E_PTT_UNUSED_ENTRY(170), + I40E_PTT_UNUSED_ENTRY(171), + I40E_PTT_UNUSED_ENTRY(172), + I40E_PTT_UNUSED_ENTRY(173), + I40E_PTT_UNUSED_ENTRY(174), + I40E_PTT_UNUSED_ENTRY(175), + I40E_PTT_UNUSED_ENTRY(176), + I40E_PTT_UNUSED_ENTRY(177), + I40E_PTT_UNUSED_ENTRY(178), + I40E_PTT_UNUSED_ENTRY(179), + + I40E_PTT_UNUSED_ENTRY(180), + I40E_PTT_UNUSED_ENTRY(181), + I40E_PTT_UNUSED_ENTRY(182), + I40E_PTT_UNUSED_ENTRY(183), + I40E_PTT_UNUSED_ENTRY(184), + I40E_PTT_UNUSED_ENTRY(185), + I40E_PTT_UNUSED_ENTRY(186), + I40E_PTT_UNUSED_ENTRY(187), + I40E_PTT_UNUSED_ENTRY(188), + I40E_PTT_UNUSED_ENTRY(189), + + I40E_PTT_UNUSED_ENTRY(190), + I40E_PTT_UNUSED_ENTRY(191), + I40E_PTT_UNUSED_ENTRY(192), + I40E_PTT_UNUSED_ENTRY(193), + I40E_PTT_UNUSED_ENTRY(194), + I40E_PTT_UNUSED_ENTRY(195), + I40E_PTT_UNUSED_ENTRY(196), + I40E_PTT_UNUSED_ENTRY(197), + I40E_PTT_UNUSED_ENTRY(198), + I40E_PTT_UNUSED_ENTRY(199), + + I40E_PTT_UNUSED_ENTRY(200), + I40E_PTT_UNUSED_ENTRY(201), + I40E_PTT_UNUSED_ENTRY(202), + I40E_PTT_UNUSED_ENTRY(203), + I40E_PTT_UNUSED_ENTRY(204), + I40E_PTT_UNUSED_ENTRY(205), + I40E_PTT_UNUSED_ENTRY(206), + I40E_PTT_UNUSED_ENTRY(207), + I40E_PTT_UNUSED_ENTRY(208), + I40E_PTT_UNUSED_ENTRY(209), + + I40E_PTT_UNUSED_ENTRY(210), + I40E_PTT_UNUSED_ENTRY(211), + I40E_PTT_UNUSED_ENTRY(212), + I40E_PTT_UNUSED_ENTRY(213), + I40E_PTT_UNUSED_ENTRY(214), + I40E_PTT_UNUSED_ENTRY(215), + I40E_PTT_UNUSED_ENTRY(216), + I40E_PTT_UNUSED_ENTRY(217), + I40E_PTT_UNUSED_ENTRY(218), + I40E_PTT_UNUSED_ENTRY(219), + + I40E_PTT_UNUSED_ENTRY(220), + I40E_PTT_UNUSED_ENTRY(221), + I40E_PTT_UNUSED_ENTRY(222), + I40E_PTT_UNUSED_ENTRY(223), + I40E_PTT_UNUSED_ENTRY(224), + I40E_PTT_UNUSED_ENTRY(225), + I40E_PTT_UNUSED_ENTRY(226), + I40E_PTT_UNUSED_ENTRY(227), + I40E_PTT_UNUSED_ENTRY(228), + I40E_PTT_UNUSED_ENTRY(229), + + I40E_PTT_UNUSED_ENTRY(230), + I40E_PTT_UNUSED_ENTRY(231), + I40E_PTT_UNUSED_ENTRY(232), + I40E_PTT_UNUSED_ENTRY(233), + I40E_PTT_UNUSED_ENTRY(234), + I40E_PTT_UNUSED_ENTRY(235), + I40E_PTT_UNUSED_ENTRY(236), + I40E_PTT_UNUSED_ENTRY(237), + I40E_PTT_UNUSED_ENTRY(238), + I40E_PTT_UNUSED_ENTRY(239), + + I40E_PTT_UNUSED_ENTRY(240), + I40E_PTT_UNUSED_ENTRY(241), + I40E_PTT_UNUSED_ENTRY(242), + I40E_PTT_UNUSED_ENTRY(243), + I40E_PTT_UNUSED_ENTRY(244), + I40E_PTT_UNUSED_ENTRY(245), + I40E_PTT_UNUSED_ENTRY(246), + I40E_PTT_UNUSED_ENTRY(247), + I40E_PTT_UNUSED_ENTRY(248), + I40E_PTT_UNUSED_ENTRY(249), + + I40E_PTT_UNUSED_ENTRY(250), + I40E_PTT_UNUSED_ENTRY(251), + I40E_PTT_UNUSED_ENTRY(252), + I40E_PTT_UNUSED_ENTRY(253), + I40E_PTT_UNUSED_ENTRY(254), + I40E_PTT_UNUSED_ENTRY(255) +}; + + /** * i40e_init_shared_code - Initialize the shared code * @hw: pointer to hardware structure diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.c b/drivers/net/ethernet/intel/i40e/i40e_dcb.c index 50730141bb7b..036570d76176 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_dcb.c +++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.c @@ -332,6 +332,7 @@ i40e_status i40e_lldp_to_dcb_config(u8 *lldpmib, u16 type; u16 length; u16 typelength; + u16 offset = 0; if (!lldpmib || !dcbcfg) return I40E_ERR_PARAM; @@ -339,15 +340,17 @@ i40e_status i40e_lldp_to_dcb_config(u8 *lldpmib, /* set to the start of LLDPDU */ lldpmib += ETH_HLEN; tlv = (struct i40e_lldp_org_tlv *)lldpmib; - while (tlv) { + while (1) { typelength = ntohs(tlv->typelength); type = (u16)((typelength & I40E_LLDP_TLV_TYPE_MASK) >> I40E_LLDP_TLV_TYPE_SHIFT); length = (u16)((typelength & I40E_LLDP_TLV_LEN_MASK) >> I40E_LLDP_TLV_LEN_SHIFT); + offset += sizeof(typelength) + length; - if (type == I40E_TLV_TYPE_END) - break;/* END TLV break out */ + /* END TLV or beyond LLDPDU size */ + if ((type == I40E_TLV_TYPE_END) || (offset > I40E_LLDPDU_SIZE)) + break; switch (type) { case I40E_TLV_TYPE_ORG: diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c index da22c3fa2c00..57fc86496f30 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c +++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c @@ -1663,21 +1663,22 @@ static ssize_t i40e_dbg_command_write(struct file *filp, desc = NULL; } else if ((strncmp(cmd_buf, "add fd_filter", 13) == 0) || (strncmp(cmd_buf, "rem fd_filter", 13) == 0)) { - struct i40e_fdir_data fd_data; + struct i40e_fdir_filter fd_data; u16 packet_len, i, j = 0; char *asc_packet; + u8 *raw_packet; bool add = false; int ret; - asc_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_LOOKUP, + asc_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL); if (!asc_packet) goto command_write_done; - fd_data.raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_LOOKUP, - GFP_KERNEL); + raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, + GFP_KERNEL); - if (!fd_data.raw_packet) { + if (!raw_packet) { kfree(asc_packet); asc_packet = NULL; goto command_write_done; @@ -1698,36 +1699,36 @@ static ssize_t i40e_dbg_command_write(struct file *filp, cnt); kfree(asc_packet); asc_packet = NULL; - kfree(fd_data.raw_packet); + kfree(raw_packet); goto command_write_done; } /* fix packet length if user entered 0 */ if (packet_len == 0) - packet_len = I40E_FDIR_MAX_RAW_PACKET_LOOKUP; + packet_len = I40E_FDIR_MAX_RAW_PACKET_SIZE; /* make sure to check the max as well */ packet_len = min_t(u16, - packet_len, I40E_FDIR_MAX_RAW_PACKET_LOOKUP); + packet_len, I40E_FDIR_MAX_RAW_PACKET_SIZE); for (i = 0; i < packet_len; i++) { sscanf(&asc_packet[j], "%2hhx ", - &fd_data.raw_packet[i]); + &raw_packet[i]); j += 3; } dev_info(&pf->pdev->dev, "FD raw packet dump\n"); print_hex_dump(KERN_INFO, "FD raw packet: ", DUMP_PREFIX_OFFSET, 16, 1, - fd_data.raw_packet, packet_len, true); - ret = i40e_program_fdir_filter(&fd_data, pf, add); + raw_packet, packet_len, true); + ret = i40e_program_fdir_filter(&fd_data, raw_packet, pf, add); if (!ret) { dev_info(&pf->pdev->dev, "Filter command send Status : Success\n"); } else { dev_info(&pf->pdev->dev, "Filter command send failed %d\n", ret); } - kfree(fd_data.raw_packet); - fd_data.raw_packet = NULL; + kfree(raw_packet); + raw_packet = NULL; kfree(asc_packet); asc_packet = NULL; } else if (strncmp(cmd_buf, "fd-atr off", 10) == 0) { diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index b1d7d8c5cb9b..d34ff31fddd8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -62,6 +62,9 @@ static const struct i40e_stats i40e_gstrings_net_stats[] = { I40E_NETDEV_STAT(rx_crc_errors), }; +static int i40e_add_del_fdir_ethtool(struct i40e_vsi *vsi, + struct ethtool_rxnfc *cmd, bool add); + /* These PF_STATs might look like duplicates of some NETDEV_STATs, * but they are separate. This device supports Virtualization, and * as such might have several netdevs supporting VMDq and FCoE going @@ -84,6 +87,7 @@ static struct i40e_stats i40e_gstrings_stats[] = { I40E_PF_STAT("illegal_bytes", stats.illegal_bytes), I40E_PF_STAT("mac_local_faults", stats.mac_local_faults), I40E_PF_STAT("mac_remote_faults", stats.mac_remote_faults), + I40E_PF_STAT("tx_timeout", tx_timeout_count), I40E_PF_STAT("rx_length_errors", stats.rx_length_errors), I40E_PF_STAT("link_xon_rx", stats.link_xon_rx), I40E_PF_STAT("link_xoff_rx", stats.link_xoff_rx), @@ -1112,6 +1116,84 @@ static int i40e_get_rss_hash_opts(struct i40e_pf *pf, struct ethtool_rxnfc *cmd) } /** + * i40e_get_ethtool_fdir_all - Populates the rule count of a command + * @pf: Pointer to the physical function struct + * @cmd: The command to get or set Rx flow classification rules + * @rule_locs: Array of used rule locations + * + * This function populates both the total and actual rule count of + * the ethtool flow classification command + * + * Returns 0 on success or -EMSGSIZE if entry not found + **/ +static int i40e_get_ethtool_fdir_all(struct i40e_pf *pf, + struct ethtool_rxnfc *cmd, + u32 *rule_locs) +{ + struct i40e_fdir_filter *rule; + struct hlist_node *node2; + int cnt = 0; + + /* report total rule count */ + cmd->data = pf->hw.fdir_shared_filter_count + + pf->fdir_pf_filter_count; + + hlist_for_each_entry_safe(rule, node2, + &pf->fdir_filter_list, fdir_node) { + if (cnt == cmd->rule_cnt) + return -EMSGSIZE; + + rule_locs[cnt] = rule->fd_id; + cnt++; + } + + cmd->rule_cnt = cnt; + + return 0; +} + +/** + * i40e_get_ethtool_fdir_entry - Look up a filter based on Rx flow + * @pf: Pointer to the physical function struct + * @cmd: The command to get or set Rx flow classification rules + * + * This function looks up a filter based on the Rx flow classification + * command and fills the flow spec info for it if found + * + * Returns 0 on success or -EINVAL if filter not found + **/ +static int i40e_get_ethtool_fdir_entry(struct i40e_pf *pf, + struct ethtool_rxnfc *cmd) +{ + struct ethtool_rx_flow_spec *fsp = + (struct ethtool_rx_flow_spec *)&cmd->fs; + struct i40e_fdir_filter *rule = NULL; + struct hlist_node *node2; + + /* report total rule count */ + cmd->data = pf->hw.fdir_shared_filter_count + + pf->fdir_pf_filter_count; + + hlist_for_each_entry_safe(rule, node2, + &pf->fdir_filter_list, fdir_node) { + if (fsp->location <= rule->fd_id) + break; + } + + if (!rule || fsp->location != rule->fd_id) + return -EINVAL; + + fsp->flow_type = rule->flow_type; + fsp->h_u.tcp_ip4_spec.psrc = rule->src_port; + fsp->h_u.tcp_ip4_spec.pdst = rule->dst_port; + fsp->h_u.tcp_ip4_spec.ip4src = rule->src_ip[0]; + fsp->h_u.tcp_ip4_spec.ip4dst = rule->dst_ip[0]; + fsp->ring_cookie = rule->q_index; + + return 0; +} + +/** * i40e_get_rxnfc - command to get RX flow classification rules * @netdev: network interface device structure * @cmd: ethtool rxnfc command @@ -1135,15 +1217,15 @@ static int i40e_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd, ret = i40e_get_rss_hash_opts(pf, cmd); break; case ETHTOOL_GRXCLSRLCNT: - cmd->rule_cnt = 10; + cmd->rule_cnt = pf->fdir_pf_active_filters; ret = 0; break; case ETHTOOL_GRXCLSRULE: - ret = 0; + ret = i40e_get_ethtool_fdir_entry(pf, cmd); break; case ETHTOOL_GRXCLSRLALL: - cmd->data = 500; - ret = 0; + ret = i40e_get_ethtool_fdir_all(pf, cmd, rule_locs); + break; default: break; } @@ -1274,289 +1356,158 @@ static int i40e_set_rss_hash_opt(struct i40e_pf *pf, struct ethtool_rxnfc *nfc) return 0; } -#define IP_HEADER_OFFSET 14 -#define I40E_UDPIP_DUMMY_PACKET_LEN 42 /** - * i40e_add_del_fdir_udpv4 - Add/Remove UDPv4 Flow Director filters for - * a specific flow spec - * @vsi: pointer to the targeted VSI - * @fd_data: the flow director data required from the FDir descriptor - * @ethtool_rx_flow_spec: the flow spec - * @add: true adds a filter, false removes it + * i40e_update_ethtool_fdir_entry - Updates the fdir filter entry + * @vsi: Pointer to the targeted VSI + * @input: The filter to update or NULL to indicate deletion + * @sw_idx: Software index to the filter + * @cmd: The command to get or set Rx flow classification rules * - * Returns 0 if the filters were successfully added or removed + * This function updates (or deletes) a Flow Director entry from + * the hlist of the corresponding PF + * + * Returns 0 on success **/ -static int i40e_add_del_fdir_udpv4(struct i40e_vsi *vsi, - struct i40e_fdir_data *fd_data, - struct ethtool_rx_flow_spec *fsp, bool add) +static int i40e_update_ethtool_fdir_entry(struct i40e_vsi *vsi, + struct i40e_fdir_filter *input, + u16 sw_idx, + struct ethtool_rxnfc *cmd) { + struct i40e_fdir_filter *rule, *parent; struct i40e_pf *pf = vsi->back; - struct udphdr *udp; - struct iphdr *ip; - bool err = false; - int ret; - int i; - char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0, - 0x45, 0, 0, 0x1c, 0, 0, 0x40, 0, 0x40, 0x11, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0}; - - memcpy(fd_data->raw_packet, packet, I40E_UDPIP_DUMMY_PACKET_LEN); - - ip = (struct iphdr *)(fd_data->raw_packet + IP_HEADER_OFFSET); - udp = (struct udphdr *)(fd_data->raw_packet + IP_HEADER_OFFSET - + sizeof(struct iphdr)); + struct hlist_node *node2; + int err = -EINVAL; - ip->saddr = fsp->h_u.tcp_ip4_spec.ip4src; - ip->daddr = fsp->h_u.tcp_ip4_spec.ip4dst; - udp->source = fsp->h_u.tcp_ip4_spec.psrc; - udp->dest = fsp->h_u.tcp_ip4_spec.pdst; + parent = NULL; + rule = NULL; - for (i = I40E_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP; - i <= I40E_FILTER_PCTYPE_NONF_IPV4_UDP; i++) { - fd_data->pctype = i; - ret = i40e_program_fdir_filter(fd_data, pf, add); - - if (ret) { - dev_info(&pf->pdev->dev, - "Filter command send failed for PCTYPE %d (ret = %d)\n", - fd_data->pctype, ret); - err = true; - } else { - dev_info(&pf->pdev->dev, - "Filter OK for PCTYPE %d (ret = %d)\n", - fd_data->pctype, ret); - } + hlist_for_each_entry_safe(rule, node2, + &pf->fdir_filter_list, fdir_node) { + /* hash found, or no matching entry */ + if (rule->fd_id >= sw_idx) + break; + parent = rule; } - return err ? -EOPNOTSUPP : 0; -} - -#define I40E_TCPIP_DUMMY_PACKET_LEN 54 -/** - * i40e_add_del_fdir_tcpv4 - Add/Remove TCPv4 Flow Director filters for - * a specific flow spec - * @vsi: pointer to the targeted VSI - * @fd_data: the flow director data required from the FDir descriptor - * @ethtool_rx_flow_spec: the flow spec - * @add: true adds a filter, false removes it - * - * Returns 0 if the filters were successfully added or removed - **/ -static int i40e_add_del_fdir_tcpv4(struct i40e_vsi *vsi, - struct i40e_fdir_data *fd_data, - struct ethtool_rx_flow_spec *fsp, bool add) -{ - struct i40e_pf *pf = vsi->back; - struct tcphdr *tcp; - struct iphdr *ip; - bool err = false; - int ret; - /* Dummy packet */ - char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0, - 0x45, 0, 0, 0x28, 0, 0, 0x40, 0, 0x40, 0x6, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0x80, 0x11, 0x0, 0x72, 0, 0, 0, 0}; - - memcpy(fd_data->raw_packet, packet, I40E_TCPIP_DUMMY_PACKET_LEN); - - ip = (struct iphdr *)(fd_data->raw_packet + IP_HEADER_OFFSET); - tcp = (struct tcphdr *)(fd_data->raw_packet + IP_HEADER_OFFSET - + sizeof(struct iphdr)); - - ip->daddr = fsp->h_u.tcp_ip4_spec.ip4dst; - tcp->dest = fsp->h_u.tcp_ip4_spec.pdst; - ip->saddr = fsp->h_u.tcp_ip4_spec.ip4src; - tcp->source = fsp->h_u.tcp_ip4_spec.psrc; - - if (add) { - if (pf->flags & I40E_FLAG_FD_ATR_ENABLED) { - dev_info(&pf->pdev->dev, "Forcing ATR off, sideband rules for TCP/IPv4 flow being applied\n"); - pf->flags &= ~I40E_FLAG_FD_ATR_ENABLED; + /* if there is an old rule occupying our place remove it */ + if (rule && (rule->fd_id == sw_idx)) { + if (!input || (rule->fd_id != input->fd_id)) { + cmd->fs.flow_type = rule->flow_type; + err = i40e_add_del_fdir_ethtool(vsi, cmd, false); } + + hlist_del(&rule->fdir_node); + kfree(rule); + pf->fdir_pf_active_filters--; } - fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_TCP_SYN; - ret = i40e_program_fdir_filter(fd_data, pf, add); + /* If no input this was a delete, err should be 0 if a rule was + * successfully found and removed from the list else -EINVAL + */ + if (!input) + return err; - if (ret) { - dev_info(&pf->pdev->dev, - "Filter command send failed for PCTYPE %d (ret = %d)\n", - fd_data->pctype, ret); - err = true; - } else { - dev_info(&pf->pdev->dev, "Filter OK for PCTYPE %d (ret = %d)\n", - fd_data->pctype, ret); - } + /* initialize node and set software index */ + INIT_HLIST_NODE(&input->fdir_node); - fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_TCP; + /* add filter to the list */ + if (parent) + hlist_add_after(&parent->fdir_node, &input->fdir_node); + else + hlist_add_head(&input->fdir_node, + &pf->fdir_filter_list); - ret = i40e_program_fdir_filter(fd_data, pf, add); - if (ret) { - dev_info(&pf->pdev->dev, - "Filter command send failed for PCTYPE %d (ret = %d)\n", - fd_data->pctype, ret); - err = true; - } else { - dev_info(&pf->pdev->dev, "Filter OK for PCTYPE %d (ret = %d)\n", - fd_data->pctype, ret); - } + /* update counts */ + pf->fdir_pf_active_filters++; - return err ? -EOPNOTSUPP : 0; + return 0; } /** - * i40e_add_del_fdir_sctpv4 - Add/Remove SCTPv4 Flow Director filters for - * a specific flow spec - * @vsi: pointer to the targeted VSI - * @fd_data: the flow director data required from the FDir descriptor - * @ethtool_rx_flow_spec: the flow spec - * @add: true adds a filter, false removes it + * i40e_del_fdir_entry - Deletes a Flow Director filter entry + * @vsi: Pointer to the targeted VSI + * @cmd: The command to get or set Rx flow classification rules * - * Returns 0 if the filters were successfully added or removed - **/ -static int i40e_add_del_fdir_sctpv4(struct i40e_vsi *vsi, - struct i40e_fdir_data *fd_data, - struct ethtool_rx_flow_spec *fsp, bool add) -{ - return -EOPNOTSUPP; -} - -#define I40E_IP_DUMMY_PACKET_LEN 34 -/** - * i40e_add_del_fdir_ipv4 - Add/Remove IPv4 Flow Director filters for - * a specific flow spec - * @vsi: pointer to the targeted VSI - * @fd_data: the flow director data required for the FDir descriptor - * @fsp: the ethtool flow spec - * @add: true adds a filter, false removes it + * The function removes a Flow Director filter entry from the + * hlist of the corresponding PF * - * Returns 0 if the filters were successfully added or removed - **/ -static int i40e_add_del_fdir_ipv4(struct i40e_vsi *vsi, - struct i40e_fdir_data *fd_data, - struct ethtool_rx_flow_spec *fsp, bool add) + * Returns 0 on success + */ +static int i40e_del_fdir_entry(struct i40e_vsi *vsi, + struct ethtool_rxnfc *cmd) { + struct ethtool_rx_flow_spec *fsp = + (struct ethtool_rx_flow_spec *)&cmd->fs; struct i40e_pf *pf = vsi->back; - struct iphdr *ip; - bool err = false; - int ret; - int i; - char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0, - 0x45, 0, 0, 0x14, 0, 0, 0x40, 0, 0x40, 0x10, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int ret = 0; - memcpy(fd_data->raw_packet, packet, I40E_IP_DUMMY_PACKET_LEN); - ip = (struct iphdr *)(fd_data->raw_packet + IP_HEADER_OFFSET); - - ip->saddr = fsp->h_u.usr_ip4_spec.ip4src; - ip->daddr = fsp->h_u.usr_ip4_spec.ip4dst; - ip->protocol = fsp->h_u.usr_ip4_spec.proto; - - for (i = I40E_FILTER_PCTYPE_NONF_IPV4_OTHER; - i <= I40E_FILTER_PCTYPE_FRAG_IPV4; i++) { - fd_data->pctype = i; - ret = i40e_program_fdir_filter(fd_data, pf, add); - - if (ret) { - dev_info(&pf->pdev->dev, - "Filter command send failed for PCTYPE %d (ret = %d)\n", - fd_data->pctype, ret); - err = true; - } else { - dev_info(&pf->pdev->dev, - "Filter OK for PCTYPE %d (ret = %d)\n", - fd_data->pctype, ret); - } - } + ret = i40e_update_ethtool_fdir_entry(vsi, NULL, fsp->location, cmd); - return err ? -EOPNOTSUPP : 0; + return ret; } /** - * i40e_add_del_fdir_ethtool - Add/Remove Flow Director filters for - * a specific flow spec based on their protocol + * i40e_add_del_fdir_ethtool - Add/Remove Flow Director filters * @vsi: pointer to the targeted VSI * @cmd: command to get or set RX flow classification rules * @add: true adds a filter, false removes it * - * Returns 0 if the filters were successfully added or removed + * Add/Remove Flow Director filters for a specific flow spec based on their + * protocol. Returns 0 if the filters were successfully added or removed. **/ static int i40e_add_del_fdir_ethtool(struct i40e_vsi *vsi, - struct ethtool_rxnfc *cmd, bool add) + struct ethtool_rxnfc *cmd, bool add) { - struct i40e_fdir_data fd_data; - int ret = -EINVAL; + struct ethtool_rx_flow_spec *fsp; + struct i40e_fdir_filter *input; struct i40e_pf *pf; - struct ethtool_rx_flow_spec *fsp = - (struct ethtool_rx_flow_spec *)&cmd->fs; + int ret = -EINVAL; if (!vsi) return -EINVAL; + fsp = (struct ethtool_rx_flow_spec *)&cmd->fs; pf = vsi->back; - if ((fsp->ring_cookie != RX_CLS_FLOW_DISC) && - (fsp->ring_cookie >= vsi->num_queue_pairs)) + if (fsp->location >= (pf->hw.func_caps.fd_filters_best_effort + + pf->hw.func_caps.fd_filters_guaranteed)) { return -EINVAL; + } - /* Populate the Flow Director that we have at the moment - * and allocate the raw packet buffer for the calling functions - */ - fd_data.raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_LOOKUP, - GFP_KERNEL); + if ((fsp->ring_cookie >= vsi->num_queue_pairs) && add) + return -EINVAL; - if (!fd_data.raw_packet) { - dev_info(&pf->pdev->dev, "Could not allocate memory\n"); - return -ENOMEM; - } + input = kzalloc(sizeof(*input), GFP_KERNEL); - fd_data.q_index = fsp->ring_cookie; - fd_data.flex_off = 0; - fd_data.pctype = 0; - fd_data.dest_vsi = vsi->id; - fd_data.dest_ctl = I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_QINDEX; - fd_data.fd_status = I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID; - fd_data.cnt_index = 0; - fd_data.fd_id = 0; + if (!input) + return -ENOMEM; - switch (fsp->flow_type & ~FLOW_EXT) { - case TCP_V4_FLOW: - ret = i40e_add_del_fdir_tcpv4(vsi, &fd_data, fsp, add); - break; - case UDP_V4_FLOW: - ret = i40e_add_del_fdir_udpv4(vsi, &fd_data, fsp, add); - break; - case SCTP_V4_FLOW: - ret = i40e_add_del_fdir_sctpv4(vsi, &fd_data, fsp, add); - break; - case IPV4_FLOW: - ret = i40e_add_del_fdir_ipv4(vsi, &fd_data, fsp, add); - break; - case IP_USER_FLOW: - switch (fsp->h_u.usr_ip4_spec.proto) { - case IPPROTO_TCP: - ret = i40e_add_del_fdir_tcpv4(vsi, &fd_data, fsp, add); - break; - case IPPROTO_UDP: - ret = i40e_add_del_fdir_udpv4(vsi, &fd_data, fsp, add); - break; - case IPPROTO_SCTP: - ret = i40e_add_del_fdir_sctpv4(vsi, &fd_data, fsp, add); - break; - default: - ret = i40e_add_del_fdir_ipv4(vsi, &fd_data, fsp, add); - break; - } - break; - default: - dev_info(&pf->pdev->dev, "Could not specify spec type\n"); - ret = -EINVAL; + input->fd_id = fsp->location; + + input->q_index = fsp->ring_cookie; + input->flex_off = 0; + input->pctype = 0; + input->dest_vsi = vsi->id; + input->dest_ctl = I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_QINDEX; + input->fd_status = I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID; + input->cnt_index = 0; + input->flow_type = fsp->flow_type; + input->ip4_proto = fsp->h_u.usr_ip4_spec.proto; + input->src_port = fsp->h_u.tcp_ip4_spec.psrc; + input->dst_port = fsp->h_u.tcp_ip4_spec.pdst; + input->src_ip[0] = fsp->h_u.tcp_ip4_spec.ip4src; + input->dst_ip[0] = fsp->h_u.tcp_ip4_spec.ip4dst; + + ret = i40e_add_del_fdir(vsi, input, add); + if (ret) { + kfree(input); + return ret; } - kfree(fd_data.raw_packet); - fd_data.raw_packet = NULL; + if (!ret && add) + i40e_update_ethtool_fdir_entry(vsi, input, fsp->location, NULL); + else + kfree(input); return ret; } @@ -1583,7 +1534,7 @@ static int i40e_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd) ret = i40e_add_del_fdir_ethtool(vsi, cmd, true); break; case ETHTOOL_SRXCLSRLDEL: - ret = i40e_add_del_fdir_ethtool(vsi, cmd, false); + ret = i40e_del_fdir_entry(vsi, cmd); break; default: break; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 53f3ed2df796..43d391bb65c4 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -38,7 +38,7 @@ static const char i40e_driver_string[] = #define DRV_VERSION_MAJOR 0 #define DRV_VERSION_MINOR 3 -#define DRV_VERSION_BUILD 32 +#define DRV_VERSION_BUILD 34 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \ __stringify(DRV_VERSION_MINOR) "." \ __stringify(DRV_VERSION_BUILD) DRV_KERN @@ -1965,11 +1965,14 @@ static int i40e_vlan_rx_add_vid(struct net_device *netdev, netdev_info(netdev, "adding %pM vid=%d\n", netdev->dev_addr, vid); - /* If the network stack called us with vid = 0, we should - * indicate to i40e_vsi_add_vlan() that we want to receive - * any traffic (i.e. with any vlan tag, or untagged) + /* If the network stack called us with vid = 0 then + * it is asking to receive priority tagged packets with + * vlan id 0. Our HW receives them by default when configured + * to receive untagged packets so there is no need to add an + * extra filter for vlan 0 tagged packets. */ - ret = i40e_vsi_add_vlan(vsi, vid ? vid : I40E_VLAN_ANY); + if (vid) + ret = i40e_vsi_add_vlan(vsi, vid); if (!ret && (vid < VLAN_N_VID)) set_bit(vid, vsi->active_vlans); @@ -2421,6 +2424,25 @@ static void i40e_set_vsi_rx_mode(struct i40e_vsi *vsi) } /** + * i40e_fdir_filter_restore - Restore the Sideband Flow Director filters + * @vsi: Pointer to the targeted VSI + * + * This function replays the hlist on the hw where all the SB Flow Director + * filters were saved. + **/ +static void i40e_fdir_filter_restore(struct i40e_vsi *vsi) +{ + struct i40e_fdir_filter *filter; + struct i40e_pf *pf = vsi->back; + struct hlist_node *node; + + hlist_for_each_entry_safe(filter, node, + &pf->fdir_filter_list, fdir_node) { + i40e_add_del_fdir(vsi, filter, true); + } +} + +/** * i40e_vsi_configure - Set up the VSI for action * @vsi: the VSI being configured **/ @@ -2431,6 +2453,8 @@ static int i40e_vsi_configure(struct i40e_vsi *vsi) i40e_set_vsi_rx_mode(vsi); i40e_restore_vlan(vsi); i40e_vsi_config_dcb_rings(vsi); + if (vsi->type == I40E_VSI_FDIR) + i40e_fdir_filter_restore(vsi); err = i40e_vsi_configure_tx(vsi); if (!err) err = i40e_vsi_configure_rx(vsi); @@ -4268,6 +4292,26 @@ err_setup_tx: } /** + * i40e_fdir_filter_exit - Cleans up the Flow Director accounting + * @pf: Pointer to pf + * + * This function destroys the hlist where all the Flow Director + * filters were saved. + **/ +static void i40e_fdir_filter_exit(struct i40e_pf *pf) +{ + struct i40e_fdir_filter *filter; + struct hlist_node *node2; + + hlist_for_each_entry_safe(filter, node2, + &pf->fdir_filter_list, fdir_node) { + hlist_del(&filter->fdir_node); + kfree(filter); + } + pf->fdir_pf_active_filters = 0; +} + +/** * i40e_close - Disables a network interface * @netdev: network interface device structure * @@ -5052,6 +5096,12 @@ static int i40e_get_capabilities(struct i40e_pf *pf) /* increment MSI-X count because current FW skips one */ pf->hw.func_caps.num_msix_vectors++; + if (((pf->hw.aq.fw_maj_ver == 2) && (pf->hw.aq.fw_min_ver < 22)) || + (pf->hw.aq.fw_maj_ver < 2)) { + pf->hw.func_caps.num_msix_vectors++; + pf->hw.func_caps.num_msix_vectors_vf++; + } + if (pf->hw.debug_mask & I40E_DEBUG_USER) dev_info(&pf->pdev->dev, "pf=%d, num_vfs=%d, msix_pf=%d, msix_vf=%d, fd_g=%d, fd_b=%d, pf_max_q=%d num_vsi=%d\n", @@ -5131,9 +5181,9 @@ static void i40e_fdir_sb_setup(struct i40e_pf *pf) err = i40e_up_complete(vsi); if (err) goto err_up_complete; + clear_bit(__I40E_NEEDS_RESTART, &vsi->state); } - clear_bit(__I40E_NEEDS_RESTART, &vsi->state); return; err_up_complete: @@ -5156,6 +5206,7 @@ static void i40e_fdir_teardown(struct i40e_pf *pf) { int i; + i40e_fdir_filter_exit(pf); for (i = 0; i < pf->hw.func_caps.num_vsis; i++) { if (pf->vsi[i] && pf->vsi[i]->type == I40E_VSI_FDIR) { i40e_vsi_release(pf->vsi[i]); @@ -7930,13 +7981,6 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err = i40e_init_adminq(hw); dev_info(&pdev->dev, "%s\n", i40e_fw_version_str(hw)); - if (((hw->nvm.version & I40E_NVM_VERSION_HI_MASK) - >> I40E_NVM_VERSION_HI_SHIFT) != I40E_CURRENT_NVM_VERSION_HI) { - dev_info(&pdev->dev, - "warning: NVM version not supported, supported version: %02x.%02x\n", - I40E_CURRENT_NVM_VERSION_HI, - I40E_CURRENT_NVM_VERSION_LO); - } if (err) { dev_info(&pdev->dev, "init_adminq failed: %d expecting API %02x.%02x\n", diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c index 73f95b081927..262bdf11d221 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c +++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c @@ -27,14 +27,14 @@ #include "i40e_prototype.h" /** - * i40e_init_nvm_ops - Initialize NVM function pointers. - * @hw: pointer to the HW structure. + * i40e_init_nvm_ops - Initialize NVM function pointers + * @hw: pointer to the HW structure * - * Setups the function pointers and the NVM info structure. Should be called - * once per NVM initialization, e.g. inside the i40e_init_shared_code(). - * Please notice that the NVM term is used here (& in all methods covered - * in this file) as an equivalent of the FLASH part mapped into the SR. - * We are accessing FLASH always thru the Shadow RAM. + * Setup the function pointers and the NVM info structure. Should be called + * once per NVM initialization, e.g. inside the i40e_init_shared_code(). + * Please notice that the NVM term is used here (& in all methods covered + * in this file) as an equivalent of the FLASH part mapped into the SR. + * We are accessing FLASH always thru the Shadow RAM. **/ i40e_status i40e_init_nvm(struct i40e_hw *hw) { @@ -49,16 +49,16 @@ i40e_status i40e_init_nvm(struct i40e_hw *hw) gens = rd32(hw, I40E_GLNVM_GENS); sr_size = ((gens & I40E_GLNVM_GENS_SR_SIZE_MASK) >> I40E_GLNVM_GENS_SR_SIZE_SHIFT); - /* Switching to words (sr_size contains power of 2KB). */ + /* Switching to words (sr_size contains power of 2KB) */ nvm->sr_size = (1 << sr_size) * I40E_SR_WORDS_IN_1KB; - /* Check if we are in the normal or blank NVM programming mode. */ + /* Check if we are in the normal or blank NVM programming mode */ fla = rd32(hw, I40E_GLNVM_FLA); - if (fla & I40E_GLNVM_FLA_LOCKED_MASK) { /* Normal programming mode. */ - /* Max NVM timeout. */ + if (fla & I40E_GLNVM_FLA_LOCKED_MASK) { /* Normal programming mode */ + /* Max NVM timeout */ nvm->timeout = I40E_MAX_NVM_TIMEOUT; nvm->blank_nvm_mode = false; - } else { /* Blank programming mode. */ + } else { /* Blank programming mode */ nvm->blank_nvm_mode = true; ret_code = I40E_ERR_NVM_BLANK_MODE; hw_dbg(hw, "NVM init error: unsupported blank mode.\n"); @@ -68,12 +68,12 @@ i40e_status i40e_init_nvm(struct i40e_hw *hw) } /** - * i40e_acquire_nvm - Generic request for acquiring the NVM ownership. - * @hw: pointer to the HW structure. - * @access: NVM access type (read or write). + * i40e_acquire_nvm - Generic request for acquiring the NVM ownership + * @hw: pointer to the HW structure + * @access: NVM access type (read or write) * - * This function will request NVM ownership for reading - * via the proper Admin Command. + * This function will request NVM ownership for reading + * via the proper Admin Command. **/ i40e_status i40e_acquire_nvm(struct i40e_hw *hw, enum i40e_aq_resource_access_type access) @@ -87,20 +87,20 @@ i40e_status i40e_acquire_nvm(struct i40e_hw *hw, ret_code = i40e_aq_request_resource(hw, I40E_NVM_RESOURCE_ID, access, 0, &time, NULL); - /* Reading the Global Device Timer. */ + /* Reading the Global Device Timer */ gtime = rd32(hw, I40E_GLVFGEN_TIMER); - /* Store the timeout. */ + /* Store the timeout */ hw->nvm.hw_semaphore_timeout = I40E_MS_TO_GTIME(time) + gtime; if (ret_code) { - /* Set the polling timeout. */ + /* Set the polling timeout */ if (time > I40E_MAX_NVM_TIMEOUT) timeout = I40E_MS_TO_GTIME(I40E_MAX_NVM_TIMEOUT) + gtime; else timeout = hw->nvm.hw_semaphore_timeout; - /* Poll until the current NVM owner timeouts. */ + /* Poll until the current NVM owner timeouts */ while (gtime < timeout) { usleep_range(10000, 20000); ret_code = i40e_aq_request_resource(hw, @@ -128,10 +128,10 @@ i40e_i40e_acquire_nvm_exit: } /** - * i40e_release_nvm - Generic request for releasing the NVM ownership. - * @hw: pointer to the HW structure. + * i40e_release_nvm - Generic request for releasing the NVM ownership + * @hw: pointer to the HW structure * - * This function will release NVM resource via the proper Admin Command. + * This function will release NVM resource via the proper Admin Command. **/ void i40e_release_nvm(struct i40e_hw *hw) { @@ -140,17 +140,17 @@ void i40e_release_nvm(struct i40e_hw *hw) } /** - * i40e_poll_sr_srctl_done_bit - Polls the GLNVM_SRCTL done bit. - * @hw: pointer to the HW structure. + * i40e_poll_sr_srctl_done_bit - Polls the GLNVM_SRCTL done bit + * @hw: pointer to the HW structure * - * Polls the SRCTL Shadow RAM register done bit. + * Polls the SRCTL Shadow RAM register done bit. **/ static i40e_status i40e_poll_sr_srctl_done_bit(struct i40e_hw *hw) { i40e_status ret_code = I40E_ERR_TIMEOUT; u32 srctl, wait_cnt; - /* Poll the I40E_GLNVM_SRCTL until the done bit is set. */ + /* Poll the I40E_GLNVM_SRCTL until the done bit is set */ for (wait_cnt = 0; wait_cnt < I40E_SRRD_SRCTL_ATTEMPTS; wait_cnt++) { srctl = rd32(hw, I40E_GLNVM_SRCTL); if (srctl & I40E_GLNVM_SRCTL_DONE_MASK) { @@ -165,12 +165,12 @@ static i40e_status i40e_poll_sr_srctl_done_bit(struct i40e_hw *hw) } /** - * i40e_read_nvm_word - Reads Shadow RAM - * @hw: pointer to the HW structure. - * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF). - * @data: word read from the Shadow RAM. + * i40e_read_nvm_word - Reads Shadow RAM + * @hw: pointer to the HW structure + * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF) + * @data: word read from the Shadow RAM * - * Reads 16 bit word from the Shadow RAM using the GLNVM_SRCTL register. + * Reads one 16 bit word from the Shadow RAM using the GLNVM_SRCTL register. **/ i40e_status i40e_read_nvm_word(struct i40e_hw *hw, u16 offset, u16 *data) @@ -184,15 +184,15 @@ i40e_status i40e_read_nvm_word(struct i40e_hw *hw, u16 offset, goto read_nvm_exit; } - /* Poll the done bit first. */ + /* Poll the done bit first */ ret_code = i40e_poll_sr_srctl_done_bit(hw); if (!ret_code) { - /* Write the address and start reading. */ + /* Write the address and start reading */ sr_reg = (u32)(offset << I40E_GLNVM_SRCTL_ADDR_SHIFT) | (1 << I40E_GLNVM_SRCTL_START_SHIFT); wr32(hw, I40E_GLNVM_SRCTL, sr_reg); - /* Poll I40E_GLNVM_SRCTL until the done bit is set. */ + /* Poll I40E_GLNVM_SRCTL until the done bit is set */ ret_code = i40e_poll_sr_srctl_done_bit(hw); if (!ret_code) { sr_reg = rd32(hw, I40E_GLNVM_SRDATA); @@ -210,16 +210,15 @@ read_nvm_exit: } /** - * i40e_read_nvm_buffer - Reads Shadow RAM buffer. - * @hw: pointer to the HW structure. - * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF). - * @words: number of words to read (in) & - * number of words read before the NVM ownership timeout (out). - * @data: words read from the Shadow RAM. + * i40e_read_nvm_buffer - Reads Shadow RAM buffer + * @hw: pointer to the HW structure + * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF). + * @words: (in) number of words to read; (out) number of words actually read + * @data: words read from the Shadow RAM * - * Reads 16 bit words (data buffer) from the SR using the i40e_read_nvm_srrd() - * method. The buffer read is preceded by the NVM ownership take - * and followed by the release. + * Reads 16 bit words (data buffer) from the SR using the i40e_read_nvm_srrd() + * method. The buffer read is preceded by the NVM ownership take + * and followed by the release. **/ i40e_status i40e_read_nvm_buffer(struct i40e_hw *hw, u16 offset, u16 *words, u16 *data) @@ -227,7 +226,7 @@ i40e_status i40e_read_nvm_buffer(struct i40e_hw *hw, u16 offset, i40e_status ret_code = 0; u16 index, word; - /* Loop thru the selected region. */ + /* Loop thru the selected region */ for (word = 0; word < *words; word++) { index = offset + word; ret_code = i40e_read_nvm_word(hw, index, &data[word]); @@ -235,21 +234,21 @@ i40e_status i40e_read_nvm_buffer(struct i40e_hw *hw, u16 offset, break; } - /* Update the number of words read from the Shadow RAM. */ + /* Update the number of words read from the Shadow RAM */ *words = word; return ret_code; } /** - * i40e_calc_nvm_checksum - Calculates and returns the checksum - * @hw: pointer to hardware structure - * @checksum: pointer to the checksum + * i40e_calc_nvm_checksum - Calculates and returns the checksum + * @hw: pointer to hardware structure + * @checksum: pointer to the checksum * - * This function calculate SW Checksum that covers the whole 64kB shadow RAM - * except the VPD and PCIe ALT Auto-load modules. The structure and size of VPD - * is customer specific and unknown. Therefore, this function skips all maximum - * possible size of VPD (1kB). + * This function calculates SW Checksum that covers the whole 64kB shadow RAM + * except the VPD and PCIe ALT Auto-load modules. The structure and size of VPD + * is customer specific and unknown. Therefore, this function skips all maximum + * possible size of VPD (1kB). **/ static i40e_status i40e_calc_nvm_checksum(struct i40e_hw *hw, u16 *checksum) @@ -311,12 +310,12 @@ i40e_calc_nvm_checksum_exit: } /** - * i40e_validate_nvm_checksum - Validate EEPROM checksum - * @hw: pointer to hardware structure - * @checksum: calculated checksum + * i40e_validate_nvm_checksum - Validate EEPROM checksum + * @hw: pointer to hardware structure + * @checksum: calculated checksum * - * Performs checksum calculation and validates the NVM SW checksum. If the - * caller does not need checksum, the value can be NULL. + * Performs checksum calculation and validates the NVM SW checksum. If the + * caller does not need checksum, the value can be NULL. **/ i40e_status i40e_validate_nvm_checksum(struct i40e_hw *hw, u16 *checksum) diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h index ed91f93ede2b..9cd57e617959 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h @@ -231,6 +231,13 @@ i40e_status i40e_validate_nvm_checksum(struct i40e_hw *hw, u16 *checksum); void i40e_set_pci_config_data(struct i40e_hw *hw, u16 link_status); +extern struct i40e_rx_ptype_decoded i40e_ptype_lookup[]; + +static inline struct i40e_rx_ptype_decoded decode_rx_desc_ptype(u8 ptype) +{ + return i40e_ptype_lookup[ptype]; +} + /* prototype for functions used for SW locks */ /* i40e_common for VF drivers*/ diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 19af4ce0a4fe..2081bdb214e5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -25,6 +25,7 @@ ******************************************************************************/ #include "i40e.h" +#include "i40e_prototype.h" static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size, u32 td_tag) @@ -39,11 +40,12 @@ static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size, #define I40E_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS) /** * i40e_program_fdir_filter - Program a Flow Director filter - * @fdir_input: Packet data that will be filter parameters + * @fdir_data: Packet data that will be filter parameters + * @raw_packet: the pre-allocated packet buffer for FDir * @pf: The pf pointer * @add: True for add/update, False for remove **/ -int i40e_program_fdir_filter(struct i40e_fdir_data *fdir_data, +int i40e_program_fdir_filter(struct i40e_fdir_filter *fdir_data, u8 *raw_packet, struct i40e_pf *pf, bool add) { struct i40e_filter_program_desc *fdir_desc; @@ -68,8 +70,8 @@ int i40e_program_fdir_filter(struct i40e_fdir_data *fdir_data, tx_ring = vsi->tx_rings[0]; dev = tx_ring->dev; - dma = dma_map_single(dev, fdir_data->raw_packet, - I40E_FDIR_MAX_RAW_PACKET_LOOKUP, DMA_TO_DEVICE); + dma = dma_map_single(dev, raw_packet, + I40E_FDIR_MAX_RAW_PACKET_SIZE, DMA_TO_DEVICE); if (dma_mapping_error(dev, dma)) goto dma_fail; @@ -132,14 +134,14 @@ int i40e_program_fdir_filter(struct i40e_fdir_data *fdir_data, tx_ring->next_to_use = (i + 1 < tx_ring->count) ? i + 1 : 0; /* record length, and DMA address */ - dma_unmap_len_set(tx_buf, len, I40E_FDIR_MAX_RAW_PACKET_LOOKUP); + dma_unmap_len_set(tx_buf, len, I40E_FDIR_MAX_RAW_PACKET_SIZE); dma_unmap_addr_set(tx_buf, dma, dma); tx_desc->buffer_addr = cpu_to_le64(dma); td_cmd = I40E_TXD_CMD | I40E_TX_DESC_CMD_DUMMY; tx_desc->cmd_type_offset_bsz = - build_ctob(td_cmd, 0, I40E_FDIR_MAX_RAW_PACKET_LOOKUP, 0); + build_ctob(td_cmd, 0, I40E_FDIR_MAX_RAW_PACKET_SIZE, 0); /* set the timestamp */ tx_buf->time_stamp = jiffies; @@ -161,6 +163,270 @@ dma_fail: return -1; } +#define IP_HEADER_OFFSET 14 +#define I40E_UDPIP_DUMMY_PACKET_LEN 42 +/** + * i40e_add_del_fdir_udpv4 - Add/Remove UDPv4 filters + * @vsi: pointer to the targeted VSI + * @fd_data: the flow director data required for the FDir descriptor + * @raw_packet: the pre-allocated packet buffer for FDir + * @add: true adds a filter, false removes it + * + * Returns 0 if the filters were successfully added or removed + **/ +static int i40e_add_del_fdir_udpv4(struct i40e_vsi *vsi, + struct i40e_fdir_filter *fd_data, + u8 *raw_packet, bool add) +{ + struct i40e_pf *pf = vsi->back; + struct udphdr *udp; + struct iphdr *ip; + bool err = false; + int ret; + int i; + static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0, + 0x45, 0, 0, 0x1c, 0, 0, 0x40, 0, 0x40, 0x11, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + memcpy(raw_packet, packet, I40E_UDPIP_DUMMY_PACKET_LEN); + + ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET); + udp = (struct udphdr *)(raw_packet + IP_HEADER_OFFSET + + sizeof(struct iphdr)); + + ip->daddr = fd_data->dst_ip[0]; + udp->dest = fd_data->dst_port; + ip->saddr = fd_data->src_ip[0]; + udp->source = fd_data->src_port; + + for (i = I40E_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP; + i <= I40E_FILTER_PCTYPE_NONF_IPV4_UDP; i++) { + fd_data->pctype = i; + ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add); + + if (ret) { + dev_info(&pf->pdev->dev, + "Filter command send failed for PCTYPE %d (ret = %d)\n", + fd_data->pctype, ret); + err = true; + } else { + dev_info(&pf->pdev->dev, + "Filter OK for PCTYPE %d (ret = %d)\n", + fd_data->pctype, ret); + } + } + + return err ? -EOPNOTSUPP : 0; +} + +#define I40E_TCPIP_DUMMY_PACKET_LEN 54 +/** + * i40e_add_del_fdir_tcpv4 - Add/Remove TCPv4 filters + * @vsi: pointer to the targeted VSI + * @fd_data: the flow director data required for the FDir descriptor + * @raw_packet: the pre-allocated packet buffer for FDir + * @add: true adds a filter, false removes it + * + * Returns 0 if the filters were successfully added or removed + **/ +static int i40e_add_del_fdir_tcpv4(struct i40e_vsi *vsi, + struct i40e_fdir_filter *fd_data, + u8 *raw_packet, bool add) +{ + struct i40e_pf *pf = vsi->back; + struct tcphdr *tcp; + struct iphdr *ip; + bool err = false; + int ret; + /* Dummy packet */ + static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0, + 0x45, 0, 0, 0x28, 0, 0, 0x40, 0, 0x40, 0x6, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80, 0x11, + 0x0, 0x72, 0, 0, 0, 0}; + + memcpy(raw_packet, packet, I40E_TCPIP_DUMMY_PACKET_LEN); + + ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET); + tcp = (struct tcphdr *)(raw_packet + IP_HEADER_OFFSET + + sizeof(struct iphdr)); + + ip->daddr = fd_data->dst_ip[0]; + tcp->dest = fd_data->dst_port; + ip->saddr = fd_data->src_ip[0]; + tcp->source = fd_data->src_port; + + if (add) { + if (pf->flags & I40E_FLAG_FD_ATR_ENABLED) { + dev_info(&pf->pdev->dev, "Forcing ATR off, sideband rules for TCP/IPv4 flow being applied\n"); + pf->flags &= ~I40E_FLAG_FD_ATR_ENABLED; + } + } + + fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_TCP_SYN; + ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add); + + if (ret) { + dev_info(&pf->pdev->dev, + "Filter command send failed for PCTYPE %d (ret = %d)\n", + fd_data->pctype, ret); + err = true; + } else { + dev_info(&pf->pdev->dev, "Filter OK for PCTYPE %d (ret = %d)\n", + fd_data->pctype, ret); + } + + fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_TCP; + + ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add); + if (ret) { + dev_info(&pf->pdev->dev, + "Filter command send failed for PCTYPE %d (ret = %d)\n", + fd_data->pctype, ret); + err = true; + } else { + dev_info(&pf->pdev->dev, "Filter OK for PCTYPE %d (ret = %d)\n", + fd_data->pctype, ret); + } + + return err ? -EOPNOTSUPP : 0; +} + +/** + * i40e_add_del_fdir_sctpv4 - Add/Remove SCTPv4 Flow Director filters for + * a specific flow spec + * @vsi: pointer to the targeted VSI + * @fd_data: the flow director data required for the FDir descriptor + * @raw_packet: the pre-allocated packet buffer for FDir + * @add: true adds a filter, false removes it + * + * Returns 0 if the filters were successfully added or removed + **/ +static int i40e_add_del_fdir_sctpv4(struct i40e_vsi *vsi, + struct i40e_fdir_filter *fd_data, + u8 *raw_packet, bool add) +{ + return -EOPNOTSUPP; +} + +#define I40E_IP_DUMMY_PACKET_LEN 34 +/** + * i40e_add_del_fdir_ipv4 - Add/Remove IPv4 Flow Director filters for + * a specific flow spec + * @vsi: pointer to the targeted VSI + * @fd_data: the flow director data required for the FDir descriptor + * @raw_packet: the pre-allocated packet buffer for FDir + * @add: true adds a filter, false removes it + * + * Returns 0 if the filters were successfully added or removed + **/ +static int i40e_add_del_fdir_ipv4(struct i40e_vsi *vsi, + struct i40e_fdir_filter *fd_data, + u8 *raw_packet, bool add) +{ + struct i40e_pf *pf = vsi->back; + struct iphdr *ip; + bool err = false; + int ret; + int i; + static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0, + 0x45, 0, 0, 0x14, 0, 0, 0x40, 0, 0x40, 0x10, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0}; + + memcpy(raw_packet, packet, I40E_IP_DUMMY_PACKET_LEN); + ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET); + + ip->saddr = fd_data->src_ip[0]; + ip->daddr = fd_data->dst_ip[0]; + ip->protocol = 0; + + for (i = I40E_FILTER_PCTYPE_NONF_IPV4_OTHER; + i <= I40E_FILTER_PCTYPE_FRAG_IPV4; i++) { + fd_data->pctype = i; + ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add); + + if (ret) { + dev_info(&pf->pdev->dev, + "Filter command send failed for PCTYPE %d (ret = %d)\n", + fd_data->pctype, ret); + err = true; + } else { + dev_info(&pf->pdev->dev, + "Filter OK for PCTYPE %d (ret = %d)\n", + fd_data->pctype, ret); + } + } + + return err ? -EOPNOTSUPP : 0; +} + +/** + * i40e_add_del_fdir - Build raw packets to add/del fdir filter + * @vsi: pointer to the targeted VSI + * @cmd: command to get or set RX flow classification rules + * @add: true adds a filter, false removes it + * + **/ +int i40e_add_del_fdir(struct i40e_vsi *vsi, + struct i40e_fdir_filter *input, bool add) +{ + struct i40e_pf *pf = vsi->back; + u8 *raw_packet; + int ret; + + /* Populate the Flow Director that we have at the moment + * and allocate the raw packet buffer for the calling functions + */ + raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL); + if (!raw_packet) + return -ENOMEM; + + switch (input->flow_type & ~FLOW_EXT) { + case TCP_V4_FLOW: + ret = i40e_add_del_fdir_tcpv4(vsi, input, raw_packet, + add); + break; + case UDP_V4_FLOW: + ret = i40e_add_del_fdir_udpv4(vsi, input, raw_packet, + add); + break; + case SCTP_V4_FLOW: + ret = i40e_add_del_fdir_sctpv4(vsi, input, raw_packet, + add); + break; + case IPV4_FLOW: + ret = i40e_add_del_fdir_ipv4(vsi, input, raw_packet, + add); + break; + case IP_USER_FLOW: + switch (input->ip4_proto) { + case IPPROTO_TCP: + ret = i40e_add_del_fdir_tcpv4(vsi, input, + raw_packet, add); + break; + case IPPROTO_UDP: + ret = i40e_add_del_fdir_udpv4(vsi, input, + raw_packet, add); + break; + case IPPROTO_SCTP: + ret = i40e_add_del_fdir_sctpv4(vsi, input, + raw_packet, add); + break; + default: + ret = i40e_add_del_fdir_ipv4(vsi, input, + raw_packet, add); + break; + } + break; + default: + dev_info(&pf->pdev->dev, "Could not specify spec type %d", + input->flow_type); + ret = -EINVAL; + } + + kfree(raw_packet); + return ret; +} + /** * i40e_fd_handle_status - check the Programming Status for FD * @rx_ring: the Rx ring for this descriptor @@ -956,6 +1222,29 @@ static inline u32 i40e_rx_hash(struct i40e_ring *ring, } /** + * i40e_ptype_to_hash - get a hash type + * @ptype: the ptype value from the descriptor + * + * Returns a hash type to be used by skb_set_hash + **/ +static inline enum pkt_hash_types i40e_ptype_to_hash(u8 ptype) +{ + struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype); + + if (!decoded.known) + return PKT_HASH_TYPE_NONE; + + if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP && + decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4) + return PKT_HASH_TYPE_L4; + else if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP && + decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3) + return PKT_HASH_TYPE_L3; + else + return PKT_HASH_TYPE_L2; +} + +/** * i40e_clean_rx_irq - Reclaim resources after receive completes * @rx_ring: rx ring to clean * @budget: how many cleans we're allowed @@ -972,8 +1261,8 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) u16 i = rx_ring->next_to_clean; union i40e_rx_desc *rx_desc; u32 rx_error, rx_status; + u8 rx_ptype; u64 qword; - u16 rx_ptype; rx_desc = I40E_RX_DESC(rx_ring, i); qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len); @@ -1087,7 +1376,8 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) goto next_desc; } - skb->rxhash = i40e_rx_hash(rx_ring, rx_desc); + skb_set_hash(skb, i40e_rx_hash(rx_ring, rx_desc), + i40e_ptype_to_hash(rx_ptype)); if (unlikely(rx_status & I40E_RXD_QW1_STATUS_TSYNVALID_MASK)) { i40e_ptp_rx_hwtstamp(vsi->back, skb, (rx_status & I40E_RXD_QW1_STATUS_TSYNINDX_MASK) >> @@ -1812,7 +2102,7 @@ static int i40e_xmit_descriptor_count(struct sk_buff *skb, /* need: 1 descriptor per page * PAGE_SIZE/I40E_MAX_DATA_PER_TXD, * + 1 desc for skb_head_len/I40E_MAX_DATA_PER_TXD, - * + 2 desc gap to keep tail from touching head, + * + 4 desc gap to avoid the cache line where head is, * + 1 desc for context descriptor, * otherwise try next time */ @@ -1823,7 +2113,7 @@ static int i40e_xmit_descriptor_count(struct sk_buff *skb, count += skb_shinfo(skb)->nr_frags; #endif count += TXD_USE_COUNT(skb_headlen(skb)); - if (i40e_maybe_stop_tx(tx_ring, count + 3)) { + if (i40e_maybe_stop_tx(tx_ring, count + 4 + 1)) { tx_ring->tx_stats.tx_busy++; return 0; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 189e250198dd..42cc6ba88005 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -858,7 +858,7 @@ int i40e_alloc_vfs(struct i40e_pf *pf, u16 num_alloc_vfs) } } /* allocate memory */ - vfs = kzalloc(num_alloc_vfs * sizeof(struct i40e_vf), GFP_KERNEL); + vfs = kcalloc(num_alloc_vfs, sizeof(struct i40e_vf), GFP_KERNEL); if (!vfs) { ret = -ENOMEM; goto err_alloc; diff --git a/drivers/net/ethernet/intel/i40evf/i40e_common.c b/drivers/net/ethernet/intel/i40evf/i40e_common.c index 7b13953b28c4..78618af271cf 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_common.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_common.c @@ -160,6 +160,372 @@ i40e_status i40evf_aq_queue_shutdown(struct i40e_hw *hw, } +/* The i40e_ptype_lookup table is used to convert from the 8-bit ptype in the + * hardware to a bit-field that can be used by SW to more easily determine the + * packet type. + * + * Macros are used to shorten the table lines and make this table human + * readable. + * + * We store the PTYPE in the top byte of the bit field - this is just so that + * we can check that the table doesn't have a row missing, as the index into + * the table should be the PTYPE. + * + * Typical work flow: + * + * IF NOT i40e_ptype_lookup[ptype].known + * THEN + * Packet is unknown + * ELSE IF i40e_ptype_lookup[ptype].outer_ip == I40E_RX_PTYPE_OUTER_IP + * Use the rest of the fields to look at the tunnels, inner protocols, etc + * ELSE + * Use the enum i40e_rx_l2_ptype to decode the packet type + * ENDIF + */ + +/* macro to make the table lines short */ +#define I40E_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\ + { PTYPE, \ + 1, \ + I40E_RX_PTYPE_OUTER_##OUTER_IP, \ + I40E_RX_PTYPE_OUTER_##OUTER_IP_VER, \ + I40E_RX_PTYPE_##OUTER_FRAG, \ + I40E_RX_PTYPE_TUNNEL_##T, \ + I40E_RX_PTYPE_TUNNEL_END_##TE, \ + I40E_RX_PTYPE_##TEF, \ + I40E_RX_PTYPE_INNER_PROT_##I, \ + I40E_RX_PTYPE_PAYLOAD_LAYER_##PL } + +#define I40E_PTT_UNUSED_ENTRY(PTYPE) \ + { PTYPE, 0, 0, 0, 0, 0, 0, 0, 0, 0 } + +/* shorter macros makes the table fit but are terse */ +#define I40E_RX_PTYPE_NOF I40E_RX_PTYPE_NOT_FRAG +#define I40E_RX_PTYPE_FRG I40E_RX_PTYPE_FRAG +#define I40E_RX_PTYPE_INNER_PROT_TS I40E_RX_PTYPE_INNER_PROT_TIMESYNC + +/* Lookup table mapping the HW PTYPE to the bit field for decoding */ +struct i40e_rx_ptype_decoded i40e_ptype_lookup[] = { + /* L2 Packet types */ + I40E_PTT_UNUSED_ENTRY(0), + I40E_PTT(1, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT(2, L2, NONE, NOF, NONE, NONE, NOF, TS, PAY2), + I40E_PTT(3, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT_UNUSED_ENTRY(4), + I40E_PTT_UNUSED_ENTRY(5), + I40E_PTT(6, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT(7, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT_UNUSED_ENTRY(8), + I40E_PTT_UNUSED_ENTRY(9), + I40E_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE), + I40E_PTT(12, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(13, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(14, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(15, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(16, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(17, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(18, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(19, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(20, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(21, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + + /* Non Tunneled IPv4 */ + I40E_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(25), + I40E_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP, PAY4), + I40E_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4), + I40E_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4), + + /* IPv4 --> IPv4 */ + I40E_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3), + I40E_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3), + I40E_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(32), + I40E_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP, PAY4), + I40E_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4), + I40E_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4), + + /* IPv4 --> IPv6 */ + I40E_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3), + I40E_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3), + I40E_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(39), + I40E_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP, PAY4), + I40E_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4), + I40E_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT */ + I40E_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3), + + /* IPv4 --> GRE/NAT --> IPv4 */ + I40E_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3), + I40E_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3), + I40E_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(47), + I40E_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP, PAY4), + I40E_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4), + I40E_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT --> IPv6 */ + I40E_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3), + I40E_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3), + I40E_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(54), + I40E_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP, PAY4), + I40E_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4), + I40E_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT --> MAC */ + I40E_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3), + + /* IPv4 --> GRE/NAT --> MAC --> IPv4 */ + I40E_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3), + I40E_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3), + I40E_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(62), + I40E_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP, PAY4), + I40E_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4), + I40E_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT -> MAC --> IPv6 */ + I40E_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3), + I40E_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3), + I40E_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(69), + I40E_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP, PAY4), + I40E_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4), + I40E_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT --> MAC/VLAN */ + I40E_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3), + + /* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */ + I40E_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3), + I40E_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3), + I40E_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(77), + I40E_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP, PAY4), + I40E_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4), + I40E_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4), + + /* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */ + I40E_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3), + I40E_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3), + I40E_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(84), + I40E_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP, PAY4), + I40E_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4), + I40E_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4), + + /* Non Tunneled IPv6 */ + I40E_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP, PAY3), + I40E_PTT_UNUSED_ENTRY(91), + I40E_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP, PAY4), + I40E_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4), + I40E_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4), + + /* IPv6 --> IPv4 */ + I40E_PTT(95, IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3), + I40E_PTT(96, IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3), + I40E_PTT(97, IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(98), + I40E_PTT(99, IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP, PAY4), + I40E_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4), + I40E_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4), + + /* IPv6 --> IPv6 */ + I40E_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3), + I40E_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3), + I40E_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(105), + I40E_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP, PAY4), + I40E_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4), + I40E_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT */ + I40E_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3), + + /* IPv6 --> GRE/NAT -> IPv4 */ + I40E_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3), + I40E_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3), + I40E_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(113), + I40E_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP, PAY4), + I40E_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4), + I40E_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> IPv6 */ + I40E_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3), + I40E_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3), + I40E_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(120), + I40E_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP, PAY4), + I40E_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4), + I40E_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> MAC */ + I40E_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3), + + /* IPv6 --> GRE/NAT -> MAC -> IPv4 */ + I40E_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3), + I40E_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3), + I40E_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(128), + I40E_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP, PAY4), + I40E_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4), + I40E_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> MAC -> IPv6 */ + I40E_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3), + I40E_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3), + I40E_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(135), + I40E_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP, PAY4), + I40E_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4), + I40E_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> MAC/VLAN */ + I40E_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3), + + /* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */ + I40E_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3), + I40E_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3), + I40E_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(143), + I40E_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP, PAY4), + I40E_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4), + I40E_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */ + I40E_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3), + I40E_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3), + I40E_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(150), + I40E_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP, PAY4), + I40E_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4), + I40E_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4), + + /* unused entries */ + I40E_PTT_UNUSED_ENTRY(154), + I40E_PTT_UNUSED_ENTRY(155), + I40E_PTT_UNUSED_ENTRY(156), + I40E_PTT_UNUSED_ENTRY(157), + I40E_PTT_UNUSED_ENTRY(158), + I40E_PTT_UNUSED_ENTRY(159), + + I40E_PTT_UNUSED_ENTRY(160), + I40E_PTT_UNUSED_ENTRY(161), + I40E_PTT_UNUSED_ENTRY(162), + I40E_PTT_UNUSED_ENTRY(163), + I40E_PTT_UNUSED_ENTRY(164), + I40E_PTT_UNUSED_ENTRY(165), + I40E_PTT_UNUSED_ENTRY(166), + I40E_PTT_UNUSED_ENTRY(167), + I40E_PTT_UNUSED_ENTRY(168), + I40E_PTT_UNUSED_ENTRY(169), + + I40E_PTT_UNUSED_ENTRY(170), + I40E_PTT_UNUSED_ENTRY(171), + I40E_PTT_UNUSED_ENTRY(172), + I40E_PTT_UNUSED_ENTRY(173), + I40E_PTT_UNUSED_ENTRY(174), + I40E_PTT_UNUSED_ENTRY(175), + I40E_PTT_UNUSED_ENTRY(176), + I40E_PTT_UNUSED_ENTRY(177), + I40E_PTT_UNUSED_ENTRY(178), + I40E_PTT_UNUSED_ENTRY(179), + + I40E_PTT_UNUSED_ENTRY(180), + I40E_PTT_UNUSED_ENTRY(181), + I40E_PTT_UNUSED_ENTRY(182), + I40E_PTT_UNUSED_ENTRY(183), + I40E_PTT_UNUSED_ENTRY(184), + I40E_PTT_UNUSED_ENTRY(185), + I40E_PTT_UNUSED_ENTRY(186), + I40E_PTT_UNUSED_ENTRY(187), + I40E_PTT_UNUSED_ENTRY(188), + I40E_PTT_UNUSED_ENTRY(189), + + I40E_PTT_UNUSED_ENTRY(190), + I40E_PTT_UNUSED_ENTRY(191), + I40E_PTT_UNUSED_ENTRY(192), + I40E_PTT_UNUSED_ENTRY(193), + I40E_PTT_UNUSED_ENTRY(194), + I40E_PTT_UNUSED_ENTRY(195), + I40E_PTT_UNUSED_ENTRY(196), + I40E_PTT_UNUSED_ENTRY(197), + I40E_PTT_UNUSED_ENTRY(198), + I40E_PTT_UNUSED_ENTRY(199), + + I40E_PTT_UNUSED_ENTRY(200), + I40E_PTT_UNUSED_ENTRY(201), + I40E_PTT_UNUSED_ENTRY(202), + I40E_PTT_UNUSED_ENTRY(203), + I40E_PTT_UNUSED_ENTRY(204), + I40E_PTT_UNUSED_ENTRY(205), + I40E_PTT_UNUSED_ENTRY(206), + I40E_PTT_UNUSED_ENTRY(207), + I40E_PTT_UNUSED_ENTRY(208), + I40E_PTT_UNUSED_ENTRY(209), + + I40E_PTT_UNUSED_ENTRY(210), + I40E_PTT_UNUSED_ENTRY(211), + I40E_PTT_UNUSED_ENTRY(212), + I40E_PTT_UNUSED_ENTRY(213), + I40E_PTT_UNUSED_ENTRY(214), + I40E_PTT_UNUSED_ENTRY(215), + I40E_PTT_UNUSED_ENTRY(216), + I40E_PTT_UNUSED_ENTRY(217), + I40E_PTT_UNUSED_ENTRY(218), + I40E_PTT_UNUSED_ENTRY(219), + + I40E_PTT_UNUSED_ENTRY(220), + I40E_PTT_UNUSED_ENTRY(221), + I40E_PTT_UNUSED_ENTRY(222), + I40E_PTT_UNUSED_ENTRY(223), + I40E_PTT_UNUSED_ENTRY(224), + I40E_PTT_UNUSED_ENTRY(225), + I40E_PTT_UNUSED_ENTRY(226), + I40E_PTT_UNUSED_ENTRY(227), + I40E_PTT_UNUSED_ENTRY(228), + I40E_PTT_UNUSED_ENTRY(229), + + I40E_PTT_UNUSED_ENTRY(230), + I40E_PTT_UNUSED_ENTRY(231), + I40E_PTT_UNUSED_ENTRY(232), + I40E_PTT_UNUSED_ENTRY(233), + I40E_PTT_UNUSED_ENTRY(234), + I40E_PTT_UNUSED_ENTRY(235), + I40E_PTT_UNUSED_ENTRY(236), + I40E_PTT_UNUSED_ENTRY(237), + I40E_PTT_UNUSED_ENTRY(238), + I40E_PTT_UNUSED_ENTRY(239), + + I40E_PTT_UNUSED_ENTRY(240), + I40E_PTT_UNUSED_ENTRY(241), + I40E_PTT_UNUSED_ENTRY(242), + I40E_PTT_UNUSED_ENTRY(243), + I40E_PTT_UNUSED_ENTRY(244), + I40E_PTT_UNUSED_ENTRY(245), + I40E_PTT_UNUSED_ENTRY(246), + I40E_PTT_UNUSED_ENTRY(247), + I40E_PTT_UNUSED_ENTRY(248), + I40E_PTT_UNUSED_ENTRY(249), + + I40E_PTT_UNUSED_ENTRY(250), + I40E_PTT_UNUSED_ENTRY(251), + I40E_PTT_UNUSED_ENTRY(252), + I40E_PTT_UNUSED_ENTRY(253), + I40E_PTT_UNUSED_ENTRY(254), + I40E_PTT_UNUSED_ENTRY(255) +}; + + /** * i40e_aq_send_msg_to_pf * @hw: pointer to the hardware structure diff --git a/drivers/net/ethernet/intel/i40evf/i40e_prototype.h b/drivers/net/ethernet/intel/i40evf/i40e_prototype.h index 7841573a58c9..33c99051cc96 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_prototype.h @@ -63,6 +63,13 @@ i40e_status i40evf_aq_queue_shutdown(struct i40e_hw *hw, i40e_status i40e_set_mac_type(struct i40e_hw *hw); +extern struct i40e_rx_ptype_decoded i40e_ptype_lookup[]; + +static inline struct i40e_rx_ptype_decoded decode_rx_desc_ptype(u8 ptype) +{ + return i40e_ptype_lookup[ptype]; +} + /* prototype for functions used for SW locks */ /* i40e_common for VF drivers*/ diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index 827bb5fa4af9..b1d87c6a5c35 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -24,6 +24,7 @@ #include <linux/prefetch.h> #include "i40evf.h" +#include "i40e_prototype.h" static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size, u32 td_tag) @@ -786,6 +787,29 @@ static inline u32 i40e_rx_hash(struct i40e_ring *ring, } /** + * i40e_ptype_to_hash - get a hash type + * @ptype: the ptype value from the descriptor + * + * Returns a hash type to be used by skb_set_hash + **/ +static inline enum pkt_hash_types i40e_ptype_to_hash(u8 ptype) +{ + struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype); + + if (!decoded.known) + return PKT_HASH_TYPE_NONE; + + if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP && + decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4) + return PKT_HASH_TYPE_L4; + else if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP && + decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3) + return PKT_HASH_TYPE_L3; + else + return PKT_HASH_TYPE_L2; +} + +/** * i40e_clean_rx_irq - Reclaim resources after receive completes * @rx_ring: rx ring to clean * @budget: how many cleans we're allowed @@ -802,8 +826,8 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) u16 i = rx_ring->next_to_clean; union i40e_rx_desc *rx_desc; u32 rx_error, rx_status; + u8 rx_ptype; u64 qword; - u16 rx_ptype; rx_desc = I40E_RX_DESC(rx_ring, i); qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len); @@ -912,7 +936,8 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) goto next_desc; } - skb->rxhash = i40e_rx_hash(rx_ring, rx_desc); + skb_set_hash(skb, i40e_rx_hash(rx_ring, rx_desc), + i40e_ptype_to_hash(rx_ptype)); /* probably a little skewed due to removing CRC */ total_rx_bytes += skb->len; total_rx_packets++; @@ -1457,7 +1482,7 @@ static int i40e_xmit_descriptor_count(struct sk_buff *skb, /* need: 1 descriptor per page * PAGE_SIZE/I40E_MAX_DATA_PER_TXD, * + 1 desc for skb_head_len/I40E_MAX_DATA_PER_TXD, - * + 2 desc gap to keep tail from touching head, + * + 4 desc gap to avoid the cache line where head is, * + 1 desc for context descriptor, * otherwise try next time */ @@ -1468,7 +1493,7 @@ static int i40e_xmit_descriptor_count(struct sk_buff *skb, count += skb_shinfo(skb)->nr_frags; #endif count += TXD_USE_COUNT(skb_headlen(skb)); - if (i40e_maybe_stop_tx(tx_ring, count + 3)) { + if (i40e_maybe_stop_tx(tx_ring, count + 4 + 1)) { tx_ring->tx_stats.tx_busy++; return 0; } diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index b2c03bca7929..11d0b61510b0 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -31,7 +31,7 @@ char i40evf_driver_name[] = "i40evf"; static const char i40evf_driver_string[] = "Intel(R) XL710 X710 Virtual Function Network Driver"; -#define DRV_VERSION "0.9.13" +#define DRV_VERSION "0.9.14" const char i40evf_driver_version[] = DRV_VERSION; static const char i40evf_copyright[] = "Copyright (c) 2013 - 2014 Intel Corporation."; @@ -2036,6 +2036,7 @@ static void i40evf_init_task(struct work_struct *work) NETIF_F_IPV6_CSUM | NETIF_F_TSO | NETIF_F_TSO6 | + NETIF_F_RXCSUM | NETIF_F_GRO; if (adapter->vf_res->vf_offload_flags @@ -2046,6 +2047,10 @@ static void i40evf_init_task(struct work_struct *work) NETIF_F_HW_VLAN_CTAG_FILTER; } + /* copy netdev features into list of user selectable features */ + netdev->hw_features |= netdev->features; + netdev->hw_features &= ~NETIF_F_RXCSUM; + if (!is_valid_ether_addr(adapter->hw.mac.addr)) { dev_info(&pdev->dev, "Invalid MAC address %pMAC, using random\n", adapter->hw.mac.addr); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c index 3454437fcd95..0c59d4fe7e3a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c @@ -332,7 +332,7 @@ static struct mlx4_interface mlx4_en_interface = { .protocol = MLX4_PROT_ETH, }; -void mlx4_en_verify_params(void) +static void mlx4_en_verify_params(void) { if (pfctx > MAX_PFC_TX) { pr_warn("mlx4_en: WARNING: illegal module parameter pfctx 0x%x - should be in range 0-0x%x, will be changed to default (0)\n", diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index b8eee365e15d..c7ef30dee1b9 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -21,9 +21,10 @@ #include <linux/list.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <net/ip6_checksum.h> /* Version Information */ -#define DRIVER_VERSION "v1.05.0 (2014/02/18)" +#define DRIVER_VERSION "v1.06.0 (2014/03/03)" #define DRIVER_AUTHOR "Realtek linux nic maintainers <nic_swsd@realtek.com>" #define DRIVER_DESC "Realtek RTL8152/RTL8153 Based USB Ethernet Adapters" #define MODULENAME "r8152" @@ -447,6 +448,7 @@ enum rtl8152_flags { RTL8152_LINK_CHG, SELECTIVE_SUSPEND, PHY_RESET, + SCHEDULE_TASKLET, }; /* Define these values to match your device */ @@ -466,8 +468,18 @@ enum rtl8152_flags { struct rx_desc { __le32 opts1; #define RX_LEN_MASK 0x7fff + __le32 opts2; +#define RD_UDP_CS (1 << 23) +#define RD_TCP_CS (1 << 22) +#define RD_IPV6_CS (1 << 20) +#define RD_IPV4_CS (1 << 19) + __le32 opts3; +#define IPF (1 << 23) /* IP checksum fail */ +#define UDPF (1 << 22) /* UDP checksum fail */ +#define TCPF (1 << 21) /* TCP checksum fail */ + __le32 opts4; __le32 opts5; __le32 opts6; @@ -477,13 +489,21 @@ struct tx_desc { __le32 opts1; #define TX_FS (1 << 31) /* First segment of a packet */ #define TX_LS (1 << 30) /* Final segment of a packet */ -#define TX_LEN_MASK 0x3ffff +#define GTSENDV4 (1 << 28) +#define GTSENDV6 (1 << 27) +#define GTTCPHO_SHIFT 18 +#define GTTCPHO_MAX 0x7fU +#define TX_LEN_MAX 0x3ffffU __le32 opts2; #define UDP_CS (1 << 31) /* Calculate UDP/IP checksum */ #define TCP_CS (1 << 30) /* Calculate TCP/IP checksum */ #define IPV4_CS (1 << 29) /* Calculate IPv4 checksum */ #define IPV6_CS (1 << 28) /* Calculate IPv6 checksum */ +#define MSS_SHIFT 17 +#define MSS_MAX 0x7ffU +#define TCPHO_SHIFT 17 +#define TCPHO_MAX 0x7ffU }; struct r8152; @@ -550,12 +570,21 @@ enum rtl_version { RTL_VER_MAX }; +enum tx_csum_stat { + TX_CSUM_SUCCESS = 0, + TX_CSUM_TSO, + TX_CSUM_NONE +}; + /* Maximum number of multicast addresses to filter (vs. Rx-all-multicast). * The RTL chips use a 64 element hash table based on the Ethernet CRC. */ static const int multicast_filter_limit = 32; static unsigned int rx_buf_sz = 16384; +#define RTL_LIMITED_TSO_SIZE (rx_buf_sz - sizeof(struct tx_desc) - \ + VLAN_ETH_HLEN - VLAN_HLEN) + static int get_registers(struct r8152 *tp, u16 value, u16 index, u16 size, void *data) { @@ -963,7 +992,6 @@ static int rtl8152_set_mac_address(struct net_device *netdev, void *p) static void read_bulk_callback(struct urb *urb) { struct net_device *netdev; - unsigned long flags; int status = urb->status; struct rx_agg *agg; struct r8152 *tp; @@ -997,9 +1025,9 @@ static void read_bulk_callback(struct urb *urb) if (urb->actual_length < ETH_ZLEN) break; - spin_lock_irqsave(&tp->rx_lock, flags); + spin_lock(&tp->rx_lock); list_add_tail(&agg->list, &tp->rx_done); - spin_unlock_irqrestore(&tp->rx_lock, flags); + spin_unlock(&tp->rx_lock); tasklet_schedule(&tp->tl); return; case -ESHUTDOWN: @@ -1022,9 +1050,9 @@ static void read_bulk_callback(struct urb *urb) if (result == -ENODEV) { netif_device_detach(tp->netdev); } else if (result) { - spin_lock_irqsave(&tp->rx_lock, flags); + spin_lock(&tp->rx_lock); list_add_tail(&agg->list, &tp->rx_done); - spin_unlock_irqrestore(&tp->rx_lock, flags); + spin_unlock(&tp->rx_lock); tasklet_schedule(&tp->tl); } } @@ -1033,7 +1061,6 @@ static void write_bulk_callback(struct urb *urb) { struct net_device_stats *stats; struct net_device *netdev; - unsigned long flags; struct tx_agg *agg; struct r8152 *tp; int status = urb->status; @@ -1057,9 +1084,9 @@ static void write_bulk_callback(struct urb *urb) stats->tx_bytes += agg->skb_len; } - spin_lock_irqsave(&tp->tx_lock, flags); + spin_lock(&tp->tx_lock); list_add_tail(&agg->list, &tp->tx_free); - spin_unlock_irqrestore(&tp->tx_lock, flags); + spin_unlock(&tp->tx_lock); usb_autopm_put_interface_async(tp->intf); @@ -1073,7 +1100,7 @@ static void write_bulk_callback(struct urb *urb) return; if (!skb_queue_empty(&tp->tx_queue)) - schedule_delayed_work(&tp->schedule, 0); + tasklet_schedule(&tp->tl); } static void intr_callback(struct urb *urb) @@ -1268,6 +1295,9 @@ static struct tx_agg *r8152_get_tx_agg(struct r8152 *tp) struct tx_agg *agg = NULL; unsigned long flags; + if (list_empty(&tp->tx_free)) + return NULL; + spin_lock_irqsave(&tp->tx_lock, flags); if (!list_empty(&tp->tx_free)) { struct list_head *cursor; @@ -1281,24 +1311,130 @@ static struct tx_agg *r8152_get_tx_agg(struct r8152 *tp) return agg; } -static void -r8152_tx_csum(struct r8152 *tp, struct tx_desc *desc, struct sk_buff *skb) +static inline __be16 get_protocol(struct sk_buff *skb) { - memset(desc, 0, sizeof(*desc)); + __be16 protocol; - desc->opts1 = cpu_to_le32((skb->len & TX_LEN_MASK) | TX_FS | TX_LS); + if (skb->protocol == htons(ETH_P_8021Q)) + protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; + else + protocol = skb->protocol; + + return protocol; +} + +/* + * r8152_csum_workaround() + * The hw limites the value the transport offset. When the offset is out of the + * range, calculate the checksum by sw. + */ +static void r8152_csum_workaround(struct r8152 *tp, struct sk_buff *skb, + struct sk_buff_head *list) +{ + if (skb_shinfo(skb)->gso_size) { + netdev_features_t features = tp->netdev->features; + struct sk_buff_head seg_list; + struct sk_buff *segs, *nskb; + + features &= ~(NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO); + segs = skb_gso_segment(skb, features); + if (IS_ERR(segs) || !segs) + goto drop; + + __skb_queue_head_init(&seg_list); + + do { + nskb = segs; + segs = segs->next; + nskb->next = NULL; + __skb_queue_tail(&seg_list, nskb); + } while (segs); + + skb_queue_splice(&seg_list, list); + dev_kfree_skb(skb); + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb_checksum_help(skb) < 0) + goto drop; + + __skb_queue_head(list, skb); + } else { + struct net_device_stats *stats; + +drop: + stats = &tp->netdev->stats; + stats->tx_dropped++; + dev_kfree_skb(skb); + } +} + +/* + * msdn_giant_send_check() + * According to the document of microsoft, the TCP Pseudo Header excludes the + * packet length for IPv6 TCP large packets. + */ +static int msdn_giant_send_check(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + struct tcphdr *th; + + ipv6h = ipv6_hdr(skb); + th = tcp_hdr(skb); + + th->check = 0; + th->check = ~tcp_v6_check(0, &ipv6h->saddr, &ipv6h->daddr, 0); + + return 0; +} + +static int r8152_tx_csum(struct r8152 *tp, struct tx_desc *desc, + struct sk_buff *skb, u32 len, u32 transport_offset) +{ + u32 mss = skb_shinfo(skb)->gso_size; + u32 opts1, opts2 = 0; + int ret = TX_CSUM_SUCCESS; + + WARN_ON_ONCE(len > TX_LEN_MAX); + + opts1 = len | TX_FS | TX_LS; + + if (mss) { + if (transport_offset > GTTCPHO_MAX) { + netif_warn(tp, tx_err, tp->netdev, + "Invalid transport offset 0x%x for TSO\n", + transport_offset); + ret = TX_CSUM_TSO; + goto unavailable; + } + + switch (get_protocol(skb)) { + case htons(ETH_P_IP): + opts1 |= GTSENDV4; + break; + + case htons(ETH_P_IPV6): + opts1 |= GTSENDV6; + msdn_giant_send_check(skb); + break; - if (skb->ip_summed == CHECKSUM_PARTIAL) { - __be16 protocol; + default: + WARN_ON_ONCE(1); + break; + } + + opts1 |= transport_offset << GTTCPHO_SHIFT; + opts2 |= min(mss, MSS_MAX) << MSS_SHIFT; + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { u8 ip_protocol; - u32 opts2 = 0; - if (skb->protocol == htons(ETH_P_8021Q)) - protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; - else - protocol = skb->protocol; + if (transport_offset > TCPHO_MAX) { + netif_warn(tp, tx_err, tp->netdev, + "Invalid transport offset 0x%x\n", + transport_offset); + ret = TX_CSUM_NONE; + goto unavailable; + } - switch (protocol) { + switch (get_protocol(skb)) { case htons(ETH_P_IP): opts2 |= IPV4_CS; ip_protocol = ip_hdr(skb)->protocol; @@ -1314,30 +1450,33 @@ r8152_tx_csum(struct r8152 *tp, struct tx_desc *desc, struct sk_buff *skb) break; } - if (ip_protocol == IPPROTO_TCP) { + if (ip_protocol == IPPROTO_TCP) opts2 |= TCP_CS; - opts2 |= (skb_transport_offset(skb) & 0x7fff) << 17; - } else if (ip_protocol == IPPROTO_UDP) { + else if (ip_protocol == IPPROTO_UDP) opts2 |= UDP_CS; - } else { + else WARN_ON_ONCE(1); - } - desc->opts2 = cpu_to_le32(opts2); + opts2 |= transport_offset << TCPHO_SHIFT; } + + desc->opts2 = cpu_to_le32(opts2); + desc->opts1 = cpu_to_le32(opts1); + +unavailable: + return ret; } static int r8152_tx_agg_fill(struct r8152 *tp, struct tx_agg *agg) { struct sk_buff_head skb_head, *tx_queue = &tp->tx_queue; - unsigned long flags; int remain, ret; u8 *tx_data; __skb_queue_head_init(&skb_head); - spin_lock_irqsave(&tx_queue->lock, flags); + spin_lock(&tx_queue->lock); skb_queue_splice_init(tx_queue, &skb_head); - spin_unlock_irqrestore(&tx_queue->lock, flags); + spin_unlock(&tx_queue->lock); tx_data = agg->head; agg->skb_num = agg->skb_len = 0; @@ -1347,47 +1486,65 @@ static int r8152_tx_agg_fill(struct r8152 *tp, struct tx_agg *agg) struct tx_desc *tx_desc; struct sk_buff *skb; unsigned int len; + u32 offset; skb = __skb_dequeue(&skb_head); if (!skb) break; - remain -= sizeof(*tx_desc); - len = skb->len; - if (remain < len) { + len = skb->len + sizeof(*tx_desc); + + if (len > remain) { __skb_queue_head(&skb_head, skb); break; } tx_data = tx_agg_align(tx_data); tx_desc = (struct tx_desc *)tx_data; + + offset = (u32)skb_transport_offset(skb); + + if (r8152_tx_csum(tp, tx_desc, skb, skb->len, offset)) { + r8152_csum_workaround(tp, skb, &skb_head); + continue; + } + tx_data += sizeof(*tx_desc); - r8152_tx_csum(tp, tx_desc, skb); - memcpy(tx_data, skb->data, len); - agg->skb_num++; + len = skb->len; + if (skb_copy_bits(skb, 0, tx_data, len) < 0) { + struct net_device_stats *stats = &tp->netdev->stats; + + stats->tx_dropped++; + dev_kfree_skb_any(skb); + tx_data -= sizeof(*tx_desc); + continue; + } + + tx_data += len; agg->skb_len += len; + agg->skb_num++; + dev_kfree_skb_any(skb); - tx_data += len; remain = rx_buf_sz - (int)(tx_agg_align(tx_data) - agg->head); } if (!skb_queue_empty(&skb_head)) { - spin_lock_irqsave(&tx_queue->lock, flags); + spin_lock(&tx_queue->lock); skb_queue_splice(&skb_head, tx_queue); - spin_unlock_irqrestore(&tx_queue->lock, flags); + spin_unlock(&tx_queue->lock); } - netif_tx_lock_bh(tp->netdev); + netif_tx_lock(tp->netdev); if (netif_queue_stopped(tp->netdev) && skb_queue_len(&tp->tx_queue) < tp->tx_qlen) netif_wake_queue(tp->netdev); - netif_tx_unlock_bh(tp->netdev); + netif_tx_unlock(tp->netdev); - ret = usb_autopm_get_interface(tp->intf); + ret = usb_autopm_get_interface_async(tp->intf); if (ret < 0) goto out_tx_fill; @@ -1395,14 +1552,45 @@ static int r8152_tx_agg_fill(struct r8152 *tp, struct tx_agg *agg) agg->head, (int)(tx_data - (u8 *)agg->head), (usb_complete_t)write_bulk_callback, agg); - ret = usb_submit_urb(agg->urb, GFP_KERNEL); + ret = usb_submit_urb(agg->urb, GFP_ATOMIC); if (ret < 0) - usb_autopm_put_interface(tp->intf); + usb_autopm_put_interface_async(tp->intf); out_tx_fill: return ret; } +static u8 r8152_rx_csum(struct r8152 *tp, struct rx_desc *rx_desc) +{ + u8 checksum = CHECKSUM_NONE; + u32 opts2, opts3; + + if (tp->version == RTL_VER_01) + goto return_result; + + opts2 = le32_to_cpu(rx_desc->opts2); + opts3 = le32_to_cpu(rx_desc->opts3); + + if (opts2 & RD_IPV4_CS) { + if (opts3 & IPF) + checksum = CHECKSUM_NONE; + else if ((opts2 & RD_UDP_CS) && (opts3 & UDPF)) + checksum = CHECKSUM_NONE; + else if ((opts2 & RD_TCP_CS) && (opts3 & TCPF)) + checksum = CHECKSUM_NONE; + else + checksum = CHECKSUM_UNNECESSARY; + } else if (RD_IPV6_CS) { + if ((opts2 & RD_UDP_CS) && !(opts3 & UDPF)) + checksum = CHECKSUM_UNNECESSARY; + else if ((opts2 & RD_TCP_CS) && !(opts3 & TCPF)) + checksum = CHECKSUM_UNNECESSARY; + } + +return_result: + return checksum; +} + static void rx_bottom(struct r8152 *tp) { unsigned long flags; @@ -1455,8 +1643,10 @@ static void rx_bottom(struct r8152 *tp) skb = netdev_alloc_skb_ip_align(netdev, pkt_len); if (!skb) { stats->rx_dropped++; - break; + goto find_next_rx; } + + skb->ip_summed = r8152_rx_csum(tp, rx_desc); memcpy(skb->data, rx_data, pkt_len); skb_put(skb, pkt_len); skb->protocol = eth_type_trans(skb, netdev); @@ -1464,6 +1654,7 @@ static void rx_bottom(struct r8152 *tp) stats->rx_packets++; stats->rx_bytes += pkt_len; +find_next_rx: rx_data = rx_agg_align(rx_data + pkt_len + CRC_SIZE); rx_desc = (struct rx_desc *)rx_data; len_used = (int)(rx_data - (u8 *)agg->head); @@ -1535,6 +1726,7 @@ static void bottom_half(unsigned long data) return; rx_bottom(tp); + tx_bottom(tp); } static @@ -1551,16 +1743,15 @@ static void rtl_drop_queued_tx(struct r8152 *tp) { struct net_device_stats *stats = &tp->netdev->stats; struct sk_buff_head skb_head, *tx_queue = &tp->tx_queue; - unsigned long flags; struct sk_buff *skb; if (skb_queue_empty(tx_queue)) return; __skb_queue_head_init(&skb_head); - spin_lock_irqsave(&tx_queue->lock, flags); + spin_lock_bh(&tx_queue->lock); skb_queue_splice_init(tx_queue, &skb_head); - spin_unlock_irqrestore(&tx_queue->lock, flags); + spin_unlock_bh(&tx_queue->lock); while ((skb = __skb_dequeue(&skb_head))) { dev_kfree_skb(skb); @@ -1631,7 +1822,7 @@ static void _rtl8152_set_rx_mode(struct net_device *netdev) } static netdev_tx_t rtl8152_start_xmit(struct sk_buff *skb, - struct net_device *netdev) + struct net_device *netdev) { struct r8152 *tp = netdev_priv(netdev); @@ -1639,13 +1830,17 @@ static netdev_tx_t rtl8152_start_xmit(struct sk_buff *skb, skb_queue_tail(&tp->tx_queue, skb); - if (list_empty(&tp->tx_free) && - skb_queue_len(&tp->tx_queue) > tp->tx_qlen) + if (!list_empty(&tp->tx_free)) { + if (test_bit(SELECTIVE_SUSPEND, &tp->flags)) { + set_bit(SCHEDULE_TASKLET, &tp->flags); + schedule_delayed_work(&tp->schedule, 0); + } else { + usb_mark_last_busy(tp->udev); + tasklet_schedule(&tp->tl); + } + } else if (skb_queue_len(&tp->tx_queue) > tp->tx_qlen) netif_stop_queue(netdev); - if (!list_empty(&tp->tx_free)) - schedule_delayed_work(&tp->schedule, 0); - return NETDEV_TX_OK; } @@ -2524,8 +2719,11 @@ static void rtl_work_func_t(struct work_struct *work) if (test_bit(RTL8152_SET_RX_MODE, &tp->flags)) _rtl8152_set_rx_mode(tp->netdev); - if (tp->speed & LINK_STATUS) - tx_bottom(tp); + if (test_bit(SCHEDULE_TASKLET, &tp->flags) && + (tp->speed & LINK_STATUS)) { + clear_bit(SCHEDULE_TASKLET, &tp->flags); + tasklet_schedule(&tp->tl); + } if (test_bit(PHY_RESET, &tp->flags)) rtl_phy_reset(tp); @@ -3094,10 +3292,15 @@ static int rtl8152_probe(struct usb_interface *intf, netdev->netdev_ops = &rtl8152_netdev_ops; netdev->watchdog_timeo = RTL8152_TX_TIMEOUT; - netdev->features |= NETIF_F_IP_CSUM; - netdev->hw_features = NETIF_F_IP_CSUM; + netdev->features |= NETIF_F_RXCSUM | NETIF_F_IP_CSUM | NETIF_F_SG | + NETIF_F_TSO | NETIF_F_FRAGLIST | NETIF_F_IPV6_CSUM | + NETIF_F_TSO6; + netdev->hw_features = NETIF_F_RXCSUM | NETIF_F_IP_CSUM | NETIF_F_SG | + NETIF_F_TSO | NETIF_F_FRAGLIST | + NETIF_F_IPV6_CSUM | NETIF_F_TSO6; SET_ETHTOOL_OPS(netdev, &ops); + netif_set_gso_max_size(netdev, RTL_LIMITED_TSO_SIZE); tp->mii.dev = netdev; tp->mii.mdio_read = read_mii_word; diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index ae413a2cbee7..bef37be402b8 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -48,37 +48,19 @@ typedef unsigned int pending_ring_idx_t; #define INVALID_PENDING_RING_IDX (~0U) -/* For the head field in pending_tx_info: it is used to indicate - * whether this tx info is the head of one or more coalesced requests. - * - * When head != INVALID_PENDING_RING_IDX, it means the start of a new - * tx requests queue and the end of previous queue. - * - * An example sequence of head fields (I = INVALID_PENDING_RING_IDX): - * - * ...|0 I I I|5 I|9 I I I|... - * -->|<-INUSE---------------- - * - * After consuming the first slot(s) we have: - * - * ...|V V V V|5 I|9 I I I|... - * -----FREE->|<-INUSE-------- - * - * where V stands for "valid pending ring index". Any number other - * than INVALID_PENDING_RING_IDX is OK. These entries are considered - * free and can contain any number other than - * INVALID_PENDING_RING_IDX. In practice we use 0. - * - * The in use non-INVALID_PENDING_RING_IDX (say 0, 5 and 9 in the - * above example) number is the index into pending_tx_info and - * mmap_pages arrays. - */ struct pending_tx_info { - struct xen_netif_tx_request req; /* coalesced tx request */ - pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX - * if it is head of one or more tx - * reqs - */ + struct xen_netif_tx_request req; /* tx request */ + /* Callback data for released SKBs. The callback is always + * xenvif_zerocopy_callback, desc contains the pending_idx, which is + * also an index in pending_tx_info array. It is initialized in + * xenvif_alloc and it never changes. + * skb_shinfo(skb)->destructor_arg points to the first mapped slot's + * callback_struct in this array of struct pending_tx_info's, then ctx + * to the next, or NULL if there is no more slot for this skb. + * ubuf_to_vif is a helper which finds the struct xenvif from a pointer + * to this field. + */ + struct ubuf_info callback_struct; }; #define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE) @@ -108,6 +90,15 @@ struct xenvif_rx_meta { */ #define MAX_GRANT_COPY_OPS (MAX_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE) +#define NETBACK_INVALID_HANDLE -1 + +/* To avoid confusion, we define XEN_NETBK_LEGACY_SLOTS_MAX indicating + * the maximum slots a valid packet can use. Now this value is defined + * to be XEN_NETIF_NR_SLOTS_MIN, which is supposed to be supported by + * all backend. + */ +#define XEN_NETBK_LEGACY_SLOTS_MAX XEN_NETIF_NR_SLOTS_MIN + struct xenvif { /* Unique identifier for this interface. */ domid_t domid; @@ -126,13 +117,28 @@ struct xenvif { pending_ring_idx_t pending_cons; u16 pending_ring[MAX_PENDING_REQS]; struct pending_tx_info pending_tx_info[MAX_PENDING_REQS]; - - /* Coalescing tx requests before copying makes number of grant - * copy ops greater or equal to number of slots required. In - * worst case a tx request consumes 2 gnttab_copy. + grant_handle_t grant_tx_handle[MAX_PENDING_REQS]; + + struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS]; + struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS]; + /* passed to gnttab_[un]map_refs with pages under (un)mapping */ + struct page *pages_to_map[MAX_PENDING_REQS]; + struct page *pages_to_unmap[MAX_PENDING_REQS]; + + /* This prevents zerocopy callbacks to race over dealloc_ring */ + spinlock_t callback_lock; + /* This prevents dealloc thread and NAPI instance to race over response + * creation and pending_ring in xenvif_idx_release. In xenvif_tx_err + * it only protect response creation */ - struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS]; - + spinlock_t response_lock; + pending_ring_idx_t dealloc_prod; + pending_ring_idx_t dealloc_cons; + u16 dealloc_ring[MAX_PENDING_REQS]; + struct task_struct *dealloc_task; + wait_queue_head_t dealloc_wq; + struct timer_list dealloc_delay; + bool dealloc_delay_timed_out; /* Use kthread for guest RX */ struct task_struct *task; @@ -144,6 +150,9 @@ struct xenvif { struct xen_netif_rx_back_ring rx; struct sk_buff_head rx_queue; RING_IDX rx_last_skb_slots; + bool rx_queue_purge; + + struct timer_list wake_queue; /* This array is allocated seperately as it is large */ struct gnttab_copy *grant_copy_op; @@ -175,6 +184,10 @@ struct xenvif { /* Statistics */ unsigned long rx_gso_checksum_fixup; + unsigned long tx_zerocopy_sent; + unsigned long tx_zerocopy_success; + unsigned long tx_zerocopy_fail; + unsigned long tx_frag_overflow; /* Miscellaneous private stuff. */ struct net_device *dev; @@ -216,9 +229,11 @@ void xenvif_carrier_off(struct xenvif *vif); int xenvif_tx_action(struct xenvif *vif, int budget); -int xenvif_kthread(void *data); +int xenvif_kthread_guest_rx(void *data); void xenvif_kick_thread(struct xenvif *vif); +int xenvif_dealloc_kthread(void *data); + /* Determine whether the needed number of slots (req) are available, * and set req_event if not. */ @@ -226,6 +241,30 @@ bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed); void xenvif_stop_queue(struct xenvif *vif); +/* Callback from stack when TX packet can be released */ +void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success); + +/* Unmap a pending page and release it back to the guest */ +void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx); + +static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif) +{ + return MAX_PENDING_REQS - + vif->pending_prod + vif->pending_cons; +} + +static inline bool xenvif_tx_pending_slots_available(struct xenvif *vif) +{ + return nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX + < MAX_PENDING_REQS; +} + +/* Callback from stack when TX packet can be released */ +void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success); + extern bool separate_tx_rx_irq; +extern unsigned int rx_drain_timeout_msecs; +extern unsigned int rx_drain_timeout_jiffies; + #endif /* __XEN_NETBACK__COMMON_H__ */ diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 7669d49a67e2..83a71ac5b93a 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -38,6 +38,7 @@ #include <xen/events.h> #include <asm/xen/hypercall.h> +#include <xen/balloon.h> #define XENVIF_QUEUE_LENGTH 32 #define XENVIF_NAPI_WEIGHT 64 @@ -87,7 +88,8 @@ static int xenvif_poll(struct napi_struct *napi, int budget) local_irq_save(flags); RING_FINAL_CHECK_FOR_REQUESTS(&vif->tx, more_to_do); - if (!more_to_do) + if (!(more_to_do && + xenvif_tx_pending_slots_available(vif))) __napi_complete(napi); local_irq_restore(flags); @@ -113,6 +115,18 @@ static irqreturn_t xenvif_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static void xenvif_wake_queue(unsigned long data) +{ + struct xenvif *vif = (struct xenvif *)data; + + if (netif_queue_stopped(vif->dev)) { + netdev_err(vif->dev, "draining TX queue\n"); + vif->rx_queue_purge = true; + xenvif_kick_thread(vif); + netif_wake_queue(vif->dev); + } +} + static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct xenvif *vif = netdev_priv(dev); @@ -121,7 +135,9 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev) BUG_ON(skb->dev != dev); /* Drop the packet if vif is not ready */ - if (vif->task == NULL || !xenvif_schedulable(vif)) + if (vif->task == NULL || + vif->dealloc_task == NULL || + !xenvif_schedulable(vif)) goto drop; /* At best we'll need one slot for the header and one for each @@ -140,8 +156,13 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev) * then turn off the queue to give the ring a chance to * drain. */ - if (!xenvif_rx_ring_slots_available(vif, min_slots_needed)) + if (!xenvif_rx_ring_slots_available(vif, min_slots_needed)) { + vif->wake_queue.function = xenvif_wake_queue; + vif->wake_queue.data = (unsigned long)vif; xenvif_stop_queue(vif); + mod_timer(&vif->wake_queue, + jiffies + rx_drain_timeout_jiffies); + } skb_queue_tail(&vif->rx_queue, skb); xenvif_kick_thread(vif); @@ -234,6 +255,28 @@ static const struct xenvif_stat { "rx_gso_checksum_fixup", offsetof(struct xenvif, rx_gso_checksum_fixup) }, + /* If (sent != success + fail), there are probably packets never + * freed up properly! + */ + { + "tx_zerocopy_sent", + offsetof(struct xenvif, tx_zerocopy_sent), + }, + { + "tx_zerocopy_success", + offsetof(struct xenvif, tx_zerocopy_success), + }, + { + "tx_zerocopy_fail", + offsetof(struct xenvif, tx_zerocopy_fail) + }, + /* Number of packets exceeding MAX_SKB_FRAG slots. You should use + * a guest with the same MAX_SKB_FRAG + */ + { + "tx_frag_overflow", + offsetof(struct xenvif, tx_frag_overflow) + }, }; static int xenvif_get_sset_count(struct net_device *dev, int string_set) @@ -327,6 +370,8 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, init_timer(&vif->credit_timeout); vif->credit_window_start = get_jiffies_64(); + init_timer(&vif->wake_queue); + dev->netdev_ops = &xenvif_netdev_ops; dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | @@ -343,8 +388,27 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, vif->pending_prod = MAX_PENDING_REQS; for (i = 0; i < MAX_PENDING_REQS; i++) vif->pending_ring[i] = i; - for (i = 0; i < MAX_PENDING_REQS; i++) - vif->mmap_pages[i] = NULL; + spin_lock_init(&vif->callback_lock); + spin_lock_init(&vif->response_lock); + /* If ballooning is disabled, this will consume real memory, so you + * better enable it. The long term solution would be to use just a + * bunch of valid page descriptors, without dependency on ballooning + */ + err = alloc_xenballooned_pages(MAX_PENDING_REQS, + vif->mmap_pages, + false); + if (err) { + netdev_err(dev, "Could not reserve mmap_pages\n"); + return ERR_PTR(-ENOMEM); + } + for (i = 0; i < MAX_PENDING_REQS; i++) { + vif->pending_tx_info[i].callback_struct = (struct ubuf_info) + { .callback = xenvif_zerocopy_callback, + .ctx = NULL, + .desc = i }; + vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE; + } + init_timer(&vif->dealloc_delay); /* * Initialise a dummy MAC address. We choose the numerically @@ -382,12 +446,14 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref, BUG_ON(vif->tx_irq); BUG_ON(vif->task); + BUG_ON(vif->dealloc_task); err = xenvif_map_frontend_rings(vif, tx_ring_ref, rx_ring_ref); if (err < 0) goto err; init_waitqueue_head(&vif->wq); + init_waitqueue_head(&vif->dealloc_wq); if (tx_evtchn == rx_evtchn) { /* feature-split-event-channels == 0 */ @@ -421,8 +487,8 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref, disable_irq(vif->rx_irq); } - task = kthread_create(xenvif_kthread, - (void *)vif, "%s", vif->dev->name); + task = kthread_create(xenvif_kthread_guest_rx, + (void *)vif, "%s-guest-rx", vif->dev->name); if (IS_ERR(task)) { pr_warn("Could not allocate kthread for %s\n", vif->dev->name); err = PTR_ERR(task); @@ -431,6 +497,16 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref, vif->task = task; + task = kthread_create(xenvif_dealloc_kthread, + (void *)vif, "%s-dealloc", vif->dev->name); + if (IS_ERR(task)) { + pr_warn("Could not allocate kthread for %s\n", vif->dev->name); + err = PTR_ERR(task); + goto err_rx_unbind; + } + + vif->dealloc_task = task; + rtnl_lock(); if (!vif->can_sg && vif->dev->mtu > ETH_DATA_LEN) dev_set_mtu(vif->dev, ETH_DATA_LEN); @@ -441,6 +517,7 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref, rtnl_unlock(); wake_up_process(vif->task); + wake_up_process(vif->dealloc_task); return 0; @@ -474,10 +551,17 @@ void xenvif_disconnect(struct xenvif *vif) xenvif_carrier_off(vif); if (vif->task) { + del_timer_sync(&vif->wake_queue); kthread_stop(vif->task); vif->task = NULL; } + if (vif->dealloc_task) { + del_timer_sync(&vif->dealloc_delay); + kthread_stop(vif->dealloc_task); + vif->dealloc_task = NULL; + } + if (vif->tx_irq) { if (vif->tx_irq == vif->rx_irq) unbind_from_irqhandler(vif->tx_irq, vif); @@ -493,6 +577,36 @@ void xenvif_disconnect(struct xenvif *vif) void xenvif_free(struct xenvif *vif) { + int i, unmap_timeout = 0; + /* Here we want to avoid timeout messages if an skb can be legitimatly + * stucked somewhere else. Realisticly this could be an another vif's + * internal or QDisc queue. That another vif also has this + * rx_drain_timeout_msecs timeout, but the timer only ditches the + * internal queue. After that, the QDisc queue can put in worst case + * XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS skbs into that another vif's + * internal queue, so we need several rounds of such timeouts until we + * can be sure that no another vif should have skb's from us. We are + * not sending more skb's, so newly stucked packets are not interesting + * for us here. + */ + unsigned int worst_case_skb_lifetime = (rx_drain_timeout_msecs/1000) * + DIV_ROUND_UP(XENVIF_QUEUE_LENGTH, (XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS)); + + for (i = 0; i < MAX_PENDING_REQS; ++i) { + if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) { + unmap_timeout++; + schedule_timeout(msecs_to_jiffies(1000)); + if (unmap_timeout > worst_case_skb_lifetime && + net_ratelimit()) + netdev_err(vif->dev, + "Page still granted! Index: %x\n", + i); + i = -1; + } + } + + free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages); + netif_napi_del(&vif->napi); unregister_netdev(vif->dev); diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index e5284bca2d90..bc943205a691 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -37,6 +37,7 @@ #include <linux/kthread.h> #include <linux/if_vlan.h> #include <linux/udp.h> +#include <linux/highmem.h> #include <net/tcp.h> @@ -54,6 +55,13 @@ bool separate_tx_rx_irq = 1; module_param(separate_tx_rx_irq, bool, 0644); +/* When guest ring is filled up, qdisc queues the packets for us, but we have + * to timeout them, otherwise other guests' packets can get stucked there + */ +unsigned int rx_drain_timeout_msecs = 10000; +module_param(rx_drain_timeout_msecs, uint, 0444); +unsigned int rx_drain_timeout_jiffies; + /* * This is the maximum slots a skb can have. If a guest sends a skb * which exceeds this limit it is considered malicious. @@ -62,24 +70,6 @@ module_param(separate_tx_rx_irq, bool, 0644); static unsigned int fatal_skb_slots = FATAL_SKB_SLOTS_DEFAULT; module_param(fatal_skb_slots, uint, 0444); -/* - * To avoid confusion, we define XEN_NETBK_LEGACY_SLOTS_MAX indicating - * the maximum slots a valid packet can use. Now this value is defined - * to be XEN_NETIF_NR_SLOTS_MIN, which is supposed to be supported by - * all backend. - */ -#define XEN_NETBK_LEGACY_SLOTS_MAX XEN_NETIF_NR_SLOTS_MIN - -/* - * If head != INVALID_PENDING_RING_IDX, it means this tx request is head of - * one or more merged tx requests, otherwise it is the continuation of - * previous tx request. - */ -static inline int pending_tx_is_head(struct xenvif *vif, RING_IDX idx) -{ - return vif->pending_tx_info[idx].head != INVALID_PENDING_RING_IDX; -} - static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx, u8 status); @@ -109,6 +99,18 @@ static inline unsigned long idx_to_kaddr(struct xenvif *vif, return (unsigned long)pfn_to_kaddr(idx_to_pfn(vif, idx)); } +/* Find the containing VIF's structure from a pointer in pending_tx_info array + */ +static inline struct xenvif* ubuf_to_vif(struct ubuf_info *ubuf) +{ + u16 pending_idx = ubuf->desc; + struct pending_tx_info *temp = + container_of(ubuf, struct pending_tx_info, callback_struct); + return container_of(temp - pending_idx, + struct xenvif, + pending_tx_info[0]); +} + /* This is a miniumum size for the linear area to avoid lots of * calls to __pskb_pull_tail() as we set up checksum offsets. The * value 128 was chosen as it covers all IPv4 and most likely @@ -131,10 +133,9 @@ static inline pending_ring_idx_t pending_index(unsigned i) return i & (MAX_PENDING_REQS-1); } -static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif) +static inline pending_ring_idx_t nr_free_slots(struct xen_netif_tx_back_ring *ring) { - return MAX_PENDING_REQS - - vif->pending_prod + vif->pending_cons; + return ring->nr_ents - (ring->sring->req_prod - ring->rsp_prod_pvt); } bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed) @@ -235,7 +236,9 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif *vif, static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, struct netrx_pending_operations *npo, struct page *page, unsigned long size, - unsigned long offset, int *head) + unsigned long offset, int *head, + struct xenvif *foreign_vif, + grant_ref_t foreign_gref) { struct gnttab_copy *copy_gop; struct xenvif_rx_meta *meta; @@ -277,8 +280,15 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, copy_gop->flags = GNTCOPY_dest_gref; copy_gop->len = bytes; - copy_gop->source.domid = DOMID_SELF; - copy_gop->source.u.gmfn = virt_to_mfn(page_address(page)); + if (foreign_vif) { + copy_gop->source.domid = foreign_vif->domid; + copy_gop->source.u.ref = foreign_gref; + copy_gop->flags |= GNTCOPY_source_gref; + } else { + copy_gop->source.domid = DOMID_SELF; + copy_gop->source.u.gmfn = + virt_to_mfn(page_address(page)); + } copy_gop->source.offset = offset; copy_gop->dest.domid = vif->domid; @@ -339,6 +349,9 @@ static int xenvif_gop_skb(struct sk_buff *skb, int old_meta_prod; int gso_type; int gso_size; + struct ubuf_info *ubuf = skb_shinfo(skb)->destructor_arg; + grant_ref_t foreign_grefs[MAX_SKB_FRAGS]; + struct xenvif *foreign_vif = NULL; old_meta_prod = npo->meta_prod; @@ -379,6 +392,19 @@ static int xenvif_gop_skb(struct sk_buff *skb, npo->copy_off = 0; npo->copy_gref = req->gref; + if ((skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) && + (ubuf->callback == &xenvif_zerocopy_callback)) { + int i = 0; + foreign_vif = ubuf_to_vif(ubuf); + + do { + u16 pending_idx = ubuf->desc; + foreign_grefs[i++] = + foreign_vif->pending_tx_info[pending_idx].req.gref; + ubuf = (struct ubuf_info *) ubuf->ctx; + } while (ubuf); + } + data = skb->data; while (data < skb_tail_pointer(skb)) { unsigned int offset = offset_in_page(data); @@ -388,7 +414,9 @@ static int xenvif_gop_skb(struct sk_buff *skb, len = skb_tail_pointer(skb) - data; xenvif_gop_frag_copy(vif, skb, npo, - virt_to_page(data), len, offset, &head); + virt_to_page(data), len, offset, &head, + NULL, + 0); data += len; } @@ -397,7 +425,9 @@ static int xenvif_gop_skb(struct sk_buff *skb, skb_frag_page(&skb_shinfo(skb)->frags[i]), skb_frag_size(&skb_shinfo(skb)->frags[i]), skb_shinfo(skb)->frags[i].page_offset, - &head); + &head, + foreign_vif, + foreign_grefs[i]); } return npo->meta_prod - old_meta_prod; @@ -455,10 +485,12 @@ static void xenvif_add_frag_responses(struct xenvif *vif, int status, } } -struct skb_cb_overlay { +struct xenvif_rx_cb { int meta_slots_used; }; +#define XENVIF_RX_CB(skb) ((struct xenvif_rx_cb *)(skb)->cb) + void xenvif_kick_thread(struct xenvif *vif) { wake_up(&vif->wq); @@ -474,7 +506,6 @@ static void xenvif_rx_action(struct xenvif *vif) LIST_HEAD(notify); int ret; unsigned long offset; - struct skb_cb_overlay *sco; bool need_to_notify = false; struct netrx_pending_operations npo = { @@ -513,9 +544,8 @@ static void xenvif_rx_action(struct xenvif *vif) } else vif->rx_last_skb_slots = 0; - sco = (struct skb_cb_overlay *)skb->cb; - sco->meta_slots_used = xenvif_gop_skb(skb, &npo); - BUG_ON(sco->meta_slots_used > max_slots_needed); + XENVIF_RX_CB(skb)->meta_slots_used = xenvif_gop_skb(skb, &npo); + BUG_ON(XENVIF_RX_CB(skb)->meta_slots_used > max_slots_needed); __skb_queue_tail(&rxq, skb); } @@ -529,7 +559,6 @@ static void xenvif_rx_action(struct xenvif *vif) gnttab_batch_copy(vif->grant_copy_op, npo.copy_prod); while ((skb = __skb_dequeue(&rxq)) != NULL) { - sco = (struct skb_cb_overlay *)skb->cb; if ((1 << vif->meta[npo.meta_cons].gso_type) & vif->gso_prefix_mask) { @@ -540,19 +569,21 @@ static void xenvif_rx_action(struct xenvif *vif) resp->offset = vif->meta[npo.meta_cons].gso_size; resp->id = vif->meta[npo.meta_cons].id; - resp->status = sco->meta_slots_used; + resp->status = XENVIF_RX_CB(skb)->meta_slots_used; npo.meta_cons++; - sco->meta_slots_used--; + XENVIF_RX_CB(skb)->meta_slots_used--; } vif->dev->stats.tx_bytes += skb->len; vif->dev->stats.tx_packets++; - status = xenvif_check_gop(vif, sco->meta_slots_used, &npo); + status = xenvif_check_gop(vif, + XENVIF_RX_CB(skb)->meta_slots_used, + &npo); - if (sco->meta_slots_used == 1) + if (XENVIF_RX_CB(skb)->meta_slots_used == 1) flags = 0; else flags = XEN_NETRXF_more_data; @@ -589,13 +620,13 @@ static void xenvif_rx_action(struct xenvif *vif) xenvif_add_frag_responses(vif, status, vif->meta + npo.meta_cons + 1, - sco->meta_slots_used); + XENVIF_RX_CB(skb)->meta_slots_used); RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&vif->rx, ret); need_to_notify |= !!ret; - npo.meta_cons += sco->meta_slots_used; + npo.meta_cons += XENVIF_RX_CB(skb)->meta_slots_used; dev_kfree_skb(skb); } @@ -645,9 +676,12 @@ static void xenvif_tx_err(struct xenvif *vif, struct xen_netif_tx_request *txp, RING_IDX end) { RING_IDX cons = vif->tx.req_cons; + unsigned long flags; do { + spin_lock_irqsave(&vif->response_lock, flags); make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR); + spin_unlock_irqrestore(&vif->response_lock, flags); if (cons == end) break; txp = RING_GET_REQUEST(&vif->tx, cons++); @@ -759,180 +793,168 @@ static int xenvif_count_requests(struct xenvif *vif, return slots; } -static struct page *xenvif_alloc_page(struct xenvif *vif, - u16 pending_idx) + +struct xenvif_tx_cb { + u16 pending_idx; +}; + +#define XENVIF_TX_CB(skb) ((struct xenvif_tx_cb *)(skb)->cb) + +static inline void xenvif_tx_create_gop(struct xenvif *vif, + u16 pending_idx, + struct xen_netif_tx_request *txp, + struct gnttab_map_grant_ref *gop) { - struct page *page; + vif->pages_to_map[gop-vif->tx_map_ops] = vif->mmap_pages[pending_idx]; + gnttab_set_map_op(gop, idx_to_kaddr(vif, pending_idx), + GNTMAP_host_map | GNTMAP_readonly, + txp->gref, vif->domid); + + memcpy(&vif->pending_tx_info[pending_idx].req, txp, + sizeof(*txp)); +} - page = alloc_page(GFP_ATOMIC|__GFP_COLD); - if (!page) +static inline struct sk_buff *xenvif_alloc_skb(unsigned int size) +{ + struct sk_buff *skb = + alloc_skb(size + NET_SKB_PAD + NET_IP_ALIGN, + GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(skb == NULL)) return NULL; - vif->mmap_pages[pending_idx] = page; - return page; + /* Packets passed to netif_rx() must have some headroom. */ + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); + + /* Initialize it here to avoid later surprises */ + skb_shinfo(skb)->destructor_arg = NULL; + + return skb; } -static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif, - struct sk_buff *skb, - struct xen_netif_tx_request *txp, - struct gnttab_copy *gop) +static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif, + struct sk_buff *skb, + struct xen_netif_tx_request *txp, + struct gnttab_map_grant_ref *gop) { struct skb_shared_info *shinfo = skb_shinfo(skb); skb_frag_t *frags = shinfo->frags; - u16 pending_idx = *((u16 *)skb->data); - u16 head_idx = 0; - int slot, start; - struct page *page; - pending_ring_idx_t index, start_idx = 0; - uint16_t dst_offset; - unsigned int nr_slots; - struct pending_tx_info *first = NULL; + u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx; + int start; + pending_ring_idx_t index; + unsigned int nr_slots, frag_overflow = 0; /* At this point shinfo->nr_frags is in fact the number of * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX. */ + if (shinfo->nr_frags > MAX_SKB_FRAGS) { + frag_overflow = shinfo->nr_frags - MAX_SKB_FRAGS; + BUG_ON(frag_overflow > MAX_SKB_FRAGS); + shinfo->nr_frags = MAX_SKB_FRAGS; + } nr_slots = shinfo->nr_frags; /* Skip first skb fragment if it is on same page as header fragment. */ start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); - /* Coalesce tx requests, at this point the packet passed in - * should be <= 64K. Any packets larger than 64K have been - * handled in xenvif_count_requests(). - */ - for (shinfo->nr_frags = slot = start; slot < nr_slots; - shinfo->nr_frags++) { - struct pending_tx_info *pending_tx_info = - vif->pending_tx_info; + for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots; + shinfo->nr_frags++, txp++, gop++) { + index = pending_index(vif->pending_cons++); + pending_idx = vif->pending_ring[index]; + xenvif_tx_create_gop(vif, pending_idx, txp, gop); + frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx); + } - page = alloc_page(GFP_ATOMIC|__GFP_COLD); - if (!page) - goto err; - - dst_offset = 0; - first = NULL; - while (dst_offset < PAGE_SIZE && slot < nr_slots) { - gop->flags = GNTCOPY_source_gref; - - gop->source.u.ref = txp->gref; - gop->source.domid = vif->domid; - gop->source.offset = txp->offset; - - gop->dest.domid = DOMID_SELF; - - gop->dest.offset = dst_offset; - gop->dest.u.gmfn = virt_to_mfn(page_address(page)); - - if (dst_offset + txp->size > PAGE_SIZE) { - /* This page can only merge a portion - * of tx request. Do not increment any - * pointer / counter here. The txp - * will be dealt with in future - * rounds, eventually hitting the - * `else` branch. - */ - gop->len = PAGE_SIZE - dst_offset; - txp->offset += gop->len; - txp->size -= gop->len; - dst_offset += gop->len; /* quit loop */ - } else { - /* This tx request can be merged in the page */ - gop->len = txp->size; - dst_offset += gop->len; - - index = pending_index(vif->pending_cons++); - - pending_idx = vif->pending_ring[index]; - - memcpy(&pending_tx_info[pending_idx].req, txp, - sizeof(*txp)); - - /* Poison these fields, corresponding - * fields for head tx req will be set - * to correct values after the loop. - */ - vif->mmap_pages[pending_idx] = (void *)(~0UL); - pending_tx_info[pending_idx].head = - INVALID_PENDING_RING_IDX; - - if (!first) { - first = &pending_tx_info[pending_idx]; - start_idx = index; - head_idx = pending_idx; - } - - txp++; - slot++; - } + if (frag_overflow) { + struct sk_buff *nskb = xenvif_alloc_skb(0); + if (unlikely(nskb == NULL)) { + if (net_ratelimit()) + netdev_err(vif->dev, + "Can't allocate the frag_list skb.\n"); + return NULL; + } + + shinfo = skb_shinfo(nskb); + frags = shinfo->frags; - gop++; + for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow; + shinfo->nr_frags++, txp++, gop++) { + index = pending_index(vif->pending_cons++); + pending_idx = vif->pending_ring[index]; + xenvif_tx_create_gop(vif, pending_idx, txp, gop); + frag_set_pending_idx(&frags[shinfo->nr_frags], + pending_idx); } - first->req.offset = 0; - first->req.size = dst_offset; - first->head = start_idx; - vif->mmap_pages[head_idx] = page; - frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx); + skb_shinfo(skb)->frag_list = nskb; } - BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS); - return gop; -err: - /* Unwind, freeing all pages and sending error responses. */ - while (shinfo->nr_frags-- > start) { - xenvif_idx_release(vif, - frag_get_pending_idx(&frags[shinfo->nr_frags]), - XEN_NETIF_RSP_ERROR); +} + +static inline void xenvif_grant_handle_set(struct xenvif *vif, + u16 pending_idx, + grant_handle_t handle) +{ + if (unlikely(vif->grant_tx_handle[pending_idx] != + NETBACK_INVALID_HANDLE)) { + netdev_err(vif->dev, + "Trying to overwrite active handle! pending_idx: %x\n", + pending_idx); + BUG(); } - /* The head too, if necessary. */ - if (start) - xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR); + vif->grant_tx_handle[pending_idx] = handle; +} - return NULL; +static inline void xenvif_grant_handle_reset(struct xenvif *vif, + u16 pending_idx) +{ + if (unlikely(vif->grant_tx_handle[pending_idx] == + NETBACK_INVALID_HANDLE)) { + netdev_err(vif->dev, + "Trying to unmap invalid handle! pending_idx: %x\n", + pending_idx); + BUG(); + } + vif->grant_tx_handle[pending_idx] = NETBACK_INVALID_HANDLE; } static int xenvif_tx_check_gop(struct xenvif *vif, struct sk_buff *skb, - struct gnttab_copy **gopp) + struct gnttab_map_grant_ref **gopp) { - struct gnttab_copy *gop = *gopp; - u16 pending_idx = *((u16 *)skb->data); + struct gnttab_map_grant_ref *gop = *gopp; + u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx; struct skb_shared_info *shinfo = skb_shinfo(skb); struct pending_tx_info *tx_info; int nr_frags = shinfo->nr_frags; int i, err, start; - u16 peek; /* peek into next tx request */ + struct sk_buff *first_skb = NULL; /* Check status of header. */ err = gop->status; if (unlikely(err)) xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR); + else + xenvif_grant_handle_set(vif, pending_idx , gop->handle); /* Skip first skb fragment if it is on same page as header fragment. */ start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); +check_frags: for (i = start; i < nr_frags; i++) { int j, newerr; - pending_ring_idx_t head; pending_idx = frag_get_pending_idx(&shinfo->frags[i]); tx_info = &vif->pending_tx_info[pending_idx]; - head = tx_info->head; /* Check error status: if okay then remember grant handle. */ - do { - newerr = (++gop)->status; - if (newerr) - break; - peek = vif->pending_ring[pending_index(++head)]; - } while (!pending_tx_is_head(vif, peek)); + newerr = (++gop)->status; if (likely(!newerr)) { + xenvif_grant_handle_set(vif, pending_idx , gop->handle); /* Had a previous error? Invalidate this fragment. */ if (unlikely(err)) - xenvif_idx_release(vif, pending_idx, - XEN_NETIF_RSP_OKAY); + xenvif_idx_unmap(vif, pending_idx); continue; } @@ -942,20 +964,45 @@ static int xenvif_tx_check_gop(struct xenvif *vif, /* Not the first error? Preceding frags already invalidated. */ if (err) continue; - /* First error: invalidate header and preceding fragments. */ - pending_idx = *((u16 *)skb->data); - xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY); + if (!first_skb) + pending_idx = XENVIF_TX_CB(skb)->pending_idx; + else + pending_idx = XENVIF_TX_CB(skb)->pending_idx; + xenvif_idx_unmap(vif, pending_idx); for (j = start; j < i; j++) { pending_idx = frag_get_pending_idx(&shinfo->frags[j]); - xenvif_idx_release(vif, pending_idx, - XEN_NETIF_RSP_OKAY); + xenvif_idx_unmap(vif, pending_idx); } /* Remember the error: invalidate all subsequent fragments. */ err = newerr; } + if (skb_has_frag_list(skb)) { + first_skb = skb; + skb = shinfo->frag_list; + shinfo = skb_shinfo(skb); + nr_frags = shinfo->nr_frags; + start = 0; + + goto check_frags; + } + + /* There was a mapping error in the frag_list skb. We have to unmap + * the first skb's frags + */ + if (first_skb && err) { + int j; + shinfo = skb_shinfo(first_skb); + pending_idx = XENVIF_TX_CB(skb)->pending_idx; + start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); + for (j = start; j < shinfo->nr_frags; j++) { + pending_idx = frag_get_pending_idx(&shinfo->frags[j]); + xenvif_idx_unmap(vif, pending_idx); + } + } + *gopp = gop + 1; return err; } @@ -965,6 +1012,10 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb) struct skb_shared_info *shinfo = skb_shinfo(skb); int nr_frags = shinfo->nr_frags; int i; + u16 prev_pending_idx = INVALID_PENDING_IDX; + + if (skb_shinfo(skb)->destructor_arg) + prev_pending_idx = XENVIF_TX_CB(skb)->pending_idx; for (i = 0; i < nr_frags; i++) { skb_frag_t *frag = shinfo->frags + i; @@ -974,6 +1025,17 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb) pending_idx = frag_get_pending_idx(frag); + /* If this is not the first frag, chain it to the previous*/ + if (unlikely(prev_pending_idx == INVALID_PENDING_IDX)) + skb_shinfo(skb)->destructor_arg = + &vif->pending_tx_info[pending_idx].callback_struct; + else if (likely(pending_idx != prev_pending_idx)) + vif->pending_tx_info[prev_pending_idx].callback_struct.ctx = + &(vif->pending_tx_info[pending_idx].callback_struct); + + vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL; + prev_pending_idx = pending_idx; + txp = &vif->pending_tx_info[pending_idx].req; page = virt_to_page(idx_to_kaddr(vif, pending_idx)); __skb_fill_page_desc(skb, i, page, txp->offset, txp->size); @@ -981,10 +1043,15 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb) skb->data_len += txp->size; skb->truesize += txp->size; - /* Take an extra reference to offset xenvif_idx_release */ + /* Take an extra reference to offset network stack's put_page */ get_page(vif->mmap_pages[pending_idx]); - xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY); } + /* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc + * overlaps with "index", and "mapping" is not set. I think mapping + * should be set. If delivered to local stack, it would drop this + * skb in sk_filter unless the socket has the right to use it. + */ + skb->pfmemalloc = false; } static int xenvif_get_extras(struct xenvif *vif, @@ -1104,16 +1171,14 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget) { - struct gnttab_copy *gop = vif->tx_copy_ops, *request_gop; + struct gnttab_map_grant_ref *gop = vif->tx_map_ops, *request_gop; struct sk_buff *skb; int ret; - while ((nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX - < MAX_PENDING_REQS) && + while (xenvif_tx_pending_slots_available(vif) && (skb_queue_len(&vif->tx_queue) < budget)) { struct xen_netif_tx_request txreq; struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX]; - struct page *page; struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1]; u16 pending_idx; RING_IDX idx; @@ -1189,8 +1254,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget) ret < XEN_NETBK_LEGACY_SLOTS_MAX) ? PKT_PROT_LEN : txreq.size; - skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN, - GFP_ATOMIC | __GFP_NOWARN); + skb = xenvif_alloc_skb(data_len); if (unlikely(skb == NULL)) { netdev_dbg(vif->dev, "Can't allocate a skb in start_xmit.\n"); @@ -1198,9 +1262,6 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget) break; } - /* Packets passed to netif_rx() must have some headroom. */ - skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); - if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) { struct xen_netif_extra_info *gso; gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1]; @@ -1212,31 +1273,11 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget) } } - /* XXX could copy straight to head */ - page = xenvif_alloc_page(vif, pending_idx); - if (!page) { - kfree_skb(skb); - xenvif_tx_err(vif, &txreq, idx); - break; - } - - gop->source.u.ref = txreq.gref; - gop->source.domid = vif->domid; - gop->source.offset = txreq.offset; - - gop->dest.u.gmfn = virt_to_mfn(page_address(page)); - gop->dest.domid = DOMID_SELF; - gop->dest.offset = txreq.offset; - - gop->len = txreq.size; - gop->flags = GNTCOPY_source_gref; + xenvif_tx_create_gop(vif, pending_idx, &txreq, gop); gop++; - memcpy(&vif->pending_tx_info[pending_idx].req, - &txreq, sizeof(txreq)); - vif->pending_tx_info[pending_idx].head = index; - *((u16 *)skb->data) = pending_idx; + XENVIF_TX_CB(skb)->pending_idx = pending_idx; __skb_put(skb, data_len); @@ -1264,17 +1305,82 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget) vif->tx.req_cons = idx; - if ((gop-vif->tx_copy_ops) >= ARRAY_SIZE(vif->tx_copy_ops)) + if ((gop-vif->tx_map_ops) >= ARRAY_SIZE(vif->tx_map_ops)) break; } - return gop - vif->tx_copy_ops; + return gop - vif->tx_map_ops; } +/* Consolidate skb with a frag_list into a brand new one with local pages on + * frags. Returns 0 or -ENOMEM if can't allocate new pages. + */ +static int xenvif_handle_frag_list(struct xenvif *vif, struct sk_buff *skb) +{ + unsigned int offset = skb_headlen(skb); + skb_frag_t frags[MAX_SKB_FRAGS]; + int i; + struct ubuf_info *uarg; + struct sk_buff *nskb = skb_shinfo(skb)->frag_list; + + vif->tx_zerocopy_sent += 2; + vif->tx_frag_overflow++; + + xenvif_fill_frags(vif, nskb); + /* Subtract frags size, we will correct it later */ + skb->truesize -= skb->data_len; + skb->len += nskb->len; + skb->data_len += nskb->len; + + /* create a brand new frags array and coalesce there */ + for (i = 0; offset < skb->len; i++) { + struct page *page; + unsigned int len; + + BUG_ON(i >= MAX_SKB_FRAGS); + page = alloc_page(GFP_ATOMIC|__GFP_COLD); + if (!page) { + int j; + skb->truesize += skb->data_len; + for (j = 0; j < i; j++) + put_page(frags[j].page.p); + return -ENOMEM; + } + + if (offset + PAGE_SIZE < skb->len) + len = PAGE_SIZE; + else + len = skb->len - offset; + if (skb_copy_bits(skb, offset, page_address(page), len)) + BUG(); + + offset += len; + frags[i].page.p = page; + frags[i].page_offset = 0; + skb_frag_size_set(&frags[i], len); + } + /* swap out with old one */ + memcpy(skb_shinfo(skb)->frags, + frags, + i * sizeof(skb_frag_t)); + skb_shinfo(skb)->nr_frags = i; + skb->truesize += i * PAGE_SIZE; + + /* remove traces of mapped pages and frag_list */ + skb_frag_list_init(skb); + uarg = skb_shinfo(skb)->destructor_arg; + uarg->callback(uarg, true); + skb_shinfo(skb)->destructor_arg = NULL; + + skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + kfree_skb(nskb); + + return 0; +} static int xenvif_tx_submit(struct xenvif *vif) { - struct gnttab_copy *gop = vif->tx_copy_ops; + struct gnttab_map_grant_ref *gop = vif->tx_map_ops; struct sk_buff *skb; int work_done = 0; @@ -1283,7 +1389,7 @@ static int xenvif_tx_submit(struct xenvif *vif) u16 pending_idx; unsigned data_len; - pending_idx = *((u16 *)skb->data); + pending_idx = XENVIF_TX_CB(skb)->pending_idx; txp = &vif->pending_tx_info[pending_idx].req; /* Check the remap error code. */ @@ -1298,14 +1404,16 @@ static int xenvif_tx_submit(struct xenvif *vif) memcpy(skb->data, (void *)(idx_to_kaddr(vif, pending_idx)|txp->offset), data_len); + vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL; if (data_len < txp->size) { /* Append the packet payload as a fragment. */ txp->offset += data_len; txp->size -= data_len; + skb_shinfo(skb)->destructor_arg = + &vif->pending_tx_info[pending_idx].callback_struct; } else { /* Schedule a response immediately. */ - xenvif_idx_release(vif, pending_idx, - XEN_NETIF_RSP_OKAY); + xenvif_idx_unmap(vif, pending_idx); } if (txp->flags & XEN_NETTXF_csum_blank) @@ -1315,6 +1423,17 @@ static int xenvif_tx_submit(struct xenvif *vif) xenvif_fill_frags(vif, skb); + if (unlikely(skb_has_frag_list(skb))) { + if (xenvif_handle_frag_list(vif, skb)) { + if (net_ratelimit()) + netdev_err(vif->dev, + "Not enough memory to consolidate frag_list!\n"); + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + kfree_skb(skb); + continue; + } + } + if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) { int target = min_t(int, skb->len, PKT_PROT_LEN); __pskb_pull_tail(skb, target - skb_headlen(skb)); @@ -1327,6 +1446,9 @@ static int xenvif_tx_submit(struct xenvif *vif) if (checksum_setup(vif, skb)) { netdev_dbg(vif->dev, "Can't setup checksum in net_tx_action\n"); + /* We have to set this flag to trigger the callback */ + if (skb_shinfo(skb)->destructor_arg) + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; kfree_skb(skb); continue; } @@ -1352,17 +1474,134 @@ static int xenvif_tx_submit(struct xenvif *vif) work_done++; + /* Set this flag right before netif_receive_skb, otherwise + * someone might think this packet already left netback, and + * do a skb_copy_ubufs while we are still in control of the + * skb. E.g. the __pskb_pull_tail earlier can do such thing. + */ + if (skb_shinfo(skb)->destructor_arg) { + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + vif->tx_zerocopy_sent++; + } + netif_receive_skb(skb); } return work_done; } +void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success) +{ + unsigned long flags; + pending_ring_idx_t index; + struct xenvif *vif = ubuf_to_vif(ubuf); + + /* This is the only place where we grab this lock, to protect callbacks + * from each other. + */ + spin_lock_irqsave(&vif->callback_lock, flags); + do { + u16 pending_idx = ubuf->desc; + ubuf = (struct ubuf_info *) ubuf->ctx; + BUG_ON(vif->dealloc_prod - vif->dealloc_cons >= + MAX_PENDING_REQS); + index = pending_index(vif->dealloc_prod); + vif->dealloc_ring[index] = pending_idx; + /* Sync with xenvif_tx_dealloc_action: + * insert idx then incr producer. + */ + smp_wmb(); + vif->dealloc_prod++; + } while (ubuf); + wake_up(&vif->dealloc_wq); + spin_unlock_irqrestore(&vif->callback_lock, flags); + + if (RING_HAS_UNCONSUMED_REQUESTS(&vif->tx) && + xenvif_tx_pending_slots_available(vif)) { + local_bh_disable(); + napi_schedule(&vif->napi); + local_bh_enable(); + } + + if (likely(zerocopy_success)) + vif->tx_zerocopy_success++; + else + vif->tx_zerocopy_fail++; +} + +static inline void xenvif_tx_dealloc_action(struct xenvif *vif) +{ + struct gnttab_unmap_grant_ref *gop; + pending_ring_idx_t dc, dp; + u16 pending_idx, pending_idx_release[MAX_PENDING_REQS]; + unsigned int i = 0; + + dc = vif->dealloc_cons; + gop = vif->tx_unmap_ops; + + /* Free up any grants we have finished using */ + do { + dp = vif->dealloc_prod; + + /* Ensure we see all indices enqueued by all + * xenvif_zerocopy_callback(). + */ + smp_rmb(); + + while (dc != dp) { + BUG_ON(gop - vif->tx_unmap_ops > MAX_PENDING_REQS); + pending_idx = + vif->dealloc_ring[pending_index(dc++)]; + + pending_idx_release[gop-vif->tx_unmap_ops] = + pending_idx; + vif->pages_to_unmap[gop-vif->tx_unmap_ops] = + vif->mmap_pages[pending_idx]; + gnttab_set_unmap_op(gop, + idx_to_kaddr(vif, pending_idx), + GNTMAP_host_map, + vif->grant_tx_handle[pending_idx]); + /* Btw. already unmapped? */ + xenvif_grant_handle_reset(vif, pending_idx); + ++gop; + } + + } while (dp != vif->dealloc_prod); + + vif->dealloc_cons = dc; + + if (gop - vif->tx_unmap_ops > 0) { + int ret; + ret = gnttab_unmap_refs(vif->tx_unmap_ops, + NULL, + vif->pages_to_unmap, + gop - vif->tx_unmap_ops); + if (ret) { + netdev_err(vif->dev, "Unmap fail: nr_ops %x ret %d\n", + gop - vif->tx_unmap_ops, ret); + for (i = 0; i < gop - vif->tx_unmap_ops; ++i) { + if (gop[i].status != GNTST_okay) + netdev_err(vif->dev, + " host_addr: %llx handle: %x status: %d\n", + gop[i].host_addr, + gop[i].handle, + gop[i].status); + } + BUG(); + } + } + + for (i = 0; i < gop - vif->tx_unmap_ops; ++i) + xenvif_idx_release(vif, pending_idx_release[i], + XEN_NETIF_RSP_OKAY); +} + + /* Called after netfront has transmitted */ int xenvif_tx_action(struct xenvif *vif, int budget) { unsigned nr_gops; - int work_done; + int work_done, ret; if (unlikely(!tx_work_todo(vif))) return 0; @@ -1372,7 +1611,11 @@ int xenvif_tx_action(struct xenvif *vif, int budget) if (nr_gops == 0) return 0; - gnttab_batch_copy(vif->tx_copy_ops, nr_gops); + ret = gnttab_map_refs(vif->tx_map_ops, + NULL, + vif->pages_to_map, + nr_gops); + BUG_ON(ret); work_done = xenvif_tx_submit(vif); @@ -1383,45 +1626,18 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx, u8 status) { struct pending_tx_info *pending_tx_info; - pending_ring_idx_t head; - u16 peek; /* peek into next tx request */ - - BUG_ON(vif->mmap_pages[pending_idx] == (void *)(~0UL)); - - /* Already complete? */ - if (vif->mmap_pages[pending_idx] == NULL) - return; + pending_ring_idx_t index; + unsigned long flags; pending_tx_info = &vif->pending_tx_info[pending_idx]; - - head = pending_tx_info->head; - - BUG_ON(!pending_tx_is_head(vif, head)); - BUG_ON(vif->pending_ring[pending_index(head)] != pending_idx); - - do { - pending_ring_idx_t index; - pending_ring_idx_t idx = pending_index(head); - u16 info_idx = vif->pending_ring[idx]; - - pending_tx_info = &vif->pending_tx_info[info_idx]; - make_tx_response(vif, &pending_tx_info->req, status); - - /* Setting any number other than - * INVALID_PENDING_RING_IDX indicates this slot is - * starting a new packet / ending a previous packet. - */ - pending_tx_info->head = 0; - - index = pending_index(vif->pending_prod++); - vif->pending_ring[index] = vif->pending_ring[info_idx]; - - peek = vif->pending_ring[pending_index(++head)]; - - } while (!pending_tx_is_head(vif, peek)); - - put_page(vif->mmap_pages[pending_idx]); - vif->mmap_pages[pending_idx] = NULL; + spin_lock_irqsave(&vif->response_lock, flags); + make_tx_response(vif, &pending_tx_info->req, status); + index = pending_index(vif->pending_prod); + vif->pending_ring[index] = pending_idx; + /* TX shouldn't use the index before we give it back here */ + mb(); + vif->pending_prod++; + spin_unlock_irqrestore(&vif->response_lock, flags); } @@ -1469,23 +1685,74 @@ static struct xen_netif_rx_response *make_rx_response(struct xenvif *vif, return resp; } +void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx) +{ + int ret; + struct gnttab_unmap_grant_ref tx_unmap_op; + + gnttab_set_unmap_op(&tx_unmap_op, + idx_to_kaddr(vif, pending_idx), + GNTMAP_host_map, + vif->grant_tx_handle[pending_idx]); + /* Btw. already unmapped? */ + xenvif_grant_handle_reset(vif, pending_idx); + + ret = gnttab_unmap_refs(&tx_unmap_op, NULL, + &vif->mmap_pages[pending_idx], 1); + BUG_ON(ret); + + xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY); +} + static inline int rx_work_todo(struct xenvif *vif) { - return !skb_queue_empty(&vif->rx_queue) && - xenvif_rx_ring_slots_available(vif, vif->rx_last_skb_slots); + return (!skb_queue_empty(&vif->rx_queue) && + xenvif_rx_ring_slots_available(vif, vif->rx_last_skb_slots)) || + vif->rx_queue_purge; } static inline int tx_work_todo(struct xenvif *vif) { if (likely(RING_HAS_UNCONSUMED_REQUESTS(&vif->tx)) && - (nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX - < MAX_PENDING_REQS)) + xenvif_tx_pending_slots_available(vif)) return 1; return 0; } +static void xenvif_dealloc_delay(unsigned long data) +{ + struct xenvif *vif = (struct xenvif *)data; + + vif->dealloc_delay_timed_out = true; + wake_up(&vif->dealloc_wq); +} + +static inline bool tx_dealloc_work_todo(struct xenvif *vif) +{ + if (vif->dealloc_cons != vif->dealloc_prod) { + if ((nr_free_slots(&vif->tx) > 2 * XEN_NETBK_LEGACY_SLOTS_MAX) && + (vif->dealloc_prod - vif->dealloc_cons < MAX_PENDING_REQS / 4) && + !vif->dealloc_delay_timed_out) { + if (!timer_pending(&vif->dealloc_delay)) { + vif->dealloc_delay.function = + xenvif_dealloc_delay; + vif->dealloc_delay.data = (unsigned long)vif; + mod_timer(&vif->dealloc_delay, + jiffies + msecs_to_jiffies(1)); + + } + return false; + } + del_timer_sync(&vif->dealloc_delay); + vif->dealloc_delay_timed_out = false; + return true; + } + + return false; +} + void xenvif_unmap_frontend_rings(struct xenvif *vif) { if (vif->tx.sring) @@ -1543,7 +1810,7 @@ static void xenvif_start_queue(struct xenvif *vif) netif_wake_queue(vif->dev); } -int xenvif_kthread(void *data) +int xenvif_kthread_guest_rx(void *data) { struct xenvif *vif = data; struct sk_buff *skb; @@ -1555,12 +1822,19 @@ int xenvif_kthread(void *data) if (kthread_should_stop()) break; + if (vif->rx_queue_purge) { + skb_queue_purge(&vif->rx_queue); + vif->rx_queue_purge = false; + } + if (!skb_queue_empty(&vif->rx_queue)) xenvif_rx_action(vif); if (skb_queue_empty(&vif->rx_queue) && - netif_queue_stopped(vif->dev)) + netif_queue_stopped(vif->dev)) { + del_timer_sync(&vif->wake_queue); xenvif_start_queue(vif); + } cond_resched(); } @@ -1572,6 +1846,28 @@ int xenvif_kthread(void *data) return 0; } +int xenvif_dealloc_kthread(void *data) +{ + struct xenvif *vif = data; + + while (!kthread_should_stop()) { + wait_event_interruptible(vif->dealloc_wq, + tx_dealloc_work_todo(vif) || + kthread_should_stop()); + if (kthread_should_stop()) + break; + + xenvif_tx_dealloc_action(vif); + cond_resched(); + } + + /* Unmap anything remaining*/ + if (tx_dealloc_work_todo(vif)) + xenvif_tx_dealloc_action(vif); + + return 0; +} + static int __init netback_init(void) { int rc = 0; @@ -1589,6 +1885,8 @@ static int __init netback_init(void) if (rc) goto failed_init; + rx_drain_timeout_jiffies = msecs_to_jiffies(rx_drain_timeout_msecs); + return 0; failed_init: diff --git a/net/ieee802154/6lowpan.h b/include/net/6lowpan.h index 0dccf62434d5..f7d372b7d4ff 100644 --- a/net/ieee802154/6lowpan.h +++ b/include/net/6lowpan.h @@ -53,6 +53,8 @@ #ifndef __6LOWPAN_H__ #define __6LOWPAN_H__ +#include <net/ipv6.h> + #define UIP_802154_SHORTADDR_LEN 2 /* compressed ipv6 address length */ #define UIP_IPH_LEN 40 /* ipv6 fixed header size */ #define UIP_PROTO_UDP 17 /* ipv6 next header value for UDP */ diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index adb3ea04adaa..73492b91105a 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -27,7 +27,7 @@ #include "6lowpan.h" -#include "../ieee802154/6lowpan.h" /* for the compression support */ +#include <net/6lowpan.h> /* for the compression support */ #define IFACE_NAME_TEMPLATE "bt%d" #define EUI64_ADDR_LEN 8 diff --git a/net/ieee802154/6lowpan_iphc.c b/net/ieee802154/6lowpan_iphc.c index 860aa2d445ba..211b5686d719 100644 --- a/net/ieee802154/6lowpan_iphc.c +++ b/net/ieee802154/6lowpan_iphc.c @@ -54,11 +54,10 @@ #include <linux/if_arp.h> #include <linux/module.h> #include <linux/netdevice.h> +#include <net/6lowpan.h> #include <net/ipv6.h> #include <net/af_ieee802154.h> -#include "6lowpan.h" - /* * Uncompress address function for source and * destination address(non-multicast). diff --git a/net/ieee802154/6lowpan_rtnl.c b/net/ieee802154/6lowpan_rtnl.c index e4726180fc36..1bbab8952f77 100644 --- a/net/ieee802154/6lowpan_rtnl.c +++ b/net/ieee802154/6lowpan_rtnl.c @@ -52,10 +52,10 @@ #include <net/af_ieee802154.h> #include <net/ieee802154.h> #include <net/ieee802154_netdev.h> +#include <net/6lowpan.h> #include <net/ipv6.h> #include "reassembly.h" -#include "6lowpan.h" static LIST_HEAD(lowpan_devices); diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c index 4511fc22ef16..1cc2336eb52c 100644 --- a/net/ieee802154/reassembly.c +++ b/net/ieee802154/reassembly.c @@ -24,10 +24,10 @@ #include <linux/export.h> #include <net/ieee802154_netdev.h> +#include <net/6lowpan.h> #include <net/ipv6.h> #include <net/inet_frag.h> -#include "6lowpan.h" #include "reassembly.h" static struct inet_frags lowpan_frags; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a02c884d4321..bc0fb0fc7552 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -882,6 +882,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb = skb_clone(skb, gfp_mask); if (unlikely(!skb)) return -ENOBUFS; + /* Our usage of tstamp should remain private */ + skb->tstamp.tv64 = 0; } inet = inet_sk(sk); diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index e5dc42f0e527..9958c31c2c54 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1108,6 +1108,7 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, struct flowi *fl, size_t data_len) { struct l2tp_tunnel *tunnel = session->tunnel; + struct sock *sk = tunnel->sock; unsigned int len = skb->len; int error; @@ -1131,7 +1132,7 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, /* Queue the packet to IP for output */ skb->local_df = 1; #if IS_ENABLED(CONFIG_IPV6) - if (skb->sk->sk_family == PF_INET6 && !tunnel->v4mapped) + if (sk->sk_family == PF_INET6 && !tunnel->v4mapped) error = inet6_csk_xmit(skb, NULL); else #endif @@ -1151,23 +1152,6 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, return 0; } -/* Automatically called when the skb is freed. - */ -static void l2tp_sock_wfree(struct sk_buff *skb) -{ - sock_put(skb->sk); -} - -/* For data skbs that we transmit, we associate with the tunnel socket - * but don't do accounting. - */ -static inline void l2tp_skb_set_owner_w(struct sk_buff *skb, struct sock *sk) -{ - sock_hold(sk); - skb->sk = sk; - skb->destructor = l2tp_sock_wfree; -} - #if IS_ENABLED(CONFIG_IPV6) static void l2tp_xmit_ipv6_csum(struct sock *sk, struct sk_buff *skb, int udp_len) @@ -1221,7 +1205,6 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len return NET_XMIT_DROP; } - skb_orphan(skb); /* Setup L2TP header */ session->build_header(session, __skb_push(skb, hdr_len)); @@ -1287,8 +1270,6 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len break; } - l2tp_skb_set_owner_w(skb, sk); - l2tp_xmit_core(session, skb, fl, data_len); out_unlock: bh_unlock_sock(sk); diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index d1c3429b69ed..ec126f91276b 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -20,9 +20,8 @@ af-rxrpc-y := \ ar-skbuff.o \ ar-transport.o -ifeq ($(CONFIG_PROC_FS),y) -af-rxrpc-y += ar-proc.o -endif +af-rxrpc-$(CONFIG_PROC_FS) += ar-proc.o +af-rxrpc-$(CONFIG_SYSCTL) += sysctl.o obj-$(CONFIG_AF_RXRPC) += af-rxrpc.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index e61aa6001c65..7b1670489638 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -838,6 +838,12 @@ static int __init af_rxrpc_init(void) goto error_key_type_s; } + ret = rxrpc_sysctl_init(); + if (ret < 0) { + printk(KERN_CRIT "RxRPC: Cannot register sysctls\n"); + goto error_sysctls; + } + #ifdef CONFIG_PROC_FS proc_create("rxrpc_calls", 0, init_net.proc_net, &rxrpc_call_seq_fops); proc_create("rxrpc_conns", 0, init_net.proc_net, @@ -845,6 +851,8 @@ static int __init af_rxrpc_init(void) #endif return 0; +error_sysctls: + unregister_key_type(&key_type_rxrpc_s); error_key_type_s: unregister_key_type(&key_type_rxrpc); error_key_type: @@ -865,6 +873,7 @@ error_call_jar: static void __exit af_rxrpc_exit(void) { _enter(""); + rxrpc_sysctl_exit(); unregister_key_type(&key_type_rxrpc_s); unregister_key_type(&key_type_rxrpc); sock_unregister(PF_RXRPC); diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c index cd97a0ce48d8..c6be17a959a6 100644 --- a/net/rxrpc/ar-ack.c +++ b/net/rxrpc/ar-ack.c @@ -19,7 +19,49 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" -static unsigned int rxrpc_ack_defer = 1; +/* + * How long to wait before scheduling ACK generation after seeing a + * packet with RXRPC_REQUEST_ACK set (in jiffies). + */ +unsigned rxrpc_requested_ack_delay = 1; + +/* + * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). + * + * We use this when we've received new data packets. If those packets aren't + * all consumed within this time we will send a DELAY ACK if an ACK was not + * requested to let the sender know it doesn't need to resend. + */ +unsigned rxrpc_soft_ack_delay = 1 * HZ; + +/* + * How long to wait before scheduling an ACK with subtype IDLE (in jiffies). + * + * We use this when we've consumed some previously soft-ACK'd packets when + * further packets aren't immediately received to decide when to send an IDLE + * ACK let the other end know that it can free up its Tx buffer space. + */ +unsigned rxrpc_idle_ack_delay = 0.5 * HZ; + +/* + * Receive window size in packets. This indicates the maximum number of + * unconsumed received packets we're willing to retain in memory. Once this + * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further + * packets. + */ +unsigned rxrpc_rx_window_size = 32; + +/* + * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet + * made by gluing normal packets together that we're willing to handle. + */ +unsigned rxrpc_rx_mtu = 5692; + +/* + * The maximum number of fragments in a received jumbo packet that we tell the + * sender that we're willing to handle. + */ +unsigned rxrpc_rx_jumbo_max = 4; static const char *rxrpc_acks(u8 reason) { @@ -82,24 +124,23 @@ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, switch (ack_reason) { case RXRPC_ACK_DELAY: _debug("run delay timer"); - call->ack_timer.expires = jiffies + rxrpc_ack_timeout * HZ; - add_timer(&call->ack_timer); - return; + expiry = rxrpc_soft_ack_delay; + goto run_timer; case RXRPC_ACK_IDLE: if (!immediate) { _debug("run defer timer"); - expiry = 1; + expiry = rxrpc_idle_ack_delay; goto run_timer; } goto cancel_timer; case RXRPC_ACK_REQUESTED: - if (!rxrpc_ack_defer) + expiry = rxrpc_requested_ack_delay; + if (!expiry) goto cancel_timer; if (!immediate || serial == cpu_to_be32(1)) { _debug("run defer timer"); - expiry = rxrpc_ack_defer; goto run_timer; } @@ -1174,11 +1215,11 @@ send_ACK: mtu = call->conn->trans->peer->if_mtu; mtu -= call->conn->trans->peer->hdrsize; ackinfo.maxMTU = htonl(mtu); - ackinfo.rwind = htonl(32); + ackinfo.rwind = htonl(rxrpc_rx_window_size); /* permit the peer to send us jumbo packets if it wants to */ - ackinfo.rxMTU = htonl(5692); - ackinfo.jumbo_max = htonl(4); + ackinfo.rxMTU = htonl(rxrpc_rx_mtu); + ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max); hdr.serial = htonl(atomic_inc_return(&call->conn->serial)); _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c index a3bbb360a3f9..a9e05db0f5d5 100644 --- a/net/rxrpc/ar-call.c +++ b/net/rxrpc/ar-call.c @@ -12,10 +12,22 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/circ_buf.h> +#include <linux/hashtable.h> +#include <linux/spinlock_types.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" +/* + * Maximum lifetime of a call (in jiffies). + */ +unsigned rxrpc_max_call_lifetime = 60 * HZ; + +/* + * Time till dead call expires after last use (in jiffies). + */ +unsigned rxrpc_dead_call_expiry = 2 * HZ; + const char *const rxrpc_call_states[] = { [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", @@ -38,8 +50,6 @@ const char *const rxrpc_call_states[] = { struct kmem_cache *rxrpc_call_jar; LIST_HEAD(rxrpc_calls); DEFINE_RWLOCK(rxrpc_call_lock); -static unsigned int rxrpc_call_max_lifetime = 60; -static unsigned int rxrpc_dead_call_timeout = 2; static void rxrpc_destroy_call(struct work_struct *work); static void rxrpc_call_life_expired(unsigned long _call); @@ -47,6 +57,145 @@ static void rxrpc_dead_call_expired(unsigned long _call); static void rxrpc_ack_time_expired(unsigned long _call); static void rxrpc_resend_time_expired(unsigned long _call); +static DEFINE_SPINLOCK(rxrpc_call_hash_lock); +static DEFINE_HASHTABLE(rxrpc_call_hash, 10); + +/* + * Hash function for rxrpc_call_hash + */ +static unsigned long rxrpc_call_hashfunc( + u8 clientflag, + __be32 cid, + __be32 call_id, + __be32 epoch, + __be16 service_id, + sa_family_t proto, + void *localptr, + unsigned int addr_size, + const u8 *peer_addr) +{ + const u16 *p; + unsigned int i; + unsigned long key; + u32 hcid = ntohl(cid); + + _enter(""); + + key = (unsigned long)localptr; + /* We just want to add up the __be32 values, so forcing the + * cast should be okay. + */ + key += (__force u32)epoch; + key += (__force u16)service_id; + key += (__force u32)call_id; + key += (hcid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT; + key += hcid & RXRPC_CHANNELMASK; + key += clientflag; + key += proto; + /* Step through the peer address in 16-bit portions for speed */ + for (i = 0, p = (const u16 *)peer_addr; i < addr_size >> 1; i++, p++) + key += *p; + _leave(" key = 0x%lx", key); + return key; +} + +/* + * Add a call to the hashtable + */ +static void rxrpc_call_hash_add(struct rxrpc_call *call) +{ + unsigned long key; + unsigned int addr_size = 0; + + _enter(""); + switch (call->proto) { + case AF_INET: + addr_size = sizeof(call->peer_ip.ipv4_addr); + break; + case AF_INET6: + addr_size = sizeof(call->peer_ip.ipv6_addr); + break; + default: + break; + } + key = rxrpc_call_hashfunc(call->in_clientflag, call->cid, + call->call_id, call->epoch, + call->service_id, call->proto, + call->conn->trans->local, addr_size, + call->peer_ip.ipv6_addr); + /* Store the full key in the call */ + call->hash_key = key; + spin_lock(&rxrpc_call_hash_lock); + hash_add_rcu(rxrpc_call_hash, &call->hash_node, key); + spin_unlock(&rxrpc_call_hash_lock); + _leave(""); +} + +/* + * Remove a call from the hashtable + */ +static void rxrpc_call_hash_del(struct rxrpc_call *call) +{ + _enter(""); + spin_lock(&rxrpc_call_hash_lock); + hash_del_rcu(&call->hash_node); + spin_unlock(&rxrpc_call_hash_lock); + _leave(""); +} + +/* + * Find a call in the hashtable and return it, or NULL if it + * isn't there. + */ +struct rxrpc_call *rxrpc_find_call_hash( + u8 clientflag, + __be32 cid, + __be32 call_id, + __be32 epoch, + __be16 service_id, + void *localptr, + sa_family_t proto, + const u8 *peer_addr) +{ + unsigned long key; + unsigned int addr_size = 0; + struct rxrpc_call *call = NULL; + struct rxrpc_call *ret = NULL; + + _enter(""); + switch (proto) { + case AF_INET: + addr_size = sizeof(call->peer_ip.ipv4_addr); + break; + case AF_INET6: + addr_size = sizeof(call->peer_ip.ipv6_addr); + break; + default: + break; + } + + key = rxrpc_call_hashfunc(clientflag, cid, call_id, epoch, + service_id, proto, localptr, addr_size, + peer_addr); + hash_for_each_possible_rcu(rxrpc_call_hash, call, hash_node, key) { + if (call->hash_key == key && + call->call_id == call_id && + call->cid == cid && + call->in_clientflag == clientflag && + call->service_id == service_id && + call->proto == proto && + call->local == localptr && + memcmp(call->peer_ip.ipv6_addr, peer_addr, + addr_size) == 0 && + call->epoch == epoch) { + ret = call; + break; + } + } + _leave(" = %p", ret); + return ret; +} + /* * allocate a new call */ @@ -91,7 +240,7 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) call->rx_data_expect = 1; call->rx_data_eaten = 0; call->rx_first_oos = 0; - call->ackr_win_top = call->rx_data_eaten + 1 + RXRPC_MAXACKS; + call->ackr_win_top = call->rx_data_eaten + 1 + rxrpc_rx_window_size; call->creation_jif = jiffies; return call; } @@ -128,11 +277,31 @@ static struct rxrpc_call *rxrpc_alloc_client_call( return ERR_PTR(ret); } + /* Record copies of information for hashtable lookup */ + call->proto = rx->proto; + call->local = trans->local; + switch (call->proto) { + case AF_INET: + call->peer_ip.ipv4_addr = + trans->peer->srx.transport.sin.sin_addr.s_addr; + break; + case AF_INET6: + memcpy(call->peer_ip.ipv6_addr, + trans->peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8, + sizeof(call->peer_ip.ipv6_addr)); + break; + } + call->epoch = call->conn->epoch; + call->service_id = call->conn->service_id; + call->in_clientflag = call->conn->in_clientflag; + /* Add the new call to the hashtable */ + rxrpc_call_hash_add(call); + spin_lock(&call->conn->trans->peer->lock); list_add(&call->error_link, &call->conn->trans->peer->error_targets); spin_unlock(&call->conn->trans->peer->lock); - call->lifetimer.expires = jiffies + rxrpc_call_max_lifetime * HZ; + call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; add_timer(&call->lifetimer); _leave(" = %p", call); @@ -320,9 +489,12 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, parent = *p; call = rb_entry(parent, struct rxrpc_call, conn_node); - if (call_id < call->call_id) + /* The tree is sorted in order of the __be32 value without + * turning it into host order. + */ + if ((__force u32)call_id < (__force u32)call->call_id) p = &(*p)->rb_left; - else if (call_id > call->call_id) + else if ((__force u32)call_id > (__force u32)call->call_id) p = &(*p)->rb_right; else goto old_call; @@ -347,9 +519,31 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, list_add_tail(&call->link, &rxrpc_calls); write_unlock_bh(&rxrpc_call_lock); + /* Record copies of information for hashtable lookup */ + call->proto = rx->proto; + call->local = conn->trans->local; + switch (call->proto) { + case AF_INET: + call->peer_ip.ipv4_addr = + conn->trans->peer->srx.transport.sin.sin_addr.s_addr; + break; + case AF_INET6: + memcpy(call->peer_ip.ipv6_addr, + conn->trans->peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8, + sizeof(call->peer_ip.ipv6_addr)); + break; + default: + break; + } + call->epoch = conn->epoch; + call->service_id = conn->service_id; + call->in_clientflag = conn->in_clientflag; + /* Add the new call to the hashtable */ + rxrpc_call_hash_add(call); + _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); - call->lifetimer.expires = jiffies + rxrpc_call_max_lifetime * HZ; + call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; add_timer(&call->lifetimer); _leave(" = %p {%d} [new]", call, call->debug_id); return call; @@ -533,7 +727,7 @@ void rxrpc_release_call(struct rxrpc_call *call) del_timer_sync(&call->resend_timer); del_timer_sync(&call->ack_timer); del_timer_sync(&call->lifetimer); - call->deadspan.expires = jiffies + rxrpc_dead_call_timeout * HZ; + call->deadspan.expires = jiffies + rxrpc_dead_call_expiry; add_timer(&call->deadspan); _leave(""); @@ -665,6 +859,9 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call) rxrpc_put_connection(call->conn); } + /* Remove the call from the hash */ + rxrpc_call_hash_del(call); + if (call->acks_window) { _debug("kill Tx window %d", CIRC_CNT(call->acks_head, call->acks_tail, diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index 7bf5b5b9e8b9..6631f4f1e39b 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -18,11 +18,15 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" +/* + * Time till a connection expires after last use (in seconds). + */ +unsigned rxrpc_connection_expiry = 10 * 60; + static void rxrpc_connection_reaper(struct work_struct *work); LIST_HEAD(rxrpc_connections); DEFINE_RWLOCK(rxrpc_connection_lock); -static unsigned long rxrpc_connection_timeout = 10 * 60; static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper); /* @@ -862,7 +866,7 @@ static void rxrpc_connection_reaper(struct work_struct *work) spin_lock(&conn->trans->client_lock); write_lock(&conn->trans->conn_lock); - reap_time = conn->put_time + rxrpc_connection_timeout; + reap_time = conn->put_time + rxrpc_connection_expiry; if (atomic_read(&conn->usage) > 0) { ; @@ -916,7 +920,7 @@ void __exit rxrpc_destroy_all_connections(void) { _enter(""); - rxrpc_connection_timeout = 0; + rxrpc_connection_expiry = 0; cancel_delayed_work(&rxrpc_connection_reap); rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c index a9206087b4d7..db57458c824c 100644 --- a/net/rxrpc/ar-error.c +++ b/net/rxrpc/ar-error.c @@ -83,6 +83,7 @@ void rxrpc_UDP_error_report(struct sock *sk) if (mtu == 0) { /* they didn't give us a size, estimate one */ + mtu = peer->if_mtu; if (mtu > 1500) { mtu >>= 1; if (mtu < 1500) diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 529572f18d1f..73742647c135 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -25,8 +25,6 @@ #include <net/net_namespace.h> #include "ar-internal.h" -unsigned long rxrpc_ack_timeout = 1; - const char *rxrpc_pkts[] = { "?00", "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG", @@ -349,8 +347,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) * it */ if (sp->hdr.flags & RXRPC_REQUEST_ACK) { _proto("ACK Requested on %%%u", serial); - rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, - !(sp->hdr.flags & RXRPC_MORE_PACKETS)); + rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false); } switch (sp->hdr.type) { @@ -526,36 +523,38 @@ protocol_error: * post an incoming packet to the appropriate call/socket to deal with * - must get rid of the sk_buff, either by freeing it or by queuing it */ -static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn, +static void rxrpc_post_packet_to_call(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp; - struct rxrpc_call *call; - struct rb_node *p; - __be32 call_id; - - _enter("%p,%p", conn, skb); - read_lock_bh(&conn->lock); + _enter("%p,%p", call, skb); sp = rxrpc_skb(skb); - /* look at extant calls by channel number first */ - call = conn->channels[ntohl(sp->hdr.cid) & RXRPC_CHANNELMASK]; - if (!call || call->call_id != sp->hdr.callNumber) - goto call_not_extant; - _debug("extant call [%d]", call->state); - ASSERTCMP(call->conn, ==, conn); read_lock(&call->state_lock); switch (call->state) { case RXRPC_CALL_LOCALLY_ABORTED: - if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) + if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) { rxrpc_queue_call(call); + goto free_unlock; + } case RXRPC_CALL_REMOTELY_ABORTED: case RXRPC_CALL_NETWORK_ERROR: case RXRPC_CALL_DEAD: + goto dead_call; + case RXRPC_CALL_COMPLETE: + case RXRPC_CALL_CLIENT_FINAL_ACK: + /* complete server call */ + if (call->conn->in_clientflag) + goto dead_call; + /* resend last packet of a completed call */ + _debug("final ack again"); + rxrpc_get_call(call); + set_bit(RXRPC_CALL_ACK_FINAL, &call->events); + rxrpc_queue_call(call); goto free_unlock; default: break; @@ -563,7 +562,6 @@ static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn, read_unlock(&call->state_lock); rxrpc_get_call(call); - read_unlock_bh(&conn->lock); if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && sp->hdr.flags & RXRPC_JUMBO_PACKET) @@ -574,78 +572,16 @@ static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn, rxrpc_put_call(call); goto done; -call_not_extant: - /* search the completed calls in case what we're dealing with is - * there */ - _debug("call not extant"); - - call_id = sp->hdr.callNumber; - p = conn->calls.rb_node; - while (p) { - call = rb_entry(p, struct rxrpc_call, conn_node); - - if (call_id < call->call_id) - p = p->rb_left; - else if (call_id > call->call_id) - p = p->rb_right; - else - goto found_completed_call; - } - dead_call: - /* it's a either a really old call that we no longer remember or its a - * new incoming call */ - read_unlock_bh(&conn->lock); - - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && - sp->hdr.seq == cpu_to_be32(1)) { - _debug("incoming call"); - skb_queue_tail(&conn->trans->local->accept_queue, skb); - rxrpc_queue_work(&conn->trans->local->acceptor); - goto done; - } - - _debug("dead call"); - skb->priority = RX_CALL_DEAD; - rxrpc_reject_packet(conn->trans->local, skb); - goto done; - - /* resend last packet of a completed call - * - client calls may have been aborted or ACK'd - * - server calls may have been aborted - */ -found_completed_call: - _debug("completed call"); - - if (atomic_read(&call->usage) == 0) - goto dead_call; - - /* synchronise any state changes */ - read_lock(&call->state_lock); - ASSERTIFCMP(call->state != RXRPC_CALL_CLIENT_FINAL_ACK, - call->state, >=, RXRPC_CALL_COMPLETE); - - if (call->state == RXRPC_CALL_LOCALLY_ABORTED || - call->state == RXRPC_CALL_REMOTELY_ABORTED || - call->state == RXRPC_CALL_DEAD) { - read_unlock(&call->state_lock); - goto dead_call; - } - - if (call->conn->in_clientflag) { - read_unlock(&call->state_lock); - goto dead_call; /* complete server call */ + if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { + skb->priority = RX_CALL_DEAD; + rxrpc_reject_packet(call->conn->trans->local, skb); + goto unlock; } - - _debug("final ack again"); - rxrpc_get_call(call); - set_bit(RXRPC_CALL_ACK_FINAL, &call->events); - rxrpc_queue_call(call); - free_unlock: - read_unlock(&call->state_lock); - read_unlock_bh(&conn->lock); rxrpc_free_skb(skb); +unlock: + read_unlock(&call->state_lock); done: _leave(""); } @@ -664,17 +600,42 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, rxrpc_queue_conn(conn); } +static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, + struct sk_buff *skb, + struct rxrpc_skb_priv *sp) +{ + struct rxrpc_peer *peer; + struct rxrpc_transport *trans; + struct rxrpc_connection *conn; + + peer = rxrpc_find_peer(local, ip_hdr(skb)->saddr, + udp_hdr(skb)->source); + if (IS_ERR(peer)) + goto cant_find_conn; + + trans = rxrpc_find_transport(local, peer); + rxrpc_put_peer(peer); + if (!trans) + goto cant_find_conn; + + conn = rxrpc_find_connection(trans, &sp->hdr); + rxrpc_put_transport(trans); + if (!conn) + goto cant_find_conn; + + return conn; +cant_find_conn: + return NULL; +} + /* * handle data received on the local endpoint * - may be called in interrupt context */ void rxrpc_data_ready(struct sock *sk, int count) { - struct rxrpc_connection *conn; - struct rxrpc_transport *trans; struct rxrpc_skb_priv *sp; struct rxrpc_local *local; - struct rxrpc_peer *peer; struct sk_buff *skb; int ret; @@ -749,27 +710,34 @@ void rxrpc_data_ready(struct sock *sk, int count) (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) goto bad_message; - peer = rxrpc_find_peer(local, ip_hdr(skb)->saddr, udp_hdr(skb)->source); - if (IS_ERR(peer)) - goto cant_route_call; + if (sp->hdr.callNumber == 0) { + /* This is a connection-level packet. These should be + * fairly rare, so the extra overhead of looking them up the + * old-fashioned way doesn't really hurt */ + struct rxrpc_connection *conn; - trans = rxrpc_find_transport(local, peer); - rxrpc_put_peer(peer); - if (!trans) - goto cant_route_call; + conn = rxrpc_conn_from_local(local, skb, sp); + if (!conn) + goto cant_route_call; - conn = rxrpc_find_connection(trans, &sp->hdr); - rxrpc_put_transport(trans); - if (!conn) - goto cant_route_call; - - _debug("CONN %p {%d}", conn, conn->debug_id); - - if (sp->hdr.callNumber == 0) + _debug("CONN %p {%d}", conn, conn->debug_id); rxrpc_post_packet_to_conn(conn, skb); - else - rxrpc_post_packet_to_call(conn, skb); - rxrpc_put_connection(conn); + rxrpc_put_connection(conn); + } else { + struct rxrpc_call *call; + u8 in_clientflag = 0; + + if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) + in_clientflag = RXRPC_CLIENT_INITIATED; + call = rxrpc_find_call_hash(in_clientflag, sp->hdr.cid, + sp->hdr.callNumber, sp->hdr.epoch, + sp->hdr.serviceId, local, AF_INET, + (u8 *)&ip_hdr(skb)->saddr); + if (call) + rxrpc_post_packet_to_call(call, skb); + else + goto cant_route_call; + } rxrpc_put_local(local); return; @@ -790,8 +758,10 @@ cant_route_call: skb->priority = RX_CALL_DEAD; } - _debug("reject"); - rxrpc_reject_packet(local, skb); + if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { + _debug("reject type %d",sp->hdr.type); + rxrpc_reject_packet(local, skb); + } rxrpc_put_local(local); _leave(" [no call]"); return; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 5f43675ee1df..c831d44b0841 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -396,9 +396,20 @@ struct rxrpc_call { #define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG) unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1]; + struct hlist_node hash_node; + unsigned long hash_key; /* Full hash key */ + u8 in_clientflag; /* Copy of conn->in_clientflag for hashing */ + struct rxrpc_local *local; /* Local endpoint. Used for hashing. */ + sa_family_t proto; /* Frame protocol */ /* the following should all be in net order */ __be32 cid; /* connection ID + channel index */ __be32 call_id; /* call ID on connection */ + __be32 epoch; /* epoch of this connection */ + __be16 service_id; /* service ID */ + union { /* Peer IP address for hashing */ + __be32 ipv4_addr; + __u8 ipv6_addr[16]; /* Anticipates eventual IPv6 support */ + } peer_ip; }; /* @@ -433,6 +444,13 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * ar-ack.c */ +extern unsigned rxrpc_requested_ack_delay; +extern unsigned rxrpc_soft_ack_delay; +extern unsigned rxrpc_idle_ack_delay; +extern unsigned rxrpc_rx_window_size; +extern unsigned rxrpc_rx_mtu; +extern unsigned rxrpc_rx_jumbo_max; + void __rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool); void rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool); void rxrpc_process_call(struct work_struct *); @@ -440,10 +458,14 @@ void rxrpc_process_call(struct work_struct *); /* * ar-call.c */ +extern unsigned rxrpc_max_call_lifetime; +extern unsigned rxrpc_dead_call_expiry; extern struct kmem_cache *rxrpc_call_jar; extern struct list_head rxrpc_calls; extern rwlock_t rxrpc_call_lock; +struct rxrpc_call *rxrpc_find_call_hash(u8, __be32, __be32, __be32, + __be16, void *, sa_family_t, const u8 *); struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *, struct rxrpc_transport *, struct rxrpc_conn_bundle *, @@ -460,6 +482,7 @@ void __exit rxrpc_destroy_all_calls(void); /* * ar-connection.c */ +extern unsigned rxrpc_connection_expiry; extern struct list_head rxrpc_connections; extern rwlock_t rxrpc_connection_lock; @@ -493,7 +516,6 @@ void rxrpc_UDP_error_handler(struct work_struct *); /* * ar-input.c */ -extern unsigned long rxrpc_ack_timeout; extern const char *rxrpc_pkts[]; void rxrpc_data_ready(struct sock *, int); @@ -504,6 +526,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *); * ar-local.c */ extern rwlock_t rxrpc_local_lock; + struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *); void rxrpc_put_local(struct rxrpc_local *); void __exit rxrpc_destroy_all_locals(void); @@ -522,7 +545,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t, /* * ar-output.c */ -extern int rxrpc_resend_timeout; +extern unsigned rxrpc_resend_timeout; int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *); int rxrpc_client_sendmsg(struct kiocb *, struct rxrpc_sock *, @@ -572,6 +595,8 @@ void rxrpc_packet_destructor(struct sk_buff *); /* * ar-transport.c */ +extern unsigned rxrpc_transport_expiry; + struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *, struct rxrpc_peer *, gfp_t); void rxrpc_put_transport(struct rxrpc_transport *); @@ -580,6 +605,17 @@ struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *, struct rxrpc_peer *); /* + * sysctl.c + */ +#ifdef CONFIG_SYSCTL +extern int __init rxrpc_sysctl_init(void); +extern void rxrpc_sysctl_exit(void); +#else +static inline int __init rxrpc_sysctl_init(void) { return 0; } +static inline void rxrpc_sysctl_exit(void) {} +#endif + +/* * debug tracing */ extern unsigned int rxrpc_debug; diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index d0e8f1c1898a..0b4b9a79f5ab 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -18,7 +18,10 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" -int rxrpc_resend_timeout = 4; +/* + * Time till packet resend (in jiffies). + */ +unsigned rxrpc_resend_timeout = 4 * HZ; static int rxrpc_send_data(struct kiocb *iocb, struct rxrpc_sock *rx, @@ -487,7 +490,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); sp->need_resend = false; - sp->resend_at = jiffies + rxrpc_resend_timeout * HZ; + sp->resend_at = jiffies + rxrpc_resend_timeout; if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) { _debug("run timer"); call->resend_timer.expires = sp->resend_at; @@ -666,6 +669,7 @@ static int rxrpc_send_data(struct kiocb *iocb, /* add the packet to the send queue if it's now full */ if (sp->remain <= 0 || (segment == 0 && !more)) { struct rxrpc_connection *conn = call->conn; + uint32_t seq; size_t pad; /* pad out if we're using security */ @@ -678,11 +682,12 @@ static int rxrpc_send_data(struct kiocb *iocb, memset(skb_put(skb, pad), 0, pad); } + seq = atomic_inc_return(&call->sequence); + sp->hdr.epoch = conn->epoch; sp->hdr.cid = call->cid; sp->hdr.callNumber = call->call_id; - sp->hdr.seq = - htonl(atomic_inc_return(&call->sequence)); + sp->hdr.seq = htonl(seq); sp->hdr.serial = htonl(atomic_inc_return(&conn->serial)); sp->hdr.type = RXRPC_PACKET_TYPE_DATA; @@ -697,6 +702,8 @@ static int rxrpc_send_data(struct kiocb *iocb, else if (CIRC_SPACE(call->acks_head, call->acks_tail, call->acks_winsz) > 1) sp->hdr.flags |= RXRPC_MORE_PACKETS; + if (more && seq & 1) + sp->hdr.flags |= RXRPC_REQUEST_ACK; ret = rxrpc_secure_packet( call, skb, skb->mark, diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index 34b5490dde65..e9aaa65c0778 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -180,16 +180,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, if (copy > len - copied) copy = len - copied; - if (skb->ip_summed == CHECKSUM_UNNECESSARY || - skb->ip_summed == CHECKSUM_PARTIAL) { - ret = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, copy); - } else { - ret = skb_copy_and_csum_datagram_iovec(skb, offset, - msg->msg_iov); - if (ret == -EINVAL) - goto csum_copy_error; - } + ret = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copy); if (ret < 0) goto copy_error; @@ -348,20 +339,6 @@ copy_error: _leave(" = %d", ret); return ret; -csum_copy_error: - _debug("csum error"); - release_sock(&rx->sk); - if (continue_call) - rxrpc_put_call(continue_call); - rxrpc_kill_skb(skb); - if (!(flags & MSG_PEEK)) { - if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) - BUG(); - } - skb_kill_datagram(&rx->sk, skb, flags); - rxrpc_put_call(call); - return -EAGAIN; - wait_interrupted: ret = sock_intr_errno(timeo); wait_error: diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c index de755e04d29c..4cfab49e329d 100644 --- a/net/rxrpc/ar-skbuff.c +++ b/net/rxrpc/ar-skbuff.c @@ -83,9 +83,14 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call, rxrpc_request_final_ACK(call); } else if (atomic_dec_and_test(&call->ackr_not_idle) && test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) { + /* We previously soft-ACK'd some received packets that have now + * been consumed, so send a hard-ACK if no more packets are + * immediately forthcoming to allow the transmitter to free up + * its Tx bufferage. + */ _debug("send Rx idle ACK"); __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, sp->hdr.serial, - true); + false); } spin_unlock_bh(&call->lock); diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c index 92df566930b9..1976dec84f29 100644 --- a/net/rxrpc/ar-transport.c +++ b/net/rxrpc/ar-transport.c @@ -17,11 +17,15 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" +/* + * Time after last use at which transport record is cleaned up. + */ +unsigned rxrpc_transport_expiry = 3600 * 24; + static void rxrpc_transport_reaper(struct work_struct *work); static LIST_HEAD(rxrpc_transports); static DEFINE_RWLOCK(rxrpc_transport_lock); -static unsigned long rxrpc_transport_timeout = 3600 * 24; static DECLARE_DELAYED_WORK(rxrpc_transport_reap, rxrpc_transport_reaper); /* @@ -235,7 +239,7 @@ static void rxrpc_transport_reaper(struct work_struct *work) if (likely(atomic_read(&trans->usage) > 0)) continue; - reap_time = trans->put_time + rxrpc_transport_timeout; + reap_time = trans->put_time + rxrpc_transport_expiry; if (reap_time <= now) list_move_tail(&trans->link, &graveyard); else if (reap_time < earliest) @@ -271,7 +275,7 @@ void __exit rxrpc_destroy_all_transports(void) { _enter(""); - rxrpc_transport_timeout = 0; + rxrpc_transport_expiry = 0; cancel_delayed_work(&rxrpc_transport_reap); rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c new file mode 100644 index 000000000000..50a98a910eb1 --- /dev/null +++ b/net/rxrpc/sysctl.c @@ -0,0 +1,146 @@ +/* sysctls for configuring RxRPC operating parameters + * + * Copyright (C) 2014 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/sysctl.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +static struct ctl_table_header *rxrpc_sysctl_reg_table; +static const unsigned zero = 0; +static const unsigned one = 1; +static const unsigned four = 4; +static const unsigned n_65535 = 65535; +static const unsigned n_max_acks = RXRPC_MAXACKS; + +/* + * RxRPC operating parameters. + * + * See Documentation/networking/rxrpc.txt and the variable definitions for more + * information on the individual parameters. + */ +static struct ctl_table rxrpc_sysctl_table[] = { + /* Values measured in milliseconds */ + { + .procname = "req_ack_delay", + .data = &rxrpc_requested_ack_delay, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + .extra1 = (void *)&zero, + }, + { + .procname = "soft_ack_delay", + .data = &rxrpc_soft_ack_delay, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + .extra1 = (void *)&one, + }, + { + .procname = "idle_ack_delay", + .data = &rxrpc_idle_ack_delay, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + .extra1 = (void *)&one, + }, + { + .procname = "resend_timeout", + .data = &rxrpc_resend_timeout, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + .extra1 = (void *)&one, + }, + + /* Values measured in seconds but used in jiffies */ + { + .procname = "max_call_lifetime", + .data = &rxrpc_max_call_lifetime, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + .extra1 = (void *)&one, + }, + { + .procname = "dead_call_expiry", + .data = &rxrpc_dead_call_expiry, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + .extra1 = (void *)&one, + }, + + /* Values measured in seconds */ + { + .procname = "connection_expiry", + .data = &rxrpc_connection_expiry, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&one, + }, + { + .procname = "transport_expiry", + .data = &rxrpc_transport_expiry, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&one, + }, + + /* Non-time values */ + { + .procname = "rx_window_size", + .data = &rxrpc_rx_window_size, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&one, + .extra2 = (void *)&n_max_acks, + }, + { + .procname = "rx_mtu", + .data = &rxrpc_rx_mtu, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&one, + .extra1 = (void *)&n_65535, + }, + { + .procname = "rx_jumbo_max", + .data = &rxrpc_rx_jumbo_max, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&one, + .extra2 = (void *)&four, + }, + + { } +}; + +int __init rxrpc_sysctl_init(void) +{ + rxrpc_sysctl_reg_table = register_net_sysctl(&init_net, "net/rxrpc", + rxrpc_sysctl_table); + if (!rxrpc_sysctl_reg_table) + return -ENOMEM; + return 0; +} + +void rxrpc_sysctl_exit(void) +{ + if (rxrpc_sysctl_reg_table) + unregister_net_sysctl_table(rxrpc_sysctl_reg_table); +} diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 722e137df244..9f949abcacef 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1062,12 +1062,13 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) { - spinlock_t *root_lock = qdisc_root_sleeping_lock(sch); struct htb_sched *q = qdisc_priv(sch); struct nlattr *nest; struct tc_htb_glob gopt; - spin_lock_bh(root_lock); + /* Its safe to not acquire qdisc lock. As we hold RTNL, + * no change can happen on the qdisc parameters. + */ gopt.direct_pkts = q->direct_pkts; gopt.version = HTB_VER; @@ -1081,13 +1082,10 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) || nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen)) goto nla_put_failure; - nla_nest_end(skb, nest); - spin_unlock_bh(root_lock); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: - spin_unlock_bh(root_lock); nla_nest_cancel(skb, nest); return -1; } @@ -1096,11 +1094,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct htb_class *cl = (struct htb_class *)arg; - spinlock_t *root_lock = qdisc_root_sleeping_lock(sch); struct nlattr *nest; struct tc_htb_opt opt; - spin_lock_bh(root_lock); + /* Its safe to not acquire qdisc lock. As we hold RTNL, + * no change can happen on the class parameters. + */ tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT; tcm->tcm_handle = cl->common.classid; if (!cl->level && cl->un.leaf.q) @@ -1128,12 +1127,9 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps)) goto nla_put_failure; - nla_nest_end(skb, nest); - spin_unlock_bh(root_lock); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: - spin_unlock_bh(root_lock); nla_nest_cancel(skb, nest); return -1; } |