diff options
78 files changed, 2901 insertions, 1809 deletions
diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index 5ad35113d0f4..68552b92dc44 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -477,7 +477,7 @@ AF_RXRPC sockets support a few socket options at the SOL_RXRPC level: Encrypted checksum plus packet padded and first eight bytes of packet encrypted - which includes the actual packet length. - (c) RXRPC_SECURITY_ENCRYPTED + (c) RXRPC_SECURITY_ENCRYPT Encrypted checksum plus entire packet padded and encrypted, including actual packet length. @@ -578,7 +578,7 @@ A client would issue an operation by: This issues a request_key() to get the key representing the security context. The minimum security level can be set:: - unsigned int sec = RXRPC_SECURITY_ENCRYPTED; + unsigned int sec = RXRPC_SECURITY_ENCRYPT; setsockopt(client, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL, &sec, sizeof(sec)); @@ -1090,6 +1090,15 @@ The kernel interface functions are as follows: jiffies). In the event of the timeout occurring, the call will be aborted and -ETIME or -ETIMEDOUT will be returned. + (#) Apply the RXRPC_MIN_SECURITY_LEVEL sockopt to a socket from within in the + kernel:: + + int rxrpc_sock_set_min_security_level(struct sock *sk, + unsigned int val); + + This specifies the minimum security level required for calls on this + socket. + Configurable Parameters ======================= diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index aae99a2d7bd4..14345a87c7cc 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1570,34 +1570,6 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); -static inline void drbd_tcp_cork(struct socket *sock) -{ - int val = 1; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, - (char*)&val, sizeof(val)); -} - -static inline void drbd_tcp_uncork(struct socket *sock) -{ - int val = 0; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, - (char*)&val, sizeof(val)); -} - -static inline void drbd_tcp_nodelay(struct socket *sock) -{ - int val = 1; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char*)&val, sizeof(val)); -} - -static inline void drbd_tcp_quickack(struct socket *sock) -{ - int val = 2; - (void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, - (char*)&val, sizeof(val)); -} - /* sets the number of 512 byte sectors of our virtual device */ void drbd_set_my_capacity(struct drbd_device *device, sector_t size); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index c094c3c2c5d4..45fbd526c453 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -660,7 +660,7 @@ static int __send_command(struct drbd_connection *connection, int vnr, /* DRBD protocol "pings" are latency critical. * This is supposed to trigger tcp_push_pending_frames() */ if (!err && (cmd == P_PING || cmd == P_PING_ACK)) - drbd_tcp_nodelay(sock->socket); + tcp_sock_set_nodelay(sock->socket->sk); return err; } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c15e7083b13a..3a3f2b6a821f 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1051,8 +1051,8 @@ randomize: /* we don't want delays. * we use TCP_CORK where appropriate, though */ - drbd_tcp_nodelay(sock.socket); - drbd_tcp_nodelay(msock.socket); + tcp_sock_set_nodelay(sock.socket->sk); + tcp_sock_set_nodelay(msock.socket->sk); connection->data.socket = sock.socket; connection->meta.socket = msock.socket; @@ -1223,7 +1223,7 @@ static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, str * quickly as possible, and let remote TCP know what we have * received so far. */ if (err == -EAGAIN) { - drbd_tcp_quickack(connection->data.socket); + tcp_sock_set_quickack(connection->data.socket->sk, 2); drbd_unplug_all_devices(connection); } if (err > 0) { @@ -4959,8 +4959,7 @@ static int receive_UnplugRemote(struct drbd_connection *connection, struct packe { /* Make sure we've acked all the TCP data associated * with the data requests being unplugged */ - drbd_tcp_quickack(connection->data.socket); - + tcp_sock_set_quickack(connection->data.socket->sk, 2); return 0; } @@ -6162,7 +6161,7 @@ void drbd_send_acks_wf(struct work_struct *ws) rcu_read_unlock(); if (tcp_cork) - drbd_tcp_cork(connection->meta.socket); + tcp_sock_set_cork(connection->meta.socket->sk, true); err = drbd_finish_peer_reqs(device); kref_put(&device->kref, drbd_destroy_device); @@ -6175,7 +6174,7 @@ void drbd_send_acks_wf(struct work_struct *ws) } if (tcp_cork) - drbd_tcp_uncork(connection->meta.socket); + tcp_sock_set_cork(connection->meta.socket->sk, false); return; } diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 0dc019da1f8d..2b89c9f2ca70 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -2098,7 +2098,7 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * if (uncork) { mutex_lock(&connection->data.mutex); if (connection->data.socket) - drbd_tcp_uncork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, false); mutex_unlock(&connection->data.mutex); } @@ -2153,9 +2153,9 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * mutex_lock(&connection->data.mutex); if (connection->data.socket) { if (cork) - drbd_tcp_cork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, true); else if (!uncork) - drbd_tcp_uncork(connection->data.socket); + tcp_sock_set_cork(connection->data.socket->sk, false); } mutex_unlock(&connection->data.mutex); } diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 559e5fd3bad8..1662216be66d 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -947,16 +947,8 @@ static void siw_accept_newconn(struct siw_cep *cep) siw_cep_get(new_cep); new_s->sk->sk_user_data = new_cep; - if (siw_tcp_nagle == false) { - int val = 1; - - rv = kernel_setsockopt(new_s, SOL_TCP, TCP_NODELAY, - (char *)&val, sizeof(val)); - if (rv) { - siw_dbg_cep(cep, "setsockopt NODELAY error: %d\n", rv); - goto error; - } - } + if (siw_tcp_nagle == false) + tcp_sock_set_nodelay(new_s->sk); new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ; rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); @@ -1312,17 +1304,14 @@ static void siw_cm_llp_state_change(struct sock *sk) static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, struct sockaddr *raddr) { - int rv, flags = 0, s_val = 1; + int rv, flags = 0; size_t size = laddr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); /* * Make address available again asap. */ - rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, - sizeof(s_val)); - if (rv < 0) - return rv; + sock_set_reuseaddr(s->sk); rv = s->ops->bind(s, laddr, size); if (rv < 0) @@ -1389,16 +1378,8 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv); goto error; } - if (siw_tcp_nagle == false) { - int val = 1; - - rv = kernel_setsockopt(s, SOL_TCP, TCP_NODELAY, (char *)&val, - sizeof(val)); - if (rv) { - siw_dbg_qp(qp, "setsockopt NODELAY error: %d\n", rv); - goto error; - } - } + if (siw_tcp_nagle == false) + tcp_sock_set_nodelay(s->sk); cep = siw_cep_alloc(sdev); if (!cep) { rv = -ENOMEM; @@ -1781,7 +1762,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) struct siw_cep *cep = NULL; struct siw_device *sdev = to_siw_dev(id->device); int addr_family = id->local_addr.ss_family; - int rv = 0, s_val; + int rv = 0; if (addr_family != AF_INET && addr_family != AF_INET6) return -EAFNOSUPPORT; @@ -1793,13 +1774,8 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) /* * Allow binding local port when still in TIME_WAIT from last close. */ - s_val = 1; - rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, - sizeof(s_val)); - if (rv) { - siw_dbg(id->device, "setsockopt error: %d\n", rv); - goto error; - } + sock_set_reuseaddr(s->sk); + if (addr_family == AF_INET) { struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index 198d2a7d7f95..cb3c81a49fbc 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -84,6 +84,7 @@ struct sja1105_info { * the egress timestamps. */ int ptpegr_ts_bytes; + int num_cbs_shapers; const struct sja1105_dynamic_table_ops *dyn_ops; const struct sja1105_table_ops *static_ops; const struct sja1105_regs *regs; @@ -218,6 +219,7 @@ struct sja1105_private { struct mutex mgmt_lock; bool expect_dsa_8021q; enum sja1105_vlan_state vlan_state; + struct sja1105_cbs_entry *cbs; struct sja1105_tagger_data tagger_data; struct sja1105_ptp_data ptp_data; struct sja1105_tas_data tas_data; diff --git a/drivers/net/dsa/sja1105/sja1105_dynamic_config.c b/drivers/net/dsa/sja1105/sja1105_dynamic_config.c index 2a8fbd7fdedc..7516f2ffdd4e 100644 --- a/drivers/net/dsa/sja1105/sja1105_dynamic_config.c +++ b/drivers/net/dsa/sja1105/sja1105_dynamic_config.c @@ -136,6 +136,12 @@ #define SJA1105_SIZE_RETAGGING_DYN_CMD \ (SJA1105_SIZE_DYN_CMD + SJA1105_SIZE_RETAGGING_ENTRY) +#define SJA1105ET_SIZE_CBS_DYN_CMD \ + (SJA1105_SIZE_DYN_CMD + SJA1105ET_SIZE_CBS_ENTRY) + +#define SJA1105PQRS_SIZE_CBS_DYN_CMD \ + (SJA1105_SIZE_DYN_CMD + SJA1105PQRS_SIZE_CBS_ENTRY) + #define SJA1105_MAX_DYN_CMD_SIZE \ SJA1105PQRS_SIZE_MAC_CONFIG_DYN_CMD @@ -542,6 +548,60 @@ sja1105_retagging_cmd_packing(void *buf, struct sja1105_dyn_cmd *cmd, sja1105_packing(p, &cmd->index, 5, 0, size, op); } +static void sja1105et_cbs_cmd_packing(void *buf, struct sja1105_dyn_cmd *cmd, + enum packing_op op) +{ + u8 *p = buf + SJA1105ET_SIZE_CBS_ENTRY; + const int size = SJA1105_SIZE_DYN_CMD; + + sja1105_packing(p, &cmd->valid, 31, 31, size, op); + sja1105_packing(p, &cmd->index, 19, 16, size, op); +} + +static size_t sja1105et_cbs_entry_packing(void *buf, void *entry_ptr, + enum packing_op op) +{ + const size_t size = SJA1105ET_SIZE_CBS_ENTRY; + struct sja1105_cbs_entry *entry = entry_ptr; + u8 *cmd = buf + size; + u32 *p = buf; + + sja1105_packing(cmd, &entry->port, 5, 3, SJA1105_SIZE_DYN_CMD, op); + sja1105_packing(cmd, &entry->prio, 2, 0, SJA1105_SIZE_DYN_CMD, op); + sja1105_packing(p + 3, &entry->credit_lo, 31, 0, size, op); + sja1105_packing(p + 2, &entry->credit_hi, 31, 0, size, op); + sja1105_packing(p + 1, &entry->send_slope, 31, 0, size, op); + sja1105_packing(p + 0, &entry->idle_slope, 31, 0, size, op); + return size; +} + +static void sja1105pqrs_cbs_cmd_packing(void *buf, struct sja1105_dyn_cmd *cmd, + enum packing_op op) +{ + u8 *p = buf + SJA1105PQRS_SIZE_CBS_ENTRY; + const int size = SJA1105_SIZE_DYN_CMD; + + sja1105_packing(p, &cmd->valid, 31, 31, size, op); + sja1105_packing(p, &cmd->rdwrset, 30, 30, size, op); + sja1105_packing(p, &cmd->errors, 29, 29, size, op); + sja1105_packing(p, &cmd->index, 3, 0, size, op); +} + +static size_t sja1105pqrs_cbs_entry_packing(void *buf, void *entry_ptr, + enum packing_op op) +{ + const size_t size = SJA1105PQRS_SIZE_CBS_ENTRY; + struct sja1105_cbs_entry *entry = entry_ptr; + + sja1105_packing(buf, &entry->port, 159, 157, size, op); + sja1105_packing(buf, &entry->prio, 156, 154, size, op); + sja1105_packing(buf, &entry->credit_lo, 153, 122, size, op); + sja1105_packing(buf, &entry->credit_hi, 121, 90, size, op); + sja1105_packing(buf, &entry->send_slope, 89, 58, size, op); + sja1105_packing(buf, &entry->idle_slope, 57, 26, size, op); + return size; +} + #define OP_READ BIT(0) #define OP_WRITE BIT(1) #define OP_DEL BIT(2) @@ -631,6 +691,14 @@ struct sja1105_dynamic_table_ops sja1105et_dyn_ops[BLK_IDX_MAX_DYN] = { .packed_size = SJA1105_SIZE_RETAGGING_DYN_CMD, .addr = 0x31, }, + [BLK_IDX_CBS] = { + .entry_packing = sja1105et_cbs_entry_packing, + .cmd_packing = sja1105et_cbs_cmd_packing, + .max_entry_count = SJA1105ET_MAX_CBS_COUNT, + .access = OP_WRITE, + .packed_size = SJA1105ET_SIZE_CBS_DYN_CMD, + .addr = 0x2c, + }, [BLK_IDX_XMII_PARAMS] = {0}, }; @@ -725,6 +793,14 @@ struct sja1105_dynamic_table_ops sja1105pqrs_dyn_ops[BLK_IDX_MAX_DYN] = { .packed_size = SJA1105_SIZE_RETAGGING_DYN_CMD, .addr = 0x38, }, + [BLK_IDX_CBS] = { + .entry_packing = sja1105pqrs_cbs_entry_packing, + .cmd_packing = sja1105pqrs_cbs_cmd_packing, + .max_entry_count = SJA1105PQRS_MAX_CBS_COUNT, + .access = OP_WRITE, + .packed_size = SJA1105PQRS_SIZE_CBS_DYN_CMD, + .addr = 0x32, + }, [BLK_IDX_XMII_PARAMS] = {0}, }; diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 44ce7882dfb1..36ab527449e6 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1640,6 +1640,92 @@ static void sja1105_bridge_leave(struct dsa_switch *ds, int port, sja1105_bridge_member(ds, port, br, false); } +#define BYTES_PER_KBIT (1000LL / 8) + +static int sja1105_find_unused_cbs_shaper(struct sja1105_private *priv) +{ + int i; + + for (i = 0; i < priv->info->num_cbs_shapers; i++) + if (!priv->cbs[i].idle_slope && !priv->cbs[i].send_slope) + return i; + + return -1; +} + +static int sja1105_delete_cbs_shaper(struct sja1105_private *priv, int port, + int prio) +{ + int i; + + for (i = 0; i < priv->info->num_cbs_shapers; i++) { + struct sja1105_cbs_entry *cbs = &priv->cbs[i]; + + if (cbs->port == port && cbs->prio == prio) { + memset(cbs, 0, sizeof(*cbs)); + return sja1105_dynamic_config_write(priv, BLK_IDX_CBS, + i, cbs, true); + } + } + + return 0; +} + +static int sja1105_setup_tc_cbs(struct dsa_switch *ds, int port, + struct tc_cbs_qopt_offload *offload) +{ + struct sja1105_private *priv = ds->priv; + struct sja1105_cbs_entry *cbs; + int index; + + if (!offload->enable) + return sja1105_delete_cbs_shaper(priv, port, offload->queue); + + index = sja1105_find_unused_cbs_shaper(priv); + if (index < 0) + return -ENOSPC; + + cbs = &priv->cbs[index]; + cbs->port = port; + cbs->prio = offload->queue; + /* locredit and sendslope are negative by definition. In hardware, + * positive values must be provided, and the negative sign is implicit. + */ + cbs->credit_hi = offload->hicredit; + cbs->credit_lo = abs(offload->locredit); + /* User space is in kbits/sec, hardware in bytes/sec */ + cbs->idle_slope = offload->idleslope * BYTES_PER_KBIT; + cbs->send_slope = abs(offload->sendslope * BYTES_PER_KBIT); + /* Convert the negative values from 64-bit 2's complement + * to 32-bit 2's complement (for the case of 0x80000000 whose + * negative is still negative). + */ + cbs->credit_lo &= GENMASK_ULL(31, 0); + cbs->send_slope &= GENMASK_ULL(31, 0); + + return sja1105_dynamic_config_write(priv, BLK_IDX_CBS, index, cbs, + true); +} + +static int sja1105_reload_cbs(struct sja1105_private *priv) +{ + int rc = 0, i; + + for (i = 0; i < priv->info->num_cbs_shapers; i++) { + struct sja1105_cbs_entry *cbs = &priv->cbs[i]; + + if (!cbs->idle_slope && !cbs->send_slope) + continue; + + rc = sja1105_dynamic_config_write(priv, BLK_IDX_CBS, i, cbs, + true); + if (rc) + break; + } + + return rc; +} + static const char * const sja1105_reset_reasons[] = { [SJA1105_VLAN_FILTERING] = "VLAN filtering", [SJA1105_RX_HWTSTAMPING] = "RX timestamping", @@ -1754,6 +1840,10 @@ out_unlock_ptp: sja1105_sgmii_pcs_force_speed(priv, speed); } } + + rc = sja1105_reload_cbs(priv); + if (rc < 0) + goto out; out: mutex_unlock(&priv->mgmt_lock); @@ -3131,6 +3221,8 @@ static int sja1105_port_setup_tc(struct dsa_switch *ds, int port, switch (type) { case TC_SETUP_QDISC_TAPRIO: return sja1105_setup_tc_taprio(ds, port, type_data); + case TC_SETUP_QDISC_CBS: + return sja1105_setup_tc_cbs(ds, port, type_data); default: return -EOPNOTSUPP; } @@ -3408,6 +3500,14 @@ static int sja1105_probe(struct spi_device *spi) if (rc) return rc; + if (IS_ENABLED(CONFIG_NET_SCH_CBS)) { + priv->cbs = devm_kcalloc(dev, priv->info->num_cbs_shapers, + sizeof(struct sja1105_cbs_entry), + GFP_KERNEL); + if (!priv->cbs) + return -ENOMEM; + } + /* Connections between dsa_port and sja1105_port */ for (port = 0; port < SJA1105_NUM_PORTS; port++) { struct sja1105_port *sp = &priv->ports[port]; diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c index a0dacae803cc..bb52b9c841b2 100644 --- a/drivers/net/dsa/sja1105/sja1105_spi.c +++ b/drivers/net/dsa/sja1105/sja1105_spi.c @@ -515,6 +515,7 @@ struct sja1105_info sja1105e_info = { .qinq_tpid = ETH_P_8021Q, .ptp_ts_bits = 24, .ptpegr_ts_bytes = 4, + .num_cbs_shapers = SJA1105ET_MAX_CBS_COUNT, .reset_cmd = sja1105et_reset_cmd, .fdb_add_cmd = sja1105et_fdb_add, .fdb_del_cmd = sja1105et_fdb_del, @@ -530,6 +531,7 @@ struct sja1105_info sja1105t_info = { .qinq_tpid = ETH_P_8021Q, .ptp_ts_bits = 24, .ptpegr_ts_bytes = 4, + .num_cbs_shapers = SJA1105ET_MAX_CBS_COUNT, .reset_cmd = sja1105et_reset_cmd, .fdb_add_cmd = sja1105et_fdb_add, .fdb_del_cmd = sja1105et_fdb_del, @@ -545,6 +547,7 @@ struct sja1105_info sja1105p_info = { .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, + .num_cbs_shapers = SJA1105PQRS_MAX_CBS_COUNT, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, .reset_cmd = sja1105pqrs_reset_cmd, .fdb_add_cmd = sja1105pqrs_fdb_add, @@ -561,6 +564,7 @@ struct sja1105_info sja1105q_info = { .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, + .num_cbs_shapers = SJA1105PQRS_MAX_CBS_COUNT, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, .reset_cmd = sja1105pqrs_reset_cmd, .fdb_add_cmd = sja1105pqrs_fdb_add, @@ -577,6 +581,7 @@ struct sja1105_info sja1105r_info = { .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, + .num_cbs_shapers = SJA1105PQRS_MAX_CBS_COUNT, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, .reset_cmd = sja1105pqrs_reset_cmd, .fdb_add_cmd = sja1105pqrs_fdb_add, @@ -594,6 +599,7 @@ struct sja1105_info sja1105s_info = { .qinq_tpid = ETH_P_8021AD, .ptp_ts_bits = 32, .ptpegr_ts_bytes = 8, + .num_cbs_shapers = SJA1105PQRS_MAX_CBS_COUNT, .setup_rgmii_delay = sja1105pqrs_setup_rgmii_delay, .reset_cmd = sja1105pqrs_reset_cmd, .fdb_add_cmd = sja1105pqrs_fdb_add, diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.h b/drivers/net/dsa/sja1105/sja1105_static_config.h index 5946847bb5b9..9b62b9b5549d 100644 --- a/drivers/net/dsa/sja1105/sja1105_static_config.h +++ b/drivers/net/dsa/sja1105/sja1105_static_config.h @@ -30,11 +30,13 @@ #define SJA1105ET_SIZE_L2_LOOKUP_PARAMS_ENTRY 4 #define SJA1105ET_SIZE_GENERAL_PARAMS_ENTRY 40 #define SJA1105ET_SIZE_AVB_PARAMS_ENTRY 12 +#define SJA1105ET_SIZE_CBS_ENTRY 16 #define SJA1105PQRS_SIZE_L2_LOOKUP_ENTRY 20 #define SJA1105PQRS_SIZE_MAC_CONFIG_ENTRY 32 #define SJA1105PQRS_SIZE_L2_LOOKUP_PARAMS_ENTRY 16 #define SJA1105PQRS_SIZE_GENERAL_PARAMS_ENTRY 44 #define SJA1105PQRS_SIZE_AVB_PARAMS_ENTRY 16 +#define SJA1105PQRS_SIZE_CBS_ENTRY 20 /* UM10944.pdf Page 11, Table 2. Configuration Blocks */ enum { @@ -56,6 +58,7 @@ enum { BLKID_AVB_PARAMS = 0x10, BLKID_GENERAL_PARAMS = 0x11, BLKID_RETAGGING = 0x12, + BLKID_CBS = 0x13, BLKID_XMII_PARAMS = 0x4E, }; @@ -78,6 +81,7 @@ enum sja1105_blk_idx { BLK_IDX_AVB_PARAMS, BLK_IDX_GENERAL_PARAMS, BLK_IDX_RETAGGING, + BLK_IDX_CBS, BLK_IDX_XMII_PARAMS, BLK_IDX_MAX, /* Fake block indices that are only valid for dynamic access */ @@ -105,6 +109,8 @@ enum sja1105_blk_idx { #define SJA1105_MAX_RETAGGING_COUNT 32 #define SJA1105_MAX_XMII_PARAMS_COUNT 1 #define SJA1105_MAX_AVB_PARAMS_COUNT 1 +#define SJA1105ET_MAX_CBS_COUNT 10 +#define SJA1105PQRS_MAX_CBS_COUNT 16 #define SJA1105_MAX_FRAME_MEMORY 929 #define SJA1105_MAX_FRAME_MEMORY_RETAGGING 910 @@ -289,6 +295,15 @@ struct sja1105_retagging_entry { u64 destports; }; +struct sja1105_cbs_entry { + u64 port; + u64 prio; + u64 credit_hi; + u64 credit_lo; + u64 send_slope; + u64 idle_slope; +}; + struct sja1105_xmii_params_entry { u64 phy_mac[5]; u64 xmii_mode[5]; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index e5ee9103fefb..b61e47bc16e8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -34,7 +34,8 @@ mlx5_core-$(CONFIG_MLX5_EN_ARFS) += en_arfs.o mlx5_core-$(CONFIG_MLX5_EN_RXNFC) += en_fs_ethtool.o mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o -mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o lib/geneve.o lib/port_tun.o lag_mp.o +mlx5_core-$(CONFIG_MLX5_ESWITCH) += lag_mp.o lib/geneve.o lib/port_tun.o \ + en_rep.o en/rep/bond.o mlx5_core-$(CONFIG_MLX5_CLS_ACT) += en_tc.o en/rep/tc.o en/rep/neigh.o \ en/mapping.o esw/chains.o en/tc_tun.o \ en/tc_tun_vxlan.o en/tc_tun_gre.o en/tc_tun_geneve.o \ @@ -46,6 +47,10 @@ mlx5_core-$(CONFIG_MLX5_TC_CT) += en/tc_ct.o # mlx5_core-$(CONFIG_MLX5_ESWITCH) += eswitch.o eswitch_offloads.o eswitch_offloads_termtbl.o \ ecpf.o rdma.o +mlx5_core-$(CONFIG_MLX5_ESWITCH) += esw/acl/helper.o \ + esw/acl/egress_lgcy.o esw/acl/egress_ofld.o \ + esw/acl/ingress_lgcy.o esw/acl/ingress_ofld.o + mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o mlx5_core-$(CONFIG_VXLAN) += lib/vxlan.o mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c index 8ecac81a385d..a700f3c86899 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c @@ -76,58 +76,59 @@ static void print_lyr_2_4_hdrs(struct trace_seq *p, .v = MLX5_GET(fte_match_set_lyr_2_4, value, dmac_47_16) << 16 | MLX5_GET(fte_match_set_lyr_2_4, value, dmac_15_0)}; MASK_VAL_L2(u16, ethertype, ethertype); + MASK_VAL_L2(u8, ip_version, ip_version); PRINT_MASKED_VALP(smac, u8 *, p, "%pM"); PRINT_MASKED_VALP(dmac, u8 *, p, "%pM"); PRINT_MASKED_VAL(ethertype, p, "%04x"); - if (ethertype.m == 0xffff) { - if (ethertype.v == ETH_P_IP) { + if ((ethertype.m == 0xffff && ethertype.v == ETH_P_IP) || + (ip_version.m == 0xf && ip_version.v == 4)) { #define MASK_VAL_L2_BE(type, name, fld) \ MASK_VAL_BE(type, fte_match_set_lyr_2_4, name, mask, value, fld) - MASK_VAL_L2_BE(u32, src_ipv4, - src_ipv4_src_ipv6.ipv4_layout.ipv4); - MASK_VAL_L2_BE(u32, dst_ipv4, - dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + MASK_VAL_L2_BE(u32, src_ipv4, + src_ipv4_src_ipv6.ipv4_layout.ipv4); + MASK_VAL_L2_BE(u32, dst_ipv4, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); - PRINT_MASKED_VALP(src_ipv4, typeof(&src_ipv4.v), p, - "%pI4"); - PRINT_MASKED_VALP(dst_ipv4, typeof(&dst_ipv4.v), p, - "%pI4"); - } else if (ethertype.v == ETH_P_IPV6) { - static const struct in6_addr full_ones = { - .in6_u.u6_addr32 = {__constant_htonl(0xffffffff), - __constant_htonl(0xffffffff), - __constant_htonl(0xffffffff), - __constant_htonl(0xffffffff)}, - }; - DECLARE_MASK_VAL(struct in6_addr, src_ipv6); - DECLARE_MASK_VAL(struct in6_addr, dst_ipv6); + PRINT_MASKED_VALP(src_ipv4, typeof(&src_ipv4.v), p, + "%pI4"); + PRINT_MASKED_VALP(dst_ipv4, typeof(&dst_ipv4.v), p, + "%pI4"); + } else if ((ethertype.m == 0xffff && ethertype.v == ETH_P_IPV6) || + (ip_version.m == 0xf && ip_version.v == 6)) { + static const struct in6_addr full_ones = { + .in6_u.u6_addr32 = {__constant_htonl(0xffffffff), + __constant_htonl(0xffffffff), + __constant_htonl(0xffffffff), + __constant_htonl(0xffffffff)}, + }; + DECLARE_MASK_VAL(struct in6_addr, src_ipv6); + DECLARE_MASK_VAL(struct in6_addr, dst_ipv6); - memcpy(src_ipv6.m.in6_u.u6_addr8, - MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask, - src_ipv4_src_ipv6.ipv6_layout.ipv6), - sizeof(src_ipv6.m)); - memcpy(dst_ipv6.m.in6_u.u6_addr8, - MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask, - dst_ipv4_dst_ipv6.ipv6_layout.ipv6), - sizeof(dst_ipv6.m)); - memcpy(src_ipv6.v.in6_u.u6_addr8, - MLX5_ADDR_OF(fte_match_set_lyr_2_4, value, - src_ipv4_src_ipv6.ipv6_layout.ipv6), - sizeof(src_ipv6.v)); - memcpy(dst_ipv6.v.in6_u.u6_addr8, - MLX5_ADDR_OF(fte_match_set_lyr_2_4, value, - dst_ipv4_dst_ipv6.ipv6_layout.ipv6), - sizeof(dst_ipv6.v)); + memcpy(src_ipv6.m.in6_u.u6_addr8, + MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + sizeof(src_ipv6.m)); + memcpy(dst_ipv6.m.in6_u.u6_addr8, + MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + sizeof(dst_ipv6.m)); + memcpy(src_ipv6.v.in6_u.u6_addr8, + MLX5_ADDR_OF(fte_match_set_lyr_2_4, value, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + sizeof(src_ipv6.v)); + memcpy(dst_ipv6.v.in6_u.u6_addr8, + MLX5_ADDR_OF(fte_match_set_lyr_2_4, value, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + sizeof(dst_ipv6.v)); - if (!memcmp(&src_ipv6.m, &full_ones, sizeof(full_ones))) - trace_seq_printf(p, "src_ipv6=%pI6 ", - &src_ipv6.v); - if (!memcmp(&dst_ipv6.m, &full_ones, sizeof(full_ones))) - trace_seq_printf(p, "dst_ipv6=%pI6 ", - &dst_ipv6.v); - } + if (!memcmp(&src_ipv6.m, &full_ones, sizeof(full_ones))) + trace_seq_printf(p, "src_ipv6=%pI6 ", + &src_ipv6.v); + if (!memcmp(&dst_ipv6.m, &full_ones, sizeof(full_ones))) + trace_seq_printf(p, "dst_ipv6=%pI6 ", + &dst_ipv6.v); } #define PRINT_MASKED_VAL_L2(type, name, fld, p, format) {\ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c new file mode 100644 index 000000000000..bdb71332cbf2 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c @@ -0,0 +1,350 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include <linux/netdevice.h> +#include <linux/list.h> +#include <net/lag.h> + +#include "mlx5_core.h" +#include "eswitch.h" +#include "esw/acl/ofld.h" +#include "en_rep.h" + +struct mlx5e_rep_bond { + struct notifier_block nb; + struct netdev_net_notifier nn; + struct list_head metadata_list; +}; + +struct mlx5e_rep_bond_slave_entry { + struct list_head list; + struct net_device *netdev; +}; + +struct mlx5e_rep_bond_metadata { + struct list_head list; /* link to global list of rep_bond_metadata */ + struct mlx5_eswitch *esw; + /* private of uplink holding rep bond metadata list */ + struct net_device *lag_dev; + u32 metadata_reg_c_0; + + struct list_head slaves_list; /* slaves list */ + int slaves; +}; + +static struct mlx5e_rep_bond_metadata * +mlx5e_lookup_rep_bond_metadata(struct mlx5_rep_uplink_priv *uplink_priv, + const struct net_device *lag_dev) +{ + struct mlx5e_rep_bond_metadata *found = NULL; + struct mlx5e_rep_bond_metadata *cur; + + list_for_each_entry(cur, &uplink_priv->bond->metadata_list, list) { + if (cur->lag_dev == lag_dev) { + found = cur; + break; + } + } + + return found; +} + +static struct mlx5e_rep_bond_slave_entry * +mlx5e_lookup_rep_bond_slave_entry(struct mlx5e_rep_bond_metadata *mdata, + const struct net_device *netdev) +{ + struct mlx5e_rep_bond_slave_entry *found = NULL; + struct mlx5e_rep_bond_slave_entry *cur; + + list_for_each_entry(cur, &mdata->slaves_list, list) { + if (cur->netdev == netdev) { + found = cur; + break; + } + } + + return found; +} + +static void mlx5e_rep_bond_metadata_release(struct mlx5e_rep_bond_metadata *mdata) +{ + netdev_dbg(mdata->lag_dev, "destroy rep_bond_metadata(%d)\n", + mdata->metadata_reg_c_0); + list_del(&mdata->list); + mlx5_esw_match_metadata_free(mdata->esw, mdata->metadata_reg_c_0); + WARN_ON(!list_empty(&mdata->slaves_list)); + kfree(mdata); +} + +/* This must be called under rtnl_lock */ +int mlx5e_rep_bond_enslave(struct mlx5_eswitch *esw, struct net_device *netdev, + struct net_device *lag_dev) +{ + struct mlx5e_rep_bond_slave_entry *s_entry; + struct mlx5e_rep_bond_metadata *mdata; + struct mlx5e_rep_priv *rpriv; + struct mlx5e_priv *priv; + int err; + + ASSERT_RTNL(); + + rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + mdata = mlx5e_lookup_rep_bond_metadata(&rpriv->uplink_priv, lag_dev); + if (!mdata) { + /* First netdev becomes slave, no metadata presents the lag_dev. Create one */ + mdata = kzalloc(sizeof(*mdata), GFP_KERNEL); + if (!mdata) + return -ENOMEM; + + mdata->lag_dev = lag_dev; + mdata->esw = esw; + INIT_LIST_HEAD(&mdata->slaves_list); + mdata->metadata_reg_c_0 = mlx5_esw_match_metadata_alloc(esw); + if (!mdata->metadata_reg_c_0) { + kfree(mdata); + return -ENOSPC; + } + list_add(&mdata->list, &rpriv->uplink_priv.bond->metadata_list); + + netdev_dbg(lag_dev, "create rep_bond_metadata(%d)\n", + mdata->metadata_reg_c_0); + } + + s_entry = kzalloc(sizeof(*s_entry), GFP_KERNEL); + if (!s_entry) { + err = -ENOMEM; + goto entry_alloc_err; + } + + s_entry->netdev = netdev; + priv = netdev_priv(netdev); + rpriv = priv->ppriv; + + err = mlx5_esw_acl_ingress_vport_bond_update(esw, rpriv->rep->vport, + mdata->metadata_reg_c_0); + if (err) + goto ingress_err; + + mdata->slaves++; + list_add_tail(&s_entry->list, &mdata->slaves_list); + netdev_dbg(netdev, "enslave rep vport(%d) lag_dev(%s) metadata(0x%x)\n", + rpriv->rep->vport, lag_dev->name, mdata->metadata_reg_c_0); + + return 0; + +ingress_err: + kfree(s_entry); +entry_alloc_err: + if (!mdata->slaves) + mlx5e_rep_bond_metadata_release(mdata); + return err; +} + +/* This must be called under rtnl_lock */ +void mlx5e_rep_bond_unslave(struct mlx5_eswitch *esw, + const struct net_device *netdev, + const struct net_device *lag_dev) +{ + struct mlx5e_rep_bond_slave_entry *s_entry; + struct mlx5e_rep_bond_metadata *mdata; + struct mlx5e_rep_priv *rpriv; + struct mlx5e_priv *priv; + + ASSERT_RTNL(); + + rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + mdata = mlx5e_lookup_rep_bond_metadata(&rpriv->uplink_priv, lag_dev); + if (!mdata) + return; + + s_entry = mlx5e_lookup_rep_bond_slave_entry(mdata, netdev); + if (!s_entry) + return; + + priv = netdev_priv(netdev); + rpriv = priv->ppriv; + + /* Reset bond_metadata to zero first then reset all ingress/egress + * acls and rx rules of unslave representor's vport + */ + mlx5_esw_acl_ingress_vport_bond_update(esw, rpriv->rep->vport, 0); + mlx5_esw_acl_egress_vport_unbond(esw, rpriv->rep->vport); + mlx5e_rep_bond_update(priv, false); + + list_del(&s_entry->list); + + netdev_dbg(netdev, "unslave rep vport(%d) lag_dev(%s) metadata(0x%x)\n", + rpriv->rep->vport, lag_dev->name, mdata->metadata_reg_c_0); + + if (--mdata->slaves == 0) + mlx5e_rep_bond_metadata_release(mdata); + kfree(s_entry); +} + +static bool mlx5e_rep_is_lag_netdev(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + /* A given netdev is not a representor or not a slave of LAG configuration */ + if (!mlx5e_eswitch_rep(netdev) || !bond_slave_get_rtnl(netdev)) + return false; + + /* Egress acl forward to vport is supported only non-uplink representor */ + return rpriv->rep->vport != MLX5_VPORT_UPLINK; +} + +static void mlx5e_rep_changelowerstate_event(struct net_device *netdev, void *ptr) +{ + struct netdev_notifier_changelowerstate_info *info; + struct netdev_lag_lower_state_info *lag_info; + struct mlx5e_rep_priv *rpriv; + struct net_device *lag_dev; + struct mlx5e_priv *priv; + struct list_head *iter; + struct net_device *dev; + u16 acl_vport_num; + u16 fwd_vport_num; + int err; + + if (!mlx5e_rep_is_lag_netdev(netdev)) + return; + + info = ptr; + lag_info = info->lower_state_info; + /* This is not an event of a representor becoming active slave */ + if (!lag_info->tx_enabled) + return; + + priv = netdev_priv(netdev); + rpriv = priv->ppriv; + fwd_vport_num = rpriv->rep->vport; + lag_dev = netdev_master_upper_dev_get(netdev); + + netdev_dbg(netdev, "lag_dev(%s)'s slave vport(%d) is txable(%d)\n", + lag_dev->name, fwd_vport_num, net_lag_port_dev_txable(netdev)); + + /* Point everyone's egress acl to the vport of the active representor */ + netdev_for_each_lower_dev(lag_dev, dev, iter) { + priv = netdev_priv(dev); + rpriv = priv->ppriv; + acl_vport_num = rpriv->rep->vport; + if (acl_vport_num != fwd_vport_num) { + /* Only single rx_rule for unique bond_metadata should be + * present, delete it if it's saved as passive vport's + * rx_rule with destination as passive vport's root_ft + */ + mlx5e_rep_bond_update(priv, true); + err = mlx5_esw_acl_egress_vport_bond(priv->mdev->priv.eswitch, + fwd_vport_num, + acl_vport_num); + if (err) + netdev_warn(dev, + "configure slave vport(%d) egress fwd, err(%d)", + acl_vport_num, err); + } + } + + /* Insert new rx_rule for unique bond_metadata, save it as active vport's + * rx_rule with new destination as active vport's root_ft + */ + err = mlx5e_rep_bond_update(netdev_priv(netdev), false); + if (err) + netdev_warn(netdev, "configure active slave vport(%d) rx_rule, err(%d)", + fwd_vport_num, err); +} + +static void mlx5e_rep_changeupper_event(struct net_device *netdev, void *ptr) +{ + struct netdev_notifier_changeupper_info *info = ptr; + struct mlx5e_rep_priv *rpriv; + struct net_device *lag_dev; + struct mlx5e_priv *priv; + + if (!mlx5e_rep_is_lag_netdev(netdev)) + return; + + priv = netdev_priv(netdev); + rpriv = priv->ppriv; + lag_dev = info->upper_dev; + + netdev_dbg(netdev, "%sslave vport(%d) lag(%s)\n", + info->linking ? "en" : "un", rpriv->rep->vport, lag_dev->name); + + if (info->linking) + mlx5e_rep_bond_enslave(priv->mdev->priv.eswitch, netdev, lag_dev); + else + mlx5e_rep_bond_unslave(priv->mdev->priv.eswitch, netdev, lag_dev); +} + +/* Bond device of representors and netdev events are used here in specific way + * to support eswitch vports bonding and to perform failover of eswitch vport + * by modifying the vport's egress acl of lower dev representors. Thus this + * also change the traditional behavior of lower dev under bond device. + * All non-representor netdevs or representors of other vendors as lower dev + * of bond device are not supported. + */ +static int mlx5e_rep_esw_bond_netevent(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + switch (event) { + case NETDEV_CHANGELOWERSTATE: + mlx5e_rep_changelowerstate_event(netdev, ptr); + break; + case NETDEV_CHANGEUPPER: + mlx5e_rep_changeupper_event(netdev, ptr); + break; + } + return NOTIFY_DONE; +} + +/* If HW support eswitch vports bonding, register a specific notifier to + * handle it when two or more representors are bonded + */ +int mlx5e_rep_bond_init(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; + struct net_device *netdev = rpriv->netdev; + struct mlx5e_priv *priv; + int ret = 0; + + priv = netdev_priv(netdev); + if (!mlx5_esw_acl_egress_fwd2vport_supported(priv->mdev->priv.eswitch)) + goto out; + + uplink_priv->bond = kvzalloc(sizeof(*uplink_priv->bond), GFP_KERNEL); + if (!uplink_priv->bond) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&uplink_priv->bond->metadata_list); + uplink_priv->bond->nb.notifier_call = mlx5e_rep_esw_bond_netevent; + ret = register_netdevice_notifier_dev_net(netdev, + &uplink_priv->bond->nb, + &uplink_priv->bond->nn); + if (ret) { + netdev_err(netdev, "register bonding netevent notifier, err(%d)\n", ret); + kvfree(uplink_priv->bond); + uplink_priv->bond = NULL; + } + +out: + return ret; +} + +void mlx5e_rep_bond_cleanup(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); + + if (!mlx5_esw_acl_egress_fwd2vport_supported(priv->mdev->priv.eswitch) || + !rpriv->uplink_priv.bond) + return; + + unregister_netdevice_notifier_dev_net(rpriv->netdev, + &rpriv->uplink_priv.bond->nb, + &rpriv->uplink_priv.bond->nn); + kvfree(rpriv->uplink_priv.bond); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index 995b2ef1fb3b..afc19dca1f5f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -119,7 +119,7 @@ mlx5_tc_ct_get_ct_priv(struct mlx5e_priv *priv) } static int -mlx5_tc_ct_set_tuple_match(struct mlx5_flow_spec *spec, +mlx5_tc_ct_set_tuple_match(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec, struct flow_rule *rule) { void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, @@ -134,10 +134,8 @@ mlx5_tc_ct_set_tuple_match(struct mlx5_flow_spec *spec, flow_rule_match_basic(rule, &match); - MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype, - ntohs(match.mask->n_proto)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, - ntohs(match.key->n_proto)); + mlx5e_tc_set_ethertype(priv->mdev, &match, true, headers_c, + headers_v); MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol, match.mask->ip_proto); MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, @@ -533,7 +531,7 @@ mlx5_tc_ct_entry_add_rule(struct mlx5_tc_ct_priv *ct_priv, attr->counter = entry->counter; attr->flags |= MLX5_ESW_ATTR_FLAG_NO_IN_PORT; - mlx5_tc_ct_set_tuple_match(spec, flow_rule); + mlx5_tc_ct_set_tuple_match(netdev_priv(ct_priv->netdev), spec, flow_rule); mlx5e_tc_match_to_reg_match(spec, ZONE_TO_REG, entry->zone & MLX5_CT_ZONE_MASK, MLX5_CT_ZONE_MASK); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index e99382f58807..7cce85faa16f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -512,6 +512,13 @@ int mlx5e_tc_tun_parse(struct net_device *filter_dev, } if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL)) { + struct flow_dissector_key_basic key_basic = {}; + struct flow_dissector_key_basic mask_basic = { + .n_proto = htons(0xFFFF), + }; + struct flow_match_basic match_basic = { + .key = &key_basic, .mask = &mask_basic, + }; struct flow_match_control match; u16 addr_type; @@ -537,10 +544,9 @@ int mlx5e_tc_tun_parse(struct net_device *filter_dev, dst_ipv4_dst_ipv6.ipv4_layout.ipv4, ntohl(match.key->dst)); - MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, - ethertype); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, - ETH_P_IP); + key_basic.n_proto = htons(ETH_P_IP); + mlx5e_tc_set_ethertype(priv->mdev, &match_basic, true, + headers_c, headers_v); } else if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { struct flow_match_ipv6_addrs match; @@ -563,10 +569,9 @@ int mlx5e_tc_tun_parse(struct net_device *filter_dev, &match.key->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); - MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, - ethertype); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, - ETH_P_IPV6); + key_basic.n_proto = htons(ETH_P_IPV6); + mlx5e_tc_set_ethertype(priv->mdev, &match_basic, true, + headers_c, headers_v); } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 4e13e37a9ecd..af89a4803c7d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -854,6 +854,24 @@ static int mlx5e_create_rep_vport_rx_rule(struct mlx5e_priv *priv) return 0; } +static void rep_vport_rx_rule_destroy(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + if (!rpriv->vport_rx_rule) + return; + + mlx5_del_flow_rules(rpriv->vport_rx_rule); + rpriv->vport_rx_rule = NULL; +} + +int mlx5e_rep_bond_update(struct mlx5e_priv *priv, bool cleanup) +{ + rep_vport_rx_rule_destroy(priv); + + return cleanup ? 0 : mlx5e_create_rep_vport_rx_rule(priv); +} + static int mlx5e_init_rep_rx(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; @@ -918,9 +936,7 @@ err_close_drop_rq: static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv) { - struct mlx5e_rep_priv *rpriv = priv->ppriv; - - mlx5_del_flow_rules(rpriv->vport_rx_rule); + rep_vport_rx_rule_destroy(priv); mlx5e_destroy_rep_root_ft(priv); mlx5e_destroy_ttc_table(priv, &priv->fs.ttc); mlx5e_destroy_direct_tirs(priv, priv->direct_tir); @@ -959,16 +975,18 @@ static int mlx5e_init_uplink_rep_tx(struct mlx5e_rep_priv *rpriv) mlx5_init_port_tun_entropy(&uplink_priv->tun_entropy, priv->mdev); + mlx5e_rep_bond_init(rpriv); err = mlx5e_rep_tc_netdevice_event_register(rpriv); if (err) { mlx5_core_err(priv->mdev, "Failed to register netdev notifier, err: %d\n", err); - goto tc_rep_cleanup; + goto err_event_reg; } return 0; -tc_rep_cleanup: +err_event_reg: + mlx5e_rep_bond_cleanup(rpriv); mlx5e_rep_tc_cleanup(rpriv); return err; } @@ -1001,7 +1019,7 @@ static void mlx5e_cleanup_uplink_rep_tx(struct mlx5e_rep_priv *rpriv) { mlx5e_rep_tc_netdevice_event_unregister(rpriv); mlx5e_rep_indr_clean_block_privs(rpriv); - + mlx5e_rep_bond_cleanup(rpriv); mlx5e_rep_tc_cleanup(rpriv); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h index 1c4af8522467..da9f1686d525 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h @@ -56,6 +56,7 @@ struct mlx5e_neigh_update_table { }; struct mlx5_tc_ct_priv; +struct mlx5e_rep_bond; struct mlx5_rep_uplink_priv { /* Filters DB - instantiated by the uplink representor and shared by * the uplink's VFs @@ -89,6 +90,9 @@ struct mlx5_rep_uplink_priv { struct mapping_ctx *tunnel_enc_opts_mapping; struct mlx5_tc_ct_priv *ct_priv; + + /* support eswitch vports bonding */ + struct mlx5e_rep_bond *bond; }; struct mlx5e_rep_priv { @@ -211,6 +215,15 @@ struct mlx5e_rep_sq { void mlx5e_rep_register_vport_reps(struct mlx5_core_dev *mdev); void mlx5e_rep_unregister_vport_reps(struct mlx5_core_dev *mdev); +int mlx5e_rep_bond_init(struct mlx5e_rep_priv *rpriv); +void mlx5e_rep_bond_cleanup(struct mlx5e_rep_priv *rpriv); +int mlx5e_rep_bond_enslave(struct mlx5_eswitch *esw, struct net_device *netdev, + struct net_device *lag_dev); +void mlx5e_rep_bond_unslave(struct mlx5_eswitch *esw, + const struct net_device *netdev, + const struct net_device *lag_dev); +int mlx5e_rep_bond_update(struct mlx5e_priv *priv, bool cleanup); + bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv); int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv); void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 571da14809fe..0f119c08b835 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -50,6 +50,7 @@ #include <net/arp.h> #include <net/ipv6_stubs.h> #include <net/bareudp.h> +#include <net/bonding.h> #include "en.h" #include "en_rep.h" #include "en/rep/tc.h" @@ -145,6 +146,7 @@ struct mlx5e_tc_flow { struct list_head hairpin; /* flows sharing the same hairpin */ struct list_head peer; /* flows with peer flow */ struct list_head unready; /* flows not ready to be offloaded (e.g due to missing route) */ + struct net_device *orig_dev; /* netdev adding flow first */ int tmp_efi_index; struct list_head tmp_list; /* temporary flow list used by neigh update */ refcount_t refcnt; @@ -2018,6 +2020,32 @@ u32 mlx5e_tc_get_flow_tun_id(struct mlx5e_tc_flow *flow) return flow->tunnel_id; } +void mlx5e_tc_set_ethertype(struct mlx5_core_dev *mdev, + struct flow_match_basic *match, bool outer, + void *headers_c, void *headers_v) +{ + bool ip_version_cap; + + ip_version_cap = outer ? + MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_ip_version) : + MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.inner_ip_version); + + if (ip_version_cap && match->mask->n_proto == htons(0xFFFF) && + (match->key->n_proto == htons(ETH_P_IP) || + match->key->n_proto == htons(ETH_P_IPV6))) { + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ip_version); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_version, + match->key->n_proto == htons(ETH_P_IP) ? 4 : 6); + } else { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype, + ntohs(match->mask->n_proto)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, + ntohs(match->key->n_proto)); + } +} + static int parse_tunnel_attr(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow, struct mlx5_flow_spec *spec, @@ -2239,10 +2267,9 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, struct flow_match_basic match; flow_rule_match_basic(rule, &match); - MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype, - ntohs(match.mask->n_proto)); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, - ntohs(match.key->n_proto)); + mlx5e_tc_set_ethertype(priv->mdev, &match, + match_level == outer_match_level, + headers_c, headers_v); if (match.mask->n_proto) *match_level = MLX5_MATCH_L2; @@ -3118,16 +3145,19 @@ static bool modify_header_match_supported(struct mlx5_flow_spec *spec, { const struct flow_action_entry *act; bool modify_ip_header; + void *headers_c; void *headers_v; u16 ethertype; u8 ip_proto; int i, err; + headers_c = get_match_headers_criteria(actions, spec); headers_v = get_match_headers_value(actions, spec); ethertype = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ethertype); /* for non-IP we only re-write MACs, so we're okay */ - if (ethertype != ETH_P_IP && ethertype != ETH_P_IPV6) + if (MLX5_GET(fte_match_set_lyr_2_4, headers_c, ip_version) == 0 && + ethertype != ETH_P_IP && ethertype != ETH_P_IPV6) goto out_ok; modify_ip_header = false; @@ -3758,6 +3788,28 @@ static int parse_tc_vlan_action(struct mlx5e_priv *priv, return 0; } +static struct net_device *get_fdb_out_dev(struct net_device *uplink_dev, + struct net_device *out_dev) +{ + struct net_device *fdb_out_dev = out_dev; + struct net_device *uplink_upper; + + rcu_read_lock(); + uplink_upper = netdev_master_upper_dev_get_rcu(uplink_dev); + if (uplink_upper && netif_is_lag_master(uplink_upper) && + uplink_upper == out_dev) { + fdb_out_dev = uplink_dev; + } else if (netif_is_lag_master(out_dev)) { + fdb_out_dev = bond_option_active_slave_get_rcu(netdev_priv(out_dev)); + if (fdb_out_dev && + (!mlx5e_eswitch_rep(fdb_out_dev) || + !netdev_port_same_parent_id(fdb_out_dev, uplink_dev))) + fdb_out_dev = NULL; + } + rcu_read_unlock(); + return fdb_out_dev; +} + static int add_vlan_push_action(struct mlx5e_priv *priv, struct mlx5_esw_flow_attr *attr, struct net_device **out_dev, @@ -4073,7 +4125,6 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, } else if (netdev_port_same_parent_id(priv->netdev, out_dev)) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct net_device *uplink_dev = mlx5_eswitch_uplink_get_proto_dev(esw, REP_ETH); - struct net_device *uplink_upper; if (is_duplicated_output_device(priv->netdev, out_dev, @@ -4085,14 +4136,9 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, ifindexes[if_count] = out_dev->ifindex; if_count++; - rcu_read_lock(); - uplink_upper = - netdev_master_upper_dev_get_rcu(uplink_dev); - if (uplink_upper && - netif_is_lag_master(uplink_upper) && - uplink_upper == out_dev) - out_dev = uplink_dev; - rcu_read_unlock(); + out_dev = get_fdb_out_dev(uplink_dev, out_dev); + if (!out_dev) + return -ENODEV; if (is_vlan_dev(out_dev)) { err = add_vlan_push_action(priv, attr, @@ -4624,11 +4670,21 @@ mlx5e_tc_add_flow(struct mlx5e_priv *priv, return err; } +static bool is_flow_rule_duplicate_allowed(struct net_device *dev, + struct mlx5e_rep_priv *rpriv) +{ + /* Offloaded flow rule is allowed to duplicate on non-uplink representor + * sharing tc block with other slaves of a lag device. + */ + return netif_is_lag_port(dev) && rpriv->rep->vport != MLX5_VPORT_UPLINK; +} + int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv, struct flow_cls_offload *f, unsigned long flags) { struct netlink_ext_ack *extack = f->common.extack; struct rhashtable *tc_ht = get_tc_ht(priv, flags); + struct mlx5e_rep_priv *rpriv = priv->ppriv; struct mlx5e_tc_flow *flow; int err = 0; @@ -4636,6 +4692,12 @@ int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv, flow = rhashtable_lookup(tc_ht, &f->cookie, tc_ht_params); rcu_read_unlock(); if (flow) { + /* Same flow rule offloaded to non-uplink representor sharing tc block, + * just return 0. + */ + if (is_flow_rule_duplicate_allowed(dev, rpriv) && flow->orig_dev != dev) + goto out; + NL_SET_ERR_MSG_MOD(extack, "flow cookie already exists, ignoring"); netdev_warn_once(priv->netdev, @@ -4650,6 +4712,12 @@ int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv, if (err) goto out; + /* Flow rule offloaded to non-uplink representor sharing tc block, + * set the flow's owner dev. + */ + if (is_flow_rule_duplicate_allowed(dev, rpriv)) + flow->orig_dev = dev; + err = rhashtable_lookup_insert_fast(tc_ht, &flow->node, tc_ht_params); if (err) goto err_free; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h index 037aa73bf9ab..5c330b0cae21 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h @@ -170,6 +170,10 @@ void dealloc_mod_hdr_actions(struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts); struct mlx5e_tc_flow; u32 mlx5e_tc_get_flow_tun_id(struct mlx5e_tc_flow *flow); +void mlx5e_tc_set_ethertype(struct mlx5_core_dev *mdev, + struct flow_match_basic *match, bool outer, + void *headers_c, void *headers_v); + #if IS_ENABLED(CONFIG_MLX5_CLS_ACT) int mlx5e_tc_nic_init(struct mlx5e_priv *priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c new file mode 100644 index 000000000000..d46f8b225ebe --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" +#include "lgcy.h" + +static void esw_acl_egress_lgcy_rules_destroy(struct mlx5_vport *vport) +{ + esw_acl_egress_vlan_destroy(vport); + if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_rule)) { + mlx5_del_flow_rules(vport->egress.legacy.drop_rule); + vport->egress.legacy.drop_rule = NULL; + } +} + +static int esw_acl_egress_lgcy_groups_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_group *drop_grp; + u32 *flow_group_in; + int err = 0; + + err = esw_acl_egress_vlan_grp_create(esw, vport); + if (err) + return err; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) { + err = -ENOMEM; + goto alloc_err; + } + + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); + drop_grp = mlx5_create_flow_group(vport->egress.acl, flow_group_in); + if (IS_ERR(drop_grp)) { + err = PTR_ERR(drop_grp); + esw_warn(dev, "Failed to create E-Switch vport[%d] egress drop flow group, err(%d)\n", + vport->vport, err); + goto drop_grp_err; + } + + vport->egress.legacy.drop_grp = drop_grp; + kvfree(flow_group_in); + return 0; + +drop_grp_err: + kvfree(flow_group_in); +alloc_err: + esw_acl_egress_vlan_grp_destroy(vport); + return err; +} + +static void esw_acl_egress_lgcy_groups_destroy(struct mlx5_vport *vport) +{ + if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_grp)) { + mlx5_destroy_flow_group(vport->egress.legacy.drop_grp); + vport->egress.legacy.drop_grp = NULL; + } + esw_acl_egress_vlan_grp_destroy(vport); +} + +int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + struct mlx5_flow_destination drop_ctr_dst = {}; + struct mlx5_flow_destination *dst = NULL; + struct mlx5_fc *drop_counter = NULL; + struct mlx5_flow_act flow_act = {}; + /* The egress acl table contains 2 rules: + * 1)Allow traffic with vlan_tag=vst_vlan_id + * 2)Drop all other traffic. + */ + int table_size = 2; + int dest_num = 0; + int err = 0; + + if (MLX5_CAP_ESW_EGRESS_ACL(esw->dev, flow_counter)) { + drop_counter = mlx5_fc_create(esw->dev, false); + if (IS_ERR(drop_counter)) + esw_warn(esw->dev, + "vport[%d] configure egress drop rule counter err(%ld)\n", + vport->vport, PTR_ERR(drop_counter)); + vport->egress.legacy.drop_counter = drop_counter; + } + + esw_acl_egress_lgcy_rules_destroy(vport); + + if (!vport->info.vlan && !vport->info.qos) { + esw_acl_egress_lgcy_cleanup(esw, vport); + return 0; + } + + if (!IS_ERR_OR_NULL(vport->egress.acl)) + return 0; + + vport->egress.acl = esw_acl_table_create(esw, vport->vport, + MLX5_FLOW_NAMESPACE_ESW_EGRESS, + table_size); + if (IS_ERR_OR_NULL(vport->egress.acl)) { + err = PTR_ERR(vport->egress.acl); + vport->egress.acl = NULL; + goto out; + } + + err = esw_acl_egress_lgcy_groups_create(esw, vport); + if (err) + goto out; + + esw_debug(esw->dev, + "vport[%d] configure egress rules, vlan(%d) qos(%d)\n", + vport->vport, vport->info.vlan, vport->info.qos); + + /* Allowed vlan rule */ + err = esw_egress_acl_vlan_create(esw, vport, NULL, vport->info.vlan, + MLX5_FLOW_CONTEXT_ACTION_ALLOW); + if (err) + goto out; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + + /* Attach egress drop flow counter */ + if (!IS_ERR_OR_NULL(drop_counter)) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + drop_ctr_dst.counter_id = mlx5_fc_id(drop_counter); + dst = &drop_ctr_dst; + dest_num++; + } + vport->egress.legacy.drop_rule = + mlx5_add_flow_rules(vport->egress.acl, NULL, + &flow_act, dst, dest_num); + if (IS_ERR(vport->egress.legacy.drop_rule)) { + err = PTR_ERR(vport->egress.legacy.drop_rule); + esw_warn(esw->dev, + "vport[%d] configure egress drop rule failed, err(%d)\n", + vport->vport, err); + vport->egress.legacy.drop_rule = NULL; + goto out; + } + + return err; + +out: + esw_acl_egress_lgcy_cleanup(esw, vport); + return err; +} + +void esw_acl_egress_lgcy_cleanup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (IS_ERR_OR_NULL(vport->egress.acl)) + goto clean_drop_counter; + + esw_debug(esw->dev, "Destroy vport[%d] E-Switch egress ACL\n", vport->vport); + + esw_acl_egress_lgcy_rules_destroy(vport); + esw_acl_egress_lgcy_groups_destroy(vport); + esw_acl_egress_table_destroy(vport); + +clean_drop_counter: + if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_counter)) { + mlx5_fc_destroy(esw->dev, vport->egress.legacy.drop_counter); + vport->egress.legacy.drop_counter = NULL; + } +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c new file mode 100644 index 000000000000..07b2acd7e6b3 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" +#include "ofld.h" + +static void esw_acl_egress_ofld_fwd2vport_destroy(struct mlx5_vport *vport) +{ + if (!vport->egress.offloads.fwd_rule) + return; + + mlx5_del_flow_rules(vport->egress.offloads.fwd_rule); + vport->egress.offloads.fwd_rule = NULL; +} + +static int esw_acl_egress_ofld_fwd2vport_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + struct mlx5_flow_destination *fwd_dest) +{ + struct mlx5_flow_act flow_act = {}; + int err = 0; + + esw_debug(esw->dev, "vport(%d) configure egress acl rule fwd2vport(%d)\n", + vport->vport, fwd_dest->vport.num); + + /* Delete the old egress forward-to-vport rule if any */ + esw_acl_egress_ofld_fwd2vport_destroy(vport); + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + vport->egress.offloads.fwd_rule = + mlx5_add_flow_rules(vport->egress.acl, NULL, + &flow_act, fwd_dest, 1); + if (IS_ERR(vport->egress.offloads.fwd_rule)) { + err = PTR_ERR(vport->egress.offloads.fwd_rule); + esw_warn(esw->dev, + "vport(%d) failed to add fwd2vport acl rule err(%d)\n", + vport->vport, err); + vport->egress.offloads.fwd_rule = NULL; + } + + return err; +} + +static int esw_acl_egress_ofld_rules_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + struct mlx5_flow_destination *fwd_dest) +{ + int err = 0; + int action; + + if (MLX5_CAP_GEN(esw->dev, prio_tag_required)) { + /* For prio tag mode, there is only 1 FTEs: + * 1) prio tag packets - pop the prio tag VLAN, allow + * Unmatched traffic is allowed by default + */ + esw_debug(esw->dev, + "vport[%d] configure prio tag egress rules\n", vport->vport); + + action = MLX5_FLOW_CONTEXT_ACTION_VLAN_POP; + action |= fwd_dest ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : + MLX5_FLOW_CONTEXT_ACTION_ALLOW; + + /* prio tag vlan rule - pop it so vport receives untagged packets */ + err = esw_egress_acl_vlan_create(esw, vport, fwd_dest, 0, action); + if (err) + goto prio_err; + } + + if (fwd_dest) { + err = esw_acl_egress_ofld_fwd2vport_create(esw, vport, fwd_dest); + if (err) + goto fwd_err; + } + + return 0; + +fwd_err: + esw_acl_egress_vlan_destroy(vport); +prio_err: + return err; +} + +static void esw_acl_egress_ofld_rules_destroy(struct mlx5_vport *vport) +{ + esw_acl_egress_vlan_destroy(vport); + esw_acl_egress_ofld_fwd2vport_destroy(vport); +} + +static int esw_acl_egress_ofld_groups_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fwd_grp; + u32 *flow_group_in; + u32 flow_index = 0; + int ret = 0; + + if (MLX5_CAP_GEN(esw->dev, prio_tag_required)) { + ret = esw_acl_egress_vlan_grp_create(esw, vport); + if (ret) + return ret; + + flow_index++; + } + + if (!mlx5_esw_acl_egress_fwd2vport_supported(esw)) + goto out; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) { + ret = -ENOMEM; + goto fwd_grp_err; + } + + /* This group holds 1 FTE to forward all packets to other vport + * when bond vports is supported. + */ + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_index); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_index); + fwd_grp = mlx5_create_flow_group(vport->egress.acl, flow_group_in); + if (IS_ERR(fwd_grp)) { + ret = PTR_ERR(fwd_grp); + esw_warn(esw->dev, + "Failed to create vport[%d] egress fwd2vport flow group, err(%d)\n", + vport->vport, ret); + kvfree(flow_group_in); + goto fwd_grp_err; + } + vport->egress.offloads.fwd_grp = fwd_grp; + kvfree(flow_group_in); + return 0; + +fwd_grp_err: + esw_acl_egress_vlan_grp_destroy(vport); +out: + return ret; +} + +static void esw_acl_egress_ofld_groups_destroy(struct mlx5_vport *vport) +{ + if (!IS_ERR_OR_NULL(vport->egress.offloads.fwd_grp)) { + mlx5_destroy_flow_group(vport->egress.offloads.fwd_grp); + vport->egress.offloads.fwd_grp = NULL; + } + esw_acl_egress_vlan_grp_destroy(vport); +} + +int esw_acl_egress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +{ + int table_size = 0; + int err; + + if (!mlx5_esw_acl_egress_fwd2vport_supported(esw) && + !MLX5_CAP_GEN(esw->dev, prio_tag_required)) + return 0; + + esw_acl_egress_ofld_rules_destroy(vport); + + if (mlx5_esw_acl_egress_fwd2vport_supported(esw)) + table_size++; + if (MLX5_CAP_GEN(esw->dev, prio_tag_required)) + table_size++; + vport->egress.acl = esw_acl_table_create(esw, vport->vport, + MLX5_FLOW_NAMESPACE_ESW_EGRESS, table_size); + if (IS_ERR_OR_NULL(vport->egress.acl)) { + err = PTR_ERR(vport->egress.acl); + vport->egress.acl = NULL; + return err; + } + + err = esw_acl_egress_ofld_groups_create(esw, vport); + if (err) + goto group_err; + + esw_debug(esw->dev, "vport[%d] configure egress rules\n", vport->vport); + + err = esw_acl_egress_ofld_rules_create(esw, vport, NULL); + if (err) + goto rules_err; + + return 0; + +rules_err: + esw_acl_egress_ofld_groups_destroy(vport); +group_err: + esw_acl_egress_table_destroy(vport); + return err; +} + +void esw_acl_egress_ofld_cleanup(struct mlx5_vport *vport) +{ + esw_acl_egress_ofld_rules_destroy(vport); + esw_acl_egress_ofld_groups_destroy(vport); + esw_acl_egress_table_destroy(vport); +} + +int mlx5_esw_acl_egress_vport_bond(struct mlx5_eswitch *esw, u16 active_vport_num, + u16 passive_vport_num) +{ + struct mlx5_vport *passive_vport = mlx5_eswitch_get_vport(esw, passive_vport_num); + struct mlx5_vport *active_vport = mlx5_eswitch_get_vport(esw, active_vport_num); + struct mlx5_flow_destination fwd_dest = {}; + + if (IS_ERR(active_vport)) + return PTR_ERR(active_vport); + if (IS_ERR(passive_vport)) + return PTR_ERR(passive_vport); + + /* Cleanup and recreate rules WITHOUT fwd2vport of active vport */ + esw_acl_egress_ofld_rules_destroy(active_vport); + esw_acl_egress_ofld_rules_create(esw, active_vport, NULL); + + /* Cleanup and recreate all rules + fwd2vport rule of passive vport to forward */ + esw_acl_egress_ofld_rules_destroy(passive_vport); + fwd_dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + fwd_dest.vport.num = active_vport_num; + fwd_dest.vport.vhca_id = MLX5_CAP_GEN(esw->dev, vhca_id); + fwd_dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID; + + return esw_acl_egress_ofld_rules_create(esw, passive_vport, &fwd_dest); +} + +int mlx5_esw_acl_egress_vport_unbond(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + + if (IS_ERR(vport)) + return PTR_ERR(vport); + + esw_acl_egress_ofld_rules_destroy(vport); + return esw_acl_egress_ofld_rules_create(esw, vport, NULL); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c new file mode 100644 index 000000000000..22f4c1c28006 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" + +struct mlx5_flow_table * +esw_acl_table_create(struct mlx5_eswitch *esw, u16 vport_num, int ns, int size) +{ + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_namespace *root_ns; + struct mlx5_flow_table *acl; + int acl_supported; + int vport_index; + int err; + + acl_supported = (ns == MLX5_FLOW_NAMESPACE_ESW_INGRESS) ? + MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support) : + MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support); + + if (!acl_supported) + return ERR_PTR(-EOPNOTSUPP); + + esw_debug(dev, "Create vport[%d] %s ACL table\n", vport_num, + ns == MLX5_FLOW_NAMESPACE_ESW_INGRESS ? "ingress" : "egress"); + + vport_index = mlx5_eswitch_vport_num_to_index(esw, vport_num); + root_ns = mlx5_get_flow_vport_acl_namespace(dev, ns, vport_index); + if (!root_ns) { + esw_warn(dev, "Failed to get E-Switch root namespace for vport (%d)\n", + vport_num); + return ERR_PTR(-EOPNOTSUPP); + } + + acl = mlx5_create_vport_flow_table(root_ns, 0, size, 0, vport_num); + if (IS_ERR(acl)) { + err = PTR_ERR(acl); + esw_warn(dev, "vport[%d] create %s ACL table, err(%d)\n", vport_num, + ns == MLX5_FLOW_NAMESPACE_ESW_INGRESS ? "ingress" : "egress", err); + } + return acl; +} + +int esw_egress_acl_vlan_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + struct mlx5_flow_destination *fwd_dest, + u16 vlan_id, u32 flow_action) +{ + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec; + int err = 0; + + if (vport->egress.allowed_vlan) + return -EEXIST; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.first_vid); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, vlan_id); + + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + flow_act.action = flow_action; + vport->egress.allowed_vlan = + mlx5_add_flow_rules(vport->egress.acl, spec, + &flow_act, fwd_dest, 0); + if (IS_ERR(vport->egress.allowed_vlan)) { + err = PTR_ERR(vport->egress.allowed_vlan); + esw_warn(esw->dev, + "vport[%d] configure egress vlan rule failed, err(%d)\n", + vport->vport, err); + vport->egress.allowed_vlan = NULL; + } + + kvfree(spec); + return err; +} + +void esw_acl_egress_vlan_destroy(struct mlx5_vport *vport) +{ + if (!IS_ERR_OR_NULL(vport->egress.allowed_vlan)) { + mlx5_del_flow_rules(vport->egress.allowed_vlan); + vport->egress.allowed_vlan = NULL; + } +} + +int esw_acl_egress_vlan_grp_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *vlan_grp; + void *match_criteria; + u32 *flow_group_in; + int ret = 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + MLX5_SET(create_flow_group_in, flow_group_in, + match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + match_criteria = MLX5_ADDR_OF(create_flow_group_in, + flow_group_in, match_criteria); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.first_vid); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0); + + vlan_grp = mlx5_create_flow_group(vport->egress.acl, flow_group_in); + if (IS_ERR(vlan_grp)) { + ret = PTR_ERR(vlan_grp); + esw_warn(esw->dev, + "Failed to create E-Switch vport[%d] egress pop vlans flow group, err(%d)\n", + vport->vport, ret); + goto out; + } + vport->egress.vlan_grp = vlan_grp; + +out: + kvfree(flow_group_in); + return ret; +} + +void esw_acl_egress_vlan_grp_destroy(struct mlx5_vport *vport) +{ + if (!IS_ERR_OR_NULL(vport->egress.vlan_grp)) { + mlx5_destroy_flow_group(vport->egress.vlan_grp); + vport->egress.vlan_grp = NULL; + } +} + +void esw_acl_egress_table_destroy(struct mlx5_vport *vport) +{ + if (IS_ERR_OR_NULL(vport->egress.acl)) + return; + + mlx5_destroy_flow_table(vport->egress.acl); + vport->egress.acl = NULL; +} + +void esw_acl_ingress_table_destroy(struct mlx5_vport *vport) +{ + if (!vport->ingress.acl) + return; + + mlx5_destroy_flow_table(vport->ingress.acl); + vport->ingress.acl = NULL; +} + +void esw_acl_ingress_allow_rule_destroy(struct mlx5_vport *vport) +{ + if (!vport->ingress.allow_rule) + return; + + mlx5_del_flow_rules(vport->ingress.allow_rule); + vport->ingress.allow_rule = NULL; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h new file mode 100644 index 000000000000..8dc4cab66a71 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#ifndef __MLX5_ESWITCH_ACL_HELPER_H__ +#define __MLX5_ESWITCH_ACL_HELPER_H__ + +#include "eswitch.h" + +/* General acl helper functions */ +struct mlx5_flow_table * +esw_acl_table_create(struct mlx5_eswitch *esw, u16 vport_num, int ns, int size); + +/* Egress acl helper functions */ +void esw_acl_egress_table_destroy(struct mlx5_vport *vport); +int esw_egress_acl_vlan_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport, + struct mlx5_flow_destination *fwd_dest, + u16 vlan_id, u32 flow_action); +void esw_acl_egress_vlan_destroy(struct mlx5_vport *vport); +int esw_acl_egress_vlan_grp_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_acl_egress_vlan_grp_destroy(struct mlx5_vport *vport); + +/* Ingress acl helper functions */ +void esw_acl_ingress_table_destroy(struct mlx5_vport *vport); +void esw_acl_ingress_allow_rule_destroy(struct mlx5_vport *vport); + +#endif /* __MLX5_ESWITCH_ACL_HELPER_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c new file mode 100644 index 000000000000..9bda4fe2eafa --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" +#include "lgcy.h" + +static void esw_acl_ingress_lgcy_rules_destroy(struct mlx5_vport *vport) +{ + if (vport->ingress.legacy.drop_rule) { + mlx5_del_flow_rules(vport->ingress.legacy.drop_rule); + vport->ingress.legacy.drop_rule = NULL; + } + esw_acl_ingress_allow_rule_destroy(vport); +} + +static int esw_acl_ingress_lgcy_groups_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_group *g; + void *match_criteria; + u32 *flow_group_in; + int err; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria); + + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "vport[%d] ingress create untagged spoofchk flow group, err(%d)\n", + vport->vport, err); + goto spoof_err; + } + vport->ingress.legacy.allow_untagged_spoofchk_grp = g; + + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "vport[%d] ingress create untagged flow group, err(%d)\n", + vport->vport, err); + goto untagged_err; + } + vport->ingress.legacy.allow_untagged_only_grp = g; + + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 2); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 2); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "vport[%d] ingress create spoofchk flow group, err(%d)\n", + vport->vport, err); + goto allow_spoof_err; + } + vport->ingress.legacy.allow_spoofchk_only_grp = g; + + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 3); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 3); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "vport[%d] ingress create drop flow group, err(%d)\n", + vport->vport, err); + goto drop_err; + } + vport->ingress.legacy.drop_grp = g; + kvfree(flow_group_in); + return 0; + +drop_err: + if (!IS_ERR_OR_NULL(vport->ingress.legacy.allow_spoofchk_only_grp)) { + mlx5_destroy_flow_group(vport->ingress.legacy.allow_spoofchk_only_grp); + vport->ingress.legacy.allow_spoofchk_only_grp = NULL; + } +allow_spoof_err: + if (!IS_ERR_OR_NULL(vport->ingress.legacy.allow_untagged_only_grp)) { + mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_only_grp); + vport->ingress.legacy.allow_untagged_only_grp = NULL; + } +untagged_err: + if (!IS_ERR_OR_NULL(vport->ingress.legacy.allow_untagged_spoofchk_grp)) { + mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_spoofchk_grp); + vport->ingress.legacy.allow_untagged_spoofchk_grp = NULL; + } +spoof_err: + kvfree(flow_group_in); + return err; +} + +static void esw_acl_ingress_lgcy_groups_destroy(struct mlx5_vport *vport) +{ + if (vport->ingress.legacy.allow_spoofchk_only_grp) { + mlx5_destroy_flow_group(vport->ingress.legacy.allow_spoofchk_only_grp); + vport->ingress.legacy.allow_spoofchk_only_grp = NULL; + } + if (vport->ingress.legacy.allow_untagged_only_grp) { + mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_only_grp); + vport->ingress.legacy.allow_untagged_only_grp = NULL; + } + if (vport->ingress.legacy.allow_untagged_spoofchk_grp) { + mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_spoofchk_grp); + vport->ingress.legacy.allow_untagged_spoofchk_grp = NULL; + } + if (vport->ingress.legacy.drop_grp) { + mlx5_destroy_flow_group(vport->ingress.legacy.drop_grp); + vport->ingress.legacy.drop_grp = NULL; + } +} + +int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + struct mlx5_flow_destination drop_ctr_dst = {}; + struct mlx5_flow_destination *dst = NULL; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec = NULL; + struct mlx5_fc *counter = NULL; + /* The ingress acl table contains 4 groups + * (2 active rules at the same time - + * 1 allow rule from one of the first 3 groups. + * 1 drop rule from the last group): + * 1)Allow untagged traffic with smac=original mac. + * 2)Allow untagged traffic. + * 3)Allow traffic with smac=original mac. + * 4)Drop all other traffic. + */ + int table_size = 4; + int dest_num = 0; + int err = 0; + u8 *smac_v; + + esw_acl_ingress_lgcy_rules_destroy(vport); + + if (MLX5_CAP_ESW_INGRESS_ACL(esw->dev, flow_counter)) { + counter = mlx5_fc_create(esw->dev, false); + if (IS_ERR(counter)) + esw_warn(esw->dev, + "vport[%d] configure ingress drop rule counter failed\n", + vport->vport); + vport->ingress.legacy.drop_counter = counter; + } + + if (!vport->info.vlan && !vport->info.qos && !vport->info.spoofchk) { + esw_acl_ingress_lgcy_cleanup(esw, vport); + return 0; + } + + if (!vport->ingress.acl) { + vport->ingress.acl = esw_acl_table_create(esw, vport->vport, + MLX5_FLOW_NAMESPACE_ESW_INGRESS, + table_size); + if (IS_ERR_OR_NULL(vport->ingress.acl)) { + err = PTR_ERR(vport->ingress.acl); + vport->ingress.acl = NULL; + return err; + } + + err = esw_acl_ingress_lgcy_groups_create(esw, vport); + if (err) + goto out; + } + + esw_debug(esw->dev, + "vport[%d] configure ingress rules, vlan(%d) qos(%d)\n", + vport->vport, vport->info.vlan, vport->info.qos); + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + err = -ENOMEM; + goto out; + } + + if (vport->info.vlan || vport->info.qos) + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.cvlan_tag); + + if (vport->info.spoofchk) { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.smac_47_16); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.smac_15_0); + smac_v = MLX5_ADDR_OF(fte_match_param, + spec->match_value, + outer_headers.smac_47_16); + ether_addr_copy(smac_v, vport->info.mac); + } + + /* Create ingress allow rule */ + memset(spec, 0, sizeof(*spec)); + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW; + vport->ingress.allow_rule = mlx5_add_flow_rules(vport->ingress.acl, spec, + &flow_act, NULL, 0); + if (IS_ERR(vport->ingress.allow_rule)) { + err = PTR_ERR(vport->ingress.allow_rule); + esw_warn(esw->dev, + "vport[%d] configure ingress allow rule, err(%d)\n", + vport->vport, err); + vport->ingress.allow_rule = NULL; + goto out; + } + + memset(&flow_act, 0, sizeof(flow_act)); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + /* Attach drop flow counter */ + if (counter) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + drop_ctr_dst.counter_id = mlx5_fc_id(counter); + dst = &drop_ctr_dst; + dest_num++; + } + vport->ingress.legacy.drop_rule = + mlx5_add_flow_rules(vport->ingress.acl, NULL, + &flow_act, dst, dest_num); + if (IS_ERR(vport->ingress.legacy.drop_rule)) { + err = PTR_ERR(vport->ingress.legacy.drop_rule); + esw_warn(esw->dev, + "vport[%d] configure ingress drop rule, err(%d)\n", + vport->vport, err); + vport->ingress.legacy.drop_rule = NULL; + goto out; + } + kvfree(spec); + return 0; + +out: + esw_acl_ingress_lgcy_cleanup(esw, vport); + kvfree(spec); + return err; +} + +void esw_acl_ingress_lgcy_cleanup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (IS_ERR_OR_NULL(vport->ingress.acl)) + goto clean_drop_counter; + + esw_debug(esw->dev, "Destroy vport[%d] E-Switch ingress ACL\n", vport->vport); + + esw_acl_ingress_lgcy_rules_destroy(vport); + esw_acl_ingress_lgcy_groups_destroy(vport); + esw_acl_ingress_table_destroy(vport); + +clean_drop_counter: + if (!IS_ERR_OR_NULL(vport->ingress.legacy.drop_counter)) { + mlx5_fc_destroy(esw->dev, vport->ingress.legacy.drop_counter); + vport->ingress.legacy.drop_counter = NULL; + } +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c new file mode 100644 index 000000000000..4e55d7225a26 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" +#include "ofld.h" + +static bool +esw_acl_ingress_prio_tag_enabled(const struct mlx5_eswitch *esw, + const struct mlx5_vport *vport) +{ + return (MLX5_CAP_GEN(esw->dev, prio_tag_required) && + mlx5_eswitch_is_vf_vport(esw, vport->vport)); +} + +static int esw_acl_ingress_prio_tag_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec; + int err = 0; + + /* For prio tag mode, there is only 1 FTEs: + * 1) Untagged packets - push prio tag VLAN and modify metadata if + * required, allow + * Unmatched traffic is allowed by default + */ + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + /* Untagged packets - push prio tag VLAN, allow */ + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.cvlan_tag, 0); + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH | + MLX5_FLOW_CONTEXT_ACTION_ALLOW; + flow_act.vlan[0].ethtype = ETH_P_8021Q; + flow_act.vlan[0].vid = 0; + flow_act.vlan[0].prio = 0; + + if (vport->ingress.offloads.modify_metadata_rule) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + flow_act.modify_hdr = vport->ingress.offloads.modify_metadata; + } + + vport->ingress.allow_rule = mlx5_add_flow_rules(vport->ingress.acl, spec, + &flow_act, NULL, 0); + if (IS_ERR(vport->ingress.allow_rule)) { + err = PTR_ERR(vport->ingress.allow_rule); + esw_warn(esw->dev, + "vport[%d] configure ingress untagged allow rule, err(%d)\n", + vport->vport, err); + vport->ingress.allow_rule = NULL; + } + + kvfree(spec); + return err; +} + +static int esw_acl_ingress_mod_metadata_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; + struct mlx5_flow_act flow_act = {}; + int err = 0; + u32 key; + + key = mlx5_eswitch_get_vport_metadata_for_match(esw, vport->vport); + key >>= ESW_SOURCE_PORT_METADATA_OFFSET; + + MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET); + MLX5_SET(set_action_in, action, field, + MLX5_ACTION_IN_FIELD_METADATA_REG_C_0); + MLX5_SET(set_action_in, action, data, key); + MLX5_SET(set_action_in, action, offset, + ESW_SOURCE_PORT_METADATA_OFFSET); + MLX5_SET(set_action_in, action, length, + ESW_SOURCE_PORT_METADATA_BITS); + + vport->ingress.offloads.modify_metadata = + mlx5_modify_header_alloc(esw->dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS, + 1, action); + if (IS_ERR(vport->ingress.offloads.modify_metadata)) { + err = PTR_ERR(vport->ingress.offloads.modify_metadata); + esw_warn(esw->dev, + "failed to alloc modify header for vport %d ingress acl (%d)\n", + vport->vport, err); + return err; + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_MOD_HDR | MLX5_FLOW_CONTEXT_ACTION_ALLOW; + flow_act.modify_hdr = vport->ingress.offloads.modify_metadata; + vport->ingress.offloads.modify_metadata_rule = + mlx5_add_flow_rules(vport->ingress.acl, + NULL, &flow_act, NULL, 0); + if (IS_ERR(vport->ingress.offloads.modify_metadata_rule)) { + err = PTR_ERR(vport->ingress.offloads.modify_metadata_rule); + esw_warn(esw->dev, + "failed to add setting metadata rule for vport %d ingress acl, err(%d)\n", + vport->vport, err); + mlx5_modify_header_dealloc(esw->dev, vport->ingress.offloads.modify_metadata); + vport->ingress.offloads.modify_metadata_rule = NULL; + } + return err; +} + +static void esw_acl_ingress_mod_metadata_destroy(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (!vport->ingress.offloads.modify_metadata_rule) + return; + + mlx5_del_flow_rules(vport->ingress.offloads.modify_metadata_rule); + mlx5_modify_header_dealloc(esw->dev, vport->ingress.offloads.modify_metadata); + vport->ingress.offloads.modify_metadata_rule = NULL; +} + +static int esw_acl_ingress_ofld_rules_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int err; + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { + err = esw_acl_ingress_mod_metadata_create(esw, vport); + if (err) { + esw_warn(esw->dev, + "vport(%d) create ingress modify metadata, err(%d)\n", + vport->vport, err); + return err; + } + } + + if (esw_acl_ingress_prio_tag_enabled(esw, vport)) { + err = esw_acl_ingress_prio_tag_create(esw, vport); + if (err) { + esw_warn(esw->dev, + "vport(%d) create ingress prio tag rule, err(%d)\n", + vport->vport, err); + goto prio_tag_err; + } + } + + return 0; + +prio_tag_err: + esw_acl_ingress_mod_metadata_destroy(esw, vport); + return err; +} + +static void esw_acl_ingress_ofld_rules_destroy(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + esw_acl_ingress_allow_rule_destroy(vport); + esw_acl_ingress_mod_metadata_destroy(esw, vport); +} + +static int esw_acl_ingress_ofld_groups_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *g; + void *match_criteria; + u32 *flow_group_in; + u32 flow_index = 0; + int ret = 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + if (esw_acl_ingress_prio_tag_enabled(esw, vport)) { + /* This group is to hold FTE to match untagged packets when prio_tag + * is enabled. + */ + match_criteria = MLX5_ADDR_OF(create_flow_group_in, + flow_group_in, match_criteria); + MLX5_SET(create_flow_group_in, flow_group_in, + match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_index); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_index); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + ret = PTR_ERR(g); + esw_warn(esw->dev, "vport[%d] ingress create untagged flow group, err(%d)\n", + vport->vport, ret); + goto prio_tag_err; + } + vport->ingress.offloads.metadata_prio_tag_grp = g; + flow_index++; + } + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { + /* This group holds an FTE with no match to add metadata for + * tagged packets if prio-tag is enabled, or for all untagged + * traffic in case prio-tag is disabled. + */ + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_index); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_index); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + ret = PTR_ERR(g); + esw_warn(esw->dev, "vport[%d] ingress create drop flow group, err(%d)\n", + vport->vport, ret); + goto metadata_err; + } + vport->ingress.offloads.metadata_allmatch_grp = g; + } + + kvfree(flow_group_in); + return 0; + +metadata_err: + if (!IS_ERR_OR_NULL(vport->ingress.offloads.metadata_prio_tag_grp)) { + mlx5_destroy_flow_group(vport->ingress.offloads.metadata_prio_tag_grp); + vport->ingress.offloads.metadata_prio_tag_grp = NULL; + } +prio_tag_err: + kvfree(flow_group_in); + return ret; +} + +static void esw_acl_ingress_ofld_groups_destroy(struct mlx5_vport *vport) +{ + if (vport->ingress.offloads.metadata_allmatch_grp) { + mlx5_destroy_flow_group(vport->ingress.offloads.metadata_allmatch_grp); + vport->ingress.offloads.metadata_allmatch_grp = NULL; + } + + if (vport->ingress.offloads.metadata_prio_tag_grp) { + mlx5_destroy_flow_group(vport->ingress.offloads.metadata_prio_tag_grp); + vport->ingress.offloads.metadata_prio_tag_grp = NULL; + } +} + +int esw_acl_ingress_ofld_setup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int num_ftes = 0; + int err; + + if (!mlx5_eswitch_vport_match_metadata_enabled(esw) && + !esw_acl_ingress_prio_tag_enabled(esw, vport)) + return 0; + + esw_acl_ingress_allow_rule_destroy(vport); + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) + num_ftes++; + if (esw_acl_ingress_prio_tag_enabled(esw, vport)) + num_ftes++; + + vport->ingress.acl = esw_acl_table_create(esw, vport->vport, + MLX5_FLOW_NAMESPACE_ESW_INGRESS, + num_ftes); + if (IS_ERR_OR_NULL(vport->ingress.acl)) { + err = PTR_ERR(vport->ingress.acl); + vport->ingress.acl = NULL; + return err; + } + + err = esw_acl_ingress_ofld_groups_create(esw, vport); + if (err) + goto group_err; + + esw_debug(esw->dev, + "vport[%d] configure ingress rules\n", vport->vport); + + err = esw_acl_ingress_ofld_rules_create(esw, vport); + if (err) + goto rules_err; + + return 0; + +rules_err: + esw_acl_ingress_ofld_groups_destroy(vport); +group_err: + esw_acl_ingress_table_destroy(vport); + return err; +} + +void esw_acl_ingress_ofld_cleanup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + esw_acl_ingress_ofld_rules_destroy(esw, vport); + esw_acl_ingress_ofld_groups_destroy(vport); + esw_acl_ingress_table_destroy(vport); +} + +/* Caller must hold rtnl_lock */ +int mlx5_esw_acl_ingress_vport_bond_update(struct mlx5_eswitch *esw, u16 vport_num, + u32 metadata) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + int err; + + if (WARN_ON_ONCE(IS_ERR(vport))) { + esw_warn(esw->dev, "vport(%d) invalid!\n", vport_num); + err = PTR_ERR(vport); + goto out; + } + + esw_acl_ingress_ofld_rules_destroy(esw, vport); + + vport->metadata = metadata ? metadata : vport->default_metadata; + + /* Recreate ingress acl rules with vport->metadata */ + err = esw_acl_ingress_ofld_rules_create(esw, vport); + if (err) + goto out; + + return 0; + +out: + vport->metadata = vport->default_metadata; + return err; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h new file mode 100644 index 000000000000..44c152da3d83 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#ifndef __MLX5_ESWITCH_ACL_LGCY_H__ +#define __MLX5_ESWITCH_ACL_LGCY_H__ + +#include "eswitch.h" + +/* Eswitch acl egress external APIs */ +int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_acl_egress_lgcy_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); + +/* Eswitch acl ingress external APIs */ +int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_acl_ingress_lgcy_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); + +#endif /* __MLX5_ESWITCH_ACL_LGCY_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h new file mode 100644 index 000000000000..c57869b93d60 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#ifndef __MLX5_ESWITCH_ACL_OFLD_H__ +#define __MLX5_ESWITCH_ACL_OFLD_H__ + +#include "eswitch.h" + +/* Eswitch acl egress external APIs */ +int esw_acl_egress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_acl_egress_ofld_cleanup(struct mlx5_vport *vport); +int mlx5_esw_acl_egress_vport_bond(struct mlx5_eswitch *esw, u16 active_vport_num, + u16 passive_vport_num); +int mlx5_esw_acl_egress_vport_unbond(struct mlx5_eswitch *esw, u16 vport_num); + +static inline bool mlx5_esw_acl_egress_fwd2vport_supported(struct mlx5_eswitch *esw) +{ + return esw && esw->mode == MLX5_ESWITCH_OFFLOADS && + mlx5_eswitch_vport_match_metadata_enabled(esw) && + MLX5_CAP_ESW_FLOWTABLE(esw->dev, egress_acl_forward_to_vport); +} + +/* Eswitch acl ingress external APIs */ +int esw_acl_ingress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_acl_ingress_ofld_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +int mlx5_esw_acl_ingress_vport_bond_update(struct mlx5_eswitch *esw, u16 vport_num, + u32 metadata); + +#endif /* __MLX5_ESWITCH_ACL_OFLD_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index ac79b7c9aeb3..1116ab9bea6c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -35,6 +35,7 @@ #include <linux/mlx5/mlx5_ifc.h> #include <linux/mlx5/vport.h> #include <linux/mlx5/fs.h> +#include "esw/acl/lgcy.h" #include "mlx5_core.h" #include "lib/eq.h" #include "eswitch.h" @@ -936,512 +937,6 @@ static void esw_vport_change_handler(struct work_struct *work) mutex_unlock(&esw->state_lock); } -int esw_vport_enable_egress_acl(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); - struct mlx5_flow_group *vlan_grp = NULL; - struct mlx5_flow_group *drop_grp = NULL; - struct mlx5_core_dev *dev = esw->dev; - struct mlx5_flow_namespace *root_ns; - struct mlx5_flow_table *acl; - void *match_criteria; - u32 *flow_group_in; - /* The egress acl table contains 2 rules: - * 1)Allow traffic with vlan_tag=vst_vlan_id - * 2)Drop all other traffic. - */ - int table_size = 2; - int err = 0; - - if (!MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) - return -EOPNOTSUPP; - - if (!IS_ERR_OR_NULL(vport->egress.acl)) - return 0; - - esw_debug(dev, "Create vport[%d] egress ACL log_max_size(%d)\n", - vport->vport, MLX5_CAP_ESW_EGRESS_ACL(dev, log_max_ft_size)); - - root_ns = mlx5_get_flow_vport_acl_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_EGRESS, - mlx5_eswitch_vport_num_to_index(esw, vport->vport)); - if (!root_ns) { - esw_warn(dev, "Failed to get E-Switch egress flow namespace for vport (%d)\n", vport->vport); - return -EOPNOTSUPP; - } - - flow_group_in = kvzalloc(inlen, GFP_KERNEL); - if (!flow_group_in) - return -ENOMEM; - - acl = mlx5_create_vport_flow_table(root_ns, 0, table_size, 0, vport->vport); - if (IS_ERR(acl)) { - err = PTR_ERR(acl); - esw_warn(dev, "Failed to create E-Switch vport[%d] egress flow Table, err(%d)\n", - vport->vport, err); - goto out; - } - - MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); - match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.first_vid); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0); - - vlan_grp = mlx5_create_flow_group(acl, flow_group_in); - if (IS_ERR(vlan_grp)) { - err = PTR_ERR(vlan_grp); - esw_warn(dev, "Failed to create E-Switch vport[%d] egress allowed vlans flow group, err(%d)\n", - vport->vport, err); - goto out; - } - - memset(flow_group_in, 0, inlen); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); - drop_grp = mlx5_create_flow_group(acl, flow_group_in); - if (IS_ERR(drop_grp)) { - err = PTR_ERR(drop_grp); - esw_warn(dev, "Failed to create E-Switch vport[%d] egress drop flow group, err(%d)\n", - vport->vport, err); - goto out; - } - - vport->egress.acl = acl; - vport->egress.drop_grp = drop_grp; - vport->egress.allowed_vlans_grp = vlan_grp; -out: - kvfree(flow_group_in); - if (err && !IS_ERR_OR_NULL(vlan_grp)) - mlx5_destroy_flow_group(vlan_grp); - if (err && !IS_ERR_OR_NULL(acl)) - mlx5_destroy_flow_table(acl); - return err; -} - -void esw_vport_cleanup_egress_rules(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - if (!IS_ERR_OR_NULL(vport->egress.allowed_vlan)) { - mlx5_del_flow_rules(vport->egress.allowed_vlan); - vport->egress.allowed_vlan = NULL; - } - - if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_rule)) { - mlx5_del_flow_rules(vport->egress.legacy.drop_rule); - vport->egress.legacy.drop_rule = NULL; - } -} - -void esw_vport_disable_egress_acl(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - if (IS_ERR_OR_NULL(vport->egress.acl)) - return; - - esw_debug(esw->dev, "Destroy vport[%d] E-Switch egress ACL\n", vport->vport); - - esw_vport_cleanup_egress_rules(esw, vport); - mlx5_destroy_flow_group(vport->egress.allowed_vlans_grp); - mlx5_destroy_flow_group(vport->egress.drop_grp); - mlx5_destroy_flow_table(vport->egress.acl); - vport->egress.allowed_vlans_grp = NULL; - vport->egress.drop_grp = NULL; - vport->egress.acl = NULL; -} - -static int -esw_vport_create_legacy_ingress_acl_groups(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); - struct mlx5_core_dev *dev = esw->dev; - struct mlx5_flow_group *g; - void *match_criteria; - u32 *flow_group_in; - int err; - - flow_group_in = kvzalloc(inlen, GFP_KERNEL); - if (!flow_group_in) - return -ENOMEM; - - match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria); - - MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0); - - g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); - if (IS_ERR(g)) { - err = PTR_ERR(g); - esw_warn(dev, "vport[%d] ingress create untagged spoofchk flow group, err(%d)\n", - vport->vport, err); - goto spoof_err; - } - vport->ingress.legacy.allow_untagged_spoofchk_grp = g; - - memset(flow_group_in, 0, inlen); - MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); - - g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); - if (IS_ERR(g)) { - err = PTR_ERR(g); - esw_warn(dev, "vport[%d] ingress create untagged flow group, err(%d)\n", - vport->vport, err); - goto untagged_err; - } - vport->ingress.legacy.allow_untagged_only_grp = g; - - memset(flow_group_in, 0, inlen); - MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 2); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 2); - - g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); - if (IS_ERR(g)) { - err = PTR_ERR(g); - esw_warn(dev, "vport[%d] ingress create spoofchk flow group, err(%d)\n", - vport->vport, err); - goto allow_spoof_err; - } - vport->ingress.legacy.allow_spoofchk_only_grp = g; - - memset(flow_group_in, 0, inlen); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 3); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 3); - - g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); - if (IS_ERR(g)) { - err = PTR_ERR(g); - esw_warn(dev, "vport[%d] ingress create drop flow group, err(%d)\n", - vport->vport, err); - goto drop_err; - } - vport->ingress.legacy.drop_grp = g; - kvfree(flow_group_in); - return 0; - -drop_err: - if (!IS_ERR_OR_NULL(vport->ingress.legacy.allow_spoofchk_only_grp)) { - mlx5_destroy_flow_group(vport->ingress.legacy.allow_spoofchk_only_grp); - vport->ingress.legacy.allow_spoofchk_only_grp = NULL; - } -allow_spoof_err: - if (!IS_ERR_OR_NULL(vport->ingress.legacy.allow_untagged_only_grp)) { - mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_only_grp); - vport->ingress.legacy.allow_untagged_only_grp = NULL; - } -untagged_err: - if (!IS_ERR_OR_NULL(vport->ingress.legacy.allow_untagged_spoofchk_grp)) { - mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_spoofchk_grp); - vport->ingress.legacy.allow_untagged_spoofchk_grp = NULL; - } -spoof_err: - kvfree(flow_group_in); - return err; -} - -int esw_vport_create_ingress_acl_table(struct mlx5_eswitch *esw, - struct mlx5_vport *vport, int table_size) -{ - struct mlx5_core_dev *dev = esw->dev; - struct mlx5_flow_namespace *root_ns; - struct mlx5_flow_table *acl; - int vport_index; - int err; - - if (!MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support)) - return -EOPNOTSUPP; - - esw_debug(dev, "Create vport[%d] ingress ACL log_max_size(%d)\n", - vport->vport, MLX5_CAP_ESW_INGRESS_ACL(dev, log_max_ft_size)); - - vport_index = mlx5_eswitch_vport_num_to_index(esw, vport->vport); - root_ns = mlx5_get_flow_vport_acl_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS, - vport_index); - if (!root_ns) { - esw_warn(dev, "Failed to get E-Switch ingress flow namespace for vport (%d)\n", - vport->vport); - return -EOPNOTSUPP; - } - - acl = mlx5_create_vport_flow_table(root_ns, 0, table_size, 0, vport->vport); - if (IS_ERR(acl)) { - err = PTR_ERR(acl); - esw_warn(dev, "vport[%d] ingress create flow Table, err(%d)\n", - vport->vport, err); - return err; - } - vport->ingress.acl = acl; - return 0; -} - -void esw_vport_destroy_ingress_acl_table(struct mlx5_vport *vport) -{ - if (!vport->ingress.acl) - return; - - mlx5_destroy_flow_table(vport->ingress.acl); - vport->ingress.acl = NULL; -} - -void esw_vport_cleanup_ingress_rules(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - if (vport->ingress.legacy.drop_rule) { - mlx5_del_flow_rules(vport->ingress.legacy.drop_rule); - vport->ingress.legacy.drop_rule = NULL; - } - - if (vport->ingress.allow_rule) { - mlx5_del_flow_rules(vport->ingress.allow_rule); - vport->ingress.allow_rule = NULL; - } -} - -static void esw_vport_disable_legacy_ingress_acl(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - if (!vport->ingress.acl) - return; - - esw_debug(esw->dev, "Destroy vport[%d] E-Switch ingress ACL\n", vport->vport); - - esw_vport_cleanup_ingress_rules(esw, vport); - if (vport->ingress.legacy.allow_spoofchk_only_grp) { - mlx5_destroy_flow_group(vport->ingress.legacy.allow_spoofchk_only_grp); - vport->ingress.legacy.allow_spoofchk_only_grp = NULL; - } - if (vport->ingress.legacy.allow_untagged_only_grp) { - mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_only_grp); - vport->ingress.legacy.allow_untagged_only_grp = NULL; - } - if (vport->ingress.legacy.allow_untagged_spoofchk_grp) { - mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_spoofchk_grp); - vport->ingress.legacy.allow_untagged_spoofchk_grp = NULL; - } - if (vport->ingress.legacy.drop_grp) { - mlx5_destroy_flow_group(vport->ingress.legacy.drop_grp); - vport->ingress.legacy.drop_grp = NULL; - } - esw_vport_destroy_ingress_acl_table(vport); -} - -static int esw_vport_ingress_config(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - struct mlx5_fc *counter = vport->ingress.legacy.drop_counter; - struct mlx5_flow_destination drop_ctr_dst = {0}; - struct mlx5_flow_destination *dst = NULL; - struct mlx5_flow_act flow_act = {0}; - struct mlx5_flow_spec *spec = NULL; - int dest_num = 0; - int err = 0; - u8 *smac_v; - - /* The ingress acl table contains 4 groups - * (2 active rules at the same time - - * 1 allow rule from one of the first 3 groups. - * 1 drop rule from the last group): - * 1)Allow untagged traffic with smac=original mac. - * 2)Allow untagged traffic. - * 3)Allow traffic with smac=original mac. - * 4)Drop all other traffic. - */ - int table_size = 4; - - esw_vport_cleanup_ingress_rules(esw, vport); - - if (!vport->info.vlan && !vport->info.qos && !vport->info.spoofchk) { - esw_vport_disable_legacy_ingress_acl(esw, vport); - return 0; - } - - if (!vport->ingress.acl) { - err = esw_vport_create_ingress_acl_table(esw, vport, table_size); - if (err) { - esw_warn(esw->dev, - "vport[%d] enable ingress acl err (%d)\n", - err, vport->vport); - return err; - } - - err = esw_vport_create_legacy_ingress_acl_groups(esw, vport); - if (err) - goto out; - } - - esw_debug(esw->dev, - "vport[%d] configure ingress rules, vlan(%d) qos(%d)\n", - vport->vport, vport->info.vlan, vport->info.qos); - - spec = kvzalloc(sizeof(*spec), GFP_KERNEL); - if (!spec) { - err = -ENOMEM; - goto out; - } - - if (vport->info.vlan || vport->info.qos) - MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); - - if (vport->info.spoofchk) { - MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.smac_47_16); - MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.smac_15_0); - smac_v = MLX5_ADDR_OF(fte_match_param, - spec->match_value, - outer_headers.smac_47_16); - ether_addr_copy(smac_v, vport->info.mac); - } - - spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; - flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW; - vport->ingress.allow_rule = - mlx5_add_flow_rules(vport->ingress.acl, spec, - &flow_act, NULL, 0); - if (IS_ERR(vport->ingress.allow_rule)) { - err = PTR_ERR(vport->ingress.allow_rule); - esw_warn(esw->dev, - "vport[%d] configure ingress allow rule, err(%d)\n", - vport->vport, err); - vport->ingress.allow_rule = NULL; - goto out; - } - - flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; - - /* Attach drop flow counter */ - if (counter) { - flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; - drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - drop_ctr_dst.counter_id = mlx5_fc_id(counter); - dst = &drop_ctr_dst; - dest_num++; - } - vport->ingress.legacy.drop_rule = - mlx5_add_flow_rules(vport->ingress.acl, NULL, - &flow_act, dst, dest_num); - if (IS_ERR(vport->ingress.legacy.drop_rule)) { - err = PTR_ERR(vport->ingress.legacy.drop_rule); - esw_warn(esw->dev, - "vport[%d] configure ingress drop rule, err(%d)\n", - vport->vport, err); - vport->ingress.legacy.drop_rule = NULL; - goto out; - } - kvfree(spec); - return 0; - -out: - esw_vport_disable_legacy_ingress_acl(esw, vport); - kvfree(spec); - return err; -} - -int mlx5_esw_create_vport_egress_acl_vlan(struct mlx5_eswitch *esw, - struct mlx5_vport *vport, - u16 vlan_id, u32 flow_action) -{ - struct mlx5_flow_act flow_act = {}; - struct mlx5_flow_spec *spec; - int err = 0; - - if (vport->egress.allowed_vlan) - return -EEXIST; - - spec = kvzalloc(sizeof(*spec), GFP_KERNEL); - if (!spec) - return -ENOMEM; - - MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); - MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.cvlan_tag); - MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.first_vid); - MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, vlan_id); - - spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; - flow_act.action = flow_action; - vport->egress.allowed_vlan = - mlx5_add_flow_rules(vport->egress.acl, spec, - &flow_act, NULL, 0); - if (IS_ERR(vport->egress.allowed_vlan)) { - err = PTR_ERR(vport->egress.allowed_vlan); - esw_warn(esw->dev, - "vport[%d] configure egress vlan rule failed, err(%d)\n", - vport->vport, err); - vport->egress.allowed_vlan = NULL; - } - - kvfree(spec); - return err; -} - -static int esw_vport_egress_config(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - struct mlx5_fc *counter = vport->egress.legacy.drop_counter; - struct mlx5_flow_destination drop_ctr_dst = {0}; - struct mlx5_flow_destination *dst = NULL; - struct mlx5_flow_act flow_act = {0}; - int dest_num = 0; - int err = 0; - - esw_vport_cleanup_egress_rules(esw, vport); - - if (!vport->info.vlan && !vport->info.qos) { - esw_vport_disable_egress_acl(esw, vport); - return 0; - } - - err = esw_vport_enable_egress_acl(esw, vport); - if (err) { - mlx5_core_warn(esw->dev, - "failed to enable egress acl (%d) on vport[%d]\n", - err, vport->vport); - return err; - } - - esw_debug(esw->dev, - "vport[%d] configure egress rules, vlan(%d) qos(%d)\n", - vport->vport, vport->info.vlan, vport->info.qos); - - /* Allowed vlan rule */ - err = mlx5_esw_create_vport_egress_acl_vlan(esw, vport, vport->info.vlan, - MLX5_FLOW_CONTEXT_ACTION_ALLOW); - if (err) - return err; - - flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; - - /* Attach egress drop flow counter */ - if (counter) { - flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; - drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - drop_ctr_dst.counter_id = mlx5_fc_id(counter); - dst = &drop_ctr_dst; - dest_num++; - } - vport->egress.legacy.drop_rule = - mlx5_add_flow_rules(vport->egress.acl, NULL, - &flow_act, dst, dest_num); - if (IS_ERR(vport->egress.legacy.drop_rule)) { - err = PTR_ERR(vport->egress.legacy.drop_rule); - esw_warn(esw->dev, - "vport[%d] configure egress drop rule failed, err(%d)\n", - vport->vport, err); - vport->egress.legacy.drop_rule = NULL; - } - - return err; -} - static bool element_type_supported(struct mlx5_eswitch *esw, int type) { const struct mlx5_core_dev *dev = esw->dev; @@ -1653,44 +1148,19 @@ static int esw_vport_create_legacy_acl_tables(struct mlx5_eswitch *esw, if (mlx5_esw_is_manager_vport(esw, vport->vport)) return 0; - if (MLX5_CAP_ESW_INGRESS_ACL(esw->dev, flow_counter)) { - vport->ingress.legacy.drop_counter = mlx5_fc_create(esw->dev, false); - if (IS_ERR(vport->ingress.legacy.drop_counter)) { - esw_warn(esw->dev, - "vport[%d] configure ingress drop rule counter failed\n", - vport->vport); - vport->ingress.legacy.drop_counter = NULL; - } - } - - ret = esw_vport_ingress_config(esw, vport); + ret = esw_acl_ingress_lgcy_setup(esw, vport); if (ret) goto ingress_err; - if (MLX5_CAP_ESW_EGRESS_ACL(esw->dev, flow_counter)) { - vport->egress.legacy.drop_counter = mlx5_fc_create(esw->dev, false); - if (IS_ERR(vport->egress.legacy.drop_counter)) { - esw_warn(esw->dev, - "vport[%d] configure egress drop rule counter failed\n", - vport->vport); - vport->egress.legacy.drop_counter = NULL; - } - } - - ret = esw_vport_egress_config(esw, vport); + ret = esw_acl_egress_lgcy_setup(esw, vport); if (ret) goto egress_err; return 0; egress_err: - esw_vport_disable_legacy_ingress_acl(esw, vport); - mlx5_fc_destroy(esw->dev, vport->egress.legacy.drop_counter); - vport->egress.legacy.drop_counter = NULL; - + esw_acl_ingress_lgcy_cleanup(esw, vport); ingress_err: - mlx5_fc_destroy(esw->dev, vport->ingress.legacy.drop_counter); - vport->ingress.legacy.drop_counter = NULL; return ret; } @@ -1710,13 +1180,8 @@ static void esw_vport_destroy_legacy_acl_tables(struct mlx5_eswitch *esw, if (mlx5_esw_is_manager_vport(esw, vport->vport)) return; - esw_vport_disable_egress_acl(esw, vport); - mlx5_fc_destroy(esw->dev, vport->egress.legacy.drop_counter); - vport->egress.legacy.drop_counter = NULL; - - esw_vport_disable_legacy_ingress_acl(esw, vport); - mlx5_fc_destroy(esw->dev, vport->ingress.legacy.drop_counter); - vport->ingress.legacy.drop_counter = NULL; + esw_acl_egress_lgcy_cleanup(esw, vport); + esw_acl_ingress_lgcy_cleanup(esw, vport); } static void esw_vport_cleanup_acl(struct mlx5_eswitch *esw, @@ -2265,6 +1730,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) mutex_init(&esw->offloads.decap_tbl_lock); hash_init(esw->offloads.decap_tbl); atomic64_set(&esw->offloads.num_flows, 0); + ida_init(&esw->offloads.vport_metadata_ida); mutex_init(&esw->state_lock); mutex_init(&esw->mode_lock); @@ -2303,6 +1769,7 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) esw_offloads_cleanup_reps(esw); mutex_destroy(&esw->mode_lock); mutex_destroy(&esw->state_lock); + ida_destroy(&esw->offloads.vport_metadata_ida); mutex_destroy(&esw->offloads.mod_hdr.lock); mutex_destroy(&esw->offloads.encap_tbl_lock); mutex_destroy(&esw->offloads.decap_tbl_lock); @@ -2348,7 +1815,7 @@ int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw, ether_addr_copy(evport->info.mac, mac); evport->info.node_guid = node_guid; if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY) - err = esw_vport_ingress_config(esw, evport); + err = esw_acl_ingress_lgcy_setup(esw, evport); unlock: mutex_unlock(&esw->state_lock); @@ -2430,10 +1897,10 @@ int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw, evport->info.vlan = vlan; evport->info.qos = qos; if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY) { - err = esw_vport_ingress_config(esw, evport); + err = esw_acl_ingress_lgcy_setup(esw, evport); if (err) return err; - err = esw_vport_egress_config(esw, evport); + err = esw_acl_egress_lgcy_setup(esw, evport); } return err; @@ -2475,7 +1942,7 @@ int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw, "Spoofchk in set while MAC is invalid, vport(%d)\n", evport->vport); if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY) - err = esw_vport_ingress_config(esw, evport); + err = esw_acl_ingress_lgcy_setup(esw, evport); if (err) evport->info.spoofchk = pschk; mutex_unlock(&esw->state_lock); @@ -2734,7 +2201,7 @@ static int mlx5_eswitch_query_vport_drop_stats(struct mlx5_core_dev *dev, if (!vport->enabled) goto unlock; - if (vport->egress.legacy.drop_counter) + if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_counter)) mlx5_fc_query(dev, vport->egress.legacy.drop_counter, &stats->rx_dropped, &bytes); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index ccbbea3e0505..a5175e98c0b3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -99,13 +99,19 @@ struct vport_ingress { struct vport_egress { struct mlx5_flow_table *acl; - struct mlx5_flow_group *allowed_vlans_grp; - struct mlx5_flow_group *drop_grp; struct mlx5_flow_handle *allowed_vlan; - struct { - struct mlx5_flow_handle *drop_rule; - struct mlx5_fc *drop_counter; - } legacy; + struct mlx5_flow_group *vlan_grp; + union { + struct { + struct mlx5_flow_group *drop_grp; + struct mlx5_flow_handle *drop_rule; + struct mlx5_fc *drop_counter; + } legacy; + struct { + struct mlx5_flow_group *fwd_grp; + struct mlx5_flow_handle *fwd_rule; + } offloads; + }; }; struct mlx5_vport_drop_stats { @@ -143,6 +149,8 @@ struct mlx5_vport { struct vport_ingress ingress; struct vport_egress egress; + u32 default_metadata; + u32 metadata; struct mlx5_vport_info info; @@ -218,6 +226,7 @@ struct mlx5_esw_offload { u8 inline_mode; atomic64_t num_flows; enum devlink_eswitch_encap_mode encap; + struct ida vport_metadata_ida; }; /* E-Switch MC FDB table hash node */ @@ -285,18 +294,10 @@ void esw_offloads_disable(struct mlx5_eswitch *esw); int esw_offloads_enable(struct mlx5_eswitch *esw); void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw); int esw_offloads_init_reps(struct mlx5_eswitch *esw); -void esw_vport_cleanup_ingress_rules(struct mlx5_eswitch *esw, - struct mlx5_vport *vport); -int esw_vport_create_ingress_acl_table(struct mlx5_eswitch *esw, - struct mlx5_vport *vport, - int table_size); -void esw_vport_destroy_ingress_acl_table(struct mlx5_vport *vport); -void esw_vport_cleanup_egress_rules(struct mlx5_eswitch *esw, - struct mlx5_vport *vport); -int esw_vport_enable_egress_acl(struct mlx5_eswitch *esw, - struct mlx5_vport *vport); -void esw_vport_disable_egress_acl(struct mlx5_eswitch *esw, - struct mlx5_vport *vport); + +u32 mlx5_esw_match_metadata_alloc(struct mlx5_eswitch *esw); +void mlx5_esw_match_metadata_free(struct mlx5_eswitch *esw, u32 metadata); + int mlx5_esw_modify_vport_rate(struct mlx5_eswitch *esw, u16 vport_num, u32 rate_mbps); @@ -458,10 +459,6 @@ int mlx5_eswitch_del_vlan_action(struct mlx5_eswitch *esw, int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw, u16 vport, u16 vlan, u8 qos, u8 set_flags); -int mlx5_esw_create_vport_egress_acl_vlan(struct mlx5_eswitch *esw, - struct mlx5_vport *vport, - u16 vlan_id, u32 flow_action); - static inline bool mlx5_esw_qos_enabled(struct mlx5_eswitch *esw) { return esw->qos.enabled; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 554fc64d8ef6..060354bb211a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -31,12 +31,14 @@ */ #include <linux/etherdevice.h> +#include <linux/idr.h> #include <linux/mlx5/driver.h> #include <linux/mlx5/mlx5_ifc.h> #include <linux/mlx5/vport.h> #include <linux/mlx5/fs.h> #include "mlx5_core.h" #include "eswitch.h" +#include "esw/acl/ofld.h" #include "esw/chains.h" #include "rdma.h" #include "en.h" @@ -234,13 +236,6 @@ static struct mlx5_eswitch_rep *mlx5_eswitch_get_rep(struct mlx5_eswitch *esw, return &esw->offloads.vport_reps[idx]; } -static bool -esw_check_ingress_prio_tag_enabled(const struct mlx5_eswitch *esw, - const struct mlx5_vport *vport) -{ - return (MLX5_CAP_GEN(esw->dev, prio_tag_required) && - mlx5_eswitch_is_vf_vport(esw, vport->vport)); -} static void mlx5_eswitch_set_rule_source_port(struct mlx5_eswitch *esw, @@ -1851,279 +1846,6 @@ static void esw_offloads_devcom_cleanup(struct mlx5_eswitch *esw) mlx5_devcom_unregister_component(devcom, MLX5_DEVCOM_ESW_OFFLOADS); } -static int esw_vport_ingress_prio_tag_config(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - struct mlx5_flow_act flow_act = {0}; - struct mlx5_flow_spec *spec; - int err = 0; - - /* For prio tag mode, there is only 1 FTEs: - * 1) Untagged packets - push prio tag VLAN and modify metadata if - * required, allow - * Unmatched traffic is allowed by default - */ - spec = kvzalloc(sizeof(*spec), GFP_KERNEL); - if (!spec) - return -ENOMEM; - - /* Untagged packets - push prio tag VLAN, allow */ - MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); - MLX5_SET(fte_match_param, spec->match_value, outer_headers.cvlan_tag, 0); - spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; - flow_act.action = MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH | - MLX5_FLOW_CONTEXT_ACTION_ALLOW; - flow_act.vlan[0].ethtype = ETH_P_8021Q; - flow_act.vlan[0].vid = 0; - flow_act.vlan[0].prio = 0; - - if (vport->ingress.offloads.modify_metadata_rule) { - flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; - flow_act.modify_hdr = vport->ingress.offloads.modify_metadata; - } - - vport->ingress.allow_rule = - mlx5_add_flow_rules(vport->ingress.acl, spec, - &flow_act, NULL, 0); - if (IS_ERR(vport->ingress.allow_rule)) { - err = PTR_ERR(vport->ingress.allow_rule); - esw_warn(esw->dev, - "vport[%d] configure ingress untagged allow rule, err(%d)\n", - vport->vport, err); - vport->ingress.allow_rule = NULL; - } - - kvfree(spec); - return err; -} - -static int esw_vport_add_ingress_acl_modify_metadata(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; - struct mlx5_flow_act flow_act = {}; - int err = 0; - u32 key; - - key = mlx5_eswitch_get_vport_metadata_for_match(esw, vport->vport); - key >>= ESW_SOURCE_PORT_METADATA_OFFSET; - - MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET); - MLX5_SET(set_action_in, action, field, - MLX5_ACTION_IN_FIELD_METADATA_REG_C_0); - MLX5_SET(set_action_in, action, data, key); - MLX5_SET(set_action_in, action, offset, - ESW_SOURCE_PORT_METADATA_OFFSET); - MLX5_SET(set_action_in, action, length, - ESW_SOURCE_PORT_METADATA_BITS); - - vport->ingress.offloads.modify_metadata = - mlx5_modify_header_alloc(esw->dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS, - 1, action); - if (IS_ERR(vport->ingress.offloads.modify_metadata)) { - err = PTR_ERR(vport->ingress.offloads.modify_metadata); - esw_warn(esw->dev, - "failed to alloc modify header for vport %d ingress acl (%d)\n", - vport->vport, err); - return err; - } - - flow_act.action = MLX5_FLOW_CONTEXT_ACTION_MOD_HDR | MLX5_FLOW_CONTEXT_ACTION_ALLOW; - flow_act.modify_hdr = vport->ingress.offloads.modify_metadata; - vport->ingress.offloads.modify_metadata_rule = - mlx5_add_flow_rules(vport->ingress.acl, - NULL, &flow_act, NULL, 0); - if (IS_ERR(vport->ingress.offloads.modify_metadata_rule)) { - err = PTR_ERR(vport->ingress.offloads.modify_metadata_rule); - esw_warn(esw->dev, - "failed to add setting metadata rule for vport %d ingress acl, err(%d)\n", - vport->vport, err); - mlx5_modify_header_dealloc(esw->dev, vport->ingress.offloads.modify_metadata); - vport->ingress.offloads.modify_metadata_rule = NULL; - } - return err; -} - -static void esw_vport_del_ingress_acl_modify_metadata(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - if (vport->ingress.offloads.modify_metadata_rule) { - mlx5_del_flow_rules(vport->ingress.offloads.modify_metadata_rule); - mlx5_modify_header_dealloc(esw->dev, vport->ingress.offloads.modify_metadata); - - vport->ingress.offloads.modify_metadata_rule = NULL; - } -} - -static int esw_vport_create_ingress_acl_group(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); - struct mlx5_flow_group *g; - void *match_criteria; - u32 *flow_group_in; - u32 flow_index = 0; - int ret = 0; - - flow_group_in = kvzalloc(inlen, GFP_KERNEL); - if (!flow_group_in) - return -ENOMEM; - - if (esw_check_ingress_prio_tag_enabled(esw, vport)) { - /* This group is to hold FTE to match untagged packets when prio_tag - * is enabled. - */ - memset(flow_group_in, 0, inlen); - - match_criteria = MLX5_ADDR_OF(create_flow_group_in, - flow_group_in, match_criteria); - MLX5_SET(create_flow_group_in, flow_group_in, - match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); - MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_index); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_index); - - g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); - if (IS_ERR(g)) { - ret = PTR_ERR(g); - esw_warn(esw->dev, "vport[%d] ingress create untagged flow group, err(%d)\n", - vport->vport, ret); - goto prio_tag_err; - } - vport->ingress.offloads.metadata_prio_tag_grp = g; - flow_index++; - } - - if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { - /* This group holds an FTE with no matches for add metadata for - * tagged packets, if prio-tag is enabled (as a fallthrough), - * or all traffic in case prio-tag is disabled. - */ - memset(flow_group_in, 0, inlen); - MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_index); - MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_index); - - g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); - if (IS_ERR(g)) { - ret = PTR_ERR(g); - esw_warn(esw->dev, "vport[%d] ingress create drop flow group, err(%d)\n", - vport->vport, ret); - goto metadata_err; - } - vport->ingress.offloads.metadata_allmatch_grp = g; - } - - kvfree(flow_group_in); - return 0; - -metadata_err: - if (!IS_ERR_OR_NULL(vport->ingress.offloads.metadata_prio_tag_grp)) { - mlx5_destroy_flow_group(vport->ingress.offloads.metadata_prio_tag_grp); - vport->ingress.offloads.metadata_prio_tag_grp = NULL; - } -prio_tag_err: - kvfree(flow_group_in); - return ret; -} - -static void esw_vport_destroy_ingress_acl_group(struct mlx5_vport *vport) -{ - if (vport->ingress.offloads.metadata_allmatch_grp) { - mlx5_destroy_flow_group(vport->ingress.offloads.metadata_allmatch_grp); - vport->ingress.offloads.metadata_allmatch_grp = NULL; - } - - if (vport->ingress.offloads.metadata_prio_tag_grp) { - mlx5_destroy_flow_group(vport->ingress.offloads.metadata_prio_tag_grp); - vport->ingress.offloads.metadata_prio_tag_grp = NULL; - } -} - -static int esw_vport_ingress_config(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - int num_ftes = 0; - int err; - - if (!mlx5_eswitch_vport_match_metadata_enabled(esw) && - !esw_check_ingress_prio_tag_enabled(esw, vport)) - return 0; - - esw_vport_cleanup_ingress_rules(esw, vport); - - if (mlx5_eswitch_vport_match_metadata_enabled(esw)) - num_ftes++; - if (esw_check_ingress_prio_tag_enabled(esw, vport)) - num_ftes++; - - err = esw_vport_create_ingress_acl_table(esw, vport, num_ftes); - if (err) { - esw_warn(esw->dev, - "failed to enable ingress acl (%d) on vport[%d]\n", - err, vport->vport); - return err; - } - - err = esw_vport_create_ingress_acl_group(esw, vport); - if (err) - goto group_err; - - esw_debug(esw->dev, - "vport[%d] configure ingress rules\n", vport->vport); - - if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { - err = esw_vport_add_ingress_acl_modify_metadata(esw, vport); - if (err) - goto metadata_err; - } - - if (esw_check_ingress_prio_tag_enabled(esw, vport)) { - err = esw_vport_ingress_prio_tag_config(esw, vport); - if (err) - goto prio_tag_err; - } - return 0; - -prio_tag_err: - esw_vport_del_ingress_acl_modify_metadata(esw, vport); -metadata_err: - esw_vport_destroy_ingress_acl_group(vport); -group_err: - esw_vport_destroy_ingress_acl_table(vport); - return err; -} - -static int esw_vport_egress_config(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) -{ - int err; - - if (!MLX5_CAP_GEN(esw->dev, prio_tag_required)) - return 0; - - esw_vport_cleanup_egress_rules(esw, vport); - - err = esw_vport_enable_egress_acl(esw, vport); - if (err) - return err; - - /* For prio tag mode, there is only 1 FTEs: - * 1) prio tag packets - pop the prio tag VLAN, allow - * Unmatched traffic is allowed by default - */ - esw_debug(esw->dev, - "vport[%d] configure prio tag egress rules\n", vport->vport); - - /* prio tag vlan rule - pop it so VF receives untagged packets */ - err = mlx5_esw_create_vport_egress_acl_vlan(esw, vport, 0, - MLX5_FLOW_CONTEXT_ACTION_VLAN_POP | - MLX5_FLOW_CONTEXT_ACTION_ALLOW); - if (err) - esw_vport_disable_egress_acl(esw, vport); - - return err; -} - static bool esw_check_vport_match_metadata_supported(const struct mlx5_eswitch *esw) { @@ -2156,25 +1878,83 @@ static bool esw_use_vport_metadata(const struct mlx5_eswitch *esw) esw_check_vport_match_metadata_supported(esw); } +u32 mlx5_esw_match_metadata_alloc(struct mlx5_eswitch *esw) +{ + u32 num_vports = GENMASK(ESW_VPORT_BITS - 1, 0) - 1; + u32 vhca_id_mask = GENMASK(ESW_VHCA_ID_BITS - 1, 0); + u32 vhca_id = MLX5_CAP_GEN(esw->dev, vhca_id); + u32 start; + u32 end; + int id; + + /* Make sure the vhca_id fits the ESW_VHCA_ID_BITS */ + WARN_ON_ONCE(vhca_id >= BIT(ESW_VHCA_ID_BITS)); + + /* Trim vhca_id to ESW_VHCA_ID_BITS */ + vhca_id &= vhca_id_mask; + + start = (vhca_id << ESW_VPORT_BITS); + end = start + num_vports; + if (!vhca_id) + start += 1; /* zero is reserved/invalid metadata */ + id = ida_alloc_range(&esw->offloads.vport_metadata_ida, start, end, GFP_KERNEL); + + return (id < 0) ? 0 : id; +} + +void mlx5_esw_match_metadata_free(struct mlx5_eswitch *esw, u32 metadata) +{ + ida_free(&esw->offloads.vport_metadata_ida, metadata); +} + +static int esw_offloads_vport_metadata_setup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (vport->vport == MLX5_VPORT_UPLINK) + return 0; + + vport->default_metadata = mlx5_esw_match_metadata_alloc(esw); + vport->metadata = vport->default_metadata; + return vport->metadata ? 0 : -ENOSPC; +} + +static void esw_offloads_vport_metadata_cleanup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (vport->vport == MLX5_VPORT_UPLINK || !vport->default_metadata) + return; + + WARN_ON(vport->metadata != vport->default_metadata); + mlx5_esw_match_metadata_free(esw, vport->default_metadata); +} + int esw_vport_create_offloads_acl_tables(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { int err; - err = esw_vport_ingress_config(esw, vport); + err = esw_offloads_vport_metadata_setup(esw, vport); if (err) - return err; + goto metadata_err; + + err = esw_acl_ingress_ofld_setup(esw, vport); + if (err) + goto ingress_err; if (mlx5_eswitch_is_vf_vport(esw, vport->vport)) { - err = esw_vport_egress_config(esw, vport); - if (err) { - esw_vport_cleanup_ingress_rules(esw, vport); - esw_vport_del_ingress_acl_modify_metadata(esw, vport); - esw_vport_destroy_ingress_acl_group(vport); - esw_vport_destroy_ingress_acl_table(vport); - } + err = esw_acl_egress_ofld_setup(esw, vport); + if (err) + goto egress_err; } + + return 0; + +egress_err: + esw_acl_ingress_ofld_cleanup(esw, vport); +ingress_err: + esw_offloads_vport_metadata_cleanup(esw, vport); +metadata_err: return err; } @@ -2182,11 +1962,9 @@ void esw_vport_destroy_offloads_acl_tables(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { - esw_vport_disable_egress_acl(esw, vport); - esw_vport_cleanup_ingress_rules(esw, vport); - esw_vport_del_ingress_acl_modify_metadata(esw, vport); - esw_vport_destroy_ingress_acl_group(vport); - esw_vport_destroy_ingress_acl_table(vport); + esw_acl_egress_ofld_cleanup(vport); + esw_acl_ingress_ofld_cleanup(esw, vport); + esw_offloads_vport_metadata_cleanup(esw, vport); } static int esw_create_uplink_offloads_acl_tables(struct mlx5_eswitch *esw) @@ -2852,38 +2630,11 @@ EXPORT_SYMBOL(mlx5_eswitch_vport_match_metadata_enabled); u32 mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw, u16 vport_num) { - u32 vport_num_mask = GENMASK(ESW_VPORT_BITS - 1, 0); - u32 vhca_id_mask = GENMASK(ESW_VHCA_ID_BITS - 1, 0); - u32 vhca_id = MLX5_CAP_GEN(esw->dev, vhca_id); - u32 val; + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); - /* Make sure the vhca_id fits the ESW_VHCA_ID_BITS */ - WARN_ON_ONCE(vhca_id >= BIT(ESW_VHCA_ID_BITS)); - - /* Trim vhca_id to ESW_VHCA_ID_BITS */ - vhca_id &= vhca_id_mask; - - /* Make sure pf and ecpf map to end of ESW_VPORT_BITS range so they - * don't overlap with VF numbers, and themselves, after trimming. - */ - WARN_ON_ONCE((MLX5_VPORT_UPLINK & vport_num_mask) < - vport_num_mask - 1); - WARN_ON_ONCE((MLX5_VPORT_ECPF & vport_num_mask) < - vport_num_mask - 1); - WARN_ON_ONCE((MLX5_VPORT_UPLINK & vport_num_mask) == - (MLX5_VPORT_ECPF & vport_num_mask)); - - /* Make sure that the VF vport_num fits ESW_VPORT_BITS and don't - * overlap with pf and ecpf. - */ - if (vport_num != MLX5_VPORT_UPLINK && - vport_num != MLX5_VPORT_ECPF) - WARN_ON_ONCE(vport_num >= vport_num_mask - 1); - - /* We can now trim vport_num to ESW_VPORT_BITS */ - vport_num &= vport_num_mask; + if (WARN_ON_ONCE(IS_ERR(vport))) + return 0; - val = (vhca_id << ESW_VPORT_BITS) | vport_num; - return val << (32 - ESW_SOURCE_PORT_METADATA_BITS); + return vport->metadata << (32 - ESW_SOURCE_PORT_METADATA_BITS); } EXPORT_SYMBOL(mlx5_eswitch_get_vport_metadata_for_match); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 92f2395dd31a..30de3bf35c6d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1272,7 +1272,7 @@ static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) mlx5_debugfs_root); if (!priv->dbg_root) { dev_err(dev->device, "mlx5_core: error, Cannot create debugfs dir, aborting\n"); - return -ENOMEM; + goto err_dbg_root; } err = mlx5_health_init(dev); @@ -1289,15 +1289,27 @@ err_pagealloc_init: mlx5_health_cleanup(dev); err_health_init: debugfs_remove(dev->priv.dbg_root); - +err_dbg_root: + mutex_destroy(&priv->pgdir_mutex); + mutex_destroy(&priv->alloc_mutex); + mutex_destroy(&priv->bfregs.wc_head.lock); + mutex_destroy(&priv->bfregs.reg_head.lock); + mutex_destroy(&dev->intf_state_mutex); return err; } static void mlx5_mdev_uninit(struct mlx5_core_dev *dev) { + struct mlx5_priv *priv = &dev->priv; + mlx5_pagealloc_cleanup(dev); mlx5_health_cleanup(dev); debugfs_remove_recursive(dev->priv.dbg_root); + mutex_destroy(&priv->pgdir_mutex); + mutex_destroy(&priv->alloc_mutex); + mutex_destroy(&priv->bfregs.wc_head.lock); + mutex_destroy(&priv->bfregs.reg_head.lock); + mutex_destroy(&dev->intf_state_mutex); } #define MLX5_IB_MOD "mlx5_ib" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c index 48b6358b6845..890767a2a7cb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c @@ -297,7 +297,8 @@ mlx5dr_domain_create(struct mlx5_core_dev *mdev, enum mlx5dr_domain_type type) dmn->mdev = mdev; dmn->type = type; refcount_set(&dmn->refcount, 1); - mutex_init(&dmn->mutex); + mutex_init(&dmn->info.rx.mutex); + mutex_init(&dmn->info.tx.mutex); if (dr_domain_caps_init(mdev, dmn)) { mlx5dr_err(dmn, "Failed init domain, no caps\n"); @@ -345,9 +346,9 @@ int mlx5dr_domain_sync(struct mlx5dr_domain *dmn, u32 flags) int ret = 0; if (flags & MLX5DR_DOMAIN_SYNC_FLAGS_SW) { - mutex_lock(&dmn->mutex); + mlx5dr_domain_lock(dmn); ret = mlx5dr_send_ring_force_drain(dmn); - mutex_unlock(&dmn->mutex); + mlx5dr_domain_unlock(dmn); if (ret) { mlx5dr_err(dmn, "Force drain failed flags: %d, ret: %d\n", flags, ret); @@ -371,7 +372,8 @@ int mlx5dr_domain_destroy(struct mlx5dr_domain *dmn) dr_domain_uninit_cache(dmn); dr_domain_uninit_resources(dmn); dr_domain_caps_uninit(dmn); - mutex_destroy(&dmn->mutex); + mutex_destroy(&dmn->info.tx.mutex); + mutex_destroy(&dmn->info.rx.mutex); kfree(dmn); return 0; } @@ -379,7 +381,7 @@ int mlx5dr_domain_destroy(struct mlx5dr_domain *dmn) void mlx5dr_domain_set_peer(struct mlx5dr_domain *dmn, struct mlx5dr_domain *peer_dmn) { - mutex_lock(&dmn->mutex); + mlx5dr_domain_lock(dmn); if (dmn->peer_dmn) refcount_dec(&dmn->peer_dmn->refcount); @@ -389,5 +391,5 @@ void mlx5dr_domain_set_peer(struct mlx5dr_domain *dmn, if (dmn->peer_dmn) refcount_inc(&dmn->peer_dmn->refcount); - mutex_unlock(&dmn->mutex); + mlx5dr_domain_unlock(dmn); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c index a95938874798..31abcbb95ca2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c @@ -690,7 +690,7 @@ mlx5dr_matcher_create(struct mlx5dr_table *tbl, refcount_set(&matcher->refcount, 1); INIT_LIST_HEAD(&matcher->matcher_list); - mutex_lock(&tbl->dmn->mutex); + mlx5dr_domain_lock(tbl->dmn); ret = dr_matcher_init(matcher, mask); if (ret) @@ -700,14 +700,14 @@ mlx5dr_matcher_create(struct mlx5dr_table *tbl, if (ret) goto matcher_uninit; - mutex_unlock(&tbl->dmn->mutex); + mlx5dr_domain_unlock(tbl->dmn); return matcher; matcher_uninit: dr_matcher_uninit(matcher); free_matcher: - mutex_unlock(&tbl->dmn->mutex); + mlx5dr_domain_unlock(tbl->dmn); kfree(matcher); dec_ref: refcount_dec(&tbl->refcount); @@ -791,13 +791,13 @@ int mlx5dr_matcher_destroy(struct mlx5dr_matcher *matcher) if (refcount_read(&matcher->refcount) > 1) return -EBUSY; - mutex_lock(&tbl->dmn->mutex); + mlx5dr_domain_lock(tbl->dmn); dr_matcher_remove_from_tbl(matcher); dr_matcher_uninit(matcher); refcount_dec(&matcher->tbl->refcount); - mutex_unlock(&tbl->dmn->mutex); + mlx5dr_domain_unlock(tbl->dmn); kfree(matcher); return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c index cce3ee7a6614..cd708dcc2e3a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c @@ -938,7 +938,10 @@ static bool dr_rule_verify(struct mlx5dr_matcher *matcher, static int dr_rule_destroy_rule_nic(struct mlx5dr_rule *rule, struct mlx5dr_rule_rx_tx *nic_rule) { + mlx5dr_domain_nic_lock(nic_rule->nic_matcher->nic_tbl->nic_dmn); dr_rule_clean_rule_members(rule, nic_rule); + mlx5dr_domain_nic_unlock(nic_rule->nic_matcher->nic_tbl->nic_dmn); + return 0; } @@ -1039,18 +1042,18 @@ dr_rule_create_rule_nic(struct mlx5dr_rule *rule, if (dr_rule_skip(dmn->type, nic_dmn->ste_type, &matcher->mask, param)) return 0; + hw_ste_arr = kzalloc(DR_RULE_MAX_STE_CHAIN * DR_STE_SIZE, GFP_KERNEL); + if (!hw_ste_arr) + return -ENOMEM; + + mlx5dr_domain_nic_lock(nic_dmn); + ret = mlx5dr_matcher_select_builders(matcher, nic_matcher, dr_rule_get_ipv(¶m->outer), dr_rule_get_ipv(¶m->inner)); if (ret) - goto out_err; - - hw_ste_arr = kzalloc(DR_RULE_MAX_STE_CHAIN * DR_STE_SIZE, GFP_KERNEL); - if (!hw_ste_arr) { - ret = -ENOMEM; - goto out_err; - } + goto free_hw_ste; /* Set the tag values inside the ste array */ ret = mlx5dr_ste_build_ste_arr(matcher, nic_matcher, param, hw_ste_arr); @@ -1115,6 +1118,8 @@ dr_rule_create_rule_nic(struct mlx5dr_rule *rule, if (htbl) mlx5dr_htbl_put(htbl); + mlx5dr_domain_nic_unlock(nic_dmn); + kfree(hw_ste_arr); return 0; @@ -1129,8 +1134,8 @@ free_rule: kfree(ste_info); } free_hw_ste: + mlx5dr_domain_nic_unlock(nic_dmn); kfree(hw_ste_arr); -out_err: return ret; } @@ -1232,31 +1237,23 @@ struct mlx5dr_rule *mlx5dr_rule_create(struct mlx5dr_matcher *matcher, { struct mlx5dr_rule *rule; - mutex_lock(&matcher->tbl->dmn->mutex); refcount_inc(&matcher->refcount); rule = dr_rule_create_rule(matcher, value, num_actions, actions); if (!rule) refcount_dec(&matcher->refcount); - mutex_unlock(&matcher->tbl->dmn->mutex); - return rule; } int mlx5dr_rule_destroy(struct mlx5dr_rule *rule) { struct mlx5dr_matcher *matcher = rule->matcher; - struct mlx5dr_table *tbl = rule->matcher->tbl; int ret; - mutex_lock(&tbl->dmn->mutex); - ret = dr_rule_destroy_rule(rule); - - mutex_unlock(&tbl->dmn->mutex); - if (!ret) refcount_dec(&matcher->refcount); + return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index b8d97d44be7b..f421013b0b54 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -357,9 +357,11 @@ static int dr_postsend_icm_data(struct mlx5dr_domain *dmn, u32 buff_offset; int ret; + spin_lock(&send_ring->lock); + ret = dr_handle_pending_wc(dmn, send_ring); if (ret) - return ret; + goto out_unlock; if (send_info->write.length > dmn->info.max_inline_size) { buff_offset = (send_ring->tx_head & @@ -377,7 +379,9 @@ static int dr_postsend_icm_data(struct mlx5dr_domain *dmn, dr_fill_data_segs(send_ring, send_info); dr_post_send(send_ring->qp, send_info); - return 0; +out_unlock: + spin_unlock(&send_ring->lock); + return ret; } static int dr_get_tbl_copy_details(struct mlx5dr_domain *dmn, @@ -563,9 +567,7 @@ int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn, send_info.remote_addr = action->rewrite.chunk->mr_addr; send_info.rkey = action->rewrite.chunk->rkey; - mutex_lock(&dmn->mutex); ret = dr_postsend_icm_data(dmn, &send_info); - mutex_unlock(&dmn->mutex); return ret; } @@ -886,6 +888,7 @@ int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn) init_attr.pdn = dmn->pdn; init_attr.uar = dmn->uar; init_attr.max_send_wr = QUEUE_SIZE; + spin_lock_init(&dmn->send_ring->lock); dmn->send_ring->qp = dr_create_rc_qp(dmn->mdev, &init_attr); if (!dmn->send_ring->qp) { @@ -990,7 +993,9 @@ int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn) return ret; } + spin_lock(&send_ring->lock); ret = dr_handle_pending_wc(dmn, send_ring); + spin_unlock(&send_ring->lock); return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c index c2fe48d7b75a..b599b6beb5b9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c @@ -14,7 +14,7 @@ int mlx5dr_table_set_miss_action(struct mlx5dr_table *tbl, if (action && action->action_type != DR_ACTION_TYP_FT) return -EOPNOTSUPP; - mutex_lock(&tbl->dmn->mutex); + mlx5dr_domain_lock(tbl->dmn); if (!list_empty(&tbl->matcher_list)) last_matcher = list_last_entry(&tbl->matcher_list, @@ -78,7 +78,7 @@ int mlx5dr_table_set_miss_action(struct mlx5dr_table *tbl, refcount_inc(&action->refcount); out: - mutex_unlock(&tbl->dmn->mutex); + mlx5dr_domain_unlock(tbl->dmn); return ret; } @@ -95,7 +95,7 @@ static void dr_table_uninit_fdb(struct mlx5dr_table *tbl) static void dr_table_uninit(struct mlx5dr_table *tbl) { - mutex_lock(&tbl->dmn->mutex); + mlx5dr_domain_lock(tbl->dmn); switch (tbl->dmn->type) { case MLX5DR_DOMAIN_TYPE_NIC_RX: @@ -112,7 +112,7 @@ static void dr_table_uninit(struct mlx5dr_table *tbl) break; } - mutex_unlock(&tbl->dmn->mutex); + mlx5dr_domain_unlock(tbl->dmn); } static int dr_table_init_nic(struct mlx5dr_domain *dmn, @@ -177,7 +177,7 @@ static int dr_table_init(struct mlx5dr_table *tbl) INIT_LIST_HEAD(&tbl->matcher_list); - mutex_lock(&tbl->dmn->mutex); + mlx5dr_domain_lock(tbl->dmn); switch (tbl->dmn->type) { case MLX5DR_DOMAIN_TYPE_NIC_RX: @@ -201,7 +201,7 @@ static int dr_table_init(struct mlx5dr_table *tbl) break; } - mutex_unlock(&tbl->dmn->mutex); + mlx5dr_domain_unlock(tbl->dmn); return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index 984783238baa..c6d5a81d138b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -636,6 +636,7 @@ struct mlx5dr_domain_rx_tx { u64 drop_icm_addr; u64 default_icm_addr; enum mlx5dr_ste_entry_type ste_type; + struct mutex mutex; /* protect rx/tx domain */ }; struct mlx5dr_domain_info { @@ -660,7 +661,6 @@ struct mlx5dr_domain { struct mlx5_uars_page *uar; enum mlx5dr_domain_type type; refcount_t refcount; - struct mutex mutex; /* protect domain */ struct mlx5dr_icm_pool *ste_icm_pool; struct mlx5dr_icm_pool *action_icm_pool; struct mlx5dr_send_ring *send_ring; @@ -814,6 +814,28 @@ struct mlx5dr_icm_chunk { struct list_head *miss_list; }; +static inline void mlx5dr_domain_nic_lock(struct mlx5dr_domain_rx_tx *nic_dmn) +{ + mutex_lock(&nic_dmn->mutex); +} + +static inline void mlx5dr_domain_nic_unlock(struct mlx5dr_domain_rx_tx *nic_dmn) +{ + mutex_unlock(&nic_dmn->mutex); +} + +static inline void mlx5dr_domain_lock(struct mlx5dr_domain *dmn) +{ + mlx5dr_domain_nic_lock(&dmn->info.rx); + mlx5dr_domain_nic_lock(&dmn->info.tx); +} + +static inline void mlx5dr_domain_unlock(struct mlx5dr_domain *dmn) +{ + mlx5dr_domain_nic_unlock(&dmn->info.tx); + mlx5dr_domain_nic_unlock(&dmn->info.rx); +} + static inline int mlx5dr_matcher_supp_flex_parser_icmp_v4(struct mlx5dr_cmd_caps *caps) { @@ -1043,6 +1065,7 @@ struct mlx5dr_send_ring { struct ib_wc wc[MAX_SEND_CQE]; u8 sync_buff[MIN_READ_SYNC]; struct mlx5dr_mr *sync_mr; + spinlock_t lock; /* Protect the data path of the send ring */ }; int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index c15a92163c1f..4c972d8abf31 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1313,8 +1313,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; - struct linger sol = { .l_onoff = 1, .l_linger = 0 }; - int ret, opt, rcv_pdu_size; + int ret, rcv_pdu_size; queue->ctrl = ctrl; INIT_LIST_HEAD(&queue->send_list); @@ -1337,60 +1336,24 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, } /* Single syn retry */ - opt = 1; - ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, - (char *)&opt, sizeof(opt)); - if (ret) { - dev_err(nctrl->device, - "failed to set TCP_SYNCNT sock opt %d\n", ret); - goto err_sock; - } + tcp_sock_set_syncnt(queue->sock->sk, 1); /* Set TCP no delay */ - opt = 1; - ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, - TCP_NODELAY, (char *)&opt, sizeof(opt)); - if (ret) { - dev_err(nctrl->device, - "failed to set TCP_NODELAY sock opt %d\n", ret); - goto err_sock; - } + tcp_sock_set_nodelay(queue->sock->sk); /* * Cleanup whatever is sitting in the TCP transmit queue on socket * close. This is done to prevent stale data from being sent should * the network connection be restored before TCP times out. */ - ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER, - (char *)&sol, sizeof(sol)); - if (ret) { - dev_err(nctrl->device, - "failed to set SO_LINGER sock opt %d\n", ret); - goto err_sock; - } + sock_no_linger(queue->sock->sk); - if (so_priority > 0) { - ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY, - (char *)&so_priority, sizeof(so_priority)); - if (ret) { - dev_err(ctrl->ctrl.device, - "failed to set SO_PRIORITY sock opt, ret %d\n", - ret); - goto err_sock; - } - } + if (so_priority > 0) + sock_set_priority(queue->sock->sk, so_priority); /* Set socket type of service */ - if (nctrl->opts->tos >= 0) { - opt = nctrl->opts->tos; - ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS, - (char *)&opt, sizeof(opt)); - if (ret) { - dev_err(nctrl->device, - "failed to set IP_TOS sock opt %d\n", ret); - goto err_sock; - } - } + if (nctrl->opts->tos >= 0) + ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos); queue->sock->sk->sk_allocation = GFP_ATOMIC; nvme_tcp_set_queue_io_cpu(queue); diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index f0da04e960f4..4546049a96b3 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1429,7 +1429,6 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) { struct socket *sock = queue->sock; struct inet_sock *inet = inet_sk(sock->sk); - struct linger sol = { .l_onoff = 1, .l_linger = 0 }; int ret; ret = kernel_getsockname(sock, @@ -1447,27 +1446,14 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) * close. This is done to prevent stale data from being sent should * the network connection be restored before TCP times out. */ - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&sol, sizeof(sol)); - if (ret) - return ret; + sock_no_linger(sock->sk); - if (so_priority > 0) { - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY, - (char *)&so_priority, sizeof(so_priority)); - if (ret) - return ret; - } + if (so_priority > 0) + sock_set_priority(sock->sk, so_priority); /* Set socket type of service */ - if (inet->rcv_tos > 0) { - int tos = inet->rcv_tos; - - ret = kernel_setsockopt(sock, SOL_IP, IP_TOS, - (char *)&tos, sizeof(tos)); - if (ret) - return ret; - } + if (inet->rcv_tos > 0) + ip_sock_set_tos(sock->sk, inet->rcv_tos); write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = queue; @@ -1588,7 +1574,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) { struct nvmet_tcp_port *port; __kernel_sa_family_t af; - int opt, ret; + int ret; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) @@ -1632,30 +1618,10 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) port->sock->sk->sk_user_data = port; port->data_ready = port->sock->sk->sk_data_ready; port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; - - opt = 1; - ret = kernel_setsockopt(port->sock, IPPROTO_TCP, - TCP_NODELAY, (char *)&opt, sizeof(opt)); - if (ret) { - pr_err("failed to set TCP_NODELAY sock opt %d\n", ret); - goto err_sock; - } - - ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&opt, sizeof(opt)); - if (ret) { - pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret); - goto err_sock; - } - - if (so_priority > 0) { - ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY, - (char *)&so_priority, sizeof(so_priority)); - if (ret) { - pr_err("failed to set SO_PRIORITY sock opt %d\n", ret); - goto err_sock; - } - } + sock_set_reuseaddr(port->sock->sk); + tcp_sock_set_nodelay(port->sock->sk); + if (so_priority > 0) + sock_set_priority(port->sock->sk, so_priority); ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr, sizeof(port->addr)); diff --git a/drivers/target/iscsi/Kconfig b/drivers/target/iscsi/Kconfig index 1f93ea381353..922484ea4e30 100644 --- a/drivers/target/iscsi/Kconfig +++ b/drivers/target/iscsi/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config ISCSI_TARGET tristate "Linux-iSCSI.org iSCSI Target Mode Stack" - depends on NET + depends on INET select CRYPTO select CRYPTO_CRC32C select CRYPTO_CRC32C_INTEL if X86 diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 731ee67fe914..85748e338858 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -15,6 +15,7 @@ #include <linux/sched/signal.h> #include <linux/idr.h> #include <linux/tcp.h> /* TCP_NODELAY */ +#include <net/ip.h> #include <net/ipv6.h> /* ipv6_addr_v4mapped() */ #include <scsi/iscsi_proto.h> #include <target/target_core_base.h> @@ -855,7 +856,7 @@ int iscsit_setup_np( struct sockaddr_storage *sockaddr) { struct socket *sock = NULL; - int backlog = ISCSIT_TCP_BACKLOG, ret, opt = 0, len; + int backlog = ISCSIT_TCP_BACKLOG, ret, len; switch (np->np_network_transport) { case ISCSI_TCP: @@ -897,34 +898,10 @@ int iscsit_setup_np( /* * Set SO_REUSEADDR, and disable Nagel Algorithm with TCP_NODELAY. */ - /* FIXME: Someone please explain why this is endian-safe */ - opt = 1; - if (np->np_network_transport == ISCSI_TCP) { - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - pr_err("kernel_setsockopt() for TCP_NODELAY" - " failed: %d\n", ret); - goto fail; - } - } - - /* FIXME: Someone please explain why this is endian-safe */ - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - pr_err("kernel_setsockopt() for SO_REUSEADDR" - " failed\n"); - goto fail; - } - - ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - pr_err("kernel_setsockopt() for IP_FREEBIND" - " failed\n"); - goto fail; - } + if (np->np_network_transport == ISCSI_TCP) + tcp_sock_set_nodelay(sock->sk); + sock_set_reuseaddr(sock->sk); + ip_sock_set_freebind(sock->sk); ret = kernel_bind(sock, (struct sockaddr *)&np->np_sockaddr, len); if (ret < 0) { diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 1ecc67da6c1a..e313dae01674 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -37,7 +37,6 @@ int afs_open_socket(struct afs_net *net) { struct sockaddr_rxrpc srx; struct socket *socket; - unsigned int min_level; int ret; _enter(""); @@ -57,9 +56,8 @@ int afs_open_socket(struct afs_net *net) srx.transport.sin6.sin6_family = AF_INET6; srx.transport.sin6.sin6_port = htons(AFS_CM_PORT); - min_level = RXRPC_SECURITY_ENCRYPT; - ret = kernel_setsockopt(socket, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL, - (void *)&min_level, sizeof(min_level)); + ret = rxrpc_sock_set_min_security_level(socket->sk, + RXRPC_SECURITY_ENCRYPT); if (ret < 0) goto error_2; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 28268ed461b8..ad8fb53b3682 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3929,14 +3929,8 @@ generic_ip_connect(struct TCP_Server_Info *server) socket->sk->sk_rcvbuf = 140 * 1024; } - if (server->tcp_nodelay) { - int val = 1; - rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, - (char *)&val, sizeof(val)); - if (rc) - cifs_dbg(FYI, "set TCP_NODELAY socket option error %d\n", - rc); - } + if (server->tcp_nodelay) + tcp_sock_set_nodelay(socket->sk); cifs_dbg(FYI, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx\n", socket->sk->sk_sndbuf, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index c97570eb2c18..99760063e000 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -325,7 +325,6 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, size_t total_len = 0, sent, size; struct socket *ssocket = server->ssocket; struct msghdr smb_msg; - int val = 1; __be32 rfc1002_marker; if (cifs_rdma_enabled(server)) { @@ -345,8 +344,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, } /* cork the socket */ - kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, - (char *)&val, sizeof(val)); + tcp_sock_set_cork(ssocket->sk, true); for (j = 0; j < num_rqst; j++) send_length += smb_rqst_len(server, &rqst[j]); @@ -435,9 +433,7 @@ unmask: } /* uncork it */ - val = 0; - kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, - (char *)&val, sizeof(val)); + tcp_sock_set_cork(ssocket->sk, false); if ((total_len > 0) && (total_len != send_length)) { cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n", diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index cdfaf4f0e11a..69333728d871 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -724,7 +724,7 @@ out_close: } /* Listening socket is busy, accept a connection */ -static int tcp_accept_from_sock(struct connection *con) +static int accept_from_sock(struct connection *con) { int result; struct sockaddr_storage peeraddr; @@ -852,123 +852,6 @@ accept_err: return result; } -static int sctp_accept_from_sock(struct connection *con) -{ - /* Check that the new node is in the lockspace */ - struct sctp_prim prim; - int nodeid; - int prim_len, ret; - int addr_len; - struct connection *newcon; - struct connection *addcon; - struct socket *newsock; - - mutex_lock(&connections_lock); - if (!dlm_allow_conn) { - mutex_unlock(&connections_lock); - return -1; - } - mutex_unlock(&connections_lock); - - mutex_lock_nested(&con->sock_mutex, 0); - - ret = kernel_accept(con->sock, &newsock, O_NONBLOCK); - if (ret < 0) - goto accept_err; - - memset(&prim, 0, sizeof(struct sctp_prim)); - prim_len = sizeof(struct sctp_prim); - - ret = kernel_getsockopt(newsock, IPPROTO_SCTP, SCTP_PRIMARY_ADDR, - (char *)&prim, &prim_len); - if (ret < 0) { - log_print("getsockopt/sctp_primary_addr failed: %d", ret); - goto accept_err; - } - - make_sockaddr(&prim.ssp_addr, 0, &addr_len); - ret = addr_to_nodeid(&prim.ssp_addr, &nodeid); - if (ret) { - unsigned char *b = (unsigned char *)&prim.ssp_addr; - - log_print("reject connect from unknown addr"); - print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, - b, sizeof(struct sockaddr_storage)); - goto accept_err; - } - - newcon = nodeid2con(nodeid, GFP_NOFS); - if (!newcon) { - ret = -ENOMEM; - goto accept_err; - } - - mutex_lock_nested(&newcon->sock_mutex, 1); - - if (newcon->sock) { - struct connection *othercon = newcon->othercon; - - if (!othercon) { - othercon = kmem_cache_zalloc(con_cache, GFP_NOFS); - if (!othercon) { - log_print("failed to allocate incoming socket"); - mutex_unlock(&newcon->sock_mutex); - ret = -ENOMEM; - goto accept_err; - } - othercon->nodeid = nodeid; - othercon->rx_action = receive_from_sock; - mutex_init(&othercon->sock_mutex); - INIT_LIST_HEAD(&othercon->writequeue); - spin_lock_init(&othercon->writequeue_lock); - INIT_WORK(&othercon->swork, process_send_sockets); - INIT_WORK(&othercon->rwork, process_recv_sockets); - set_bit(CF_IS_OTHERCON, &othercon->flags); - } - mutex_lock_nested(&othercon->sock_mutex, 2); - if (!othercon->sock) { - newcon->othercon = othercon; - add_sock(newsock, othercon); - addcon = othercon; - mutex_unlock(&othercon->sock_mutex); - } else { - printk("Extra connection from node %d attempted\n", nodeid); - ret = -EAGAIN; - mutex_unlock(&othercon->sock_mutex); - mutex_unlock(&newcon->sock_mutex); - goto accept_err; - } - } else { - newcon->rx_action = receive_from_sock; - add_sock(newsock, newcon); - addcon = newcon; - } - - log_print("connected to %d", nodeid); - - mutex_unlock(&newcon->sock_mutex); - - /* - * Add it to the active queue in case we got data - * between processing the accept adding the socket - * to the read_sockets list - */ - if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) - queue_work(recv_workqueue, &addcon->rwork); - mutex_unlock(&con->sock_mutex); - - return 0; - -accept_err: - mutex_unlock(&con->sock_mutex); - if (newsock) - sock_release(newsock); - if (ret != -EAGAIN) - log_print("error accepting connection from node: %d", ret); - - return ret; -} - static void free_entry(struct writequeue_entry *e) { __free_page(e->page); @@ -1035,7 +918,6 @@ static void sctp_connect_to_sock(struct connection *con) int result; int addr_len; struct socket *sock; - struct __kernel_sock_timeval tv = { .tv_sec = 5, .tv_usec = 0 }; if (con->nodeid == 0) { log_print("attempt to connect sock 0 foiled"); @@ -1087,13 +969,10 @@ static void sctp_connect_to_sock(struct connection *con) * since O_NONBLOCK argument in connect() function does not work here, * then, we should restore the default value of this attribute. */ - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv, - sizeof(tv)); + sock_set_sndtimeo(sock->sk, 5); result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, 0); - memset(&tv, 0, sizeof(tv)); - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv, - sizeof(tv)); + sock_set_sndtimeo(sock->sk, 0); if (result == -EINPROGRESS) result = 0; @@ -1132,7 +1011,6 @@ static void tcp_connect_to_sock(struct connection *con) struct sockaddr_storage saddr, src_addr; int addr_len; struct socket *sock = NULL; - int one = 1; int result; if (con->nodeid == 0) { @@ -1181,8 +1059,7 @@ static void tcp_connect_to_sock(struct connection *con) log_print("connecting to %d", con->nodeid); /* Turn off Nagle's algorithm */ - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, - sizeof(one)); + tcp_sock_set_nodelay(sock->sk); result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, O_NONBLOCK); @@ -1224,7 +1101,6 @@ static struct socket *tcp_create_listen_sock(struct connection *con, { struct socket *sock = NULL; int result = 0; - int one = 1; int addr_len; if (dlm_local_addr[0]->ss_family == AF_INET) @@ -1241,19 +1117,14 @@ static struct socket *tcp_create_listen_sock(struct connection *con, } /* Turn off Nagle's algorithm */ - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, - sizeof(one)); + tcp_sock_set_nodelay(sock->sk); - result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&one, sizeof(one)); + sock_set_reuseaddr(sock->sk); - if (result < 0) { - log_print("Failed to set SO_REUSEADDR on socket: %d", result); - } write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = con; save_listen_callbacks(sock); - con->rx_action = tcp_accept_from_sock; + con->rx_action = accept_from_sock; con->connect_action = tcp_connect_to_sock; write_unlock_bh(&sock->sk->sk_callback_lock); @@ -1267,11 +1138,7 @@ static struct socket *tcp_create_listen_sock(struct connection *con, con->sock = NULL; goto create_out; } - result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&one, sizeof(one)); - if (result < 0) { - log_print("Set keepalive failed: %d", result); - } + sock_set_keepalive(sock->sk); result = sock->ops->listen(sock, 5); if (result < 0) { @@ -1309,7 +1176,6 @@ static int sctp_listen_for_all(void) struct socket *sock = NULL; int result = -EINVAL; struct connection *con = nodeid2con(0, GFP_NOFS); - int bufsize = NEEDED_RMEM; int one = 1; if (!con) @@ -1324,11 +1190,7 @@ static int sctp_listen_for_all(void) goto out; } - result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE, - (char *)&bufsize, sizeof(bufsize)); - if (result) - log_print("Error increasing buffer space on socket %d", result); - + sock_set_rcvbuf(sock->sk, NEEDED_RMEM); result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, sizeof(one)); if (result < 0) @@ -1340,7 +1202,7 @@ static int sctp_listen_for_all(void) save_listen_callbacks(sock); con->sock = sock; con->sock->sk->sk_data_ready = lowcomms_data_ready; - con->rx_action = sctp_accept_from_sock; + con->rx_action = accept_from_sock; con->connect_action = sctp_connect_to_sock; write_unlock_bh(&sock->sk->sk_callback_lock); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2c512b40a940..79a231719460 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1441,22 +1441,6 @@ static void o2net_rx_until_empty(struct work_struct *work) sc_put(sc); } -static int o2net_set_nodelay(struct socket *sock) -{ - int val = 1; - - return kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (void *)&val, sizeof(val)); -} - -static int o2net_set_usertimeout(struct socket *sock) -{ - int user_timeout = O2NET_TCP_USER_TIMEOUT; - - return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (void *)&user_timeout, sizeof(user_timeout)); -} - static void o2net_initialize_handshake(void) { o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( @@ -1636,17 +1620,8 @@ static void o2net_start_connect(struct work_struct *work) goto out; } - ret = o2net_set_nodelay(sc->sc_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } - - ret = o2net_set_usertimeout(sock); - if (ret) { - mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); - goto out; - } + tcp_sock_set_nodelay(sc->sc_sock->sk); + tcp_sock_set_user_timeout(sock->sk, O2NET_TCP_USER_TIMEOUT); o2net_register_callbacks(sc->sc_sock->sk, sc); @@ -1832,17 +1807,8 @@ static int o2net_accept_one(struct socket *sock, int *more) *more = 1; new_sock->sk->sk_allocation = GFP_ATOMIC; - ret = o2net_set_nodelay(new_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } - - ret = o2net_set_usertimeout(new_sock); - if (ret) { - mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); - goto out; - } + tcp_sock_set_nodelay(new_sock->sk); + tcp_sock_set_user_timeout(new_sock->sk, O2NET_TCP_USER_TIMEOUT); ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1); if (ret < 0) diff --git a/include/linux/net.h b/include/linux/net.h index 6451425e828f..74ef5d7315f7 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -303,8 +303,6 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags); int kernel_getsockname(struct socket *sock, struct sockaddr *addr); int kernel_getpeername(struct socket *sock, struct sockaddr *addr); -int kernel_getsockopt(struct socket *sock, int level, int optname, char *optval, - int *optlen); int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval, unsigned int optlen); int kernel_sendpage(struct socket *sock, struct page *page, int offset, diff --git a/include/linux/tcp.h b/include/linux/tcp.h index bf44e85d709d..9aac824c523c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -497,4 +497,13 @@ static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); +void tcp_sock_set_cork(struct sock *sk, bool on); +int tcp_sock_set_keepcnt(struct sock *sk, int val); +int tcp_sock_set_keepidle(struct sock *sk, int val); +int tcp_sock_set_keepintvl(struct sock *sk, int val); +void tcp_sock_set_nodelay(struct sock *sk); +void tcp_sock_set_quickack(struct sock *sk, int val); +int tcp_sock_set_syncnt(struct sock *sk, int val); +void tcp_sock_set_user_timeout(struct sock *sk, u32 val); + #endif /* _LINUX_TCP_H */ diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index ab988940bf04..91eacbdcf33d 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -72,4 +72,6 @@ bool rxrpc_kernel_call_is_complete(struct rxrpc_call *); void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, unsigned long); +int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); + #endif /* _NET_RXRPC_H */ diff --git a/include/net/ip.h b/include/net/ip.h index 5b317c9f4470..04ebe7bf54c6 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -765,4 +765,10 @@ static inline bool inetdev_valid_mtu(unsigned int mtu) return likely(mtu >= IPV4_MIN_MTU); } +void ip_sock_set_freebind(struct sock *sk); +int ip_sock_set_mtu_discover(struct sock *sk, int val); +void ip_sock_set_pktinfo(struct sock *sk); +void ip_sock_set_recverr(struct sock *sk); +void ip_sock_set_tos(struct sock *sk, int val); + #endif /* _IP_H */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 39a00d3ef5e2..5e65bf2fd32d 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1177,4 +1177,96 @@ int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); + +static inline int ip6_sock_set_v6only(struct sock *sk) +{ + if (inet_sk(sk)->inet_num) + return -EINVAL; + lock_sock(sk); + sk->sk_ipv6only = true; + release_sock(sk); + return 0; +} + +static inline void ip6_sock_set_recverr(struct sock *sk) +{ + lock_sock(sk); + inet6_sk(sk)->recverr = true; + release_sock(sk); +} + +static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val) +{ + unsigned int pref = 0; + unsigned int prefmask = ~0; + + /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ + switch (val & (IPV6_PREFER_SRC_PUBLIC | + IPV6_PREFER_SRC_TMP | + IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { + case IPV6_PREFER_SRC_PUBLIC: + pref |= IPV6_PREFER_SRC_PUBLIC; + prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | + IPV6_PREFER_SRC_TMP); + break; + case IPV6_PREFER_SRC_TMP: + pref |= IPV6_PREFER_SRC_TMP; + prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | + IPV6_PREFER_SRC_TMP); + break; + case IPV6_PREFER_SRC_PUBTMP_DEFAULT: + prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | + IPV6_PREFER_SRC_TMP); + break; + case 0: + break; + default: + return -EINVAL; + } + + /* check HOME/COA conflicts */ + switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) { + case IPV6_PREFER_SRC_HOME: + prefmask &= ~IPV6_PREFER_SRC_COA; + break; + case IPV6_PREFER_SRC_COA: + pref |= IPV6_PREFER_SRC_COA; + break; + case 0: + break; + default: + return -EINVAL; + } + + /* check CGA/NONCGA conflicts */ + switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { + case IPV6_PREFER_SRC_CGA: + case IPV6_PREFER_SRC_NONCGA: + case 0: + break; + default: + return -EINVAL; + } + + inet6_sk(sk)->srcprefs = (inet6_sk(sk)->srcprefs & prefmask) | pref; + return 0; +} + +static inline int ip6_sock_set_addr_preferences(struct sock *sk, bool val) +{ + int ret; + + lock_sock(sk); + ret = __ip6_sock_set_addr_preferences(sk, val); + release_sock(sk); + return ret; +} + +static inline void ip6_sock_set_recvpktinfo(struct sock *sk) +{ + lock_sock(sk); + inet6_sk(sk)->rxopt.bits.rxinfo = true; + release_sock(sk); +} + #endif /* _NET_IPV6_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 3e8c6d4b4b59..d994daa418ec 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2688,4 +2688,14 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); +int sock_bindtoindex(struct sock *sk, int ifindex); +void sock_enable_timestamps(struct sock *sk); +void sock_no_linger(struct sock *sk); +void sock_set_keepalive(struct sock *sk); +void sock_set_priority(struct sock *sk, u32 priority); +void sock_set_rcvbuf(struct sock *sk, int val); +void sock_set_reuseaddr(struct sock *sk); +void sock_set_reuseport(struct sock *sk); +void sock_set_sndtimeo(struct sock *sk, s64 secs); + #endif /* _SOCK_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index b681338a8320..66e4b8331850 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -437,6 +437,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); void tcp_req_err(struct sock *sk, u32 seq, bool abort); +void tcp_ld_RTO_revert(struct sock *sk, u32 seq); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f8ca5edc5f2c..27d6ab11f9ee 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -490,15 +490,8 @@ static int ceph_tcp_connect(struct ceph_connection *con) return ret; } - if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) { - int optval = 1; - - ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char *)&optval, sizeof(optval)); - if (ret) - pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d", - ret); - } + if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) + tcp_sock_set_nodelay(sock->sk); con->sock = sock; return 0; diff --git a/net/core/sock.c b/net/core/sock.c index fd85e651ce28..2ca3425b519c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -566,7 +566,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) } EXPORT_SYMBOL(sk_dst_check); -static int sock_setbindtodevice_locked(struct sock *sk, int ifindex) +static int sock_bindtoindex_locked(struct sock *sk, int ifindex) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES @@ -594,6 +594,18 @@ out: return ret; } +int sock_bindtoindex(struct sock *sk, int ifindex) +{ + int ret; + + lock_sock(sk); + ret = sock_bindtoindex_locked(sk, ifindex); + release_sock(sk); + + return ret; +} +EXPORT_SYMBOL(sock_bindtoindex); + static int sock_setbindtodevice(struct sock *sk, char __user *optval, int optlen) { @@ -634,10 +646,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, goto out; } - lock_sock(sk); - ret = sock_setbindtodevice_locked(sk, index); - release_sock(sk); - + return sock_bindtoindex(sk, index); out: #endif @@ -712,6 +721,111 @@ bool sk_mc_loop(struct sock *sk) } EXPORT_SYMBOL(sk_mc_loop); +void sock_set_reuseaddr(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuse = SK_CAN_REUSE; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseaddr); + +void sock_set_reuseport(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuseport = true; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseport); + +void sock_no_linger(struct sock *sk) +{ + lock_sock(sk); + sk->sk_lingertime = 0; + sock_set_flag(sk, SOCK_LINGER); + release_sock(sk); +} +EXPORT_SYMBOL(sock_no_linger); + +void sock_set_priority(struct sock *sk, u32 priority) +{ + lock_sock(sk); + sk->sk_priority = priority; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_priority); + +void sock_set_sndtimeo(struct sock *sk, s64 secs) +{ + lock_sock(sk); + if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) + sk->sk_sndtimeo = secs * HZ; + else + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_sndtimeo); + +static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) +{ + if (val) { + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); + sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + } else { + sock_reset_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + sock_reset_flag(sk, SOCK_TSTAMP_NEW); + } +} + +void sock_enable_timestamps(struct sock *sk) +{ + lock_sock(sk); + __sock_set_timestamps(sk, true, false, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_enable_timestamps); + +void sock_set_keepalive(struct sock *sk) +{ + lock_sock(sk); + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, true); + sock_valbool_flag(sk, SOCK_KEEPOPEN, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_keepalive); + +static void __sock_set_rcvbuf(struct sock *sk, int val) +{ + /* Ensure val * 2 fits into an int, to prevent max_t() from treating it + * as a negative value. + */ + val = min_t(int, val, INT_MAX / 2); + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + + /* We double it on the way in to account for "struct sk_buff" etc. + * overhead. Applications assume that the SO_RCVBUF setting they make + * will allow that much actual data to be received on that socket. + * + * Applications are unaware that "struct sk_buff" and other overheads + * allocate from the receive buffer during socket buffer allocation. + * + * And after considering the possible alternatives, returning the value + * we actually used in getsockopt is the most desirable behavior. + */ + WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); +} + +void sock_set_rcvbuf(struct sock *sk, int val) +{ + lock_sock(sk); + __sock_set_rcvbuf(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_rcvbuf); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -808,30 +922,7 @@ set_sndbuf: * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - val = min_t(u32, val, sysctl_rmem_max); -set_rcvbuf: - /* Ensure val * 2 fits into an int, to prevent max_t() - * from treating it as a negative value. - */ - val = min_t(int, val, INT_MAX / 2); - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - /* - * We double it on the way in to account for - * "struct sk_buff" etc. overhead. Applications - * assume that the SO_RCVBUF setting they make will - * allow that much actual data to be received on that - * socket. - * - * Applications are unaware that "struct sk_buff" and - * other overheads allocate from the receive buffer - * during socket buffer allocation. - * - * And after considering the possible alternatives, - * returning the value we actually used in getsockopt - * is the most desirable behavior. - */ - WRITE_ONCE(sk->sk_rcvbuf, - max_t(int, val * 2, SOCK_MIN_RCVBUF)); + __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); break; case SO_RCVBUFFORCE: @@ -843,9 +934,8 @@ set_rcvbuf: /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ - if (val < 0) - val = 0; - goto set_rcvbuf; + __sock_set_rcvbuf(sk, max(val, 0)); + break; case SO_KEEPALIVE: if (sk->sk_prot->keepalive) @@ -903,28 +993,17 @@ set_rcvbuf: break; case SO_TIMESTAMP_OLD: + __sock_set_timestamps(sk, valbool, false, false); + break; case SO_TIMESTAMP_NEW: + __sock_set_timestamps(sk, valbool, true, false); + break; case SO_TIMESTAMPNS_OLD: + __sock_set_timestamps(sk, valbool, false, true); + break; case SO_TIMESTAMPNS_NEW: - if (valbool) { - if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW) - sock_set_flag(sk, SOCK_TSTAMP_NEW); - else - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - - if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW) - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - else - sock_set_flag(sk, SOCK_RCVTSTAMPNS); - sock_set_flag(sk, SOCK_RCVTSTAMP); - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - } else { - sock_reset_flag(sk, SOCK_RCVTSTAMP); - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - } + __sock_set_timestamps(sk, valbool, true, true); break; - case SO_TIMESTAMPING_NEW: sock_set_flag(sk, SOCK_TSTAMP_NEW); /* fall through */ @@ -1180,7 +1259,7 @@ set_rcvbuf: break; case SO_BINDTOIFINDEX: - ret = sock_setbindtodevice_locked(sk, val); + ret = sock_bindtoindex_locked(sk, val); break; default: diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index f43d5f12aa86..84ec3703c909 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -560,6 +560,61 @@ out: return err; } +static void __ip_sock_set_tos(struct sock *sk, int val) +{ + if (sk->sk_type == SOCK_STREAM) { + val &= ~INET_ECN_MASK; + val |= inet_sk(sk)->tos & INET_ECN_MASK; + } + if (inet_sk(sk)->tos != val) { + inet_sk(sk)->tos = val; + sk->sk_priority = rt_tos2priority(val); + sk_dst_reset(sk); + } +} + +void ip_sock_set_tos(struct sock *sk, int val) +{ + lock_sock(sk); + __ip_sock_set_tos(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_tos); + +void ip_sock_set_freebind(struct sock *sk) +{ + lock_sock(sk); + inet_sk(sk)->freebind = true; + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_freebind); + +void ip_sock_set_recverr(struct sock *sk) +{ + lock_sock(sk); + inet_sk(sk)->recverr = true; + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_recverr); + +int ip_sock_set_mtu_discover(struct sock *sk, int val) +{ + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) + return -EINVAL; + lock_sock(sk); + inet_sk(sk)->pmtudisc = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(ip_sock_set_mtu_discover); + +void ip_sock_set_pktinfo(struct sock *sk) +{ + lock_sock(sk); + inet_sk(sk)->cmsg_flags |= IP_CMSG_PKTINFO; + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_pktinfo); /* * Socket option code for IP. This is the end of the line after any @@ -823,15 +878,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE; break; case IP_TOS: /* This sets both TOS and Precedence */ - if (sk->sk_type == SOCK_STREAM) { - val &= ~INET_ECN_MASK; - val |= inet->tos & INET_ECN_MASK; - } - if (inet->tos != val) { - inet->tos = val; - sk->sk_priority = rt_tos2priority(val); - sk_dst_reset(sk); - } + __ip_sock_set_tos(sk, val); break; case IP_TTL: if (optlen < 1) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 970064996377..15d47d5e7951 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2801,6 +2801,163 @@ static void tcp_enable_tx_delay(void) } } +/* When set indicates to always queue non-full frames. Later the user clears + * this option and we transmit any pending partial frames in the queue. This is + * meant to be used alongside sendfile() to get properly filled frames when the + * user (for example) must write out headers with a write() call first and then + * use sendfile to send out the data parts. + * + * TCP_CORK can be set together with TCP_NODELAY and it is stronger than + * TCP_NODELAY. + */ +static void __tcp_sock_set_cork(struct sock *sk, bool on) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (on) { + tp->nonagle |= TCP_NAGLE_CORK; + } else { + tp->nonagle &= ~TCP_NAGLE_CORK; + if (tp->nonagle & TCP_NAGLE_OFF) + tp->nonagle |= TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk); + } +} + +void tcp_sock_set_cork(struct sock *sk, bool on) +{ + lock_sock(sk); + __tcp_sock_set_cork(sk, on); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_cork); + +/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is + * remembered, but it is not activated until cork is cleared. + * + * However, when TCP_NODELAY is set we make an explicit push, which overrides + * even TCP_CORK for currently queued segments. + */ +static void __tcp_sock_set_nodelay(struct sock *sk, bool on) +{ + if (on) { + tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk); + } else { + tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF; + } +} + +void tcp_sock_set_nodelay(struct sock *sk) +{ + lock_sock(sk); + __tcp_sock_set_nodelay(sk, true); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_nodelay); + +static void __tcp_sock_set_quickack(struct sock *sk, int val) +{ + if (!val) { + inet_csk_enter_pingpong_mode(sk); + return; + } + + inet_csk_exit_pingpong_mode(sk); + if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && + inet_csk_ack_scheduled(sk)) { + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED; + tcp_cleanup_rbuf(sk, 1); + if (!(val & 1)) + inet_csk_enter_pingpong_mode(sk); + } +} + +void tcp_sock_set_quickack(struct sock *sk, int val) +{ + lock_sock(sk); + __tcp_sock_set_quickack(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_quickack); + +int tcp_sock_set_syncnt(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_SYNCNT) + return -EINVAL; + + lock_sock(sk); + inet_csk(sk)->icsk_syn_retries = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_syncnt); + +void tcp_sock_set_user_timeout(struct sock *sk, u32 val) +{ + lock_sock(sk); + inet_csk(sk)->icsk_user_timeout = val; + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_user_timeout); + +static int __tcp_sock_set_keepidle(struct sock *sk, int val) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (val < 1 || val > MAX_TCP_KEEPIDLE) + return -EINVAL; + + tp->keepalive_time = val * HZ; + if (sock_flag(sk, SOCK_KEEPOPEN) && + !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { + u32 elapsed = keepalive_time_elapsed(tp); + + if (tp->keepalive_time > elapsed) + elapsed = tp->keepalive_time - elapsed; + else + elapsed = 0; + inet_csk_reset_keepalive_timer(sk, elapsed); + } + + return 0; +} + +int tcp_sock_set_keepidle(struct sock *sk, int val) +{ + int err; + + lock_sock(sk); + err = __tcp_sock_set_keepidle(sk, val); + release_sock(sk); + return err; +} +EXPORT_SYMBOL(tcp_sock_set_keepidle); + +int tcp_sock_set_keepintvl(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_KEEPINTVL) + return -EINVAL; + + lock_sock(sk); + tcp_sk(sk)->keepalive_intvl = val * HZ; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_keepintvl); + +int tcp_sock_set_keepcnt(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_KEEPCNT) + return -EINVAL; + + lock_sock(sk); + tcp_sk(sk)->keepalive_probes = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_keepcnt); + /* * Socket option code for TCP. */ @@ -2898,20 +3055,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_NODELAY: - if (val) { - /* TCP_NODELAY is weaker than TCP_CORK, so that - * this option on corked socket is remembered, but - * it is not activated until cork is cleared. - * - * However, when TCP_NODELAY is set we make - * an explicit push, which overrides even TCP_CORK - * for currently queued segments. - */ - tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk); - } else { - tp->nonagle &= ~TCP_NAGLE_OFF; - } + __tcp_sock_set_nodelay(sk, val); break; case TCP_THIN_LINEAR_TIMEOUTS: @@ -2979,43 +3123,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_CORK: - /* When set indicates to always queue non-full frames. - * Later the user clears this option and we transmit - * any pending partial frames in the queue. This is - * meant to be used alongside sendfile() to get properly - * filled frames when the user (for example) must write - * out headers with a write() call first and then use - * sendfile to send out the data parts. - * - * TCP_CORK can be set together with TCP_NODELAY and it is - * stronger than TCP_NODELAY. - */ - if (val) { - tp->nonagle |= TCP_NAGLE_CORK; - } else { - tp->nonagle &= ~TCP_NAGLE_CORK; - if (tp->nonagle&TCP_NAGLE_OFF) - tp->nonagle |= TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk); - } + __tcp_sock_set_cork(sk, val); break; case TCP_KEEPIDLE: - if (val < 1 || val > MAX_TCP_KEEPIDLE) - err = -EINVAL; - else { - tp->keepalive_time = val * HZ; - if (sock_flag(sk, SOCK_KEEPOPEN) && - !((1 << sk->sk_state) & - (TCPF_CLOSE | TCPF_LISTEN))) { - u32 elapsed = keepalive_time_elapsed(tp); - if (tp->keepalive_time > elapsed) - elapsed = tp->keepalive_time - elapsed; - else - elapsed = 0; - inet_csk_reset_keepalive_timer(sk, elapsed); - } - } + err = __tcp_sock_set_keepidle(sk, val); break; case TCP_KEEPINTVL: if (val < 1 || val > MAX_TCP_KEEPINTVL) @@ -3072,19 +3184,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_QUICKACK: - if (!val) { - inet_csk_enter_pingpong_mode(sk); - } else { - inet_csk_exit_pingpong_mode(sk); - if ((1 << sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && - inet_csk_ack_scheduled(sk)) { - icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; - tcp_cleanup_rbuf(sk, 1); - if (!(val & 1)) - inet_csk_enter_pingpong_mode(sk); - } - } + __tcp_sock_set_quickack(sk, val); break; #ifdef CONFIG_TCP_MD5SIG diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4eef5b84fff1..ad6435ba6d72 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -404,7 +404,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) EXPORT_SYMBOL(tcp_req_err); /* TCP-LD (RFC 6069) logic */ -static void tcp_ld_RTO_revert(struct sock *sk, u32 seq) +void tcp_ld_RTO_revert(struct sock *sk, u32 seq) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -441,6 +441,7 @@ static void tcp_ld_RTO_revert(struct sock *sk, u32 seq) tcp_retransmit_timer(sk); } } +EXPORT_SYMBOL(tcp_ld_RTO_revert); /* * This routine is called by the ICMP module when it gets some diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 150e6f0fdbf5..2158e8bddf41 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -22,9 +22,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->bind_ifindex) { - err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX, - (void *)&cfg->bind_ifindex, - sizeof(cfg->bind_ifindex)); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex); if (err < 0) goto error; } diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 58956a6b66a2..2e0ad1bc84a8 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -25,17 +25,12 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->ipv6_v6only) { - int val = 1; - - err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, - (char *) &val, sizeof(val)); + err = ip6_sock_set_v6only(sock->sk); if (err < 0) goto error; } if (cfg->bind_ifindex) { - err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX, - (void *)&cfg->bind_ifindex, - sizeof(cfg->bind_ifindex)); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex); if (err < 0) goto error; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index e10258c2210e..adbfed6adf11 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -845,67 +845,10 @@ done: break; case IPV6_ADDR_PREFERENCES: - { - unsigned int pref = 0; - unsigned int prefmask = ~0; - if (optlen < sizeof(int)) goto e_inval; - - retv = -EINVAL; - - /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ - switch (val & (IPV6_PREFER_SRC_PUBLIC| - IPV6_PREFER_SRC_TMP| - IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { - case IPV6_PREFER_SRC_PUBLIC: - pref |= IPV6_PREFER_SRC_PUBLIC; - break; - case IPV6_PREFER_SRC_TMP: - pref |= IPV6_PREFER_SRC_TMP; - break; - case IPV6_PREFER_SRC_PUBTMP_DEFAULT: - break; - case 0: - goto pref_skip_pubtmp; - default: - goto e_inval; - } - - prefmask &= ~(IPV6_PREFER_SRC_PUBLIC| - IPV6_PREFER_SRC_TMP); -pref_skip_pubtmp: - - /* check HOME/COA conflicts */ - switch (val & (IPV6_PREFER_SRC_HOME|IPV6_PREFER_SRC_COA)) { - case IPV6_PREFER_SRC_HOME: - break; - case IPV6_PREFER_SRC_COA: - pref |= IPV6_PREFER_SRC_COA; - case 0: - goto pref_skip_coa; - default: - goto e_inval; - } - - prefmask &= ~IPV6_PREFER_SRC_COA; -pref_skip_coa: - - /* check CGA/NONCGA conflicts */ - switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { - case IPV6_PREFER_SRC_CGA: - case IPV6_PREFER_SRC_NONCGA: - case 0: - break; - default: - goto e_inval; - } - - np->srcprefs = (np->srcprefs & prefmask) | pref; - retv = 0; - + retv = __ip6_sock_set_addr_preferences(sk, val); break; - } case IPV6_MINHOPCOUNT: if (optlen < sizeof(int)) goto e_inval; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 01a6f5111a77..b7415ca75c2d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -473,6 +473,15 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else sk->sk_err_soft = err; goto out; + case TCP_LISTEN: + break; + default: + /* check if this ICMP message allows revert of backoff. + * (see RFC 6069) + */ + if (!fastopen && type == ICMPV6_DEST_UNREACH && + code == ICMPV6_NOROUTE) + tcp_ld_RTO_revert(sk, seq); } if (!sock_owned_by_user(sk) && np->recverr) { diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 46782fac4c16..43db0eca911f 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -89,15 +89,6 @@ static struct ctl_table rds_tcp_sysctl_table[] = { { } }; -/* doing it this way avoids calling tcp_sk() */ -void rds_tcp_nonagle(struct socket *sock) -{ - int val = 1; - - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (void *)&val, - sizeof(val)); -} - u32 rds_tcp_write_seq(struct rds_tcp_connection *tc) { /* seq# of the last byte of data in tcp send buffer */ @@ -502,7 +493,7 @@ void rds_tcp_tune(struct socket *sock) struct net *net = sock_net(sk); struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); - rds_tcp_nonagle(sock); + tcp_sock_set_nodelay(sock->sk); lock_sock(sk); if (rtn->sndbuf_size > 0) { sk->sk_sndbuf = rtn->sndbuf_size; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 3c69361d21c7..bad9cf49d565 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -50,7 +50,6 @@ struct rds_tcp_statistics { /* tcp.c */ void rds_tcp_tune(struct socket *sock); -void rds_tcp_nonagle(struct socket *sock); void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_restore_callbacks(struct socket *sock, @@ -71,9 +70,8 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6); void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); -int rds_tcp_keepalive(struct socket *sock); +void rds_tcp_keepalive(struct socket *sock); void *rds_tcp_listen_sock_def_readable(struct net *net); -void rds_tcp_set_linger(struct socket *sock); /* tcp_recv.c */ int rds_tcp_recv_init(void); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 008f50fb25dd..4e64598176b0 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -207,7 +207,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) if (sock) { if (rds_destroy_pending(cp->cp_conn)) - rds_tcp_set_linger(sock); + sock_no_linger(sock->sk); sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); lock_sock(sock->sk); rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 810a3a49e947..101cf14215a0 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -38,36 +38,19 @@ #include "rds.h" #include "tcp.h" -int rds_tcp_keepalive(struct socket *sock) +void rds_tcp_keepalive(struct socket *sock) { /* values below based on xs_udp_default_timeout */ int keepidle = 5; /* send a probe 'keepidle' secs after last data */ int keepcnt = 5; /* number of unack'ed probes before declaring dead */ - int keepalive = 1; - int ret = 0; - - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&keepalive, sizeof(keepalive)); - if (ret < 0) - goto bail; - - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, - (char *)&keepcnt, sizeof(keepcnt)); - if (ret < 0) - goto bail; - - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, - (char *)&keepidle, sizeof(keepidle)); - if (ret < 0) - goto bail; + sock_set_keepalive(sock->sk); + tcp_sock_set_keepcnt(sock->sk, keepcnt); + tcp_sock_set_keepidle(sock->sk, keepidle); /* KEEPINTVL is the interval between successive probes. We follow * the model in xs_tcp_finish_connecting() and re-use keepidle. */ - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, - (char *)&keepidle, sizeof(keepidle)); -bail: - return ret; + tcp_sock_set_keepintvl(sock->sk, keepidle); } /* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the @@ -111,17 +94,6 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) return NULL; } -void rds_tcp_set_linger(struct socket *sock) -{ - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, - }; - - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); -} - int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; @@ -160,10 +132,7 @@ int rds_tcp_accept_one(struct socket *sock) new_sock->ops = sock->ops; __module_get(new_sock->ops->owner); - ret = rds_tcp_keepalive(new_sock); - if (ret < 0) - goto out; - + rds_tcp_keepalive(new_sock); rds_tcp_tune(new_sock); inet = inet_sk(new_sock->sk); @@ -241,7 +210,7 @@ rst_nsk: * be pending on it. By setting linger, we achieve the side-effect * of avoiding TIME_WAIT state on new_sock. */ - rds_tcp_set_linger(new_sock); + sock_no_linger(new_sock->sk); kernel_sock_shutdown(new_sock, SHUT_RDWR); ret = 0; out: @@ -303,7 +272,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6) } sock->sk->sk_reuse = SK_CAN_REUSE; - rds_tcp_nonagle(sock); + tcp_sock_set_nodelay(sock->sk); write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = sock->sk->sk_data_ready; diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 78a2554a4497..8c4d1d6e9249 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -38,23 +38,18 @@ #include "rds.h" #include "tcp.h" -static void rds_tcp_cork(struct socket *sock, int val) -{ - kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (void *)&val, sizeof(val)); -} - void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp) { struct rds_tcp_connection *tc = cp->cp_transport_data; - rds_tcp_cork(tc->t_sock, 1); + tcp_sock_set_cork(tc->t_sock->sk, true); } void rds_tcp_xmit_path_complete(struct rds_conn_path *cp) { struct rds_tcp_connection *tc = cp->cp_transport_data; - rds_tcp_cork(tc->t_sock, 0); + tcp_sock_set_cork(tc->t_sock->sk, false); } /* the core send_sem serializes this with other xmit and shutdown */ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 15ee92d79581..394189b81849 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -571,6 +571,19 @@ out: return ret; } +int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val) +{ + if (sk->sk_state != RXRPC_UNBOUND) + return -EISCONN; + if (val > RXRPC_SECURITY_MAX) + return -EINVAL; + lock_sock(sk); + rxrpc_sk(sk)->min_sec_level = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(rxrpc_sock_set_min_security_level); + /* * set RxRPC socket options */ diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 01135e54d95d..c8b2097f499c 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -107,7 +107,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) { struct sock *usk; - int ret, opt; + int ret; _enter("%p{%d,%d}", local, local->srx.transport_type, local->srx.transport.family); @@ -157,13 +157,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) switch (local->srx.transport.family) { case AF_INET6: /* we want to receive ICMPv6 errors */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + ip6_sock_set_recverr(local->socket->sk); /* Fall through and set IPv4 options too otherwise we don't get * errors from IPv4 packets sent through the IPv6 socket. @@ -171,31 +165,13 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) /* Fall through */ case AF_INET: /* we want to receive ICMP errors */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + ip_sock_set_recverr(local->socket->sk); /* we want to set the don't fragment bit */ - opt = IP_PMTUDISC_DO; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DO); /* We want receive timestamps. */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_SOCKET, SO_TIMESTAMPNS_OLD, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + sock_enable_timestamps(local->socket->sk); break; default: diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f8b632a5c619..1ba43c3df4ad 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -321,7 +321,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, struct kvec iov[2]; rxrpc_serial_t serial; size_t len; - int ret, opt; + int ret; _enter(",{%d}", skb->len); @@ -473,18 +473,14 @@ send_fragmentable: switch (conn->params.local->srx.transport.family) { case AF_INET6: case AF_INET: - opt = IP_PMTUDISC_DONT; - kernel_setsockopt(conn->params.local->socket, - SOL_IP, IP_MTU_DISCOVER, - (char *)&opt, sizeof(opt)); + ip_sock_set_mtu_discover(conn->params.local->socket->sk, + IP_PMTUDISC_DONT); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); conn->params.peer->last_tx_at = ktime_get_seconds(); - opt = IP_PMTUDISC_DO; - kernel_setsockopt(conn->params.local->socket, - SOL_IP, IP_MTU_DISCOVER, - (char *)&opt, sizeof(opt)); + ip_sock_set_mtu_discover(conn->params.local->socket->sk, + IP_PMTUDISC_DO); break; default: diff --git a/net/socket.c b/net/socket.c index 80422fc3c836..81a98b6cbd08 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3625,40 +3625,6 @@ int kernel_getpeername(struct socket *sock, struct sockaddr *addr) EXPORT_SYMBOL(kernel_getpeername); /** - * kernel_getsockopt - get a socket option (kernel space) - * @sock: socket - * @level: API level (SOL_SOCKET, ...) - * @optname: option tag - * @optval: option value - * @optlen: option length - * - * Assigns the option length to @optlen. - * Returns 0 or an error. - */ - -int kernel_getsockopt(struct socket *sock, int level, int optname, - char *optval, int *optlen) -{ - mm_segment_t oldfs = get_fs(); - char __user *uoptval; - int __user *uoptlen; - int err; - - uoptval = (char __user __force *) optval; - uoptlen = (int __user __force *) optlen; - - set_fs(KERNEL_DS); - if (level == SOL_SOCKET) - err = sock_getsockopt(sock, level, optname, uoptval, uoptlen); - else - err = sock->ops->getsockopt(sock, level, optname, uoptval, - uoptlen); - set_fs(oldfs); - return err; -} -EXPORT_SYMBOL(kernel_getsockopt); - -/** * kernel_setsockopt - set a socket option (kernel space) * @sock: socket * @level: API level (SOL_SOCKET, ...) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 023514e392b3..e7a0037d9b56 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -323,17 +323,9 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt) static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt) { - struct svc_sock *svsk; - struct socket *sock; - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, - }; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - svsk = container_of(xprt, struct svc_sock, sk_xprt); - sock = svsk->sk_sock; - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); + sock_no_linger(svsk->sk_sock->sk); } /* @@ -603,8 +595,6 @@ static struct svc_xprt_class svc_udp_class = { static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) { - int err, level, optname, one = 1; - svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class, &svsk->sk_xprt, serv); clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); @@ -624,19 +614,14 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) /* make sure we get destination address info */ switch (svsk->sk_sk->sk_family) { case AF_INET: - level = SOL_IP; - optname = IP_PKTINFO; + ip_sock_set_pktinfo(svsk->sk_sock->sk); break; case AF_INET6: - level = SOL_IPV6; - optname = IPV6_RECVPKTINFO; + ip6_sock_set_recvpktinfo(svsk->sk_sock->sk); break; default: BUG(); } - err = kernel_setsockopt(svsk->sk_sock, level, optname, - (char *)&one, sizeof(one)); - dprintk("svc: kernel_setsockopt returned %d\n", err); } /* @@ -1337,7 +1322,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, struct sockaddr *newsin = (struct sockaddr *)&addr; int newlen; int family; - int val; RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk("svc: svc_create_socket(%s, %d, %s)\n", @@ -1373,11 +1357,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, * getting requests from IPv4 remotes. Those should * be shunted to a PF_INET listener via rpcbind. */ - val = 1; if (family == PF_INET6) - kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, - (char *)&val, sizeof(val)); - + ip6_sock_set_v6only(sock->sk); if (type == SOCK_STREAM) sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */ error = kernel_bind(sock, sin, len); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 845d0be805ec..3a143e250b9a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1594,21 +1594,6 @@ static int xs_get_random_port(void) return rand + min; } -/** - * xs_set_reuseaddr_port - set the socket's port and address reuse options - * @sock: socket - * - * Note that this function has to be called on all sockets that share the - * same port, and it must be called before binding. - */ -static void xs_sock_set_reuseport(struct socket *sock) -{ - int opt = 1; - - kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, - (char *)&opt, sizeof(opt)); -} - static unsigned short xs_sock_getport(struct socket *sock) { struct sockaddr_storage buf; @@ -1801,7 +1786,7 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, xs_reclassify_socket(family, sock); if (reuseport) - xs_sock_set_reuseport(sock); + sock_set_reuseport(sock->sk); err = xs_bind(transport, sock); if (err) { @@ -2110,7 +2095,6 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); unsigned int keepidle; unsigned int keepcnt; - unsigned int opt_on = 1; unsigned int timeo; spin_lock(&xprt->transport_lock); @@ -2122,18 +2106,13 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, spin_unlock(&xprt->transport_lock); /* TCP Keepalive options */ - kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&opt_on, sizeof(opt_on)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, - (char *)&keepidle, sizeof(keepidle)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, - (char *)&keepidle, sizeof(keepidle)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, - (char *)&keepcnt, sizeof(keepcnt)); + sock_set_keepalive(sock->sk); + tcp_sock_set_keepidle(sock->sk, keepidle); + tcp_sock_set_keepintvl(sock->sk, keepidle); + tcp_sock_set_keepcnt(sock->sk, keepcnt); /* TCP user timeout (see RFC5482) */ - kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (char *)&timeo, sizeof(timeo)); + tcp_sock_set_user_timeout(sock->sk, timeo); } static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, @@ -2171,7 +2150,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) if (!transport->inet) { struct sock *sk = sock->sk; - unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC; /* Avoid temporary address, they are bad for long-lived * connections such as NFS mounts. @@ -2180,8 +2158,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) * knowledge about the normal duration of connections, * MAY override this as appropriate. */ - kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES, - (char *)&addr_pref, sizeof(addr_pref)); + if (xs_addr(xprt)->sa_family == PF_INET6) { + ip6_sock_set_addr_preferences(sk, + IPV6_PREFER_SRC_PUBLIC); + } xs_tcp_set_socket_timeouts(xprt, sock); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index d6b67d07d22e..3734cdbedc9c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -196,17 +196,17 @@ static int tsk_importance(struct tipc_sock *tsk) return msg_importance(&tsk->phdr); } -static int tsk_set_importance(struct tipc_sock *tsk, int imp) +static struct tipc_sock *tipc_sk(const struct sock *sk) { - if (imp > TIPC_CRITICAL_IMPORTANCE) - return -EINVAL; - msg_set_importance(&tsk->phdr, (u32)imp); - return 0; + return container_of(sk, struct tipc_sock, sk); } -static struct tipc_sock *tipc_sk(const struct sock *sk) +int tsk_set_importance(struct sock *sk, int imp) { - return container_of(sk, struct tipc_sock, sk); + if (imp > TIPC_CRITICAL_IMPORTANCE) + return -EINVAL; + msg_set_importance(&tipc_sk(sk)->phdr, (u32)imp); + return 0; } static bool tsk_conn_cong(struct tipc_sock *tsk) @@ -2721,7 +2721,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, /* Connect new socket to it's peer */ tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg)); - tsk_set_importance(new_tsock, msg_importance(msg)); + tsk_set_importance(new_sk, msg_importance(msg)); if (msg_named(msg)) { new_tsock->conn_type = msg_nametype(msg); new_tsock->conn_instance = msg_nameinst(msg); @@ -3139,7 +3139,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, switch (opt) { case TIPC_IMPORTANCE: - res = tsk_set_importance(tsk, value); + res = tsk_set_importance(sk, value); break; case TIPC_SRC_DROPPABLE: if (sock->type != SOCK_STREAM) diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 235b9679acee..b11575afc66f 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -75,4 +75,6 @@ u32 tipc_sock_get_portid(struct sock *sk); bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb); bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb); +int tsk_set_importance(struct sock *sk, int imp); + #endif diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 446af7bbd13e..1489cfb941d8 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -497,7 +497,6 @@ static void tipc_topsrv_listener_data_ready(struct sock *sk) static int tipc_topsrv_create_listener(struct tipc_topsrv *srv) { - int imp = TIPC_CRITICAL_IMPORTANCE; struct socket *lsock = NULL; struct sockaddr_tipc saddr; struct sock *sk; @@ -514,8 +513,9 @@ static int tipc_topsrv_create_listener(struct tipc_topsrv *srv) sk->sk_user_data = srv; write_unlock_bh(&sk->sk_callback_lock); - rc = kernel_setsockopt(lsock, SOL_TIPC, TIPC_IMPORTANCE, - (char *)&imp, sizeof(imp)); + lock_sock(sk); + rc = tsk_set_importance(sk, TIPC_CRITICAL_IMPORTANCE); + release_sock(sk); if (rc < 0) goto err; diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 1e2f61262e4e..dee567f7576a 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,8 +19,8 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp ipv4_compat_mode ipv4_fdb_grp_fcnal" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp ipv6_compat_mode ipv6_fdb_grp_fcnal" +IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp ipv4_compat_mode ipv4_fdb_grp_fcnal ipv4_torture" +IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp ipv6_compat_mode ipv6_fdb_grp_fcnal ipv6_torture" ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" TESTS="${ALL_TESTS}" @@ -767,6 +767,62 @@ ipv6_large_grp() $IP nexthop flush >/dev/null 2>&1 } +ipv6_del_add_loop1() +{ + while :; do + $IP nexthop del id 100 + $IP nexthop add id 100 via 2001:db8:91::2 dev veth1 + done >/dev/null 2>&1 +} + +ipv6_grp_replace_loop() +{ + while :; do + $IP nexthop replace id 102 group 100/101 + done >/dev/null 2>&1 +} + +ipv6_torture() +{ + local pid1 + local pid2 + local pid3 + local pid4 + local pid5 + + echo + echo "IPv6 runtime torture" + echo "--------------------" + if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test; need mausezahn tool" + return + fi + + run_cmd "$IP nexthop add id 100 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 101 via 2001:db8:92::2 dev veth3" + run_cmd "$IP nexthop add id 102 group 100/101" + run_cmd "$IP route add 2001:db8:101::1 nhid 102" + run_cmd "$IP route add 2001:db8:101::2 nhid 102" + + ipv6_del_add_loop1 & + pid1=$! + ipv6_grp_replace_loop & + pid2=$! + ip netns exec me ping -f 2001:db8:101::1 >/dev/null 2>&1 & + pid3=$! + ip netns exec me ping -f 2001:db8:101::2 >/dev/null 2>&1 & + pid4=$! + ip netns exec me mausezahn veth1 -B 2001:db8:101::2 -A 2001:db8:91::1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + pid5=$! + + sleep 300 + kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + + # if we did not crash, success + log_test 0 0 "IPv6 torture test" +} + + ipv4_fcnal() { local rc @@ -1313,6 +1369,61 @@ ipv4_compat_mode() sysctl_nexthop_compat_mode_set 1 "IPv4" } +ipv4_del_add_loop1() +{ + while :; do + $IP nexthop del id 100 + $IP nexthop add id 100 via 172.16.1.2 dev veth1 + done >/dev/null 2>&1 +} + +ipv4_grp_replace_loop() +{ + while :; do + $IP nexthop replace id 102 group 100/101 + done >/dev/null 2>&1 +} + +ipv4_torture() +{ + local pid1 + local pid2 + local pid3 + local pid4 + local pid5 + + echo + echo "IPv4 runtime torture" + echo "--------------------" + if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test; need mausezahn tool" + return + fi + + run_cmd "$IP nexthop add id 100 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 101 via 172.16.2.2 dev veth3" + run_cmd "$IP nexthop add id 102 group 100/101" + run_cmd "$IP route add 172.16.101.1 nhid 102" + run_cmd "$IP route add 172.16.101.2 nhid 102" + + ipv4_del_add_loop1 & + pid1=$! + ipv4_grp_replace_loop & + pid2=$! + ip netns exec me ping -f 172.16.101.1 >/dev/null 2>&1 & + pid3=$! + ip netns exec me ping -f 172.16.101.2 >/dev/null 2>&1 & + pid4=$! + ip netns exec me mausezahn veth1 -B 172.16.101.2 -A 172.16.1.1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + pid5=$! + + sleep 300 + kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + + # if we did not crash, success + log_test 0 0 "IPv4 torture test" +} + basic() { echo |