diff options
Diffstat (limited to 'net')
50 files changed, 539 insertions, 405 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 55275ef9a31a..a3a0a5e994f5 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -123,9 +123,6 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) } vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); - - /* Get rid of the vlan's reference to real_dev */ - dev_put(real_dev); } int vlan_check_real_dev(struct net_device *real_dev, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 90330b893134..ab6dee28536d 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -843,6 +843,9 @@ static void vlan_dev_free(struct net_device *dev) free_percpu(vlan->vlan_pcpu_stats); vlan->vlan_pcpu_stats = NULL; + + /* Get rid of the vlan's reference to real_dev */ + dev_put(vlan->real_dev); } void vlan_setup(struct net_device *dev) diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index fbcb15c7c29b..93730d30af54 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -890,7 +890,7 @@ out: batadv_tp_vars_put(tp_vars); - do_exit(0); + return 0; } /** diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 9bc55ecb37f9..8452b0fbb78c 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -75,6 +75,13 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data) skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX; /* set default message type */ skcb->addr.type = J1939_TP; + + if (!j1939_address_is_valid(skcb->addr.sa)) { + netdev_err_once(priv->ndev, "%s: sa is broadcast address, ignoring!\n", + __func__); + goto done; + } + if (j1939_pgn_is_pdu1(skcb->addr.pgn)) { /* Type 1: with destination address */ skcb->addr.da = skcb->addr.pgn; diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 6c0a0ebdd024..a271688780a2 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -2023,6 +2023,11 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb) extd = J1939_ETP; fallthrough; case J1939_TP_CMD_BAM: + if (cmd == J1939_TP_CMD_BAM && !j1939_cb_is_broadcast(skcb)) { + netdev_err_once(priv->ndev, "%s: BAM to unicast (%02x), ignoring!\n", + __func__, skcb->addr.sa); + return; + } fallthrough; case J1939_TP_CMD_RTS: if (skcb->addr.type != extd) @@ -2085,6 +2090,12 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb) break; case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */ + if (j1939_cb_is_broadcast(skcb)) { + netdev_err_once(priv->ndev, "%s: abort to broadcast (%02x), ignoring!\n", + __func__, skcb->addr.sa); + return; + } + if (j1939_tp_im_transmitter(skcb)) j1939_xtp_rx_abort(priv, skb, true); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 013cbdb6cfe2..6a6898ee4049 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1153,12 +1153,11 @@ static int build_initial_monmap(struct ceph_mon_client *monc) int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) { - int err = 0; + int err; dout("init\n"); memset(monc, 0, sizeof(*monc)); monc->client = cl; - monc->monmap = NULL; mutex_init(&monc->mutex); err = build_initial_monmap(monc); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ff8624a7c964..1c5815530e0d 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -5310,14 +5310,14 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) ceph_msgpool_destroy(&osdc->msgpool_op_reply); } -static int osd_req_op_copy_from_init(struct ceph_osd_request *req, - u64 src_snapid, u64 src_version, - struct ceph_object_id *src_oid, - struct ceph_object_locator *src_oloc, - u32 src_fadvise_flags, - u32 dst_fadvise_flags, - u32 truncate_seq, u64 truncate_size, - u8 copy_from_flags) +int osd_req_op_copy_from_init(struct ceph_osd_request *req, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + u32 dst_fadvise_flags, + u32 truncate_seq, u64 truncate_size, + u8 copy_from_flags) { struct ceph_osd_req_op *op; struct page **pages; @@ -5346,49 +5346,7 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, op->indata_len, 0, false, true); return 0; } - -int ceph_osdc_copy_from(struct ceph_osd_client *osdc, - u64 src_snapid, u64 src_version, - struct ceph_object_id *src_oid, - struct ceph_object_locator *src_oloc, - u32 src_fadvise_flags, - struct ceph_object_id *dst_oid, - struct ceph_object_locator *dst_oloc, - u32 dst_fadvise_flags, - u32 truncate_seq, u64 truncate_size, - u8 copy_from_flags) -{ - struct ceph_osd_request *req; - int ret; - - req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); - if (!req) - return -ENOMEM; - - req->r_flags = CEPH_OSD_FLAG_WRITE; - - ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc); - ceph_oid_copy(&req->r_t.base_oid, dst_oid); - - ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid, - src_oloc, src_fadvise_flags, - dst_fadvise_flags, truncate_seq, - truncate_size, copy_from_flags); - if (ret) - goto out; - - ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); - if (ret) - goto out; - - ceph_osdc_start_request(osdc, req, false); - ret = ceph_osdc_wait_request(osdc, req); - -out: - ceph_osdc_put_request(req); - return ret; -} -EXPORT_SYMBOL(ceph_osdc_copy_from); +EXPORT_SYMBOL(osd_req_op_copy_from_init); int __init ceph_osdc_setup(void) { diff --git a/net/core/datagram.c b/net/core/datagram.c index 15ab9ffb27fe..ee290776c661 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -646,7 +646,8 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, skb->truesize += truesize; if (sk && sk->sk_type == SOCK_STREAM) { sk_wmem_queued_add(sk, truesize); - sk_mem_charge(sk, truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_charge(sk, truesize); } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); } diff --git a/net/core/dev.c b/net/core/dev.c index edeb811c454e..15ac064b5562 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6928,7 +6928,7 @@ void napi_disable(struct napi_struct *n) might_sleep(); set_bit(NAPI_STATE_DISABLE, &n->state); - do { + for ( ; ; ) { val = READ_ONCE(n->state); if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { usleep_range(20, 200); @@ -6937,7 +6937,10 @@ void napi_disable(struct napi_struct *n) new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); - } while (cmpxchg(&n->state, val, new) != val); + + if (cmpxchg(&n->state, val, new) == val) + break; + } hrtimer_cancel(&n->timer); diff --git a/net/core/devlink.c b/net/core/devlink.c index 6b5ee862429e..5ba4f9434acd 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -66,7 +66,7 @@ struct devlink { u8 reload_failed:1; refcount_t refcount; struct completion comp; - char priv[0] __aligned(NETDEV_ALIGN); + char priv[] __aligned(NETDEV_ALIGN); }; void *devlink_priv(struct devlink *devlink) diff --git a/net/core/filter.c b/net/core/filter.c index 8e8d3b49c297..e471c9b09670 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9756,22 +9756,46 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, struct bpf_insn *insn) { - /* si->dst_reg = skb->data */ + int reg; + int temp_reg_off = offsetof(struct sk_buff, cb) + + offsetof(struct sk_skb_cb, temp_reg); + + if (si->src_reg == si->dst_reg) { + /* We need an extra register, choose and save a register. */ + reg = BPF_REG_9; + if (si->src_reg == reg || si->dst_reg == reg) + reg--; + if (si->src_reg == reg || si->dst_reg == reg) + reg--; + *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off); + } else { + reg = si->dst_reg; + } + + /* reg = skb->data */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), - si->dst_reg, si->src_reg, + reg, si->src_reg, offsetof(struct sk_buff, data)); /* AX = skb->len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, len)); - /* si->dst_reg = skb->data + skb->len */ - *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); + /* reg = skb->data + skb->len */ + *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX); /* AX = skb->data_len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, data_len)); - /* si->dst_reg = skb->data + skb->len - skb->data_len */ - *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX); + + /* reg = skb->data + skb->len - skb->data_len */ + *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX); + + if (si->src_reg == si->dst_reg) { + /* Restore the saved register */ + *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg); + *insn++ = BPF_MOV64_REG(si->dst_reg, reg); + *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off); + } return insn; } @@ -9782,11 +9806,33 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): insn = bpf_convert_data_end_access(si, insn); break; + case offsetof(struct __sk_buff, cb[0]) ... + offsetofend(struct __sk_buff, cb[4]) - 1: + BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20); + BUILD_BUG_ON((offsetof(struct sk_buff, cb) + + offsetof(struct sk_skb_cb, data)) % + sizeof(__u64)); + + prog->cb_access = 1; + off = si->off; + off -= offsetof(struct __sk_buff, cb[0]); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct sk_skb_cb, data); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + else + *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + break; + + default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); @@ -10423,8 +10469,10 @@ BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, return -EINVAL; if (unlikely(sk && sk_is_refcounted(sk))) return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ - if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED)) - return -ESOCKTNOSUPPORT; /* reject connected sockets */ + if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN)) + return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */ + if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE)) + return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */ /* Check if socket is suitable for packet L3/L4 protocol */ if (sk && sk->sk_protocol != ctx->protocol) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 67a9188d8a49..ba2f38246f07 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3433,8 +3433,9 @@ static inline void skb_split_no_header(struct sk_buff *skb, void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); + const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; - skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; + skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); @@ -3449,19 +3450,7 @@ EXPORT_SYMBOL(skb_split); */ static int skb_prepare_for_shift(struct sk_buff *skb) { - int ret = 0; - - if (skb_cloned(skb)) { - /* Save and restore truesize: pskb_expand_head() may reallocate - * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we - * cannot change truesize at this point. - */ - unsigned int save_truesize = skb->truesize; - - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - skb->truesize = save_truesize; - } - return ret; + return skb_unclone_keeptruesize(skb, GFP_ATOMIC); } /** diff --git a/net/core/sock.c b/net/core/sock.c index 9862eefce21e..8f2b2f2c0e7b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -976,7 +976,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) bool charged; int pages; - if (!mem_cgroup_sockets_enabled || !sk->sk_memcg) + if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) return -EOPNOTSUPP; if (!bytes) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index e252b8ec2b85..f39ef79ced67 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -511,12 +511,6 @@ static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB; } -static bool sk_is_tcp(const struct sock *sk) -{ - return sk->sk_type == SOCK_STREAM && - sk->sk_protocol == IPPROTO_TCP; -} - static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index cd60b94fc175..de1c849a0a70 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -101,6 +101,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, struct dsa_port *dp; u8 *extraction; u16 vlan_tpid; + u64 rew_val; /* Revert skb->data by the amount consumed by the DSA master, * so it points to the beginning of the frame. @@ -130,6 +131,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, ocelot_xfh_get_qos_class(extraction, &qos_class); ocelot_xfh_get_tag_type(extraction, &tag_type); ocelot_xfh_get_vlan_tci(extraction, &vlan_tci); + ocelot_xfh_get_rew_val(extraction, &rew_val); skb->dev = dsa_master_find_slave(netdev, 0, src_port); if (!skb->dev) @@ -143,6 +145,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, dsa_default_offload_fwd_mark(skb); skb->priority = qos_class; + OCELOT_SKB_CB(skb)->tstamp_lo = rew_val; /* Ocelot switches copy frames unmodified to the CPU. However, it is * possible for the user to request a VLAN modification through diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 9009f412151e..ee1e5806bc93 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -56,8 +56,7 @@ static int pause_reply_size(const struct ethnl_req_info *req_base, if (req_base->flags & ETHTOOL_FLAG_STATS) n += nla_total_size(0) + /* _PAUSE_STATS */ - nla_total_size_64bit(sizeof(u64)) * - (ETHTOOL_A_PAUSE_STAT_MAX - 2); + nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT; return n; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3c6498dab6bd..b7796b4cf0a0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -862,6 +862,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, if (likely(skb)) { bool mem_scheduled; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); if (force_schedule) { mem_scheduled = true; sk_forced_mem_schedule(sk, skb->truesize); @@ -1318,6 +1319,15 @@ new_segment: copy = min_t(int, copy, pfrag->size - pfrag->offset); + /* skb changing from pure zc to mixed, must charge zc */ + if (unlikely(skb_zcopy_pure(skb))) { + if (!sk_wmem_schedule(sk, skb->data_len)) + goto wait_for_space; + + sk_mem_charge(sk, skb->data_len); + skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; + } + if (!sk_wmem_schedule(sk, copy)) goto wait_for_space; @@ -1338,8 +1348,16 @@ new_segment: } pfrag->offset += copy; } else { - if (!sk_wmem_schedule(sk, copy)) - goto wait_for_space; + /* First append to a fragless skb builds initial + * pure zerocopy skb + */ + if (!skb->len) + skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; + + if (!skb_zcopy_pure(skb)) { + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_space; + } err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); if (err == -EMSGSIZE || err == -EEXIST) { diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 5f4d6f45d87f..f70aa0932bd6 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -172,6 +172,41 @@ static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, return ret; } +static int tcp_bpf_recvmsg_parser(struct sock *sk, + struct msghdr *msg, + size_t len, + int nonblock, + int flags, + int *addr_len) +{ + struct sk_psock *psock; + int copied; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + lock_sock(sk); +msg_bytes_ready: + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + if (!copied) { + long timeo; + int data; + + timeo = sock_rcvtimeo(sk, nonblock); + data = tcp_msg_wait_data(sk, psock, timeo); + if (data && !sk_psock_queue_empty(psock)) + goto msg_bytes_ready; + copied = -EAGAIN; + } + release_sock(sk); + sk_psock_put(sk, psock); + return copied; +} + static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -464,6 +499,8 @@ enum { enum { TCP_BPF_BASE, TCP_BPF_TX, + TCP_BPF_RX, + TCP_BPF_TXRX, TCP_BPF_NUM_CFGS, }; @@ -475,7 +512,6 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], struct proto *base) { prot[TCP_BPF_BASE] = *base; - prot[TCP_BPF_BASE].unhash = sock_map_unhash; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; @@ -483,6 +519,12 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; + + prot[TCP_BPF_RX] = prot[TCP_BPF_BASE]; + prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser; + + prot[TCP_BPF_TXRX] = prot[TCP_BPF_TX]; + prot[TCP_BPF_TXRX].recvmsg = tcp_bpf_recvmsg_parser; } static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) @@ -520,6 +562,10 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + if (psock->progs.stream_verdict || psock->progs.skb_verdict) { + config = (config == TCP_BPF_TX) ? TCP_BPF_TXRX : TCP_BPF_RX; + } + if (restore) { if (inet_csk_has_ulp(sk)) { /* TLS does not have an unhash proto in SW cases, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6fbbf1558033..2e6e5a70168e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -408,13 +408,13 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) return tp->snd_una != tp->snd_up; } -#define OPTION_SACK_ADVERTISE (1 << 0) -#define OPTION_TS (1 << 1) -#define OPTION_MD5 (1 << 2) -#define OPTION_WSCALE (1 << 3) -#define OPTION_FAST_OPEN_COOKIE (1 << 8) -#define OPTION_SMC (1 << 9) -#define OPTION_MPTCP (1 << 10) +#define OPTION_SACK_ADVERTISE BIT(0) +#define OPTION_TS BIT(1) +#define OPTION_MD5 BIT(2) +#define OPTION_WSCALE BIT(3) +#define OPTION_FAST_OPEN_COOKIE BIT(8) +#define OPTION_SMC BIT(9) +#define OPTION_MPTCP BIT(10) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -1559,7 +1559,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, return -ENOMEM; } - if (skb_unclone(skb, gfp)) + if (skb_unclone_keeptruesize(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ @@ -1667,7 +1667,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { u32 delta_truesize; - if (skb_unclone(skb, GFP_ATOMIC)) + if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; delta_truesize = __pskb_trim_head(skb, len); @@ -1677,7 +1677,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) if (delta_truesize) { skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); - sk_mem_uncharge(sk, delta_truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_uncharge(sk, delta_truesize); } /* Any change of skb->len requires recalculation of tso factor. */ @@ -2295,7 +2296,9 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) if (len <= skb->len) break; - if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb)) + if (unlikely(TCP_SKB_CB(skb)->eor) || + tcp_has_tx_tstamp(skb) || + !skb_pure_zcopy_same(skb, next)) return false; len -= skb->len; @@ -3166,7 +3169,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { - if (skb_unclone(skb, GFP_ATOMIC)) + if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; diff = tcp_skb_pcount(skb); diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 5daa1c3ed83b..a8b5784afb1a 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -378,7 +378,7 @@ static int __net_init seg6_net_init(struct net *net) kfree(rcu_dereference_raw(sdata->tun_src)); kfree(sdata); return -ENOMEM; - }; + } #endif return 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2cc9b0e53ad1..551fce49841d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1263,7 +1263,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); - newinet = inet_sk(newsk); newnp = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 12c12619ee35..e43b31d25fb6 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -700,9 +700,9 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) ret = encap_rcv(sk, skb); if (ret <= 0) { - __UDP_INC_STATS(sock_net(sk), - UDP_MIB_INDATAGRAMS, - is_udplite); + __UDP6_INC_STATS(sock_net(sk), + UDP_MIB_INDATAGRAMS, + is_udplite); return -ret; } } diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index d344b02a1cde..871cf6266125 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -33,6 +33,19 @@ static int mctp_release(struct socket *sock) return 0; } +/* Generic sockaddr checks, padding checks only so far */ +static bool mctp_sockaddr_is_ok(const struct sockaddr_mctp *addr) +{ + return !addr->__smctp_pad0 && !addr->__smctp_pad1; +} + +static bool mctp_sockaddr_ext_is_ok(const struct sockaddr_mctp_ext *addr) +{ + return !addr->__smctp_pad0[0] && + !addr->__smctp_pad0[1] && + !addr->__smctp_pad0[2]; +} + static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { struct sock *sk = sock->sk; @@ -52,6 +65,9 @@ static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) /* it's a valid sockaddr for MCTP, cast and do protocol checks */ smctp = (struct sockaddr_mctp *)addr; + if (!mctp_sockaddr_is_ok(smctp)) + return -EINVAL; + lock_sock(sk); /* TODO: allow rebind */ @@ -87,6 +103,8 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) return -EINVAL; if (addr->smctp_family != AF_MCTP) return -EINVAL; + if (!mctp_sockaddr_is_ok(addr)) + return -EINVAL; if (addr->smctp_tag & ~(MCTP_TAG_MASK | MCTP_TAG_OWNER)) return -EINVAL; @@ -124,7 +142,8 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_mctp_ext *, extaddr, msg->msg_name); - if (extaddr->smctp_halen > sizeof(cb->haddr)) { + if (!mctp_sockaddr_ext_is_ok(extaddr) || + extaddr->smctp_halen > sizeof(cb->haddr)) { rc = -EINVAL; goto err_free; } @@ -198,11 +217,13 @@ static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, addr = msg->msg_name; addr->smctp_family = AF_MCTP; + addr->__smctp_pad0 = 0; addr->smctp_network = cb->net; addr->smctp_addr.s_addr = hdr->src; addr->smctp_type = type; addr->smctp_tag = hdr->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + addr->__smctp_pad1 = 0; msg->msg_namelen = sizeof(*addr); if (msk->addr_ext) { @@ -211,6 +232,7 @@ static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, msg->msg_namelen = sizeof(*ae); ae->smctp_ifindex = cb->ifindex; ae->smctp_halen = cb->halen; + memset(ae->__smctp_pad0, 0x0, sizeof(ae->__smctp_pad0)); memset(ae->smctp_haddr, 0x0, sizeof(ae->smctp_haddr)); memcpy(ae->smctp_haddr, cb->haddr, cb->halen); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 816f74dadfa3..39c523bd775c 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -47,6 +47,8 @@ #include <net/ip_vs.h> +MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); + /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ static DEFINE_MUTEX(__ip_vs_mutex); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 4c3fbaaeb103..4acc4b8e9fe5 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -560,7 +560,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, goto nla_put_failure; if (indev && entskb->dev && - entskb->mac_header != entskb->network_header) { + skb_mac_header_was_set(entskb)) { struct nfqnl_msg_packet_hw phw; int len; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 49089c50872e..334f63c9529e 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1664,31 +1664,37 @@ static const struct genl_ops nfc_genl_ops[] = { .cmd = NFC_CMD_DEV_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_up, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEV_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_down, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_START_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_start_poll, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_STOP_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_stop_poll, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_up, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_down, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_TARGET, @@ -1706,26 +1712,31 @@ static const struct genl_ops nfc_genl_ops[] = { .cmd = NFC_CMD_LLC_SET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_set_params, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_LLC_SDREQ, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_sdreq, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_FW_DOWNLOAD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_fw_download, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ENABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_enable_se, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DISABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_disable_se, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_SE, @@ -1737,21 +1748,25 @@ static const struct genl_ops nfc_genl_ops[] = { .cmd = NFC_CMD_SE_IO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_se_io, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_activate_target, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_VENDOR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_vendor_cmd, + .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_deactivate_target, + .flags = GENL_ADMIN_PERM, }, }; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 9ab068fa2672..377f896bdedc 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -95,18 +95,22 @@ static ktime_t sched_base_time(const struct sched_gate_list *sched) return ns_to_ktime(sched->base_time); } -static ktime_t taprio_get_time(struct taprio_sched *q) +static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono) { - ktime_t mono = ktime_get(); + /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */ + enum tk_offsets tk_offset = READ_ONCE(q->tk_offset); - switch (q->tk_offset) { + switch (tk_offset) { case TK_OFFS_MAX: return mono; default: - return ktime_mono_to_any(mono, q->tk_offset); + return ktime_mono_to_any(mono, tk_offset); } +} - return KTIME_MAX; +static ktime_t taprio_get_time(const struct taprio_sched *q) +{ + return taprio_mono_to_any(q, ktime_get()); } static void taprio_free_sched_cb(struct rcu_head *head) @@ -319,7 +323,7 @@ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) return 0; } - return ktime_mono_to_any(skb->skb_mstamp_ns, q->tk_offset); + return taprio_mono_to_any(q, skb->skb_mstamp_ns); } /* There are a few scenarios where we will have to modify the txtime from @@ -1352,6 +1356,7 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, } } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); + enum tk_offsets tk_offset; /* We only support static clockids and we don't allow * for it to be modified after the first init. @@ -1366,22 +1371,24 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, switch (clockid) { case CLOCK_REALTIME: - q->tk_offset = TK_OFFS_REAL; + tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: - q->tk_offset = TK_OFFS_MAX; + tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: - q->tk_offset = TK_OFFS_BOOT; + tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: - q->tk_offset = TK_OFFS_TAI; + tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); err = -EINVAL; goto out; } + /* This pairs with READ_ONCE() in taprio_mono_to_any */ + WRITE_ONCE(q->tk_offset, tk_offset); q->clockid = clockid; } else { diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index fb3da4d8f4a3..354c1c4de19b 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -326,11 +326,6 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net, struct sctp_packet *packet; int len; - /* Update socket peer label if first association. */ - if (security_sctp_assoc_request((struct sctp_endpoint *)ep, - chunk->skb)) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - /* 6.10 Bundling * An endpoint MUST NOT bundle INIT, INIT ACK or * SHUTDOWN COMPLETE with any other chunks. @@ -415,6 +410,12 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net, if (!new_asoc) goto nomem; + /* Update socket peer label if first association. */ + if (security_sctp_assoc_request(new_asoc, chunk->skb)) { + sctp_association_free(new_asoc); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } + if (sctp_assoc_set_bind_addr_from_ep(new_asoc, sctp_scope(sctp_source(chunk)), GFP_ATOMIC) < 0) @@ -780,6 +781,10 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, } } + if (security_sctp_assoc_request(new_asoc, chunk->skb)) { + sctp_association_free(new_asoc); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } /* Delay state machine commands until later. * @@ -1517,11 +1522,6 @@ static enum sctp_disposition sctp_sf_do_unexpected_init( struct sctp_packet *packet; int len; - /* Update socket peer label if first association. */ - if (security_sctp_assoc_request((struct sctp_endpoint *)ep, - chunk->skb)) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - /* 6.10 Bundling * An endpoint MUST NOT bundle INIT, INIT ACK or * SHUTDOWN COMPLETE with any other chunks. @@ -1594,6 +1594,12 @@ static enum sctp_disposition sctp_sf_do_unexpected_init( if (!new_asoc) goto nomem; + /* Update socket peer label if first association. */ + if (security_sctp_assoc_request(new_asoc, chunk->skb)) { + sctp_association_free(new_asoc); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } + if (sctp_assoc_set_bind_addr_from_ep(new_asoc, sctp_scope(sctp_source(chunk)), GFP_ATOMIC) < 0) goto nomem; @@ -2255,8 +2261,7 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook( } /* Update socket peer label if first association. */ - if (security_sctp_assoc_request((struct sctp_endpoint *)ep, - chunk->skb)) { + if (security_sctp_assoc_request(new_asoc, chunk->skb)) { sctp_association_free(new_asoc); return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } @@ -4893,9 +4898,6 @@ static enum sctp_disposition sctp_sf_violation_chunk( { static const char err_str[] = "The following chunk violates protocol:"; - if (!asoc) - return sctp_sf_violation(net, ep, asoc, type, arg, commands); - return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str, sizeof(err_str)); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6b937bfd4751..33391254fa82 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9412,7 +9412,6 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, struct inet_sock *inet = inet_sk(sk); struct inet_sock *newinet; struct sctp_sock *sp = sctp_sk(sk); - struct sctp_endpoint *ep = sp->ep; newsk->sk_type = sk->sk_type; newsk->sk_bound_dev_if = sk->sk_bound_dev_if; @@ -9457,9 +9456,9 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, net_enable_timestamp(); /* Set newsk security attributes from original sk and connection - * security attribute from ep. + * security attribute from asoc. */ - security_sctp_sk_clone(ep, sk, newsk); + security_sctp_sk_clone(asoc, sk, newsk); } static inline void sctp_copy_descendant(struct sock *sk_to, diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 0cf7ed2f5d41..59284da9116d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -149,14 +149,18 @@ static int __smc_release(struct smc_sock *smc) sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } else { - if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT) - sock_put(sk); /* passive closing */ - if (sk->sk_state == SMC_LISTEN) { - /* wake up clcsock accept */ - rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); + if (sk->sk_state != SMC_CLOSED) { + if (sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_INIT) + sock_put(sk); /* passive closing */ + if (sk->sk_state == SMC_LISTEN) { + /* wake up clcsock accept */ + rc = kernel_sock_shutdown(smc->clcsock, + SHUT_RDWR); + } + sk->sk_state = SMC_CLOSED; + sk->sk_state_change(sk); } - sk->sk_state = SMC_CLOSED; - sk->sk_state_change(sk); smc_restore_fallback_changes(smc); } diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h index b4c36795a928..ec17f29646f5 100644 --- a/net/smc/smc_tracepoint.h +++ b/net/smc/smc_tracepoint.h @@ -99,7 +99,7 @@ TRACE_EVENT(smcr_link_down, __entry->location = location; ), - TP_printk("lnk=%p lgr=%p state=%d dev=%s location=%p", + TP_printk("lnk=%p lgr=%p state=%d dev=%s location=%pS", __entry->lnk, __entry->lgr, __entry->state, __get_str(name), __entry->location) diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 9c0343568d2a..1a72c67afed5 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -27,18 +27,10 @@ static struct workqueue_struct *strp_wq; -struct _strp_msg { - /* Internal cb structure. struct strp_msg must be first for passing - * to upper layer. - */ - struct strp_msg strp; - int accum_len; -}; - static inline struct _strp_msg *_strp_msg(struct sk_buff *skb) { return (struct _strp_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); + offsetof(struct sk_skb_cb, strp)); } /* Lower lock held */ diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index 6e4dbd577a39..d435bffc6199 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -162,8 +162,10 @@ static int rpc_parse_scope_id(struct net *net, const char *buf, const size_t buflen, const char *delim, struct sockaddr_in6 *sin6) { - char *p; + char p[IPV6_SCOPE_ID_LEN + 1]; size_t len; + u32 scope_id = 0; + struct net_device *dev; if ((buf + buflen) == delim) return 1; @@ -175,29 +177,23 @@ static int rpc_parse_scope_id(struct net *net, const char *buf, return 0; len = (buf + buflen) - delim - 1; - p = kmemdup_nul(delim + 1, len, GFP_KERNEL); - if (p) { - u32 scope_id = 0; - struct net_device *dev; - - dev = dev_get_by_name(net, p); - if (dev != NULL) { - scope_id = dev->ifindex; - dev_put(dev); - } else { - if (kstrtou32(p, 10, &scope_id) != 0) { - kfree(p); - return 0; - } - } - - kfree(p); - - sin6->sin6_scope_id = scope_id; - return 1; + if (len > IPV6_SCOPE_ID_LEN) + return 0; + + memcpy(p, delim + 1, len); + p[len] = 0; + + dev = dev_get_by_name(net, p); + if (dev != NULL) { + scope_id = dev->ifindex; + dev_put(dev); + } else { + if (kstrtou32(p, 10, &scope_id) != 0) + return 0; } - return 0; + sin6->sin6_scope_id = scope_id; + return 1; } static size_t rpc_pton6(struct net *net, const char *buf, const size_t buflen, diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 1f2817195549..b87565b64928 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -781,7 +781,7 @@ gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) svc_putnl(rqstp->rq_res.head, RPC_AUTH_GSS); xdr_seq = kmalloc(4, GFP_KERNEL); if (!xdr_seq) - return -1; + return -ENOMEM; *xdr_seq = htonl(seq); iov.iov_base = xdr_seq; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index f056ff931444..a312ea2bc440 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1076,24 +1076,21 @@ void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt) static void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) { - - if (clnt != NULL) { - rpc_task_set_transport(task, clnt); - task->tk_client = clnt; - refcount_inc(&clnt->cl_count); - if (clnt->cl_softrtry) - task->tk_flags |= RPC_TASK_SOFT; - if (clnt->cl_softerr) - task->tk_flags |= RPC_TASK_TIMEOUT; - if (clnt->cl_noretranstimeo) - task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; - if (atomic_read(&clnt->cl_swapper)) - task->tk_flags |= RPC_TASK_SWAPPER; - /* Add to the client's list of all tasks */ - spin_lock(&clnt->cl_lock); - list_add_tail(&task->tk_task, &clnt->cl_tasks); - spin_unlock(&clnt->cl_lock); - } + rpc_task_set_transport(task, clnt); + task->tk_client = clnt; + refcount_inc(&clnt->cl_count); + if (clnt->cl_softrtry) + task->tk_flags |= RPC_TASK_SOFT; + if (clnt->cl_softerr) + task->tk_flags |= RPC_TASK_TIMEOUT; + if (clnt->cl_noretranstimeo) + task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; + if (atomic_read(&clnt->cl_swapper)) + task->tk_flags |= RPC_TASK_SWAPPER; + /* Add to the client's list of all tasks */ + spin_lock(&clnt->cl_lock); + list_add_tail(&task->tk_task, &clnt->cl_tasks); + spin_unlock(&clnt->cl_lock); } static void diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index c045f63d11fa..e2c835482791 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -277,9 +277,17 @@ static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS) static void rpc_task_set_debuginfo(struct rpc_task *task) { - static atomic_t rpc_pid; + struct rpc_clnt *clnt = task->tk_client; - task->tk_pid = atomic_inc_return(&rpc_pid); + /* Might be a task carrying a reverse-direction operation */ + if (!clnt) { + static atomic_t rpc_pid; + + task->tk_pid = atomic_inc_return(&rpc_pid); + return; + } + + task->tk_pid = atomic_inc_return(&clnt->cl_pid); } #else static inline void rpc_task_set_debuginfo(struct rpc_task *task) @@ -829,6 +837,7 @@ void rpc_exit_task(struct rpc_task *task) else if (task->tk_client) rpc_count_iostats(task, task->tk_client->cl_metrics); if (task->tk_ops->rpc_call_done != NULL) { + trace_rpc_task_call_done(task, task->tk_ops->rpc_call_done); task->tk_ops->rpc_call_done(task, task->tk_calldata); if (task->tk_action != NULL) { /* Always release the RPC slot and buffer memory */ @@ -903,8 +912,10 @@ static void __rpc_execute(struct rpc_task *task) /* * Lockless check for whether task is sleeping or not. */ - if (!RPC_IS_QUEUED(task)) + if (!RPC_IS_QUEUED(task)) { + cond_resched(); continue; + } /* * Signalled tasks should exit rather than sleep. @@ -1230,8 +1241,7 @@ static int rpciod_start(void) if (!wq) goto out_failed; rpciod_workqueue = wq; - /* Note: highpri because network receive is latency sensitive */ - wq = alloc_workqueue("xprtiod", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_HIGHPRI, 0); + wq = alloc_workqueue("xprtiod", WQ_UNBOUND | WQ_MEM_RECLAIM, 0); if (!wq) goto free_rpciod; xprtiod_workqueue = wq; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index a3bbe5ce4570..4292278a9552 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1186,45 +1186,6 @@ void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {} #endif -static int -svc_generic_dispatch(struct svc_rqst *rqstp, __be32 *statp) -{ - struct kvec *argv = &rqstp->rq_arg.head[0]; - struct kvec *resv = &rqstp->rq_res.head[0]; - const struct svc_procedure *procp = rqstp->rq_procinfo; - - /* - * Decode arguments - * XXX: why do we ignore the return value? - */ - if (procp->pc_decode && - !procp->pc_decode(rqstp, argv->iov_base)) { - *statp = rpc_garbage_args; - return 1; - } - - *statp = procp->pc_func(rqstp); - - if (*statp == rpc_drop_reply || - test_bit(RQ_DROPME, &rqstp->rq_flags)) - return 0; - - if (rqstp->rq_auth_stat != rpc_auth_ok) - return 1; - - if (*statp != rpc_success) - return 1; - - /* Encode reply */ - if (procp->pc_encode && - !procp->pc_encode(rqstp, resv->iov_base + resv->iov_len)) { - dprintk("svc: failed to encode reply\n"); - /* serv->sv_stats->rpcsystemerr++; */ - *statp = rpc_system_err; - } - return 1; -} - __be32 svc_generic_init_request(struct svc_rqst *rqstp, const struct svc_program *progp, @@ -1291,7 +1252,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) __be32 *statp; u32 prog, vers; __be32 rpc_stat; - int auth_res; + int auth_res, rc; __be32 *reply_statp; rpc_stat = rpc_success; @@ -1392,28 +1353,18 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) svc_reserve_auth(rqstp, procp->pc_xdrressize<<2); /* Call the function that processes the request. */ - if (!process.dispatch) { - if (!svc_generic_dispatch(rqstp, statp)) - goto release_dropit; - if (*statp == rpc_garbage_args) - goto err_garbage; - } else { - dprintk("svc: calling dispatcher\n"); - if (!process.dispatch(rqstp, statp)) - goto release_dropit; /* Release reply info */ - } - + rc = process.dispatch(rqstp, statp); + if (procp->pc_release) + procp->pc_release(rqstp); + if (!rc) + goto dropit; if (rqstp->rq_auth_stat != rpc_auth_ok) - goto err_release_bad_auth; + goto err_bad_auth; /* Check RPC status result */ if (*statp != rpc_success) resv->iov_len = ((void*)statp) - resv->iov_base + 4; - /* Release reply info */ - if (procp->pc_release) - procp->pc_release(rqstp); - if (procp->pc_encode == NULL) goto dropit; @@ -1422,9 +1373,6 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto close_xprt; return 1; /* Caller can now send it */ -release_dropit: - if (procp->pc_release) - procp->pc_release(rqstp); dropit: svc_authorise(rqstp); /* doesn't hurt to call this twice */ dprintk("svc: svc_process dropit\n"); @@ -1451,9 +1399,6 @@ err_bad_rpc: svc_putnl(resv, 2); goto sendit; -err_release_bad_auth: - if (procp->pc_release) - procp->pc_release(rqstp); err_bad_auth: dprintk("svc: authentication failed (%d)\n", be32_to_cpu(rqstp->rq_auth_stat)); @@ -1676,16 +1621,17 @@ EXPORT_SYMBOL_GPL(svc_encode_result_payload); /** * svc_fill_write_vector - Construct data argument for VFS write call * @rqstp: svc_rqst to operate on - * @pages: list of pages containing data payload - * @first: buffer containing first section of write payload - * @total: total number of bytes of write payload + * @payload: xdr_buf containing only the write data payload * * Fills in rqstp::rq_vec, and returns the number of elements. */ -unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct page **pages, - struct kvec *first, size_t total) +unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, + struct xdr_buf *payload) { + struct page **pages = payload->pages; + struct kvec *first = payload->head; struct kvec *vec = rqstp->rq_vec; + size_t total = payload->len; unsigned int i; /* Some types of transport can present the write payload diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 6316bd2b8f37..1e99ba1b9d72 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -687,6 +687,7 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) set_current_state(TASK_RUNNING); return -EINTR; } + trace_svc_alloc_arg_err(pages); schedule_timeout(msecs_to_jiffies(500)); } rqstp->rq_page_end = &rqstp->rq_pages[pages]; diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 9a6f17e18f73..2766dd21935b 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -109,8 +109,10 @@ static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, struct sock_xprt *sock; ssize_t ret = -1; - if (!xprt) - return 0; + if (!xprt || !xprt_connected(xprt)) { + xprt_put(xprt); + return -ENOTCONN; + } sock = container_of(xprt, struct sock_xprt, xprt); if (kernel_getsockname(sock->sock, (struct sockaddr *)&saddr) < 0) @@ -129,8 +131,10 @@ static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); ssize_t ret; - if (!xprt) - return 0; + if (!xprt || !xprt_connected(xprt)) { + xprt_put(xprt); + return -ENOTCONN; + } ret = sprintf(buf, "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n" "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n" diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index ca10ba2626f2..df194cc07035 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1633,7 +1633,7 @@ EXPORT_SYMBOL_GPL(xdr_buf_subsegment); * Sets up @subbuf to represent a portion of @xdr. The portion * starts at the current offset in @xdr, and extends for a length * of @nbytes. If this is successful, @xdr is advanced to the next - * position following that portion. + * XDR data item following that portion. * * Return values: * %true: @subbuf has been initialized, and @xdr has been advanced. @@ -1642,29 +1642,31 @@ EXPORT_SYMBOL_GPL(xdr_buf_subsegment); bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf, unsigned int nbytes) { - unsigned int remaining, offset, len; + unsigned int start = xdr_stream_pos(xdr); + unsigned int remaining, len; - if (xdr_buf_subsegment(xdr->buf, subbuf, xdr_stream_pos(xdr), nbytes)) + /* Extract @subbuf and bounds-check the fn arguments */ + if (xdr_buf_subsegment(xdr->buf, subbuf, start, nbytes)) return false; - if (subbuf->head[0].iov_len) - if (!__xdr_inline_decode(xdr, subbuf->head[0].iov_len)) - return false; - - remaining = subbuf->page_len; - offset = subbuf->page_base; - while (remaining) { - len = min_t(unsigned int, remaining, PAGE_SIZE) - offset; - + /* Advance @xdr by @nbytes */ + for (remaining = nbytes; remaining;) { if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr)) return false; - if (!__xdr_inline_decode(xdr, len)) - return false; + len = (char *)xdr->end - (char *)xdr->p; + if (remaining <= len) { + xdr->p = (__be32 *)((char *)xdr->p + + (remaining + xdr_pad_size(nbytes))); + break; + } + + xdr->p = (__be32 *)((char *)xdr->p + len); + xdr->end = xdr->p; remaining -= len; - offset = 0; } + xdr_stream_set_pos(xdr, start + nbytes); return true; } EXPORT_SYMBOL_GPL(xdr_stream_subsegment); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index cfd681700d1a..a02de2bddb28 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -246,11 +246,9 @@ EXPORT_SYMBOL_GPL(xprt_find_transport_ident); static void xprt_clear_locked(struct rpc_xprt *xprt) { xprt->snd_task = NULL; - if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { - smp_mb__before_atomic(); - clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_atomic(); - } else + if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) + clear_bit_unlock(XPRT_LOCKED, &xprt->state); + else queue_work(xprtiod_workqueue, &xprt->task_cleanup); } @@ -737,6 +735,8 @@ static void xprt_autoclose(struct work_struct *work) unsigned int pflags = memalloc_nofs_save(); trace_xprt_disconnect_auto(xprt); + xprt->connect_cookie++; + smp_mb__before_atomic(); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); xprt->ops->close(xprt); xprt_release_write(xprt, NULL); @@ -767,7 +767,8 @@ EXPORT_SYMBOL_GPL(xprt_disconnect_done); */ static void xprt_schedule_autoclose_locked(struct rpc_xprt *xprt) { - set_bit(XPRT_CLOSE_WAIT, &xprt->state); + if (test_and_set_bit(XPRT_CLOSE_WAIT, &xprt->state)) + return; if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) queue_work(xprtiod_workqueue, &xprt->task_cleanup); else if (xprt->snd_task && !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) @@ -1603,15 +1604,14 @@ xprt_transmit(struct rpc_task *task) { struct rpc_rqst *next, *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - int counter, status; + int status; spin_lock(&xprt->queue_lock); - counter = 0; - while (!list_empty(&xprt->xmit_queue)) { - if (++counter == 20) + for (;;) { + next = list_first_entry_or_null(&xprt->xmit_queue, + struct rpc_rqst, rq_xmit); + if (!next) break; - next = list_first_entry(&xprt->xmit_queue, - struct rpc_rqst, rq_xmit); xprt_pin_rqst(next); spin_unlock(&xprt->queue_lock); status = xprt_request_transmit(next, task); @@ -1619,13 +1619,16 @@ xprt_transmit(struct rpc_task *task) status = 0; spin_lock(&xprt->queue_lock); xprt_unpin_rqst(next); - if (status == 0) { - if (!xprt_request_data_received(task) || - test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) - continue; - } else if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) - task->tk_status = status; - break; + if (status < 0) { + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + task->tk_status = status; + break; + } + /* Was @task transmitted, and has it received a reply? */ + if (xprt_request_data_received(task) && + !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + break; + cond_resched_lock(&xprt->queue_lock); } spin_unlock(&xprt->queue_lock); } diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index f700b34a5bfd..ff699307e820 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -515,8 +515,8 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * a single ib_post_send() call. */ prev = &first; - while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { - + mr = rpcrdma_mr_pop(&req->rl_registered); + do { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; @@ -533,7 +533,8 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) *prev = last; prev = &last->next; - } + } while ((mr = rpcrdma_mr_pop(&req->rl_registered))); + mr = container_of(last, struct rpcrdma_mr, mr_invwr); /* Strong send queue ordering guarantees that when the @@ -617,8 +618,8 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * a single ib_post_send() call. */ prev = &first; - while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { - + mr = rpcrdma_mr_pop(&req->rl_registered); + do { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; @@ -635,7 +636,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) *prev = last; prev = &last->next; - } + } while ((mr = rpcrdma_mr_pop(&req->rl_registered))); /* Strong send queue ordering guarantees that when the * last WR in the chain completes, all WRs in the chain @@ -666,3 +667,38 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ rpcrdma_force_disconnect(ep); } + +/** + * frwr_wp_create - Create an MR for padding Write chunks + * @r_xprt: transport resources to use + * + * Return 0 on success, negative errno on failure. + */ +int frwr_wp_create(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct rpcrdma_mr_seg seg; + struct rpcrdma_mr *mr; + + mr = rpcrdma_mr_get(r_xprt); + if (!mr) + return -EAGAIN; + mr->mr_req = NULL; + ep->re_write_pad_mr = mr; + + seg.mr_len = XDR_UNIT; + seg.mr_page = virt_to_page(ep->re_write_pad); + seg.mr_offset = offset_in_page(ep->re_write_pad); + if (IS_ERR(frwr_map(r_xprt, &seg, 1, true, xdr_zero, mr))) + return -EIO; + trace_xprtrdma_mr_fastreg(mr); + + mr->mr_cqe.done = frwr_wc_fastreg; + mr->mr_regwr.wr.next = NULL; + mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe; + mr->mr_regwr.wr.num_sge = 0; + mr->mr_regwr.wr.opcode = IB_WR_REG_MR; + mr->mr_regwr.wr.send_flags = 0; + + return ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL); +} diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c335c1361564..8035a983c8ce 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -255,15 +255,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, page_base = 0; } - if (type == rpcrdma_readch) - goto out; - - /* When encoding a Write chunk, some servers need to see an - * extra segment for non-XDR-aligned Write chunks. The upper - * layer provides space in the tail iovec that may be used - * for this purpose. - */ - if (type == rpcrdma_writech && r_xprt->rx_ep->re_implicit_roundup) + if (type == rpcrdma_readch || type == rpcrdma_writech) goto out; if (xdrbuf->tail[0].iov_len) @@ -405,6 +397,7 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct rpcrdma_mr_seg *seg; struct rpcrdma_mr *mr; int nsegs, nchunks; @@ -443,6 +436,18 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, nsegs -= mr->mr_nents; } while (nsegs); + if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) { + if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0) + return -EMSGSIZE; + + trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr, + nsegs); + r_xprt->rx_stats.write_chunk_count++; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; + nchunks++; + nsegs -= mr->mr_nents; + } + /* Update count of segments in this Write chunk */ *segcount = cpu_to_be32(nchunks); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 6be23ce7a93d..cf76a6ad127b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -330,9 +330,9 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wc->wr_cqe and wc->status are reliable */ ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe); - trace_svcrdma_wc_receive(wc, &ctxt->rc_cid); if (wc->status != IB_WC_SUCCESS) goto flushed; + trace_svcrdma_wc_recv(wc, &ctxt->rc_cid); /* If receive posting fails, the connection is about to be * lost anyway. The server will not be able to send a reply @@ -345,7 +345,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) */ if (rdma->sc_pending_recvs < rdma->sc_max_requests) if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false)) - goto flushed; + goto dropped; /* All wc fields are now known to be valid */ ctxt->rc_byte_len = wc->byte_len; @@ -360,6 +360,11 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) return; flushed: + if (wc->status == IB_WC_WR_FLUSH_ERR) + trace_svcrdma_wc_recv_flush(wc, &ctxt->rc_cid); + else + trace_svcrdma_wc_recv_err(wc, &ctxt->rc_cid); +dropped: svc_rdma_recv_ctxt_put(rdma, ctxt); svc_xprt_deferred_close(&rdma->sc_xprt); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index e27433f08ca7..5f0155fdefc7 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -155,6 +155,7 @@ struct svc_rdma_chunk_ctxt { struct ib_cqe cc_cqe; struct svcxprt_rdma *cc_rdma; struct list_head cc_rwctxts; + ktime_t cc_posttime; int cc_sqecount; enum ib_wc_status cc_status; struct completion cc_done; @@ -267,7 +268,16 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_write_info *info = container_of(cc, struct svc_rdma_write_info, wi_cc); - trace_svcrdma_wc_write(wc, &cc->cc_cid); + switch (wc->status) { + case IB_WC_SUCCESS: + trace_svcrdma_wc_write(wc, &cc->cc_cid); + break; + case IB_WC_WR_FLUSH_ERR: + trace_svcrdma_wc_write_flush(wc, &cc->cc_cid); + break; + default: + trace_svcrdma_wc_write_err(wc, &cc->cc_cid); + } svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); @@ -320,11 +330,22 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); - struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_rdma_read_info *info; - trace_svcrdma_wc_read(wc, &cc->cc_cid); + switch (wc->status) { + case IB_WC_SUCCESS: + info = container_of(cc, struct svc_rdma_read_info, ri_cc); + trace_svcrdma_wc_read(wc, &cc->cc_cid, info->ri_totalbytes, + cc->cc_posttime); + break; + case IB_WC_WR_FLUSH_ERR: + trace_svcrdma_wc_read_flush(wc, &cc->cc_cid); + break; + default: + trace_svcrdma_wc_read_err(wc, &cc->cc_cid); + } - svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); + svc_rdma_wake_send_waiters(cc->cc_rdma, cc->cc_sqecount); cc->cc_status = wc->status; complete(&cc->cc_done); return; @@ -363,6 +384,7 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) do { if (atomic_sub_return(cc->cc_sqecount, &rdma->sc_sq_avail) > 0) { + cc->cc_posttime = ktime_get(); ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); if (ret) break; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 599021b2391d..22a871e6fe4d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -280,13 +280,21 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_send_ctxt *ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); - trace_svcrdma_wc_send(wc, &ctxt->sc_cid); - svc_rdma_wake_send_waiters(rdma, 1); complete(&ctxt->sc_done); if (unlikely(wc->status != IB_WC_SUCCESS)) - svc_xprt_deferred_close(&rdma->sc_xprt); + goto flushed; + + trace_svcrdma_wc_send(wc, &ctxt->sc_cid); + return; + +flushed: + if (wc->status != IB_WC_WR_FLUSH_ERR) + trace_svcrdma_wc_send_err(wc, &ctxt->sc_cid); + else + trace_svcrdma_wc_send_flush(wc, &ctxt->sc_cid); + svc_xprt_deferred_close(&rdma->sc_xprt); } /** diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index aaec3c9be8db..3d3673ba9e1e 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -205,14 +205,12 @@ static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep, unsigned int rsize, wsize; /* Default settings for RPC-over-RDMA Version One */ - ep->re_implicit_roundup = xprt_rdma_pad_optimize; rsize = RPCRDMA_V1_DEF_INLINE_SIZE; wsize = RPCRDMA_V1_DEF_INLINE_SIZE; if (pmsg && pmsg->cp_magic == rpcrdma_cmp_magic && pmsg->cp_version == RPCRDMA_CMP_VERSION) { - ep->re_implicit_roundup = true; rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); } @@ -551,6 +549,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) goto out; } rpcrdma_mrs_create(r_xprt); + frwr_wp_create(r_xprt); out: trace_xprtrdma_connect(r_xprt, rc); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index d91f54eae00b..c79f92eeda76 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -68,13 +68,14 @@ /* * RDMA Endpoint -- connection endpoint details */ +struct rpcrdma_mr; struct rpcrdma_ep { struct kref re_kref; struct rdma_cm_id *re_id; struct ib_pd *re_pd; unsigned int re_max_rdma_segs; unsigned int re_max_fr_depth; - bool re_implicit_roundup; + struct rpcrdma_mr *re_write_pad_mr; enum ib_mr_type re_mrtype; struct completion re_done; unsigned int re_send_count; @@ -97,6 +98,8 @@ struct rpcrdma_ep { unsigned int re_inline_recv; /* negotiated */ atomic_t re_completion_ids; + + char re_write_pad[XDR_UNIT]; }; /* Pre-allocate extra Work Requests for handling reverse-direction @@ -535,6 +538,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); +int frwr_wp_create(struct rpcrdma_xprt *r_xprt); /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 04f1b78bcbca..ae48c9c84ee1 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1134,6 +1134,7 @@ static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr) static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) { + xprt->connect_cookie++; smp_mb__before_atomic(); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); @@ -1153,14 +1154,13 @@ static void xs_error_report(struct sock *sk) struct sock_xprt *transport; struct rpc_xprt *xprt; - read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) - goto out; + return; transport = container_of(xprt, struct sock_xprt, xprt); transport->xprt_err = -sk->sk_err; if (transport->xprt_err == 0) - goto out; + return; dprintk("RPC: xs_error_report client %p, error=%d...\n", xprt, -transport->xprt_err); trace_rpc_socket_error(xprt, sk->sk_socket, transport->xprt_err); @@ -1168,8 +1168,6 @@ static void xs_error_report(struct sock *sk) /* barrier ensures xprt_err is set before XPRT_SOCK_WAKE_ERROR */ smp_mb__before_atomic(); xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR); - out: - read_unlock_bh(&sk->sk_callback_lock); } static void xs_reset_transport(struct sock_xprt *transport) @@ -1188,7 +1186,7 @@ static void xs_reset_transport(struct sock_xprt *transport) kernel_sock_shutdown(sock, SHUT_RDWR); mutex_lock(&transport->recv_mutex); - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); transport->inet = NULL; transport->sock = NULL; transport->file = NULL; @@ -1197,10 +1195,10 @@ static void xs_reset_transport(struct sock_xprt *transport) xs_restore_old_callbacks(transport, sk); xprt_clear_connected(xprt); - write_unlock_bh(&sk->sk_callback_lock); xs_sock_reset_connection_flags(xprt); /* Reset stream record info */ xs_stream_reset_connect(transport); + release_sock(sk); mutex_unlock(&transport->recv_mutex); trace_rpc_socket_close(xprt, sock); @@ -1364,7 +1362,6 @@ static void xs_data_ready(struct sock *sk) { struct rpc_xprt *xprt; - read_lock_bh(&sk->sk_callback_lock); dprintk("RPC: xs_data_ready...\n"); xprt = xprt_from_sock(sk); if (xprt != NULL) { @@ -1379,7 +1376,6 @@ static void xs_data_ready(struct sock *sk) if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) queue_work(xprtiod_workqueue, &transport->recv_worker); } - read_unlock_bh(&sk->sk_callback_lock); } /* @@ -1408,9 +1404,8 @@ static void xs_tcp_state_change(struct sock *sk) struct rpc_xprt *xprt; struct sock_xprt *transport; - read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) - goto out; + return; dprintk("RPC: xs_tcp_state_change client %p...\n", xprt); dprintk("RPC: state %x conn %d dead %d zapped %d sk_shutdown %d\n", sk->sk_state, xprt_connected(xprt), @@ -1471,8 +1466,6 @@ static void xs_tcp_state_change(struct sock *sk) /* Trigger the socket release */ xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); } - out: - read_unlock_bh(&sk->sk_callback_lock); } static void xs_write_space(struct sock *sk) @@ -1511,13 +1504,9 @@ out: */ static void xs_udp_write_space(struct sock *sk) { - read_lock_bh(&sk->sk_callback_lock); - /* from net/core/sock.c:sock_def_write_space */ if (sock_writeable(sk)) xs_write_space(sk); - - read_unlock_bh(&sk->sk_callback_lock); } /** @@ -1532,13 +1521,9 @@ static void xs_udp_write_space(struct sock *sk) */ static void xs_tcp_write_space(struct sock *sk) { - read_lock_bh(&sk->sk_callback_lock); - /* from net/core/stream.c:sk_stream_write_space */ if (sk_stream_is_writeable(sk)) xs_write_space(sk); - - read_unlock_bh(&sk->sk_callback_lock); } static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt) @@ -1833,7 +1818,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, if (!transport->inet) { struct sock *sk = sock->sk; - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); xs_save_old_callbacks(transport, sk); @@ -1849,7 +1834,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, transport->sock = sock; transport->inet = sk; - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } xs_stream_start_connect(transport); @@ -2031,7 +2016,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) if (!transport->inet) { struct sock *sk = sock->sk; - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); xs_save_old_callbacks(transport, sk); @@ -2048,7 +2033,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_set_memalloc(xprt); - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } xs_udp_do_set_buffer_size(xprt); @@ -2174,7 +2159,6 @@ static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - int ret = -ENOTCONN; if (!transport->inet) { struct sock *sk = sock->sk; @@ -2194,7 +2178,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_tcp_set_socket_timeouts(xprt, sock); tcp_sock_set_nodelay(sk); - write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); xs_save_old_callbacks(transport, sk); @@ -2214,11 +2198,11 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) transport->sock = sock; transport->inet = sk; - write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); } if (!xprt_bound(xprt)) - goto out; + return -ENOTCONN; xs_set_memalloc(xprt); @@ -2226,22 +2210,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) /* Tell the socket layer to start connecting... */ set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); - ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); - switch (ret) { - case 0: - xs_set_srcport(transport, sock); - fallthrough; - case -EINPROGRESS: - /* SYN_SENT! */ - if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) - xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; - break; - case -EADDRNOTAVAIL: - /* Source port number is unavailable. Try a new one! */ - transport->srcport = 0; - } -out: - return ret; + return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); } /** @@ -2256,14 +2225,14 @@ static void xs_tcp_setup_socket(struct work_struct *work) container_of(work, struct sock_xprt, connect_worker.work); struct socket *sock = transport->sock; struct rpc_xprt *xprt = &transport->xprt; - int status = -EIO; + int status; if (!sock) { sock = xs_create_sock(xprt, transport, xs_addr(xprt)->sa_family, SOCK_STREAM, IPPROTO_TCP, true); if (IS_ERR(sock)) { - status = PTR_ERR(sock); + xprt_wake_pending_tasks(xprt, PTR_ERR(sock)); goto out; } } @@ -2280,21 +2249,21 @@ static void xs_tcp_setup_socket(struct work_struct *work) xprt, -status, xprt_connected(xprt), sock->sk->sk_state); switch (status) { - default: - printk("%s: connect returned unhandled error %d\n", - __func__, status); - fallthrough; - case -EADDRNOTAVAIL: - /* We're probably in TIME_WAIT. Get rid of existing socket, - * and retry - */ - xs_tcp_force_close(xprt); - break; case 0: + xs_set_srcport(transport, sock); + fallthrough; case -EINPROGRESS: + /* SYN_SENT! */ + if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + fallthrough; case -EALREADY: - xprt_unlock_connect(xprt, transport); - return; + goto out_unlock; + case -EADDRNOTAVAIL: + /* Source port number is unavailable. Try a new one! */ + transport->srcport = 0; + status = -EAGAIN; + break; case -EINVAL: /* Happens, for instance, if the user specified a link * local IPv6 address without a scope-id. @@ -2306,18 +2275,22 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EHOSTUNREACH: case -EADDRINUSE: case -ENOBUFS: - /* xs_tcp_force_close() wakes tasks with a fixed error code. - * We need to wake them first to ensure the correct error code. - */ - xprt_wake_pending_tasks(xprt, status); - xs_tcp_force_close(xprt); - goto out; + break; + default: + printk("%s: connect returned unhandled error %d\n", + __func__, status); + status = -EAGAIN; } - status = -EAGAIN; + + /* xs_tcp_force_close() wakes tasks with a fixed error code. + * We need to wake them first to ensure the correct error code. + */ + xprt_wake_pending_tasks(xprt, status); + xs_tcp_force_close(xprt); out: xprt_clear_connecting(xprt); +out_unlock: xprt_unlock_connect(xprt, transport); - xprt_wake_pending_tasks(xprt, status); } /** @@ -2341,7 +2314,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport)); - if (transport->sock != NULL) { + if (transport->sock != NULL && !xprt_connecting(xprt)) { dprintk("RPC: xs_connect delayed xprt %p for %lu " "seconds\n", xprt, xprt->reestablish_timeout / HZ); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 7d851eb3a683..ed0df839c38c 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1322,6 +1322,8 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, * non-blocking call. */ err = -EALREADY; + if (flags & O_NONBLOCK) + goto out; break; default: if ((sk->sk_state == TCP_LISTEN) || |