diff options
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r-- | net/packet/af_packet.c | 280 |
1 files changed, 203 insertions, 77 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index f8db7064d81c..ed458b315ef4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -216,10 +216,16 @@ static void prb_fill_vlan_info(struct tpacket_kbdq_core *, static void packet_flush_mclist(struct sock *sk); struct packet_skb_cb { - unsigned int origlen; union { struct sockaddr_pkt pkt; - struct sockaddr_ll ll; + union { + /* Trick: alias skb original length with + * ll.sll_family and ll.protocol in order + * to save room. + */ + unsigned int origlen; + struct sockaddr_ll ll; + }; } sa; }; @@ -537,15 +543,11 @@ static void prb_init_blk_timer(struct packet_sock *po, pkc->retire_blk_timer.expires = jiffies; } -static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring) +static void prb_setup_retire_blk_timer(struct packet_sock *po) { struct tpacket_kbdq_core *pkc; - if (tx_ring) - BUG(); - - pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) : - GET_PBDQC_FROM_RB(&po->rx_ring); + pkc = GET_PBDQC_FROM_RB(&po->rx_ring); prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); } @@ -601,7 +603,7 @@ static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, static void init_prb_bdqc(struct packet_sock *po, struct packet_ring_buffer *rb, struct pgv *pg_vec, - union tpacket_req_u *req_u, int tx_ring) + union tpacket_req_u *req_u) { struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd; @@ -628,7 +630,7 @@ static void init_prb_bdqc(struct packet_sock *po, p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); - prb_setup_retire_blk_timer(po, tx_ring); + prb_setup_retire_blk_timer(po); prb_open_block(p1, pbd); } @@ -1228,27 +1230,81 @@ static void packet_free_pending(struct packet_sock *po) free_percpu(po->tx_ring.pending_refcnt); } -static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +#define ROOM_POW_OFF 2 +#define ROOM_NONE 0x0 +#define ROOM_LOW 0x1 +#define ROOM_NORMAL 0x2 + +static bool __tpacket_has_room(struct packet_sock *po, int pow_off) +{ + int idx, len; + + len = po->rx_ring.frame_max + 1; + idx = po->rx_ring.head; + if (pow_off) + idx += len >> pow_off; + if (idx >= len) + idx -= len; + return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); +} + +static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) +{ + int idx, len; + + len = po->rx_ring.prb_bdqc.knum_blocks; + idx = po->rx_ring.prb_bdqc.kactive_blk_num; + if (pow_off) + idx += len >> pow_off; + if (idx >= len) + idx -= len; + return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); +} + +static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { struct sock *sk = &po->sk; - bool has_room; + int ret = ROOM_NONE; + + if (po->prot_hook.func != tpacket_rcv) { + int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) + - (skb ? skb->truesize : 0); + if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) + return ROOM_NORMAL; + else if (avail > 0) + return ROOM_LOW; + else + return ROOM_NONE; + } - if (po->prot_hook.func != tpacket_rcv) - return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize) - <= sk->sk_rcvbuf; + if (po->tp_version == TPACKET_V3) { + if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) + ret = ROOM_NORMAL; + else if (__tpacket_v3_has_room(po, 0)) + ret = ROOM_LOW; + } else { + if (__tpacket_has_room(po, ROOM_POW_OFF)) + ret = ROOM_NORMAL; + else if (__tpacket_has_room(po, 0)) + ret = ROOM_LOW; + } - spin_lock(&sk->sk_receive_queue.lock); - if (po->tp_version == TPACKET_V3) - has_room = prb_lookup_block(po, &po->rx_ring, - po->rx_ring.prb_bdqc.kactive_blk_num, - TP_STATUS_KERNEL); - else - has_room = packet_lookup_frame(po, &po->rx_ring, - po->rx_ring.head, - TP_STATUS_KERNEL); - spin_unlock(&sk->sk_receive_queue.lock); + return ret; +} + +static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +{ + int ret; + bool has_room; + + spin_lock_bh(&po->sk.sk_receive_queue.lock); + ret = __packet_rcv_has_room(po, skb); + has_room = ret == ROOM_NORMAL; + if (po->pressure == has_room) + po->pressure = !has_room; + spin_unlock_bh(&po->sk.sk_receive_queue.lock); - return has_room; + return ret; } static void packet_sock_destruct(struct sock *sk) @@ -1266,14 +1322,18 @@ static void packet_sock_destruct(struct sock *sk) sk_refcnt_debug_dec(sk); } -static int fanout_rr_next(struct packet_fanout *f, unsigned int num) +static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) { - int x = atomic_read(&f->rr_cur) + 1; + u32 rxhash; + int i, count = 0; - if (x >= num) - x = 0; + rxhash = skb_get_hash(skb); + for (i = 0; i < ROLLOVER_HLEN; i++) + if (po->rollover->history[i] == rxhash) + count++; - return x; + po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash; + return count > (ROLLOVER_HLEN >> 1); } static unsigned int fanout_demux_hash(struct packet_fanout *f, @@ -1287,13 +1347,9 @@ static unsigned int fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { - int cur, old; + unsigned int val = atomic_inc_return(&f->rr_cur); - cur = atomic_read(&f->rr_cur); - while ((old = atomic_cmpxchg(&f->rr_cur, cur, - fanout_rr_next(f, num))) != cur) - cur = old; - return cur; + return val % num; } static unsigned int fanout_demux_cpu(struct packet_fanout *f, @@ -1312,22 +1368,40 @@ static unsigned int fanout_demux_rnd(struct packet_fanout *f, static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, - unsigned int idx, unsigned int skip, + unsigned int idx, bool try_self, unsigned int num) { - unsigned int i, j; + struct packet_sock *po, *po_next, *po_skip = NULL; + unsigned int i, j, room = ROOM_NONE; + + po = pkt_sk(f->arr[idx]); - i = j = min_t(int, f->next[idx], num - 1); + if (try_self) { + room = packet_rcv_has_room(po, skb); + if (room == ROOM_NORMAL || + (room == ROOM_LOW && !fanout_flow_is_huge(po, skb))) + return idx; + po_skip = po; + } + + i = j = min_t(int, po->rollover->sock, num - 1); do { - if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { + po_next = pkt_sk(f->arr[i]); + if (po_next != po_skip && !po_next->pressure && + packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) - f->next[idx] = i; + po->rollover->sock = i; + atomic_long_inc(&po->rollover->num); + if (room == ROOM_LOW) + atomic_long_inc(&po->rollover->num_huge); return i; } + if (++i == num) i = 0; } while (i != j); + atomic_long_inc(&po->rollover->num_failed); return idx; } @@ -1347,7 +1421,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct packet_fanout *f = pt->af_packet_priv; - unsigned int num = f->num_members; + unsigned int num = READ_ONCE(f->num_members); struct packet_sock *po; unsigned int idx; @@ -1380,17 +1454,14 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, idx = fanout_demux_qm(f, skb, num); break; case PACKET_FANOUT_ROLLOVER: - idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); + idx = fanout_demux_rollover(f, skb, 0, false, num); break; } - po = pkt_sk(f->arr[idx]); - if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) && - unlikely(!packet_rcv_has_room(po, skb))) { - idx = fanout_demux_rollover(f, skb, idx, idx, num); - po = pkt_sk(f->arr[idx]); - } + if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) + idx = fanout_demux_rollover(f, skb, idx, true, num); + po = pkt_sk(f->arr[idx]); return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } @@ -1461,6 +1532,16 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) if (po->fanout) return -EALREADY; + if (type == PACKET_FANOUT_ROLLOVER || + (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { + po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL); + if (!po->rollover) + return -ENOMEM; + atomic_long_set(&po->rollover->num, 0); + atomic_long_set(&po->rollover->num_huge, 0); + atomic_long_set(&po->rollover->num_failed, 0); + } + mutex_lock(&fanout_mutex); match = NULL; list_for_each_entry(f, &fanout_list, list) { @@ -1509,6 +1590,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) } out: mutex_unlock(&fanout_mutex); + if (err) { + kfree(po->rollover); + po->rollover = NULL; + } return err; } @@ -1530,6 +1615,9 @@ static void fanout_release(struct sock *sk) kfree(f); } mutex_unlock(&fanout_mutex); + + if (po->rollover) + kfree_rcu(po->rollover, rcu); } static const struct proto_ops packet_ops; @@ -1608,8 +1696,8 @@ oom: * protocol layers and you must therefore supply it with a complete frame */ -static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); @@ -1818,13 +1906,10 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, skb = nskb; } - BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 > - sizeof(skb->cb)); + sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8); sll = &PACKET_SKB_CB(skb)->sa.ll; - sll->sll_family = AF_PACKET; sll->sll_hatype = dev->type; - sll->sll_protocol = skb->protocol; sll->sll_pkttype = skb->pkt_type; if (unlikely(po->origdev)) sll->sll_ifindex = orig_dev->ifindex; @@ -1833,7 +1918,10 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, sll->sll_halen = dev_parse_header(skb, sll->sll_addr); - PACKET_SKB_CB(skb)->origlen = skb->len; + /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg(). + * Use their space for storing the original skb length. + */ + PACKET_SKB_CB(skb)->sa.origlen = skb->len; if (pskb_trim(skb, snaplen)) goto drop_n_acct; @@ -1847,7 +1935,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; - skb->dropcount = atomic_read(&sk->sk_drops); + sock_skb_set_dropcount(sk, skb); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); @@ -1910,14 +1998,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, } } - if (skb->ip_summed == CHECKSUM_PARTIAL) - status |= TP_STATUS_CSUMNOTREADY; - snaplen = skb->len; res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; + + if (skb->ip_summed == CHECKSUM_PARTIAL) + status |= TP_STATUS_CSUMNOTREADY; + else if (skb->pkt_type != PACKET_OUTGOING && + (skb->ip_summed == CHECKSUM_COMPLETE || + skb_csum_unnecessary(skb))) + status |= TP_STATUS_CSUM_VALID; + if (snaplen > res) snaplen = res; @@ -2300,14 +2393,18 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll), - 0, &err); + !need_wait, &err); - if (unlikely(skb == NULL)) + if (unlikely(skb == NULL)) { + /* we assume the socket was initially writeable ... */ + if (likely(len_sum > 0)) + err = len_sum; goto out_status; - + } tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, addr, hlen); - if (tp_len > dev->mtu + dev->hard_header_len) { + if (likely(tp_len >= 0) && + tp_len > dev->mtu + dev->hard_header_len) { struct ethhdr *ehdr; /* Earlier code assumed this would be a VLAN pkt, * double-check this now that we have the actual @@ -2603,8 +2700,7 @@ out: return err; } -static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); @@ -2689,7 +2785,7 @@ static int packet_release(struct socket *sock) static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) { struct packet_sock *po = pkt_sk(sk); - const struct net_device *dev_curr; + struct net_device *dev_curr; __be16 proto_curr; bool need_rehook; @@ -2713,15 +2809,13 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) po->num = proto; po->prot_hook.type = proto; - - if (po->prot_hook.dev) - dev_put(po->prot_hook.dev); - po->prot_hook.dev = dev; po->ifindex = dev ? dev->ifindex : 0; packet_cached_dev_assign(po, dev); } + if (dev_curr) + dev_put(dev_curr); if (proto == 0 || !need_rehook) goto out_unlock; @@ -2822,7 +2916,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, sock->state = SS_UNCONNECTED; err = -ENOBUFS; - sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); + sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern); if (sk == NULL) goto out; @@ -2852,6 +2946,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, spin_lock_init(&po->bind_lock); mutex_init(&po->pg_vec_lock); + po->rollover = NULL; po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) @@ -2884,13 +2979,14 @@ out: * If necessary we block. */ -static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) +static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; int vnet_hdr_len = 0; + unsigned int origlen = 0; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE)) @@ -2928,6 +3024,9 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, if (skb == NULL) goto out; + if (pkt_sk(sk)->pressure) + packet_rcv_has_room(pkt_sk(sk), NULL); + if (pkt_sk(sk)->has_vnet_hdr) { struct virtio_net_hdr vnet_hdr = { 0 }; @@ -2990,6 +3089,15 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, if (err) goto out_free; + if (sock->type != SOCK_PACKET) { + struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; + + /* Original length was stored in sockaddr_ll fields */ + origlen = PACKET_SKB_CB(skb)->sa.origlen; + sll->sll_family = AF_PACKET; + sll->sll_protocol = skb->protocol; + } + sock_recv_ts_and_drops(msg, sk, skb); if (msg->msg_name) { @@ -3001,6 +3109,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_namelen = sizeof(struct sockaddr_pkt); } else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; + msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); } @@ -3014,7 +3123,12 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; - aux.tp_len = PACKET_SKB_CB(skb)->origlen; + else if (skb->pkt_type != PACKET_OUTGOING && + (skb->ip_summed == CHECKSUM_COMPLETE || + skb_csum_unnecessary(skb))) + aux.tp_status |= TP_STATUS_CSUM_VALID; + + aux.tp_len = origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; aux.tp_net = skb_network_offset(skb); @@ -3456,6 +3570,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, struct packet_sock *po = pkt_sk(sk); void *data = &val; union tpacket_stats_u st; + struct tpacket_rollover_stats rstats; if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3531,6 +3646,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, ((u32)po->fanout->flags << 24)) : 0); break; + case PACKET_ROLLOVER_STATS: + if (!po->rollover) + return -EINVAL; + rstats.tp_all = atomic_long_read(&po->rollover->num); + rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); + rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); + data = &rstats; + lv = sizeof(rstats); + break; case PACKET_TX_HAS_OFF: val = po->tp_tx_has_off; break; @@ -3668,6 +3792,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock, TP_STATUS_KERNEL)) mask |= POLLIN | POLLRDNORM; } + if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) + po->pressure = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { @@ -3857,7 +3983,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, * it above but just being paranoid */ if (!tx_ring) - init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring); + init_prb_bdqc(po, rb, pg_vec, req_u); break; default: break; |