aboutsummaryrefslogtreecommitdiff
path: root/net/packet/af_packet.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r--net/packet/af_packet.c280
1 files changed, 203 insertions, 77 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f8db7064d81c..ed458b315ef4 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -216,10 +216,16 @@ static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
static void packet_flush_mclist(struct sock *sk);
struct packet_skb_cb {
- unsigned int origlen;
union {
struct sockaddr_pkt pkt;
- struct sockaddr_ll ll;
+ union {
+ /* Trick: alias skb original length with
+ * ll.sll_family and ll.protocol in order
+ * to save room.
+ */
+ unsigned int origlen;
+ struct sockaddr_ll ll;
+ };
} sa;
};
@@ -537,15 +543,11 @@ static void prb_init_blk_timer(struct packet_sock *po,
pkc->retire_blk_timer.expires = jiffies;
}
-static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
+static void prb_setup_retire_blk_timer(struct packet_sock *po)
{
struct tpacket_kbdq_core *pkc;
- if (tx_ring)
- BUG();
-
- pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
- GET_PBDQC_FROM_RB(&po->rx_ring);
+ pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
}
@@ -601,7 +603,7 @@ static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
static void init_prb_bdqc(struct packet_sock *po,
struct packet_ring_buffer *rb,
struct pgv *pg_vec,
- union tpacket_req_u *req_u, int tx_ring)
+ union tpacket_req_u *req_u)
{
struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
struct tpacket_block_desc *pbd;
@@ -628,7 +630,7 @@ static void init_prb_bdqc(struct packet_sock *po,
p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
prb_init_ft_ops(p1, req_u);
- prb_setup_retire_blk_timer(po, tx_ring);
+ prb_setup_retire_blk_timer(po);
prb_open_block(p1, pbd);
}
@@ -1228,27 +1230,81 @@ static void packet_free_pending(struct packet_sock *po)
free_percpu(po->tx_ring.pending_refcnt);
}
-static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+#define ROOM_POW_OFF 2
+#define ROOM_NONE 0x0
+#define ROOM_LOW 0x1
+#define ROOM_NORMAL 0x2
+
+static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
+{
+ int idx, len;
+
+ len = po->rx_ring.frame_max + 1;
+ idx = po->rx_ring.head;
+ if (pow_off)
+ idx += len >> pow_off;
+ if (idx >= len)
+ idx -= len;
+ return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
+}
+
+static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
+{
+ int idx, len;
+
+ len = po->rx_ring.prb_bdqc.knum_blocks;
+ idx = po->rx_ring.prb_bdqc.kactive_blk_num;
+ if (pow_off)
+ idx += len >> pow_off;
+ if (idx >= len)
+ idx -= len;
+ return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
+}
+
+static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
struct sock *sk = &po->sk;
- bool has_room;
+ int ret = ROOM_NONE;
+
+ if (po->prot_hook.func != tpacket_rcv) {
+ int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
+ - (skb ? skb->truesize : 0);
+ if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
+ return ROOM_NORMAL;
+ else if (avail > 0)
+ return ROOM_LOW;
+ else
+ return ROOM_NONE;
+ }
- if (po->prot_hook.func != tpacket_rcv)
- return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
- <= sk->sk_rcvbuf;
+ if (po->tp_version == TPACKET_V3) {
+ if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
+ ret = ROOM_NORMAL;
+ else if (__tpacket_v3_has_room(po, 0))
+ ret = ROOM_LOW;
+ } else {
+ if (__tpacket_has_room(po, ROOM_POW_OFF))
+ ret = ROOM_NORMAL;
+ else if (__tpacket_has_room(po, 0))
+ ret = ROOM_LOW;
+ }
- spin_lock(&sk->sk_receive_queue.lock);
- if (po->tp_version == TPACKET_V3)
- has_room = prb_lookup_block(po, &po->rx_ring,
- po->rx_ring.prb_bdqc.kactive_blk_num,
- TP_STATUS_KERNEL);
- else
- has_room = packet_lookup_frame(po, &po->rx_ring,
- po->rx_ring.head,
- TP_STATUS_KERNEL);
- spin_unlock(&sk->sk_receive_queue.lock);
+ return ret;
+}
+
+static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+{
+ int ret;
+ bool has_room;
+
+ spin_lock_bh(&po->sk.sk_receive_queue.lock);
+ ret = __packet_rcv_has_room(po, skb);
+ has_room = ret == ROOM_NORMAL;
+ if (po->pressure == has_room)
+ po->pressure = !has_room;
+ spin_unlock_bh(&po->sk.sk_receive_queue.lock);
- return has_room;
+ return ret;
}
static void packet_sock_destruct(struct sock *sk)
@@ -1266,14 +1322,18 @@ static void packet_sock_destruct(struct sock *sk)
sk_refcnt_debug_dec(sk);
}
-static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
+static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
{
- int x = atomic_read(&f->rr_cur) + 1;
+ u32 rxhash;
+ int i, count = 0;
- if (x >= num)
- x = 0;
+ rxhash = skb_get_hash(skb);
+ for (i = 0; i < ROLLOVER_HLEN; i++)
+ if (po->rollover->history[i] == rxhash)
+ count++;
- return x;
+ po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
+ return count > (ROLLOVER_HLEN >> 1);
}
static unsigned int fanout_demux_hash(struct packet_fanout *f,
@@ -1287,13 +1347,9 @@ static unsigned int fanout_demux_lb(struct packet_fanout *f,
struct sk_buff *skb,
unsigned int num)
{
- int cur, old;
+ unsigned int val = atomic_inc_return(&f->rr_cur);
- cur = atomic_read(&f->rr_cur);
- while ((old = atomic_cmpxchg(&f->rr_cur, cur,
- fanout_rr_next(f, num))) != cur)
- cur = old;
- return cur;
+ return val % num;
}
static unsigned int fanout_demux_cpu(struct packet_fanout *f,
@@ -1312,22 +1368,40 @@ static unsigned int fanout_demux_rnd(struct packet_fanout *f,
static unsigned int fanout_demux_rollover(struct packet_fanout *f,
struct sk_buff *skb,
- unsigned int idx, unsigned int skip,
+ unsigned int idx, bool try_self,
unsigned int num)
{
- unsigned int i, j;
+ struct packet_sock *po, *po_next, *po_skip = NULL;
+ unsigned int i, j, room = ROOM_NONE;
+
+ po = pkt_sk(f->arr[idx]);
- i = j = min_t(int, f->next[idx], num - 1);
+ if (try_self) {
+ room = packet_rcv_has_room(po, skb);
+ if (room == ROOM_NORMAL ||
+ (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
+ return idx;
+ po_skip = po;
+ }
+
+ i = j = min_t(int, po->rollover->sock, num - 1);
do {
- if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
+ po_next = pkt_sk(f->arr[i]);
+ if (po_next != po_skip && !po_next->pressure &&
+ packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
if (i != j)
- f->next[idx] = i;
+ po->rollover->sock = i;
+ atomic_long_inc(&po->rollover->num);
+ if (room == ROOM_LOW)
+ atomic_long_inc(&po->rollover->num_huge);
return i;
}
+
if (++i == num)
i = 0;
} while (i != j);
+ atomic_long_inc(&po->rollover->num_failed);
return idx;
}
@@ -1347,7 +1421,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
struct packet_fanout *f = pt->af_packet_priv;
- unsigned int num = f->num_members;
+ unsigned int num = READ_ONCE(f->num_members);
struct packet_sock *po;
unsigned int idx;
@@ -1380,17 +1454,14 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
idx = fanout_demux_qm(f, skb, num);
break;
case PACKET_FANOUT_ROLLOVER:
- idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
+ idx = fanout_demux_rollover(f, skb, 0, false, num);
break;
}
- po = pkt_sk(f->arr[idx]);
- if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
- unlikely(!packet_rcv_has_room(po, skb))) {
- idx = fanout_demux_rollover(f, skb, idx, idx, num);
- po = pkt_sk(f->arr[idx]);
- }
+ if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
+ idx = fanout_demux_rollover(f, skb, idx, true, num);
+ po = pkt_sk(f->arr[idx]);
return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
}
@@ -1461,6 +1532,16 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
if (po->fanout)
return -EALREADY;
+ if (type == PACKET_FANOUT_ROLLOVER ||
+ (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
+ po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
+ if (!po->rollover)
+ return -ENOMEM;
+ atomic_long_set(&po->rollover->num, 0);
+ atomic_long_set(&po->rollover->num_huge, 0);
+ atomic_long_set(&po->rollover->num_failed, 0);
+ }
+
mutex_lock(&fanout_mutex);
match = NULL;
list_for_each_entry(f, &fanout_list, list) {
@@ -1509,6 +1590,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
}
out:
mutex_unlock(&fanout_mutex);
+ if (err) {
+ kfree(po->rollover);
+ po->rollover = NULL;
+ }
return err;
}
@@ -1530,6 +1615,9 @@ static void fanout_release(struct sock *sk)
kfree(f);
}
mutex_unlock(&fanout_mutex);
+
+ if (po->rollover)
+ kfree_rcu(po->rollover, rcu);
}
static const struct proto_ops packet_ops;
@@ -1608,8 +1696,8 @@ oom:
* protocol layers and you must therefore supply it with a complete frame
*/
-static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t len)
+static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
+ size_t len)
{
struct sock *sk = sock->sk;
DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
@@ -1818,13 +1906,10 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
skb = nskb;
}
- BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
- sizeof(skb->cb));
+ sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
sll = &PACKET_SKB_CB(skb)->sa.ll;
- sll->sll_family = AF_PACKET;
sll->sll_hatype = dev->type;
- sll->sll_protocol = skb->protocol;
sll->sll_pkttype = skb->pkt_type;
if (unlikely(po->origdev))
sll->sll_ifindex = orig_dev->ifindex;
@@ -1833,7 +1918,10 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
- PACKET_SKB_CB(skb)->origlen = skb->len;
+ /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
+ * Use their space for storing the original skb length.
+ */
+ PACKET_SKB_CB(skb)->sa.origlen = skb->len;
if (pskb_trim(skb, snaplen))
goto drop_n_acct;
@@ -1847,7 +1935,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
spin_lock(&sk->sk_receive_queue.lock);
po->stats.stats1.tp_packets++;
- skb->dropcount = atomic_read(&sk->sk_drops);
+ sock_skb_set_dropcount(sk, skb);
__skb_queue_tail(&sk->sk_receive_queue, skb);
spin_unlock(&sk->sk_receive_queue.lock);
sk->sk_data_ready(sk);
@@ -1910,14 +1998,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
}
}
- if (skb->ip_summed == CHECKSUM_PARTIAL)
- status |= TP_STATUS_CSUMNOTREADY;
-
snaplen = skb->len;
res = run_filter(skb, sk, snaplen);
if (!res)
goto drop_n_restore;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ status |= TP_STATUS_CSUMNOTREADY;
+ else if (skb->pkt_type != PACKET_OUTGOING &&
+ (skb->ip_summed == CHECKSUM_COMPLETE ||
+ skb_csum_unnecessary(skb)))
+ status |= TP_STATUS_CSUM_VALID;
+
if (snaplen > res)
snaplen = res;
@@ -2300,14 +2393,18 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
tlen = dev->needed_tailroom;
skb = sock_alloc_send_skb(&po->sk,
hlen + tlen + sizeof(struct sockaddr_ll),
- 0, &err);
+ !need_wait, &err);
- if (unlikely(skb == NULL))
+ if (unlikely(skb == NULL)) {
+ /* we assume the socket was initially writeable ... */
+ if (likely(len_sum > 0))
+ err = len_sum;
goto out_status;
-
+ }
tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
addr, hlen);
- if (tp_len > dev->mtu + dev->hard_header_len) {
+ if (likely(tp_len >= 0) &&
+ tp_len > dev->mtu + dev->hard_header_len) {
struct ethhdr *ehdr;
/* Earlier code assumed this would be a VLAN pkt,
* double-check this now that we have the actual
@@ -2603,8 +2700,7 @@ out:
return err;
}
-static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t len)
+static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
struct sock *sk = sock->sk;
struct packet_sock *po = pkt_sk(sk);
@@ -2689,7 +2785,7 @@ static int packet_release(struct socket *sock)
static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
{
struct packet_sock *po = pkt_sk(sk);
- const struct net_device *dev_curr;
+ struct net_device *dev_curr;
__be16 proto_curr;
bool need_rehook;
@@ -2713,15 +2809,13 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
po->num = proto;
po->prot_hook.type = proto;
-
- if (po->prot_hook.dev)
- dev_put(po->prot_hook.dev);
-
po->prot_hook.dev = dev;
po->ifindex = dev ? dev->ifindex : 0;
packet_cached_dev_assign(po, dev);
}
+ if (dev_curr)
+ dev_put(dev_curr);
if (proto == 0 || !need_rehook)
goto out_unlock;
@@ -2822,7 +2916,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
sock->state = SS_UNCONNECTED;
err = -ENOBUFS;
- sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
+ sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
if (sk == NULL)
goto out;
@@ -2852,6 +2946,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
spin_lock_init(&po->bind_lock);
mutex_init(&po->pg_vec_lock);
+ po->rollover = NULL;
po->prot_hook.func = packet_rcv;
if (sock->type == SOCK_PACKET)
@@ -2884,13 +2979,14 @@ out:
* If necessary we block.
*/
-static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t len, int flags)
+static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
{
struct sock *sk = sock->sk;
struct sk_buff *skb;
int copied, err;
int vnet_hdr_len = 0;
+ unsigned int origlen = 0;
err = -EINVAL;
if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
@@ -2928,6 +3024,9 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
if (skb == NULL)
goto out;
+ if (pkt_sk(sk)->pressure)
+ packet_rcv_has_room(pkt_sk(sk), NULL);
+
if (pkt_sk(sk)->has_vnet_hdr) {
struct virtio_net_hdr vnet_hdr = { 0 };
@@ -2990,6 +3089,15 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
if (err)
goto out_free;
+ if (sock->type != SOCK_PACKET) {
+ struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
+
+ /* Original length was stored in sockaddr_ll fields */
+ origlen = PACKET_SKB_CB(skb)->sa.origlen;
+ sll->sll_family = AF_PACKET;
+ sll->sll_protocol = skb->protocol;
+ }
+
sock_recv_ts_and_drops(msg, sk, skb);
if (msg->msg_name) {
@@ -3001,6 +3109,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
msg->msg_namelen = sizeof(struct sockaddr_pkt);
} else {
struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
+
msg->msg_namelen = sll->sll_halen +
offsetof(struct sockaddr_ll, sll_addr);
}
@@ -3014,7 +3123,12 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
aux.tp_status = TP_STATUS_USER;
if (skb->ip_summed == CHECKSUM_PARTIAL)
aux.tp_status |= TP_STATUS_CSUMNOTREADY;
- aux.tp_len = PACKET_SKB_CB(skb)->origlen;
+ else if (skb->pkt_type != PACKET_OUTGOING &&
+ (skb->ip_summed == CHECKSUM_COMPLETE ||
+ skb_csum_unnecessary(skb)))
+ aux.tp_status |= TP_STATUS_CSUM_VALID;
+
+ aux.tp_len = origlen;
aux.tp_snaplen = skb->len;
aux.tp_mac = 0;
aux.tp_net = skb_network_offset(skb);
@@ -3456,6 +3570,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
struct packet_sock *po = pkt_sk(sk);
void *data = &val;
union tpacket_stats_u st;
+ struct tpacket_rollover_stats rstats;
if (level != SOL_PACKET)
return -ENOPROTOOPT;
@@ -3531,6 +3646,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
((u32)po->fanout->flags << 24)) :
0);
break;
+ case PACKET_ROLLOVER_STATS:
+ if (!po->rollover)
+ return -EINVAL;
+ rstats.tp_all = atomic_long_read(&po->rollover->num);
+ rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
+ rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
+ data = &rstats;
+ lv = sizeof(rstats);
+ break;
case PACKET_TX_HAS_OFF:
val = po->tp_tx_has_off;
break;
@@ -3668,6 +3792,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
TP_STATUS_KERNEL))
mask |= POLLIN | POLLRDNORM;
}
+ if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
+ po->pressure = 0;
spin_unlock_bh(&sk->sk_receive_queue.lock);
spin_lock_bh(&sk->sk_write_queue.lock);
if (po->tx_ring.pg_vec) {
@@ -3857,7 +3983,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
* it above but just being paranoid
*/
if (!tx_ring)
- init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
+ init_prb_bdqc(po, rb, pg_vec, req_u);
break;
default:
break;