From 681f130f39e10087475383e6771b9366e26bab0c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Jun 2013 05:52:22 -0700 Subject: netfilter: xt_socket: add XT_SOCKET_NOWILDCARD flag xt_socket module can be a nice replacement to conntrack module in some cases (SYN filtering for example) But it lacks the ability to match the 3rd packet of TCP handshake (ACK coming from the client). Add a XT_SOCKET_NOWILDCARD flag to disable the wildcard mechanism. The wildcard is the legacy socket match behavior, that ignores LISTEN sockets bound to INADDR_ANY (or ipv6 equivalent) iptables -I INPUT -p tcp --syn -j SYN_CHAIN iptables -I INPUT -m socket --nowildcard -j ACCEPT Signed-off-by: Eric Dumazet Cc: Patrick McHardy Cc: Jesper Dangaard Brouer Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_socket.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/xt_socket.h b/include/uapi/linux/netfilter/xt_socket.h index 26d7217bd4f1..6315e2ac3474 100644 --- a/include/uapi/linux/netfilter/xt_socket.h +++ b/include/uapi/linux/netfilter/xt_socket.h @@ -5,10 +5,17 @@ enum { XT_SOCKET_TRANSPARENT = 1 << 0, + XT_SOCKET_NOWILDCARD = 1 << 1, }; struct xt_socket_mtinfo1 { __u8 flags; }; +#define XT_SOCKET_FLAGS_V1 XT_SOCKET_TRANSPARENT + +struct xt_socket_mtinfo2 { + __u8 flags; +}; +#define XT_SOCKET_FLAGS_V2 (XT_SOCKET_TRANSPARENT | XT_SOCKET_NOWILDCARD) #endif /* _XT_SOCKET_H */ -- cgit From eba3b5a78799d21dea05118b294524958f0ab592 Mon Sep 17 00:00:00 2001 From: Alexander Frolkin Date: Wed, 19 Jun 2013 10:54:25 +0100 Subject: ipvs: SH fallback and L4 hashing By default the SH scheduler rejects connections that are hashed onto a realserver of weight 0. This patch adds a flag to make SH choose a different realserver in this case, instead of rejecting the connection. The patch also adds a flag to make SH include the source port (TCP, UDP, SCTP) in the hash as well as the source address. This basically allows for deterministic round-robin load balancing (i.e., where any director in a cluster of directors with identical config will send the same packet the same way). The flags are service flags (IP_VS_SVC_F_SCHED*) so that these options can be set per service. They are set using a new option to ipvsadm. Signed-off-by: Alexander Frolkin Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/uapi/linux/ip_vs.h | 6 +++ net/netfilter/ipvs/ip_vs_sh.c | 100 +++++++++++++++++++++++++++++++++++------- 2 files changed, 91 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index a24537725e80..29458223d044 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -20,6 +20,12 @@ #define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */ #define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */ #define IP_VS_SVC_F_ONEPACKET 0x0004 /* one-packet scheduling */ +#define IP_VS_SVC_F_SCHED1 0x0008 /* scheduler flag 1 */ +#define IP_VS_SVC_F_SCHED2 0x0010 /* scheduler flag 2 */ +#define IP_VS_SVC_F_SCHED3 0x0020 /* scheduler flag 3 */ + +#define IP_VS_SVC_F_SCHED_SH_FALLBACK IP_VS_SVC_F_SCHED1 /* SH fallback */ +#define IP_VS_SVC_F_SCHED_SH_PORT IP_VS_SVC_F_SCHED2 /* SH use port */ /* * Destination Server Flags diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index e0d5d1653566..f16c027df15b 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -48,6 +48,10 @@ #include +#include +#include +#include + /* * IPVS SH bucket @@ -71,10 +75,19 @@ struct ip_vs_sh_state { struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; }; +/* Helper function to determine if server is unavailable */ +static inline bool is_unavailable(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->weight) <= 0 || + dest->flags & IP_VS_DEST_F_OVERLOAD; +} + /* * Returns hash value for IPVS SH entry */ -static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) +static inline unsigned int +ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, + __be16 port, unsigned int offset) { __be32 addr_fold = addr->ip; @@ -83,7 +96,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK; + return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & + IP_VS_SH_TAB_MASK; } @@ -91,12 +105,42 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * -ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr) +ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s, + const union nf_inet_addr *addr, __be16 port) { - return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest); + unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0); + struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest); + + return (!dest || is_unavailable(dest)) ? NULL : dest; } +/* As ip_vs_sh_get, but with fallback if selected server is unavailable */ +static inline struct ip_vs_dest * +ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int offset; + unsigned int hash; + struct ip_vs_dest *dest; + + for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) { + hash = ip_vs_sh_hashkey(svc->af, addr, port, offset); + dest = rcu_dereference(s->buckets[hash].dest); + if (!dest) + break; + if (is_unavailable(dest)) + IP_VS_DBG_BUF(6, "SH: selected unavailable server " + "%s:%d (offset %d)", + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), offset); + else + return dest; + } + + return NULL; +} + /* * Assign all the hash buckets of the specified table with the service. */ @@ -213,13 +257,33 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, } -/* - * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - * consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) +/* Helper function to get port number */ +static inline __be16 +ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) { - return dest->flags & IP_VS_DEST_F_OVERLOAD; + __be16 port; + struct tcphdr _tcph, *th; + struct udphdr _udph, *uh; + sctp_sctphdr_t _sctph, *sh; + + switch (iph->protocol) { + case IPPROTO_TCP: + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + port = th->source; + break; + case IPPROTO_UDP: + uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); + port = uh->source; + break; + case IPPROTO_SCTP: + sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + port = sh->source; + break; + default: + port = 0; + } + + return port; } @@ -232,15 +296,21 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, { struct ip_vs_dest *dest; struct ip_vs_sh_state *s; + __be16 port = 0; IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); + if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT) + port = ip_vs_sh_get_port(skb, iph); + s = (struct ip_vs_sh_state *) svc->sched_data; - dest = ip_vs_sh_get(svc->af, s, &iph->saddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { + + if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK) + dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port); + else + dest = ip_vs_sh_get(svc, s, &iph->saddr, port); + + if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } -- cgit From 496e4ae7dc944faa1721bfda7e9d834d5611a874 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 29 Jun 2013 14:15:47 +0200 Subject: netfilter: nf_queue: add NFQA_SKB_CSUM_NOTVERIFIED info flag The common case is that TCP/IP checksums have already been verified, e.g. by hardware (rx checksum offload), or conntrack. Userspace can use this flag to determine when the checksum has not been validated yet. If the flag is set, this doesn't necessarily mean that the packet has an invalid checksum, e.g. if NIC doesn't support rx checksum. Userspace that sucessfully enabled NFQA_CFG_F_GSO queue feature flag can infer that IP/TCP checksum has already been validated if either the SKB_INFO attribute is not present or the NFQA_SKB_CSUM_NOTVERIFIED flag is unset. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_queue.h | 2 ++ net/netfilter/nfnetlink_queue_core.c | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h index a2308ae5a73d..3a9b92147339 100644 --- a/include/uapi/linux/netfilter/nfnetlink_queue.h +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h @@ -105,5 +105,7 @@ enum nfqnl_attr_config { #define NFQA_SKB_CSUMNOTREADY (1 << 0) /* packet is GSO (i.e., exceeds device mtu) */ #define NFQA_SKB_GSO (1 << 1) +/* csum not validated (incoming device doesn't support hw checksum, etc.) */ +#define NFQA_SKB_CSUM_NOTVERIFIED (1 << 2) #endif /* _NFNETLINK_QUEUE_H */ diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 299a48ae5dc9..971ea145ab3e 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -280,12 +280,17 @@ nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen) skb_shinfo(to)->nr_frags = j; } -static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet) +static int +nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, + bool csum_verify) { __u32 flags = 0; if (packet->ip_summed == CHECKSUM_PARTIAL) flags = NFQA_SKB_CSUMNOTREADY; + else if (csum_verify) + flags = NFQA_SKB_CSUM_NOTVERIFIED; + if (skb_is_gso(packet)) flags |= NFQA_SKB_GSO; @@ -310,6 +315,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, struct net_device *outdev; struct nf_conn *ct = NULL; enum ip_conntrack_info uninitialized_var(ctinfo); + bool csum_verify; size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) @@ -327,6 +333,12 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (entskb->tstamp.tv64) size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); + if (entry->hook <= NF_INET_FORWARD || + (entry->hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) + csum_verify = !skb_csum_unnecessary(entskb); + else + csum_verify = false; + outdev = entry->outdev; switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { @@ -476,7 +488,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) goto nla_put_failure; - if (nfqnl_put_packet_info(skb, entskb)) + if (nfqnl_put_packet_info(skb, entskb, csum_verify)) goto nla_put_failure; if (data_len) { -- cgit