diff options
author | Martin KaFai Lau <martin.lau@kernel.org> | 2023-01-15 12:56:17 -0800 |
---|---|---|
committer | Martin KaFai Lau <martin.lau@kernel.org> | 2023-01-15 12:56:17 -0800 |
commit | 81bbbb697481700ef8f31619f9d1e96cca894b5a (patch) | |
tree | 905aca8486fed33eadf5853d56c4d8bd681e3b04 | |
parent | 1c48391bc6739f5e3306919d4b887b92c35d5490 (diff) | |
parent | 7105f76fb56f5ed66a59bc048bc71e9f100e1d39 (diff) |
Merge branch 'bpf: Add ipip6 and ip6ip decap support for bpf_skb_adjust_room()'
Ziyang Xuan says:
====================
Add ipip6 and ip6ip decap support for bpf_skb_adjust_room().
Main use case is for using cls_bpf on ingress hook to decapsulate
IPv4 over IPv6 and IPv6 over IPv4 tunnel packets.
And add ipip6 and ip6ip decap testcases to verify that
bpf_skb_adjust_room() correctly decapsulate ipip6 and ip6ip
tunnel packets.
$./test_tc_tunnel.sh
ipip
encap 192.168.1.1 to 192.168.1.2, type ipip, mac none len 100
test basic connectivity
0
test bpf encap without decap (expect failure)
Ncat: TIMEOUT.
1
test bpf encap with tunnel device decap
0
test bpf encap with bpf decap
0
OK
ipip6
encap 192.168.1.1 to 192.168.1.2, type ipip6, mac none len 100
test basic connectivity
0
test bpf encap without decap (expect failure)
Ncat: TIMEOUT.
1
test bpf encap with tunnel device decap
0
test bpf encap with bpf decap
0
OK
ip6ip6
encap fd::1 to fd::2, type ip6tnl, mac none len 100
test basic connectivity
0
test bpf encap without decap (expect failure)
Ncat: TIMEOUT.
1
test bpf encap with tunnel device decap
0
test bpf encap with bpf decap
0
OK
sit
encap fd::1 to fd::2, type sit, mac none len 100
test basic connectivity
0
test bpf encap without decap (expect failure)
Ncat: TIMEOUT.
1
test bpf encap with tunnel device decap
0
test bpf encap with bpf decap
0
OK
...
OK. All tests passed
v3:
- Fix compilation failure of selftests/bpf.
- Combine two new branches in bpf_skb_adjust_room().
- Simplify description for new flags BPF_F_ADJ_ROOM_DECAP_L3_IP*.
v2:
- Use decap flags to indicate the new IP header.
Do not rely on skb->encapsulation.
====================
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
-rw-r--r-- | include/uapi/linux/bpf.h | 7 | ||||
-rw-r--r-- | net/core/filter.c | 31 | ||||
-rw-r--r-- | tools/include/uapi/linux/bpf.h | 7 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 91 | ||||
-rwxr-xr-x | tools/testing/selftests/bpf/test_tc_tunnel.sh | 15 |
5 files changed, 142 insertions, 9 deletions
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bc1a3d232ae4..adae5b168f9d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2647,6 +2647,11 @@ union bpf_attr { * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the * L2 type as Ethernet. * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicate the new IP header version after decapsulating the outer + * IP header. Used when the inner and outer IP versions are different. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -5807,6 +5812,8 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), + BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), + BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index d9befa6ba04e..b4547a2c02f4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3381,13 +3381,17 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) +#define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ + BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ - BPF_ADJ_ROOM_ENCAP_L2_MASK)) + BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ + BPF_F_ADJ_ROOM_DECAP_L3_MASK) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) @@ -3501,6 +3505,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_DECAP_L3_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; @@ -3519,6 +3524,14 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, if (unlikely(ret < 0)) return ret; + /* Match skb->protocol to new outer l3 protocol */ + if (skb->protocol == htons(ETH_P_IP) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + skb->protocol = htons(ETH_P_IPV6); + else if (skb->protocol == htons(ETH_P_IPV6) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) + skb->protocol = htons(ETH_P_IP); + if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -3608,6 +3621,22 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOTSUPP; } + if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { + if (!shrink) + return -EINVAL; + + switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { + case BPF_F_ADJ_ROOM_DECAP_L3_IPV4: + len_min = sizeof(struct iphdr); + break; + case BPF_F_ADJ_ROOM_DECAP_L3_IPV6: + len_min = sizeof(struct ipv6hdr); + break; + default: + return -EINVAL; + } + } + len_cur = skb->len - skb_network_offset(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bc1a3d232ae4..142b81bcbb2e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2647,6 +2647,11 @@ union bpf_attr { * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the * L2 type as Ethernet. * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicate the new IP header version after decapsulating the outer + * IP header. Used when the inner and outer IP versions are different. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -5807,6 +5812,8 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), + BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), + BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), }; enum { diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index a0e7762b1e5a..e6e678aa9874 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -38,6 +38,10 @@ static const int cfg_udp_src = 20000; #define VXLAN_FLAGS 0x8 #define VXLAN_VNI 1 +#ifndef NEXTHDR_DEST +#define NEXTHDR_DEST 60 +#endif + /* MPLS label 1000 with S bit (last label) set and ttl of 255. */ static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 | MPLS_LS_S_MASK | 0xff); @@ -363,6 +367,61 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, return TC_ACT_OK; } +static int encap_ipv6_ipip6(struct __sk_buff *skb) +{ + struct iphdr iph_inner; + struct v6hdr h_outer; + struct tcphdr tcph; + struct ethhdr eth; + __u64 flags; + int olen; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, + sizeof(iph_inner)) < 0) + return TC_ACT_OK; + + /* filter only packets we want */ + if (bpf_skb_load_bytes(skb, ETH_HLEN + (iph_inner.ihl << 2), + &tcph, sizeof(tcph)) < 0) + return TC_ACT_OK; + + if (tcph.dest != __bpf_constant_htons(cfg_port)) + return TC_ACT_OK; + + olen = sizeof(h_outer.ip); + + flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6; + + /* add room between mac and network header */ + if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags)) + return TC_ACT_SHOT; + + /* prepare new outer network header */ + memset(&h_outer.ip, 0, sizeof(h_outer.ip)); + h_outer.ip.version = 6; + h_outer.ip.hop_limit = iph_inner.ttl; + h_outer.ip.saddr.s6_addr[1] = 0xfd; + h_outer.ip.saddr.s6_addr[15] = 1; + h_outer.ip.daddr.s6_addr[1] = 0xfd; + h_outer.ip.daddr.s6_addr[15] = 2; + h_outer.ip.payload_len = iph_inner.tot_len; + h_outer.ip.nexthdr = IPPROTO_IPIP; + + /* store new outer network header */ + if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen, + BPF_F_INVALIDATE_HASH) < 0) + return TC_ACT_SHOT; + + /* update eth->h_proto */ + if (bpf_skb_load_bytes(skb, 0, ð, sizeof(eth)) < 0) + return TC_ACT_SHOT; + eth.h_proto = bpf_htons(ETH_P_IPV6); + if (bpf_skb_store_bytes(skb, 0, ð, sizeof(eth), 0) < 0) + return TC_ACT_SHOT; + + return TC_ACT_OK; +} + static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, __u16 l2_proto) { @@ -461,6 +520,15 @@ int __encap_ip6tnl_none(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("encap_ipip6_none") +int __encap_ipip6_none(struct __sk_buff *skb) +{ + if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) + return encap_ipv6_ipip6(skb); + else + return TC_ACT_OK; +} + SEC("encap_ip6gre_none") int __encap_ip6gre_none(struct __sk_buff *skb) { @@ -528,13 +596,33 @@ int __encap_ip6vxlan_eth(struct __sk_buff *skb) static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) { + __u64 flags = BPF_F_ADJ_ROOM_FIXED_GSO; + struct ipv6_opt_hdr ip6_opt_hdr; struct gre_hdr greh; struct udphdr udph; int olen = len; switch (proto) { case IPPROTO_IPIP: + flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV4; + break; case IPPROTO_IPV6: + flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV6; + break; + case NEXTHDR_DEST: + if (bpf_skb_load_bytes(skb, off + len, &ip6_opt_hdr, + sizeof(ip6_opt_hdr)) < 0) + return TC_ACT_OK; + switch (ip6_opt_hdr.nexthdr) { + case IPPROTO_IPIP: + flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV4; + break; + case IPPROTO_IPV6: + flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV6; + break; + default: + return TC_ACT_OK; + } break; case IPPROTO_GRE: olen += sizeof(struct gre_hdr); @@ -569,8 +657,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) return TC_ACT_OK; } - if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, - BPF_F_ADJ_ROOM_FIXED_GSO)) + if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, flags)) return TC_ACT_SHOT; return TC_ACT_OK; diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 334bdfeab940..910044f08908 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -100,6 +100,9 @@ if [[ "$#" -eq "0" ]]; then echo "ipip" $0 ipv4 ipip none 100 + echo "ipip6" + $0 ipv4 ipip6 none 100 + echo "ip6ip6" $0 ipv6 ip6tnl none 100 @@ -224,6 +227,9 @@ elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then ttype="vxlan" targs="id 1 dstport 8472 udp6zerocsumrx" +elif [[ "$tuntype" == "ipip6" ]]; then + ttype="ip6tnl" + targs="" else ttype=$tuntype targs="" @@ -233,6 +239,9 @@ fi if [[ "${tuntype}" == "sit" ]]; then link_addr1="${ns1_v4}" link_addr2="${ns2_v4}" +elif [[ "${tuntype}" == "ipip6" ]]; then + link_addr1="${ns1_v6}" + link_addr2="${ns2_v6}" else link_addr1="${addr1}" link_addr2="${addr2}" @@ -287,12 +296,6 @@ else server_listen fi -# bpf_skb_net_shrink does not take tunnel flags yet, cannot update L3. -if [[ "${tuntype}" == "sit" ]]; then - echo OK - exit 0 -fi - # serverside, use BPF for decap ip netns exec "${ns2}" ip link del dev testtun0 ip netns exec "${ns2}" tc qdisc add dev veth2 clsact |