Skip to content

Commit 81bbbb6

Browse files
author
Martin KaFai Lau
committed
Merge branch 'bpf: Add ipip6 and ip6ip decap support for bpf_skb_adjust_room()'
Ziyang Xuan says: ==================== Add ipip6 and ip6ip decap support for bpf_skb_adjust_room(). Main use case is for using cls_bpf on ingress hook to decapsulate IPv4 over IPv6 and IPv6 over IPv4 tunnel packets. And add ipip6 and ip6ip decap testcases to verify that bpf_skb_adjust_room() correctly decapsulate ipip6 and ip6ip tunnel packets. $./test_tc_tunnel.sh ipip encap 192.168.1.1 to 192.168.1.2, type ipip, mac none len 100 test basic connectivity 0 test bpf encap without decap (expect failure) Ncat: TIMEOUT. 1 test bpf encap with tunnel device decap 0 test bpf encap with bpf decap 0 OK ipip6 encap 192.168.1.1 to 192.168.1.2, type ipip6, mac none len 100 test basic connectivity 0 test bpf encap without decap (expect failure) Ncat: TIMEOUT. 1 test bpf encap with tunnel device decap 0 test bpf encap with bpf decap 0 OK ip6ip6 encap fd::1 to fd::2, type ip6tnl, mac none len 100 test basic connectivity 0 test bpf encap without decap (expect failure) Ncat: TIMEOUT. 1 test bpf encap with tunnel device decap 0 test bpf encap with bpf decap 0 OK sit encap fd::1 to fd::2, type sit, mac none len 100 test basic connectivity 0 test bpf encap without decap (expect failure) Ncat: TIMEOUT. 1 test bpf encap with tunnel device decap 0 test bpf encap with bpf decap 0 OK ... OK. All tests passed v3: - Fix compilation failure of selftests/bpf. - Combine two new branches in bpf_skb_adjust_room(). - Simplify description for new flags BPF_F_ADJ_ROOM_DECAP_L3_IP*. v2: - Use decap flags to indicate the new IP header. Do not rely on skb->encapsulation. ==================== Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2 parents 1c48391 + 7105f76 commit 81bbbb6

File tree

5 files changed

+142
-9
lines changed

5 files changed

+142
-9
lines changed

include/uapi/linux/bpf.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2647,6 +2647,11 @@ union bpf_attr {
26472647
* Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
26482648
* L2 type as Ethernet.
26492649
*
2650+
* * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**,
2651+
* **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**:
2652+
* Indicate the new IP header version after decapsulating the outer
2653+
* IP header. Used when the inner and outer IP versions are different.
2654+
*
26502655
* A call to this helper is susceptible to change the underlying
26512656
* packet buffer. Therefore, at load time, all checks on pointers
26522657
* previously done by the verifier are invalidated and must be
@@ -5807,6 +5812,8 @@ enum {
58075812
BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4),
58085813
BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5),
58095814
BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6),
5815+
BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7),
5816+
BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8),
58105817
};
58115818

58125819
enum {

net/core/filter.c

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3381,13 +3381,17 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
33813381
#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
33823382
BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
33833383

3384+
#define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \
3385+
BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
3386+
33843387
#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \
33853388
BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
33863389
BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
33873390
BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
33883391
BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
33893392
BPF_F_ADJ_ROOM_ENCAP_L2( \
3390-
BPF_ADJ_ROOM_ENCAP_L2_MASK))
3393+
BPF_ADJ_ROOM_ENCAP_L2_MASK) | \
3394+
BPF_F_ADJ_ROOM_DECAP_L3_MASK)
33913395

33923396
static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
33933397
u64 flags)
@@ -3501,6 +3505,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
35013505
int ret;
35023506

35033507
if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
3508+
BPF_F_ADJ_ROOM_DECAP_L3_MASK |
35043509
BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
35053510
return -EINVAL;
35063511

@@ -3519,6 +3524,14 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
35193524
if (unlikely(ret < 0))
35203525
return ret;
35213526

3527+
/* Match skb->protocol to new outer l3 protocol */
3528+
if (skb->protocol == htons(ETH_P_IP) &&
3529+
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
3530+
skb->protocol = htons(ETH_P_IPV6);
3531+
else if (skb->protocol == htons(ETH_P_IPV6) &&
3532+
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
3533+
skb->protocol = htons(ETH_P_IP);
3534+
35223535
if (skb_is_gso(skb)) {
35233536
struct skb_shared_info *shinfo = skb_shinfo(skb);
35243537

@@ -3608,6 +3621,22 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
36083621
return -ENOTSUPP;
36093622
}
36103623

3624+
if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
3625+
if (!shrink)
3626+
return -EINVAL;
3627+
3628+
switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
3629+
case BPF_F_ADJ_ROOM_DECAP_L3_IPV4:
3630+
len_min = sizeof(struct iphdr);
3631+
break;
3632+
case BPF_F_ADJ_ROOM_DECAP_L3_IPV6:
3633+
len_min = sizeof(struct ipv6hdr);
3634+
break;
3635+
default:
3636+
return -EINVAL;
3637+
}
3638+
}
3639+
36113640
len_cur = skb->len - skb_network_offset(skb);
36123641
if ((shrink && (len_diff_abs >= len_cur ||
36133642
len_cur - len_diff_abs < len_min)) ||

tools/include/uapi/linux/bpf.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2647,6 +2647,11 @@ union bpf_attr {
26472647
* Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
26482648
* L2 type as Ethernet.
26492649
*
2650+
* * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**,
2651+
* **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**:
2652+
* Indicate the new IP header version after decapsulating the outer
2653+
* IP header. Used when the inner and outer IP versions are different.
2654+
*
26502655
* A call to this helper is susceptible to change the underlying
26512656
* packet buffer. Therefore, at load time, all checks on pointers
26522657
* previously done by the verifier are invalidated and must be
@@ -5807,6 +5812,8 @@ enum {
58075812
BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4),
58085813
BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5),
58095814
BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6),
5815+
BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7),
5816+
BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8),
58105817
};
58115818

58125819
enum {

tools/testing/selftests/bpf/progs/test_tc_tunnel.c

Lines changed: 89 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ static const int cfg_udp_src = 20000;
3838
#define VXLAN_FLAGS 0x8
3939
#define VXLAN_VNI 1
4040

41+
#ifndef NEXTHDR_DEST
42+
#define NEXTHDR_DEST 60
43+
#endif
44+
4145
/* MPLS label 1000 with S bit (last label) set and ttl of 255. */
4246
static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
4347
MPLS_LS_S_MASK | 0xff);
@@ -363,6 +367,61 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
363367
return TC_ACT_OK;
364368
}
365369

370+
static int encap_ipv6_ipip6(struct __sk_buff *skb)
371+
{
372+
struct iphdr iph_inner;
373+
struct v6hdr h_outer;
374+
struct tcphdr tcph;
375+
struct ethhdr eth;
376+
__u64 flags;
377+
int olen;
378+
379+
if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
380+
sizeof(iph_inner)) < 0)
381+
return TC_ACT_OK;
382+
383+
/* filter only packets we want */
384+
if (bpf_skb_load_bytes(skb, ETH_HLEN + (iph_inner.ihl << 2),
385+
&tcph, sizeof(tcph)) < 0)
386+
return TC_ACT_OK;
387+
388+
if (tcph.dest != __bpf_constant_htons(cfg_port))
389+
return TC_ACT_OK;
390+
391+
olen = sizeof(h_outer.ip);
392+
393+
flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6;
394+
395+
/* add room between mac and network header */
396+
if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
397+
return TC_ACT_SHOT;
398+
399+
/* prepare new outer network header */
400+
memset(&h_outer.ip, 0, sizeof(h_outer.ip));
401+
h_outer.ip.version = 6;
402+
h_outer.ip.hop_limit = iph_inner.ttl;
403+
h_outer.ip.saddr.s6_addr[1] = 0xfd;
404+
h_outer.ip.saddr.s6_addr[15] = 1;
405+
h_outer.ip.daddr.s6_addr[1] = 0xfd;
406+
h_outer.ip.daddr.s6_addr[15] = 2;
407+
h_outer.ip.payload_len = iph_inner.tot_len;
408+
h_outer.ip.nexthdr = IPPROTO_IPIP;
409+
410+
/* store new outer network header */
411+
if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
412+
BPF_F_INVALIDATE_HASH) < 0)
413+
return TC_ACT_SHOT;
414+
415+
/* update eth->h_proto */
416+
if (bpf_skb_load_bytes(skb, 0, &eth, sizeof(eth)) < 0)
417+
return TC_ACT_SHOT;
418+
eth.h_proto = bpf_htons(ETH_P_IPV6);
419+
if (bpf_skb_store_bytes(skb, 0, &eth, sizeof(eth), 0) < 0)
420+
return TC_ACT_SHOT;
421+
422+
return TC_ACT_OK;
423+
}
424+
366425
static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
367426
__u16 l2_proto)
368427
{
@@ -461,6 +520,15 @@ int __encap_ip6tnl_none(struct __sk_buff *skb)
461520
return TC_ACT_OK;
462521
}
463522

523+
SEC("encap_ipip6_none")
524+
int __encap_ipip6_none(struct __sk_buff *skb)
525+
{
526+
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
527+
return encap_ipv6_ipip6(skb);
528+
else
529+
return TC_ACT_OK;
530+
}
531+
464532
SEC("encap_ip6gre_none")
465533
int __encap_ip6gre_none(struct __sk_buff *skb)
466534
{
@@ -528,13 +596,33 @@ int __encap_ip6vxlan_eth(struct __sk_buff *skb)
528596

529597
static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
530598
{
599+
__u64 flags = BPF_F_ADJ_ROOM_FIXED_GSO;
600+
struct ipv6_opt_hdr ip6_opt_hdr;
531601
struct gre_hdr greh;
532602
struct udphdr udph;
533603
int olen = len;
534604

535605
switch (proto) {
536606
case IPPROTO_IPIP:
607+
flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV4;
608+
break;
537609
case IPPROTO_IPV6:
610+
flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV6;
611+
break;
612+
case NEXTHDR_DEST:
613+
if (bpf_skb_load_bytes(skb, off + len, &ip6_opt_hdr,
614+
sizeof(ip6_opt_hdr)) < 0)
615+
return TC_ACT_OK;
616+
switch (ip6_opt_hdr.nexthdr) {
617+
case IPPROTO_IPIP:
618+
flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV4;
619+
break;
620+
case IPPROTO_IPV6:
621+
flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV6;
622+
break;
623+
default:
624+
return TC_ACT_OK;
625+
}
538626
break;
539627
case IPPROTO_GRE:
540628
olen += sizeof(struct gre_hdr);
@@ -569,8 +657,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
569657
return TC_ACT_OK;
570658
}
571659

572-
if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC,
573-
BPF_F_ADJ_ROOM_FIXED_GSO))
660+
if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, flags))
574661
return TC_ACT_SHOT;
575662

576663
return TC_ACT_OK;

tools/testing/selftests/bpf/test_tc_tunnel.sh

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ if [[ "$#" -eq "0" ]]; then
100100
echo "ipip"
101101
$0 ipv4 ipip none 100
102102

103+
echo "ipip6"
104+
$0 ipv4 ipip6 none 100
105+
103106
echo "ip6ip6"
104107
$0 ipv6 ip6tnl none 100
105108

@@ -224,6 +227,9 @@ elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
224227
elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then
225228
ttype="vxlan"
226229
targs="id 1 dstport 8472 udp6zerocsumrx"
230+
elif [[ "$tuntype" == "ipip6" ]]; then
231+
ttype="ip6tnl"
232+
targs=""
227233
else
228234
ttype=$tuntype
229235
targs=""
@@ -233,6 +239,9 @@ fi
233239
if [[ "${tuntype}" == "sit" ]]; then
234240
link_addr1="${ns1_v4}"
235241
link_addr2="${ns2_v4}"
242+
elif [[ "${tuntype}" == "ipip6" ]]; then
243+
link_addr1="${ns1_v6}"
244+
link_addr2="${ns2_v6}"
236245
else
237246
link_addr1="${addr1}"
238247
link_addr2="${addr2}"
@@ -287,12 +296,6 @@ else
287296
server_listen
288297
fi
289298

290-
# bpf_skb_net_shrink does not take tunnel flags yet, cannot update L3.
291-
if [[ "${tuntype}" == "sit" ]]; then
292-
echo OK
293-
exit 0
294-
fi
295-
296299
# serverside, use BPF for decap
297300
ip netns exec "${ns2}" ip link del dev testtun0
298301
ip netns exec "${ns2}" tc qdisc add dev veth2 clsact

0 commit comments

Comments
 (0)