Skip to content

Commit

Permalink
Merge pull request #2230 from tomastigera/tomas-mtu-fix-3.13-cherry-pick
Browse files Browse the repository at this point in the history
backport of to 3.13 PR 2229 BPF: MTU fix for 3.13
  • Loading branch information
caseydavenport authored Mar 11, 2020
2 parents 30c47af + 7a07922 commit aa0a22d
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 19 deletions.
3 changes: 1 addition & 2 deletions bpf-gpl/icmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,7 @@ static CALI_BPF_INLINE int icmp_v4_too_big(struct __sk_buff *skb)
__be16 unused;
__be16 mtu;
} frag = {
// ICMP MTU refers to the IP packet size.
.mtu = host_to_be16(TNNL_INNER_IP_MTU),
.mtu = host_to_be16(TUNNEL_MTU),
};

CALI_DEBUG("Sending ICMP too big mtu=%d\n", be16_to_host(frag.mtu));
Expand Down
37 changes: 23 additions & 14 deletions bpf-gpl/nat.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,35 +37,39 @@
#define dnat_return_should_encap() (CALI_F_FROM_WEP && !CALI_F_TUNNEL)
#define dnat_should_decap() (CALI_F_FROM_HEP && !CALI_F_TUNNEL)

// Base MTU of the host's network.
/* Number of bytes we add to a packet when we do encap. */
#define VXLAN_ENCAP_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \
sizeof(struct udphdr) + sizeof(struct vxlanhdr))

#define CALI_ENCAP_SIZE VXLAN_ENCAP_SIZE


#ifndef CALI_MTU
#define CALI_MTU 1460
#endif

// Number of bytes we add to a packet when we do encap.
#define CALI_ENCAP_EXTRA_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(struct vxlanhdr))

// Inner MTU for packets that we encap.
#ifndef CALI_NAT_TUNNEL_MTU
#define CALI_NAT_TUNNEL_MTU (CALI_MTU - CALI_ENCAP_EXTRA_SIZE) /* defaults to 1410 */
#define CALI_NAT_TUNNEL_MTU (CALI_MTU - CALI_ENCAP_SIZE) /* defaults to 1410 */
#endif

#ifndef CALI_NAT_TUNNEL_HEP_MTU
#define CALI_NAT_TUNNEL_HEP_MTU CALI_NAT_TUNNEL_MTU
#endif

#ifndef CALI_NAT_TUNNEL_WEP_MTU
#define CALI_NAT_TUNNEL_WEP_MTU (CALI_NAT_TUNNEL_MTU - 20) /* cali ifaces reserve 20 for ipip */
#define CALI_NAT_TUNNEL_WEP_MTU (CALI_NAT_TUNNEL_MTU - 50) /* defaults to 1360 as cali ifaces' mtu
* is 50 bytes smaller than the host
* ifaces mtu in the anticipation of ipip
* or vxlan overlay
*/
#endif

#if CALI_F_HEP
#define TNNL_INNER_ETH_MTU CALI_NAT_TUNNEL_HEP_MTU
#define TUNNEL_MTU CALI_NAT_TUNNEL_HEP_MTU
#elif CALI_F_WEP
#define TNNL_INNER_ETH_MTU CALI_NAT_TUNNEL_WEP_MTU
#define TUNNEL_MTU CALI_NAT_TUNNEL_WEP_MTU
#endif

#define TNNL_INNER_IP_MTU (TNNL_INNER_ETH_MTU - sizeof(struct ethhdr))

static CALI_BPF_INLINE __be32 cali_host_ip()
{
#ifdef CALI_HOST_IP
Expand Down Expand Up @@ -362,7 +366,7 @@ static CALI_BPF_INLINE int vxlan_v4_encap(struct __sk_buff *skb, __be32 ip_src,
ip_inner = (void*)(eth_inner+1);

/* Copy the original IP header. Since it is already DNATed, the dest IP is
* already set. All we need to do it to change the source IP
* already set. All we need to do is to change the source IP
*/
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,2,0)
*ip = *ip_inner;
Expand Down Expand Up @@ -444,9 +448,14 @@ static CALI_BPF_INLINE int is_vxlan_tunnel(struct iphdr *ip)

static CALI_BPF_INLINE bool vxlan_v4_encap_too_big(struct __sk_buff *skb)
{
__u32 mtu = TNNL_INNER_ETH_MTU;
__u32 mtu = TUNNEL_MTU;

if (skb->len > mtu) {
/* RFC-1191: MTU is the size in octets of the largest datagram that
* could be forwarded, along the path of the original datagram, without
* being fragmented at this router. The size includes the IP header and
* IP data, and does not include any lower-level headers.
*/
if (skb->len > sizeof(struct ethhdr) + mtu) {
CALI_DEBUG("SKB too long (len=%d) vs limit=%d\n", skb->len, mtu);
return true;
}
Expand Down
16 changes: 15 additions & 1 deletion bpf-gpl/tc.c
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,20 @@ static CALI_BPF_INLINE int forward_or_drop(struct __sk_buff *skb,
if (CALI_F_FROM_HOST) {
redir_flags = BPF_F_INGRESS;
}

/* Revalidate the access to the packet */
if ((void *)(long)skb->data + sizeof(struct ethhdr) > (void *)(long)skb->data_end) {
reason = CALI_REASON_SHORT;
goto deny;
}

/* Swap the MACs as we are turning it back */
struct ethhdr *eth_hdr = (void *)(long)skb->data;
unsigned char mac[ETH_ALEN];
__builtin_memcpy(mac, &eth_hdr->h_dest, ETH_ALEN);
__builtin_memcpy(&eth_hdr->h_dest, &eth_hdr->h_source, ETH_ALEN);
__builtin_memcpy(&eth_hdr->h_source, mac, ETH_ALEN);

rc = bpf_redirect(skb->ifindex, redir_flags);
if (rc == TC_ACT_REDIRECT) {
CALI_DEBUG("Redirect to the same interface (%d) succeeded\n", skb->ifindex);
Expand Down Expand Up @@ -876,7 +890,7 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct __sk_buff *skb,
CALI_DEBUG("DSR enabled, skipping SNAT + encap\n");
goto allow;
}
/* XXX do this before NAT until we can track the icmp back */

if (!(state->ip_proto == IPPROTO_TCP && skb_is_gso(skb)) &&
ip_is_dnf(ip_header) && vxlan_v4_encap_too_big(skb)) {
CALI_DEBUG("Return ICMP mtu is too big\n");
Expand Down
5 changes: 4 additions & 1 deletion bpf/ut/icmp_too_big_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ func TestICMPTooBig(t *testing.T) {
pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default)
fmt.Printf("pktR = %+v\n", pktR)

checkICMPTooBig(pktR, ipv4, udp, natTunnelMTU-ethernetHeaderSize-20 /* compiled as WEP */)
// The program is compiled and run as WEP and thus the expected MTU is
// 50 less due to the 50 byte difference between HEP and WEP mtu. See
// definition of TUNNEL_MTU in bpf-gpl/nat.h
checkICMPTooBig(pktR, ipv4, udp, natTunnelMTU-50)
})
}

Expand Down
2 changes: 1 addition & 1 deletion bpf/ut/nat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ func TestNATNodePortICMPTooBig(t *testing.T) {
pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default)
fmt.Printf("pktR = %+v\n", pktR)

checkICMPTooBig(pktR, ipv4, udp, natTunnelMTU-ethernetHeaderSize)
checkICMPTooBig(pktR, ipv4, udp, natTunnelMTU)
})

// clean up
Expand Down

0 comments on commit aa0a22d

Please sign in to comment.