Skip to content

Commit

Permalink
Merge pull request #2223 from fasaxc/wep-mtu-fix-is-gso
Browse files Browse the repository at this point in the history
Send ICMP too bigs directly
  • Loading branch information
fasaxc authored Mar 2, 2020
2 parents 3d751b0 + 7f86380 commit ffd3291
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 31 deletions.
2 changes: 2 additions & 0 deletions bpf-gpl/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ struct bpf_map_def_extended {
#define CALI_F_CGROUP (((CALI_COMPILE_FLAGS) & CALI_CGROUP) != 0)
#define CALI_F_DSR (CALI_COMPILE_FLAGS & CALI_TC_DSR)

#define CALI_RES_REDIR_IFINDEX (TC_ACT_VALUE_MAX + 100) /* packet should be sent back the same iface */

#define COMPILE_TIME_ASSERT(expr) {typedef char array[(expr) ? 1 : -1];}
static CALI_BPF_INLINE void __compile_asserts(void) {
#pragma clang diagnostic push
Expand Down
6 changes: 3 additions & 3 deletions bpf-gpl/icmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ static CALI_BPF_INLINE int icmp_v4_reply(struct __sk_buff *skb,
#ifdef CALI_PARANOID
/* XXX verify that ip_orig.daddr is always the node's IP
*
* we only call this function because of NodePOrt encap
* we only call this function because of NodePort encap
*/
if (ip_orig.daddr != cali_host_ip()) {
CALI_DEBUG("ICMP v4 reply: ip_orig.daddr != cali_host_ip() 0x%x\n", ip_orig.daddr);
Expand Down Expand Up @@ -180,8 +180,8 @@ static CALI_BPF_INLINE int icmp_v4_too_big(struct __sk_buff *skb)
__be16 unused;
__be16 mtu;
} frag = {
// ICMP MTU ignores the ethernet header.
.mtu = host_to_be16(TUNNEL_MTU - sizeof(struct ethhdr)),
// ICMP MTU refers to the IP packet size.
.mtu = host_to_be16(TNNL_INNER_IP_MTU),
};

CALI_DEBUG("Sending ICMP too big mtu=%d\n", be16_to_host(frag.mtu));
Expand Down
17 changes: 11 additions & 6 deletions bpf-gpl/nat.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,15 @@
#define dnat_return_should_encap() (CALI_F_FROM_WEP && !CALI_F_TUNNEL)
#define dnat_should_decap() (CALI_F_FROM_HEP && !CALI_F_TUNNEL)

#define CALI_ENCAP_EXTRA_SIZE 50

// Base MTU of the host's network.
#ifndef CALI_MTU
#define CALI_MTU 1460
#endif

// Number of bytes we add to a packet when we do encap.
#define CALI_ENCAP_EXTRA_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(struct vxlanhdr))

// Inner MTU for packets that we encap.
#ifndef CALI_NAT_TUNNEL_MTU
#define CALI_NAT_TUNNEL_MTU (CALI_MTU - CALI_ENCAP_EXTRA_SIZE) /* defaults to 1410 */
#endif
Expand All @@ -56,11 +59,13 @@
#endif

#if CALI_F_HEP
#define TUNNEL_MTU CALI_NAT_TUNNEL_HEP_MTU
#define TNNL_INNER_ETH_MTU CALI_NAT_TUNNEL_HEP_MTU
#elif CALI_F_WEP
#define TUNNEL_MTU CALI_NAT_TUNNEL_WEP_MTU
#define TNNL_INNER_ETH_MTU CALI_NAT_TUNNEL_WEP_MTU
#endif

#define TNNL_INNER_IP_MTU (TNNL_INNER_ETH_MTU - sizeof(struct ethhdr))

static CALI_BPF_INLINE __be32 cali_host_ip()
{
#ifdef CALI_HOST_IP
Expand Down Expand Up @@ -356,7 +361,7 @@ static CALI_BPF_INLINE int vxlan_v4_encap(struct __sk_buff *skb, __be32 ip_src,
eth_inner = (void *)(vxlan+1);
ip_inner = (void*)(eth_inner+1);

/* Copy the original IP header. Since it is aready DNATed, the dest IP is
/* Copy the original IP header. Since it is already DNATed, the dest IP is
* already set. All we need to do it to change the source IP
*/
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,2,0)
Expand Down Expand Up @@ -439,7 +444,7 @@ static CALI_BPF_INLINE int is_vxlan_tunnel(struct iphdr *ip)

static CALI_BPF_INLINE bool vxlan_v4_encap_too_big(struct __sk_buff *skb)
{
__u32 mtu = TUNNEL_MTU;
__u32 mtu = TNNL_INNER_ETH_MTU;

if (skb->len > mtu) {
CALI_DEBUG("SKB too long (len=%d) vs limit=%d\n", skb->len, mtu);
Expand Down
7 changes: 7 additions & 0 deletions bpf-gpl/skb.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,11 @@ static CALI_BPF_INLINE struct iphdr *skb_iphdr(struct __sk_buff *skb)
return ip;
}

static CALI_BPF_INLINE long skb_l4hdr_offset(struct __sk_buff *skb, __u8 ihl)
{
return skb_iphdr_offset(skb) + ihl;
}

#define skb_is_gso(skb) ((skb)->gso_segs > 1)

#endif /* __SKB_H__ */
76 changes: 54 additions & 22 deletions bpf-gpl/tc.c
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,21 @@ static CALI_BPF_INLINE int forward_or_drop(struct __sk_buff *skb,
goto deny;
}

if (rc == CALI_RES_REDIR_IFINDEX) {
int redir_flags = 0;
if (CALI_F_FROM_HOST) {
redir_flags = BPF_F_INGRESS;
}
rc = bpf_redirect(skb->ifindex, redir_flags);
if (rc == TC_ACT_REDIRECT) {
CALI_DEBUG("Redirect to the same interface (%d) succeeded\n", skb->ifindex);
goto skip_fib;
}

CALI_DEBUG("Redirect to the same interface (%d) failed\n", skb->ifindex);
goto deny;
}

#if FIB_ENABLED
// Try a short-circuit FIB lookup.
if (fwd_fib(fwd)) {
Expand Down Expand Up @@ -261,6 +276,8 @@ static CALI_BPF_INLINE int forward_or_drop(struct __sk_buff *skb,
cancel_fib:
#endif /* FIB_ENABLED */

skip_fib:

if (CALI_F_TO_HOST) {
/* Packet is towards host namespace, mark it so that downstream
* programs know that they're not the first to see the packet.
Expand Down Expand Up @@ -349,9 +366,9 @@ static CALI_BPF_INLINE int calico_tc(struct __sk_buff *skb)

/* XXX do a proper CT lookup to find this */
ip_header->saddr = cali_host_ip();
int ip_csum_offset = skb_iphdr_offset(skb) + offsetof(struct iphdr, check);
int l3_csum_off = skb_iphdr_offset(skb) + offsetof(struct iphdr, check);

int res = bpf_l3_csum_replace(skb, ip_csum_offset, ip_src, cali_host_ip(), 4);
int res = bpf_l3_csum_replace(skb, l3_csum_off, ip_src, cali_host_ip(), 4);
if (res) {
fwd.reason = CALI_REASON_CSUM_FAIL;
goto deny;
Expand Down Expand Up @@ -675,7 +692,9 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct __sk_buff *skb,
struct tcphdr *tcp_header = (void*)(ip_header+1);
struct udphdr *udp_header = (void*)(ip_header+1);

size_t csum_offset = 0, ip_csum_offset;
__u8 ihl = ip_header->ihl * 4;

size_t l4_csum_off = 0, l3_csum_off;
int res = 0;
bool encap_needed = false;
uint32_t fib_flags = 0;
Expand All @@ -697,6 +716,16 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct __sk_buff *skb,
}
}

l3_csum_off = skb_iphdr_offset(skb) + offsetof(struct iphdr, check);
switch (state->ip_proto) {
case IPPROTO_TCP:
l4_csum_off = skb_l4hdr_offset(skb, ihl) + offsetof(struct tcphdr, check);
break;
case IPPROTO_UDP:
l4_csum_off = skb_l4hdr_offset(skb, ihl) + offsetof(struct udphdr, check);
break;
}

switch (state->ct_result.rc){
case CALI_CT_NEW:
switch (state->pol_rc) {
Expand Down Expand Up @@ -806,27 +835,26 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct __sk_buff *skb,
}

ip_header->daddr = state->post_nat_ip_dst;
ip_csum_offset = skb_offset(skb, ip_header) + offsetof(struct iphdr, check);

switch (state->ip_proto) {
case IPPROTO_TCP:
tcp_header->dest = host_to_be16(state->post_nat_dport);
csum_offset = skb_offset(skb, tcp_header) + offsetof(struct tcphdr, check);
break;
case IPPROTO_UDP:
udp_header->dest = host_to_be16(state->post_nat_dport);
csum_offset = skb_offset(skb, udp_header) + offsetof(struct udphdr, check);
break;
}

if (csum_offset) {
res = skb_nat_l4_csum_ipv4(skb, csum_offset, state->ip_dst,
CALI_VERB("L3 csum at %d L4 csum at %d\n", l3_csum_off, l4_csum_off);

if (l4_csum_off) {
res = skb_nat_l4_csum_ipv4(skb, l4_csum_off, state->ip_dst,
state->post_nat_ip_dst, host_to_be16(state->dport),
host_to_be16(state->post_nat_dport),
state->ip_proto == IPPROTO_UDP ? BPF_F_MARK_MANGLED_0 : 0);
}

res |= bpf_l3_csum_replace(skb, ip_csum_offset, state->ip_dst, state->post_nat_ip_dst, 4);
res |= bpf_l3_csum_replace(skb, l3_csum_off, state->ip_dst, state->post_nat_ip_dst, 4);

if (res) {
reason = CALI_REASON_CSUM_FAIL;
Expand All @@ -839,48 +867,48 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct __sk_buff *skb,
goto allow;

case CALI_CT_ESTABLISHED_SNAT:
CALI_DEBUG("CT: SNAT to %x:%d\n",
CALI_DEBUG("CT: SNAT from %x:%d\n",
be32_to_host(state->ct_result.nat_ip), state->ct_result.nat_port);

if (dnat_return_should_encap() && state->ct_result.tun_ret_ip) {
/* XXX do this before NAT until we can track the icmp back */
if (ip_is_dnf(ip_header) && vxlan_v4_encap_too_big(skb)) {
CALI_DEBUG("Return ICMP mtu is too big\n");
goto icmp_too_big;
}
if (CALI_F_DSR) {
/* SNAT will be done after routing, when leaving HEP */
CALI_DEBUG("DSR enabled, skipping SNAT + encap\n");
goto allow;
}
/* XXX do this before NAT until we can track the icmp back */
if (!(state->ip_proto == IPPROTO_TCP && skb_is_gso(skb)) &&
ip_is_dnf(ip_header) && vxlan_v4_encap_too_big(skb)) {
CALI_DEBUG("Return ICMP mtu is too big\n");
goto icmp_too_big;
}
}

// Actually do the NAT.
ip_header->saddr = state->ct_result.nat_ip;
ip_csum_offset = skb_offset(skb, ip_header) + offsetof(struct iphdr, check);

switch (state->ip_proto) {
case IPPROTO_TCP:
tcp_header->source = host_to_be16(state->ct_result.nat_port);
csum_offset = skb_offset(skb, tcp_header) + offsetof(struct tcphdr, check);
break;
case IPPROTO_UDP:
udp_header->source = host_to_be16(state->ct_result.nat_port);
csum_offset = skb_offset(skb, udp_header) + offsetof(struct udphdr, check);
break;
}

if (csum_offset) {
res = skb_nat_l4_csum_ipv4(skb, csum_offset, state->ip_src,
CALI_VERB("L3 csum at %d L4 csum at %d\n", l3_csum_off, l4_csum_off);

if (l4_csum_off) {
res = skb_nat_l4_csum_ipv4(skb, l4_csum_off, state->ip_src,
state->ct_result.nat_ip, host_to_be16(state->sport),
host_to_be16(state->ct_result.nat_port),
state->ip_proto == IPPROTO_UDP ? BPF_F_MARK_MANGLED_0 : 0);
}

CALI_VERB("L3 checksum update (csum is at %d) port from %x to %x\n",
ip_csum_offset, state->ip_src, state->ct_result.nat_ip);
l3_csum_off, state->ip_src, state->ct_result.nat_ip);

int csum_rc = bpf_l3_csum_replace(skb, ip_csum_offset,
int csum_rc = bpf_l3_csum_replace(skb, l3_csum_off,
state->ip_src, state->ct_result.nat_ip, 4);
CALI_VERB("bpf_l3_csum_replace(IP): %d\n", csum_rc);
res |= csum_rc;
Expand Down Expand Up @@ -971,6 +999,10 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct __sk_buff *skb,
state->ip_proto = IPPROTO_ICMP;

fib_flags |= BPF_FIB_LOOKUP_OUTPUT;
if (CALI_F_FROM_WEP) {
/* we know it came from workload, just send it back the same way */
rc = CALI_RES_REDIR_IFINDEX;
}

goto allow;

Expand Down

0 comments on commit ffd3291

Please sign in to comment.