Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Send ICMP too bigs directly #2223

Merged
merged 10 commits into from
Mar 2, 2020
2 changes: 2 additions & 0 deletions bpf-gpl/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ struct bpf_map_def_extended {
#define CALI_F_CGROUP (((CALI_COMPILE_FLAGS) & CALI_CGROUP) != 0)
#define CALI_F_DSR (CALI_COMPILE_FLAGS & CALI_TC_DSR)

#define CALI_RES_REDIR_IFINDEX (TC_ACT_VALUE_MAX + 100) /* packet should be sent back the same iface */

#define COMPILE_TIME_ASSERT(expr) {typedef char array[(expr) ? 1 : -1];}
static CALI_BPF_INLINE void __compile_asserts(void) {
#pragma clang diagnostic push
Expand Down
6 changes: 3 additions & 3 deletions bpf-gpl/icmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ static CALI_BPF_INLINE int icmp_v4_reply(struct __sk_buff *skb,
#ifdef CALI_PARANOID
/* XXX verify that ip_orig.daddr is always the node's IP
*
* we only call this function because of NodePOrt encap
* we only call this function because of NodePort encap
*/
if (ip_orig.daddr != cali_host_ip()) {
CALI_DEBUG("ICMP v4 reply: ip_orig.daddr != cali_host_ip() 0x%x\n", ip_orig.daddr);
Expand Down Expand Up @@ -180,8 +180,8 @@ static CALI_BPF_INLINE int icmp_v4_too_big(struct __sk_buff *skb)
__be16 unused;
__be16 mtu;
} frag = {
// ICMP MTU ignores the ethernet header.
.mtu = host_to_be16(TUNNEL_MTU - sizeof(struct ethhdr)),
// ICMP MTU refers to the IP packet size.
.mtu = host_to_be16(TNNL_INNER_IP_MTU),
};

CALI_DEBUG("Sending ICMP too big mtu=%d\n", be16_to_host(frag.mtu));
Expand Down
17 changes: 11 additions & 6 deletions bpf-gpl/nat.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,15 @@
#define dnat_return_should_encap() (CALI_F_FROM_WEP && !CALI_F_TUNNEL)
#define dnat_should_decap() (CALI_F_FROM_HEP && !CALI_F_TUNNEL)

#define CALI_ENCAP_EXTRA_SIZE 50

// Base MTU of the host's network.
#ifndef CALI_MTU
#define CALI_MTU 1460
#endif

// Number of bytes we add to a packet when we do encap.
#define CALI_ENCAP_EXTRA_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(struct vxlanhdr))

// Inner MTU for packets that we encap.
#ifndef CALI_NAT_TUNNEL_MTU
#define CALI_NAT_TUNNEL_MTU (CALI_MTU - CALI_ENCAP_EXTRA_SIZE) /* defaults to 1410 */
#endif
Expand All @@ -56,11 +59,13 @@
#endif

#if CALI_F_HEP
#define TUNNEL_MTU CALI_NAT_TUNNEL_HEP_MTU
#define TNNL_INNER_ETH_MTU CALI_NAT_TUNNEL_HEP_MTU
#elif CALI_F_WEP
#define TUNNEL_MTU CALI_NAT_TUNNEL_WEP_MTU
#define TNNL_INNER_ETH_MTU CALI_NAT_TUNNEL_WEP_MTU
#endif

#define TNNL_INNER_IP_MTU (TNNL_INNER_ETH_MTU - sizeof(struct ethhdr))

static CALI_BPF_INLINE __be32 cali_host_ip()
{
#ifdef CALI_HOST_IP
Expand Down Expand Up @@ -356,7 +361,7 @@ static CALI_BPF_INLINE int vxlan_v4_encap(struct __sk_buff *skb, __be32 ip_src,
eth_inner = (void *)(vxlan+1);
ip_inner = (void*)(eth_inner+1);

/* Copy the original IP header. Since it is aready DNATed, the dest IP is
/* Copy the original IP header. Since it is already DNATed, the dest IP is
* already set. All we need to do it to change the source IP
*/
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,2,0)
Expand Down Expand Up @@ -439,7 +444,7 @@ static CALI_BPF_INLINE int is_vxlan_tunnel(struct iphdr *ip)

static CALI_BPF_INLINE bool vxlan_v4_encap_too_big(struct __sk_buff *skb)
{
__u32 mtu = TUNNEL_MTU;
__u32 mtu = TNNL_INNER_ETH_MTU;

if (skb->len > mtu) {
CALI_DEBUG("SKB too long (len=%d) vs limit=%d\n", skb->len, mtu);
Expand Down
7 changes: 7 additions & 0 deletions bpf-gpl/skb.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,11 @@ static CALI_BPF_INLINE struct iphdr *skb_iphdr(struct __sk_buff *skb)
return ip;
}

static CALI_BPF_INLINE long skb_l4hdr_offset(struct __sk_buff *skb, __u8 ihl)
{
return skb_iphdr_offset(skb) + ihl;
}

#define skb_is_gso(skb) ((skb)->gso_segs > 1)

#endif /* __SKB_H__ */
76 changes: 54 additions & 22 deletions bpf-gpl/tc.c
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,21 @@ static CALI_BPF_INLINE int forward_or_drop(struct __sk_buff *skb,
goto deny;
}

if (rc == CALI_RES_REDIR_IFINDEX) {
int redir_flags = 0;
if (CALI_F_FROM_HOST) {
redir_flags = BPF_F_INGRESS;
}
rc = bpf_redirect(skb->ifindex, redir_flags);
if (rc == TC_ACT_REDIRECT) {
CALI_DEBUG("Redirect to the same interface (%d) succeeded\n", skb->ifindex);
goto skip_fib;
}

CALI_DEBUG("Redirect to the same interface (%d) failed\n", skb->ifindex);
goto deny;
}

#if FIB_ENABLED
// Try a short-circuit FIB lookup.
if (fwd_fib(fwd)) {
Expand Down Expand Up @@ -261,6 +276,8 @@ static CALI_BPF_INLINE int forward_or_drop(struct __sk_buff *skb,
cancel_fib:
#endif /* FIB_ENABLED */

skip_fib:

if (CALI_F_TO_HOST) {
/* Packet is towards host namespace, mark it so that downstream
* programs know that they're not the first to see the packet.
Expand Down Expand Up @@ -349,9 +366,9 @@ static CALI_BPF_INLINE int calico_tc(struct __sk_buff *skb)

/* XXX do a proper CT lookup to find this */
ip_header->saddr = cali_host_ip();
int ip_csum_offset = skb_iphdr_offset(skb) + offsetof(struct iphdr, check);
int l3_csum_off = skb_iphdr_offset(skb) + offsetof(struct iphdr, check);

int res = bpf_l3_csum_replace(skb, ip_csum_offset, ip_src, cali_host_ip(), 4);
int res = bpf_l3_csum_replace(skb, l3_csum_off, ip_src, cali_host_ip(), 4);
if (res) {
fwd.reason = CALI_REASON_CSUM_FAIL;
goto deny;
Expand Down Expand Up @@ -675,7 +692,9 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct __sk_buff *skb,
struct tcphdr *tcp_header = (void*)(ip_header+1);
struct udphdr *udp_header = (void*)(ip_header+1);

size_t csum_offset = 0, ip_csum_offset;
__u8 ihl = ip_header->ihl * 4;

size_t l4_csum_off = 0, l3_csum_off;
int res = 0;
bool encap_needed = false;
uint32_t fib_flags = 0;
Expand All @@ -697,6 +716,16 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct __sk_buff *skb,
}
}

l3_csum_off = skb_iphdr_offset(skb) + offsetof(struct iphdr, check);
switch (state->ip_proto) {
case IPPROTO_TCP:
l4_csum_off = skb_l4hdr_offset(skb, ihl) + offsetof(struct tcphdr, check);
break;
case IPPROTO_UDP:
l4_csum_off = skb_l4hdr_offset(skb, ihl) + offsetof(struct udphdr, check);
break;
}

switch (state->ct_result.rc){
case CALI_CT_NEW:
switch (state->pol_rc) {
Expand Down Expand Up @@ -806,27 +835,26 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct __sk_buff *skb,
}

ip_header->daddr = state->post_nat_ip_dst;
ip_csum_offset = skb_offset(skb, ip_header) + offsetof(struct iphdr, check);

switch (state->ip_proto) {
case IPPROTO_TCP:
tcp_header->dest = host_to_be16(state->post_nat_dport);
csum_offset = skb_offset(skb, tcp_header) + offsetof(struct tcphdr, check);
break;
case IPPROTO_UDP:
udp_header->dest = host_to_be16(state->post_nat_dport);
csum_offset = skb_offset(skb, udp_header) + offsetof(struct udphdr, check);
break;
}

if (csum_offset) {
res = skb_nat_l4_csum_ipv4(skb, csum_offset, state->ip_dst,
CALI_VERB("L3 csum at %d L4 csum at %d\n", l3_csum_off, l4_csum_off);

if (l4_csum_off) {
res = skb_nat_l4_csum_ipv4(skb, l4_csum_off, state->ip_dst,
state->post_nat_ip_dst, host_to_be16(state->dport),
host_to_be16(state->post_nat_dport),
state->ip_proto == IPPROTO_UDP ? BPF_F_MARK_MANGLED_0 : 0);
}

res |= bpf_l3_csum_replace(skb, ip_csum_offset, state->ip_dst, state->post_nat_ip_dst, 4);
res |= bpf_l3_csum_replace(skb, l3_csum_off, state->ip_dst, state->post_nat_ip_dst, 4);

if (res) {
reason = CALI_REASON_CSUM_FAIL;
Expand All @@ -839,48 +867,48 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct __sk_buff *skb,
goto allow;

case CALI_CT_ESTABLISHED_SNAT:
CALI_DEBUG("CT: SNAT to %x:%d\n",
CALI_DEBUG("CT: SNAT from %x:%d\n",
be32_to_host(state->ct_result.nat_ip), state->ct_result.nat_port);

if (dnat_return_should_encap() && state->ct_result.tun_ret_ip) {
/* XXX do this before NAT until we can track the icmp back */
if (ip_is_dnf(ip_header) && vxlan_v4_encap_too_big(skb)) {
CALI_DEBUG("Return ICMP mtu is too big\n");
goto icmp_too_big;
}
if (CALI_F_DSR) {
/* SNAT will be done after routing, when leaving HEP */
CALI_DEBUG("DSR enabled, skipping SNAT + encap\n");
goto allow;
}
/* XXX do this before NAT until we can track the icmp back */
if (!(state->ip_proto == IPPROTO_TCP && skb_is_gso(skb)) &&
ip_is_dnf(ip_header) && vxlan_v4_encap_too_big(skb)) {
CALI_DEBUG("Return ICMP mtu is too big\n");
goto icmp_too_big;
}
}

// Actually do the NAT.
ip_header->saddr = state->ct_result.nat_ip;
ip_csum_offset = skb_offset(skb, ip_header) + offsetof(struct iphdr, check);

switch (state->ip_proto) {
case IPPROTO_TCP:
tcp_header->source = host_to_be16(state->ct_result.nat_port);
csum_offset = skb_offset(skb, tcp_header) + offsetof(struct tcphdr, check);
break;
case IPPROTO_UDP:
udp_header->source = host_to_be16(state->ct_result.nat_port);
csum_offset = skb_offset(skb, udp_header) + offsetof(struct udphdr, check);
break;
}

if (csum_offset) {
res = skb_nat_l4_csum_ipv4(skb, csum_offset, state->ip_src,
CALI_VERB("L3 csum at %d L4 csum at %d\n", l3_csum_off, l4_csum_off);

if (l4_csum_off) {
res = skb_nat_l4_csum_ipv4(skb, l4_csum_off, state->ip_src,
state->ct_result.nat_ip, host_to_be16(state->sport),
host_to_be16(state->ct_result.nat_port),
state->ip_proto == IPPROTO_UDP ? BPF_F_MARK_MANGLED_0 : 0);
}

CALI_VERB("L3 checksum update (csum is at %d) port from %x to %x\n",
ip_csum_offset, state->ip_src, state->ct_result.nat_ip);
l3_csum_off, state->ip_src, state->ct_result.nat_ip);

int csum_rc = bpf_l3_csum_replace(skb, ip_csum_offset,
int csum_rc = bpf_l3_csum_replace(skb, l3_csum_off,
state->ip_src, state->ct_result.nat_ip, 4);
CALI_VERB("bpf_l3_csum_replace(IP): %d\n", csum_rc);
res |= csum_rc;
Expand Down Expand Up @@ -971,6 +999,10 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct __sk_buff *skb,
state->ip_proto = IPPROTO_ICMP;

fib_flags |= BPF_FIB_LOOKUP_OUTPUT;
if (CALI_F_FROM_WEP) {
/* we know it came from workload, just send it back the same way */
rc = CALI_RES_REDIR_IFINDEX;
}

goto allow;

Expand Down