Skip to content

bpf: New approach for BPF MTU handling #270

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
sudo: required
language: bash
dist: bionic
services:
- docker

env:
global:
- PROJECT_NAME='libbpf'
- AUTHOR_EMAIL="$(git log -1 --pretty=\"%aE\")"
- REPO_ROOT="$TRAVIS_BUILD_DIR"
- CI_ROOT="$REPO_ROOT/travis-ci"
- VMTEST_ROOT="$CI_ROOT/vmtest"

addons:
apt:
packages:
- qemu-kvm
- zstd
- binutils-dev
- elfutils
- libcap-dev
- libelf-dev
- libdw-dev
- python3-docutils

jobs:
include:
- stage: Builds & Tests
name: Kernel LATEST + selftests
language: bash
env: KERNEL=LATEST
script: $CI_ROOT/vmtest/run_vmtest.sh || travis_terminate 1
31 changes: 29 additions & 2 deletions include/linux/netdevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -3891,11 +3891,38 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
bool is_skb_forwardable(const struct net_device *dev,
const struct sk_buff *skb);

static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
const struct sk_buff *skb,
const bool check_mtu)
{
const u32 vlan_hdr_len = 4; /* VLAN_HLEN */
unsigned int len;

if (!(dev->flags & IFF_UP))
return false;

if (!check_mtu)
return true;

len = dev->mtu + dev->hard_header_len + vlan_hdr_len;
if (skb->len <= len)
return true;

/* if TSO is enabled, we don't care about the length as the packet
* could be forwarded without being segmented before
*/
if (skb_is_gso(skb))
return true;

return false;
}

static __always_inline int ____dev_forward_skb(struct net_device *dev,
struct sk_buff *skb)
struct sk_buff *skb,
const bool check_mtu)
{
if (skb_orphan_frags(skb, GFP_ATOMIC) ||
unlikely(!is_skb_forwardable(dev, skb))) {
unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) {
atomic_long_inc(&dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
Expand Down
81 changes: 79 additions & 2 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -2219,6 +2219,9 @@ union bpf_attr {
* * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
* packet is not forwarded or needs assist from full stack
*
* If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU
* was exceeded and result params->mtu contains the MTU.
*
* long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
* Description
* Add an entry to, or update a sockhash *map* referencing sockets.
Expand Down Expand Up @@ -3742,6 +3745,63 @@ union bpf_attr {
* Return
* The helper returns **TC_ACT_REDIRECT** on success or
* **TC_ACT_SHOT** on error.
*
* int bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_result, s32 len_diff, u64 flags)
* Description
* Check ctx packet size against MTU of net device (based on
* *ifindex*). This helper will likely be used in combination with
* helpers that adjust/change the packet size. The argument
* *len_diff* can be used for querying with a planned size
* change. This allows to check MTU prior to changing packet ctx.
*
* Specifying *ifindex* zero means the MTU check is performed
* against the current net device. This is practical if this isn't
* used prior to redirect.
*
* The Linux kernel route table can configure MTUs on a more
* specific per route level, which is not provided by this helper.
* For route level MTU checks use the **bpf_fib_lookup**\ ()
* helper.
*
* *ctx* is either **struct xdp_md** for XDP programs or
* **struct sk_buff** for tc cls_act programs.
*
* The *flags* argument can be a combination of one or more of the
* following values:
*
* **BPF_MTU_CHK_RELAX**
* This flag relax or increase the MTU with room for one
* VLAN header (4 bytes). This relaxation is also used by
* the kernels own forwarding MTU checks.
*
* **BPF_MTU_CHK_SEGS**
* This flag will only works for *ctx* **struct sk_buff**.
* If packet context contains extra packet segment buffers
* (often knows as GSO skb), then MTU check is partly
* skipped, because in transmit path it is possible for the
* skb packet to get re-segmented (depending on net device
* features). This could still be a MTU violation, so this
* flag enables performing MTU check against segments, with
* a different violation return code to tell it apart.
*
* The *mtu_result* pointer contains the MTU value of the net
* device including the L2 header size (usually 14 bytes Ethernet
* header). The net device configured MTU is the L3 size, but as
* XDP and TX length operate at L2 this helper include L2 header
* size in reported MTU.
*
* Return
* * 0 on success, and populate MTU value in *mtu_result* pointer.
*
* * < 0 if any input argument is invalid (*mtu_result* not updated)
*
* MTU violations return positive values, but also populate MTU
* value in *mtu_result* pointer, as this can be needed for
* implementing PMTU handing:
*
* * **BPF_MTU_CHK_RET_FRAG_NEEDED**
* * **BPF_MTU_CHK_RET_SEGS_TOOBIG**
*
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
Expand Down Expand Up @@ -3900,6 +3960,7 @@ union bpf_attr {
FN(bpf_per_cpu_ptr), \
FN(bpf_this_cpu_ptr), \
FN(redirect_peer), \
FN(check_mtu), \
/* */

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
Expand Down Expand Up @@ -4872,9 +4933,13 @@ struct bpf_fib_lookup {
__be16 sport;
__be16 dport;

/* total length of packet from network header - used for MTU check */
__u16 tot_len;
union { /* used for MTU check */
/* input to lookup */
__u16 tot_len; /* total length of packet from network hdr */

/* output: MTU value (if requested check_mtu) */
__u16 mtu;
};
/* input: L3 device index for lookup
* output: device index from FIB lookup
*/
Expand Down Expand Up @@ -4920,6 +4985,18 @@ struct bpf_redir_neigh {
};
};

/* bpf_check_mtu flags*/
enum bpf_check_mtu_flags {
BPF_MTU_CHK_RELAX = (1U << 0),
BPF_MTU_CHK_SEGS = (1U << 1),
};

enum bpf_check_mtu_ret {
BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */
BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */
BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */
};

enum bpf_task_fd_type {
BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */
BPF_FD_TYPE_TRACEPOINT, /* tp name */
Expand Down
21 changes: 4 additions & 17 deletions net/core/dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -2188,28 +2188,13 @@ static inline void net_timestamp_set(struct sk_buff *skb)

bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
unsigned int len;

if (!(dev->flags & IFF_UP))
return false;

len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
if (skb->len <= len)
return true;

/* if TSO is enabled, we don't care about the length as the packet
* could be forwarded without being segmented before
*/
if (skb_is_gso(skb))
return true;

return false;
return __is_skb_forwardable(dev, skb, true);
}
EXPORT_SYMBOL_GPL(is_skb_forwardable);

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
int ret = ____dev_forward_skb(dev, skb);
int ret = ____dev_forward_skb(dev, skb, true);

if (likely(!ret)) {
skb->protocol = eth_type_trans(skb, dev);
Expand Down Expand Up @@ -3885,6 +3870,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
return NULL;
case TC_ACT_REDIRECT:
/* No need to push/pop skb's mac_header here on egress! */
skb_set_redirected(skb, false);
skb_do_redirect(skb);
*ret = NET_XMIT_SUCCESS;
return NULL;
Expand Down Expand Up @@ -4974,6 +4960,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
* redirecting to another netdev
*/
__skb_push(skb, skb->mac_len);
skb_set_redirected(skb, true);
if (skb_do_redirect(skb) == -EAGAIN) {
__skb_pull(skb, skb->mac_len);
*another = true;
Expand Down
Loading