Skip to content

Commit

Permalink
Merge branch 'nexthop_exceptions'
Browse files Browse the repository at this point in the history
These patches implement the final mechanism necessary to really allow
us to go without the route cache in ipv4.

We need a place to have long-term storage of PMTU/redirect information
which is independent of the routes themselves, yet does not get us
back into a situation where we have to write to metrics or anything
like that.

For this we use an "next-hop exception" table in the FIB nexthops.

The one thing I desperately want to avoid is having to create clone
routes in the FIB trie for this purpose, because that is very
expensive.   However, I'm willing to entertain such an idea later
if this current scheme proves to have downsides that the FIB trie
variant would not have.

In order to accomodate this any such scheme, we need to be able to
produce a full flow key at PMTU/redirect time.  That required an
adjustment of the interface call-sites used to propagate these events.

For a PMTU/redirect with a fully specified socket, we pass that socket
and use it to produce the flow key.

Otherwise we use a passed in SKB to formulate the key.  There are two
cases that need to be distinguished, ICMP message processing (in which
case the IP header is at skb->data) and output packet processing
(mostly tunnels, and in all such cases the IP header is at ip_hdr(skb)).

We also have to make the code able to handle the case where the dst
itself passed into the dst_ops->{update_pmtu,redirect} method is
invalidated.  This matters for calls from sockets that have cached
that route.  We provide a inet{,6} helper function for this purpose,
and edit SCTP specially since it caches routes at the transport rather
than socket level.

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
davem330 committed Jul 17, 2012
2 parents bd2d083 + 4895c77 commit a6ff1a2
Show file tree
Hide file tree
Showing 30 changed files with 449 additions and 183 deletions.
2 changes: 1 addition & 1 deletion drivers/infiniband/ulp/ipoib/ipoib_cm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1397,7 +1397,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
int e = skb_queue_empty(&priv->cm.skb_queue);

if (skb_dst(skb))
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);

skb_queue_tail(&priv->cm.skb_queue, skb);
if (e)
Expand Down
6 changes: 4 additions & 2 deletions include/net/dst_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ struct dst_ops {
struct net_device *dev, int how);
struct dst_entry * (*negative_advice)(struct dst_entry *);
void (*link_failure)(struct sk_buff *);
void (*update_pmtu)(struct dst_entry *dst, u32 mtu);
void (*redirect)(struct dst_entry *dst, struct sk_buff *skb);
void (*update_pmtu)(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu);
void (*redirect)(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
int (*local_out)(struct sk_buff *skb);
struct neighbour * (*neigh_lookup)(const struct dst_entry *dst,
struct sk_buff *skb,
Expand Down
2 changes: 2 additions & 0 deletions include/net/inet6_connection_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,6 @@ extern void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
extern void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);

extern int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl);

extern struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu);
#endif /* _INET6_CONNECTION_SOCK_H */
2 changes: 2 additions & 0 deletions include/net/inet_connection_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -337,4 +337,6 @@ extern int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
extern int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, unsigned int optlen);

extern struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
#endif /* _INET_CONNECTION_SOCK_H */
18 changes: 18 additions & 0 deletions include/net/ip_fib.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <net/flow.h>
#include <linux/seq_file.h>
#include <linux/rcupdate.h>
#include <net/fib_rules.h>
#include <net/inetpeer.h>

Expand Down Expand Up @@ -46,6 +47,22 @@ struct fib_config {

struct fib_info;

struct fib_nh_exception {
struct fib_nh_exception __rcu *fnhe_next;
__be32 fnhe_daddr;
u32 fnhe_pmtu;
u32 fnhe_gw;
unsigned long fnhe_expires;
unsigned long fnhe_stamp;
};

struct fnhe_hash_bucket {
struct fib_nh_exception __rcu *chain;
};

#define FNHE_HASH_SIZE 2048
#define FNHE_RECLAIM_DEPTH 5

struct fib_nh {
struct net_device *nh_dev;
struct hlist_node nh_hash;
Expand All @@ -63,6 +80,7 @@ struct fib_nh {
__be32 nh_gw;
__be32 nh_saddr;
int nh_saddr_genid;
struct fnhe_hash_bucket *nh_exceptions;
};

/*
Expand Down
4 changes: 2 additions & 2 deletions include/net/sctp/sctp.h
Original file line number Diff line number Diff line change
Expand Up @@ -519,10 +519,10 @@ static inline int sctp_frag_point(const struct sctp_association *asoc, int pmtu)
return frag;
}

static inline void sctp_assoc_pending_pmtu(struct sctp_association *asoc)
static inline void sctp_assoc_pending_pmtu(struct sock *sk, struct sctp_association *asoc)
{

sctp_assoc_sync_pmtu(asoc);
sctp_assoc_sync_pmtu(sk, asoc);
asoc->pmtu_pending = 0;
}

Expand Down
4 changes: 2 additions & 2 deletions include/net/sctp/structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -1091,7 +1091,7 @@ void sctp_transport_burst_limited(struct sctp_transport *);
void sctp_transport_burst_reset(struct sctp_transport *);
unsigned long sctp_transport_timeout(struct sctp_transport *);
void sctp_transport_reset(struct sctp_transport *);
void sctp_transport_update_pmtu(struct sctp_transport *, u32);
void sctp_transport_update_pmtu(struct sock *, struct sctp_transport *, u32);
void sctp_transport_immediate_rtx(struct sctp_transport *);


Expand Down Expand Up @@ -2003,7 +2003,7 @@ void sctp_assoc_update(struct sctp_association *old,

__u32 sctp_association_get_next_tsn(struct sctp_association *);

void sctp_assoc_sync_pmtu(struct sctp_association *);
void sctp_assoc_sync_pmtu(struct sock *, struct sctp_association *);
void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int);
void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int);
void sctp_assoc_set_primary(struct sctp_association *,
Expand Down
6 changes: 4 additions & 2 deletions net/bridge/br_netfilter.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,13 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb)
pppoe_proto(skb) == htons(PPP_IPV6) && \
brnf_filter_pppoe_tagged)

static void fake_update_pmtu(struct dst_entry *dst, u32 mtu)
static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu)
{
}

static void fake_redirect(struct dst_entry *dst, struct sk_buff *skb)
static void fake_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb)
{
}

Expand Down
13 changes: 3 additions & 10 deletions net/dccp/ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,17 +161,10 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
if (sk->sk_state == DCCP_LISTEN)
return;

/* We don't check in the destentry if pmtu discovery is forbidden
* on this route. We just assume that no packet_to_big packets
* are send back when pmtu discovery is not active.
* There is a small race when the user changes this flag in the
* route, but I think that's acceptable.
*/
if ((dst = __sk_dst_check(sk, 0)) == NULL)
dst = inet_csk_update_pmtu(sk, mtu);
if (!dst)
return;

dst->ops->update_pmtu(dst, mtu);

/* Something is about to be wrong... Remember soft error
* for the case, if this connection will not able to recover.
*/
Expand Down Expand Up @@ -200,7 +193,7 @@ static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk)
struct dst_entry *dst = __sk_dst_check(sk, 0);

if (dst)
dst->ops->redirect(dst, skb);
dst->ops->redirect(dst, sk, skb);
}

/*
Expand Down
37 changes: 5 additions & 32 deletions net/dccp/ipv6.c
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);

if (dst)
dst->ops->redirect(dst, skb);
dst->ops->redirect(dst, sk, skb);
}

if (type == ICMPV6_PKT_TOOBIG) {
Expand All @@ -145,39 +145,12 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
goto out;

/* icmp should have updated the destination cache entry */
dst = __sk_dst_check(sk, np->dst_cookie);
if (dst == NULL) {
struct inet_sock *inet = inet_sk(sk);
struct flowi6 fl6;

/* BUGGG_FUTURE: Again, it is not clear how
to handle rthdr case. Ignore this complexity
for now.
*/
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_DCCP;
fl6.daddr = np->daddr;
fl6.saddr = np->saddr;
fl6.flowi6_oif = sk->sk_bound_dev_if;
fl6.fl6_dport = inet->inet_dport;
fl6.fl6_sport = inet->inet_sport;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));

dst = ip6_dst_lookup_flow(sk, &fl6, NULL, false);
if (IS_ERR(dst)) {
sk->sk_err_soft = -PTR_ERR(dst);
goto out;
}
} else
dst_hold(dst);

dst->ops->update_pmtu(dst, ntohl(info));
dst = inet6_csk_update_pmtu(sk, ntohl(info));
if (!dst)
goto out;

if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst))
dccp_sync_mss(sk, dst_mtu(dst));
} /* else let the usual retransmit timer handle it */
dst_release(dst);
goto out;
}

Expand Down
12 changes: 8 additions & 4 deletions net/decnet/dn_route.c
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,10 @@ static void dn_dst_destroy(struct dst_entry *);
static void dn_dst_ifdown(struct dst_entry *, struct net_device *dev, int how);
static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
static void dn_dst_link_failure(struct sk_buff *);
static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu);
static void dn_dst_redirect(struct dst_entry *dst, struct sk_buff *skb);
static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb , u32 mtu);
static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr);
Expand Down Expand Up @@ -266,7 +268,8 @@ static int dn_dst_gc(struct dst_ops *ops)
* We update both the mtu and the advertised mss (i.e. the segment size we
* advertise to the other end).
*/
static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu)
static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu)
{
struct dn_route *rt = (struct dn_route *) dst;
struct neighbour *n = rt->n;
Expand Down Expand Up @@ -294,7 +297,8 @@ static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu)
}
}

static void dn_dst_redirect(struct dst_entry *dst, struct sk_buff *skb)
static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb)
{
}

Expand Down
23 changes: 23 additions & 0 deletions net/ipv4/fib_semantics.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,27 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
},
};

static void free_nh_exceptions(struct fib_nh *nh)
{
struct fnhe_hash_bucket *hash = nh->nh_exceptions;
int i;

for (i = 0; i < FNHE_HASH_SIZE; i++) {
struct fib_nh_exception *fnhe;

fnhe = rcu_dereference(hash[i].chain);
while (fnhe) {
struct fib_nh_exception *next;

next = rcu_dereference(fnhe->fnhe_next);
kfree(fnhe);

fnhe = next;
}
}
kfree(hash);
}

/* Release a nexthop info record */
static void free_fib_info_rcu(struct rcu_head *head)
{
Expand All @@ -148,6 +169,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
change_nexthops(fi) {
if (nexthop_nh->nh_dev)
dev_put(nexthop_nh->nh_dev);
if (nexthop_nh->nh_exceptions)
free_nh_exceptions(nexthop_nh);
} endfor_nexthops(fi);

release_net(fi->fib_net);
Expand Down
46 changes: 46 additions & 0 deletions net/ipv4/inet_connection_sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -803,3 +803,49 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
}
EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
#endif

static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_options_rcu *inet_opt;
__be32 daddr = inet->inet_daddr;
struct flowi4 *fl4;
struct rtable *rt;

rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt && inet_opt->opt.srr)
daddr = inet_opt->opt.faddr;
fl4 = &fl->u.ip4;
rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
inet->inet_saddr, inet->inet_dport,
inet->inet_sport, sk->sk_protocol,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
if (IS_ERR(rt))
rt = NULL;
if (rt)
sk_setup_caps(sk, &rt->dst);
rcu_read_unlock();

return &rt->dst;
}

struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
{
struct dst_entry *dst = __sk_dst_check(sk, 0);
struct inet_sock *inet = inet_sk(sk);

if (!dst) {
dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
if (!dst)
goto out;
}
dst->ops->update_pmtu(dst, sk, NULL, mtu);

dst = __sk_dst_check(sk, 0);
if (!dst)
dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
out:
return dst;
}
EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
2 changes: 1 addition & 1 deletion net/ipv4/ip_gre.c
Original file line number Diff line number Diff line change
Expand Up @@ -833,7 +833,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;

if (skb_dst(skb))
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);

if (skb->protocol == htons(ETH_P_IP)) {
df |= (old_iph->frag_off&htons(IP_DF));
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/ipip.c
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
}

if (skb_dst(skb))
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);

if ((old_iph->frag_off & htons(IP_DF)) &&
mtu < ntohs(old_iph->tot_len)) {
Expand Down
Loading

0 comments on commit a6ff1a2

Please sign in to comment.