Skip to content

Commit

Permalink
tcp/dccp: remove twchain
Browse files Browse the repository at this point in the history
TCP listener refactoring, part 3 :

Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.

Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.

As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.

If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.

[ INET_TW_MATCH() is no longer needed ]

I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()

This way, SYN_RECV pseudo sockets will be supported the same.

A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].

Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()

Before patch :

dmesg | grep "TCP established"

TCP established hash table entries: 524288 (order: 11, 8388608 bytes)

After patch :

TCP established hash table entries: 524288 (order: 10, 4194304 bytes)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Eric Dumazet authored and davem330 committed Oct 9, 2013
1 parent 53af53a commit 05dbc7b
Show file tree
Hide file tree
Showing 12 changed files with 132 additions and 261 deletions.
9 changes: 2 additions & 7 deletions include/net/inet_hashtables.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,11 @@
#include <asm/byteorder.h>

/* This is for all connections with a full identity, no wildcards.
* One chain is dedicated to TIME_WAIT sockets.
* I'll experiment with dynamic table growth later.
* The 'e' prefix stands for Establish, but we really put all sockets
* but LISTEN ones.
*/
struct inet_ehash_bucket {
struct hlist_nulls_head chain;
struct hlist_nulls_head twchain;
};

/* There are a few simple rules, which allow for local port reuse by
Expand Down Expand Up @@ -123,7 +122,6 @@ struct inet_hashinfo {
*
* TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
*
* TIME_WAIT sockets use a separate chain (twchain).
*/
struct inet_ehash_bucket *ehash;
spinlock_t *ehash_locks;
Expand Down Expand Up @@ -318,9 +316,6 @@ static inline struct sock *inet_lookup_listener(struct net *net,
net_eq(sock_net(__sk), (__net)))
#endif /* 64-bit arch */

#define INET_TW_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif)\
INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif)

/*
* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
* not check it for lookups anymore, thanks Alexey. -DaveM
Expand Down
13 changes: 1 addition & 12 deletions include/net/inet_timewait_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,18 +141,6 @@ struct inet_timewait_sock {
};
#define tw_tclass tw_tos

static inline void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
struct hlist_nulls_head *list)
{
hlist_nulls_add_head_rcu(&tw->tw_node, list);
}

static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
struct hlist_head *list)
{
hlist_add_head(&tw->tw_bind_node, list);
}

static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw)
{
return !hlist_unhashed(&tw->tw_death_node);
Expand Down Expand Up @@ -192,6 +180,7 @@ static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
return (struct inet_timewait_sock *)sk;
}

void inet_twsk_free(struct inet_timewait_sock *tw);
void inet_twsk_put(struct inet_timewait_sock *tw);

int inet_twsk_unhash(struct inet_timewait_sock *tw);
Expand Down
8 changes: 7 additions & 1 deletion include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ typedef __u64 __bitwise __addrpair;
*/
struct sock_common {
/* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned
* address on 64bit arches : cf INET_MATCH() and INET_TW_MATCH()
* address on 64bit arches : cf INET_MATCH()
*/
union {
__addrpair skc_addrpair;
Expand Down Expand Up @@ -301,6 +301,8 @@ struct sock {
#define sk_dontcopy_end __sk_common.skc_dontcopy_end
#define sk_hash __sk_common.skc_hash
#define sk_portpair __sk_common.skc_portpair
#define sk_num __sk_common.skc_num
#define sk_dport __sk_common.skc_dport
#define sk_addrpair __sk_common.skc_addrpair
#define sk_daddr __sk_common.skc_daddr
#define sk_rcv_saddr __sk_common.skc_rcv_saddr
Expand Down Expand Up @@ -1653,6 +1655,10 @@ static inline void sock_put(struct sock *sk)
if (atomic_dec_and_test(&sk->sk_refcnt))
sk_free(sk);
}
/* Generic version of sock_put(), dealing with all sockets
* (TCP_TIMEWAIT, ESTABLISHED...)
*/
void sock_gen_put(struct sock *sk);

int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested);

Expand Down
1 change: 0 additions & 1 deletion include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -1519,7 +1519,6 @@ enum tcp_seq_states {
TCP_SEQ_STATE_LISTENING,
TCP_SEQ_STATE_OPENREQ,
TCP_SEQ_STATE_ESTABLISHED,
TCP_SEQ_STATE_TIME_WAIT,
};

int tcp_seq_open(struct inode *inode, struct file *file);
Expand Down
4 changes: 1 addition & 3 deletions net/dccp/proto.c
Original file line number Diff line number Diff line change
Expand Up @@ -1158,10 +1158,8 @@ static int __init dccp_init(void)
goto out_free_bind_bucket_cachep;
}

for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) {
for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i);
}

if (inet_ehash_locks_alloc(&dccp_hashinfo))
goto out_free_dccp_ehash;
Expand Down
48 changes: 13 additions & 35 deletions net/ipv4/inet_diag.c
Original file line number Diff line number Diff line change
Expand Up @@ -635,12 +635,14 @@ static int inet_csk_diag_dump(struct sock *sk,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
}

static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
static int inet_twsk_diag_dump(struct sock *sk,
struct sk_buff *skb,
struct netlink_callback *cb,
struct inet_diag_req_v2 *r,
const struct nlattr *bc)
{
struct inet_timewait_sock *tw = inet_twsk(sk);

if (bc != NULL) {
struct inet_diag_entry entry;

Expand Down Expand Up @@ -911,16 +913,15 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,

num = 0;

if (hlist_nulls_empty(&head->chain) &&
hlist_nulls_empty(&head->twchain))
if (hlist_nulls_empty(&head->chain))
continue;

if (i > s_i)
s_num = 0;

spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &head->chain) {
struct inet_sock *inet = inet_sk(sk);
int res;

if (!net_eq(sock_net(sk), net))
continue;
Expand All @@ -929,49 +930,26 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
if (!(r->idiag_states & (1 << sk->sk_state)))
goto next_normal;
if (r->sdiag_family != AF_UNSPEC &&
sk->sk_family != r->sdiag_family)
sk->sk_family != r->sdiag_family)
goto next_normal;
if (r->id.idiag_sport != inet->inet_sport &&
if (r->id.idiag_sport != htons(sk->sk_num) &&
r->id.idiag_sport)
goto next_normal;
if (r->id.idiag_dport != inet->inet_dport &&
if (r->id.idiag_dport != sk->sk_dport &&
r->id.idiag_dport)
goto next_normal;
if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
if (sk->sk_state == TCP_TIME_WAIT)
res = inet_twsk_diag_dump(sk, skb, cb, r, bc);
else
res = inet_csk_diag_dump(sk, skb, cb, r, bc);
if (res < 0) {
spin_unlock_bh(lock);
goto done;
}
next_normal:
++num;
}

if (r->idiag_states & TCPF_TIME_WAIT) {
struct inet_timewait_sock *tw;

inet_twsk_for_each(tw, node,
&head->twchain) {
if (!net_eq(twsk_net(tw), net))
continue;

if (num < s_num)
goto next_dying;
if (r->sdiag_family != AF_UNSPEC &&
tw->tw_family != r->sdiag_family)
goto next_dying;
if (r->id.idiag_sport != tw->tw_sport &&
r->id.idiag_sport)
goto next_dying;
if (r->id.idiag_dport != tw->tw_dport &&
r->id.idiag_dport)
goto next_dying;
if (inet_twsk_diag_dump(tw, skb, cb, r, bc) < 0) {
spin_unlock_bh(lock);
goto done;
}
next_dying:
++num;
}
}
spin_unlock_bh(lock);
}

Expand Down
83 changes: 29 additions & 54 deletions net/ipv4/inet_hashtables.c
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,19 @@ struct sock *__inet_lookup_listener(struct net *net,
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);

/* All sockets share common refcount, but have different destructors */
void sock_gen_put(struct sock *sk)
{
if (!atomic_dec_and_test(&sk->sk_refcnt))
return;

if (sk->sk_state == TCP_TIME_WAIT)
inet_twsk_free(inet_twsk(sk));
else
sk_free(sk);
}
EXPORT_SYMBOL_GPL(sock_gen_put);

struct sock *__inet_lookup_established(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
Expand All @@ -255,13 +268,13 @@ struct sock *__inet_lookup_established(struct net *net,
if (likely(INET_MATCH(sk, net, acookie,
saddr, daddr, ports, dif))) {
if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
goto begintw;
goto out;
if (unlikely(!INET_MATCH(sk, net, acookie,
saddr, daddr, ports, dif))) {
sock_put(sk);
sock_gen_put(sk);
goto begin;
}
goto out;
goto found;
}
}
/*
Expand All @@ -271,37 +284,9 @@ struct sock *__inet_lookup_established(struct net *net,
*/
if (get_nulls_value(node) != slot)
goto begin;

begintw:
/* Must check for a TIME_WAIT'er before going to listener hash. */
sk_nulls_for_each_rcu(sk, node, &head->twchain) {
if (sk->sk_hash != hash)
continue;
if (likely(INET_TW_MATCH(sk, net, acookie,
saddr, daddr, ports,
dif))) {
if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
sk = NULL;
goto out;
}
if (unlikely(!INET_TW_MATCH(sk, net, acookie,
saddr, daddr, ports,
dif))) {
inet_twsk_put(inet_twsk(sk));
goto begintw;
}
goto out;
}
}
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(node) != slot)
goto begintw;
sk = NULL;
out:
sk = NULL;
found:
rcu_read_unlock();
return sk;
}
Expand All @@ -326,39 +311,29 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
struct sock *sk2;
const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw;
struct inet_timewait_sock *tw = NULL;
int twrefcnt = 0;

spin_lock(lock);

/* Check TIME-WAIT sockets first. */
sk_nulls_for_each(sk2, node, &head->twchain) {
if (sk2->sk_hash != hash)
continue;

if (likely(INET_TW_MATCH(sk2, net, acookie,
saddr, daddr, ports, dif))) {
tw = inet_twsk(sk2);
if (twsk_unique(sk, sk2, twp))
goto unique;
else
goto not_unique;
}
}
tw = NULL;

/* And established part... */
sk_nulls_for_each(sk2, node, &head->chain) {
if (sk2->sk_hash != hash)
continue;

if (likely(INET_MATCH(sk2, net, acookie,
saddr, daddr, ports, dif)))
saddr, daddr, ports, dif))) {
if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2);
if (twsk_unique(sk, sk2, twp))
break;
}
goto not_unique;
}
}

unique:
/* Must record num and sport now. Otherwise we will see
* in hash table socket with a funny identity. */
* in hash table socket with a funny identity.
*/
inet->inet_num = lport;
inet->inet_sport = htons(lport);
sk->sk_hash = hash;
Expand Down
Loading

0 comments on commit 05dbc7b

Please sign in to comment.