Skip to content

Commit

Permalink
Merge tag 'nf-next-23-06-26' of git://git.kernel.org/pub/scm/linux/ke…
Browse files Browse the repository at this point in the history
…rnel/git/netfilter/nf-next

Pablo Neira Ayuso says:

====================
Netfilter/IPVS updates for net-next

1) Allow slightly larger IPVS connection table size from Kconfig for
   64-bit arch, from Abhijeet Rastogi.

2) Since IPVS connection table might be larger than 2^20 after previous
   patch, allow to limit it depending on the available memory.
   Moreover, use kvmalloc. From Julian Anastasov.

3) Do not rebuild VLAN header in nft_payload when matching source and
   destination MAC address.

4) Remove nested rcu read lock side in ip_set_test(), from Florian Westphal.

5) Allow to update set size, also from Florian.

6) Improve NAT tuple selection when connection is closing,
   from Florian Westphal.

7) Support for resetting set element stateful expression, from Phil Sutter.

8) Use NLA_POLICY_MAX to narrow down maximum attribute value in nf_tables,
   from Florian Westphal.

* tag 'nf-next-23-06-26' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next:
  netfilter: nf_tables: limit allowed range via nla_policy
  netfilter: nf_tables: Introduce NFT_MSG_GETSETELEM_RESET
  netfilter: snat: evict closing tcp entries on reply tuple collision
  netfilter: nf_tables: permit update of set size
  netfilter: ipset: remove rcu_read_lock_bh pair from ip_set_test
  netfilter: nft_payload: rebuild vlan header when needed
  ipvs: dynamically limit the connection hash table
  ipvs: increase ip_vs_conn_tab_bits range for 64BIT
====================

Link: https://lore.kernel.org/r/20230626064749.75525-1-pablo@netfilter.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
kuba-moo committed Jun 26, 2023
2 parents 771ca3d + a412dbf commit 61dc651
Show file tree
Hide file tree
Showing 23 changed files with 199 additions and 70 deletions.
3 changes: 3 additions & 0 deletions include/net/netfilter/nf_tables.h
Original file line number Diff line number Diff line change
Expand Up @@ -1611,6 +1611,7 @@ struct nft_trans_set {
u64 timeout;
bool update;
bool bound;
u32 size;
};

#define nft_trans_set(trans) \
Expand All @@ -1625,6 +1626,8 @@ struct nft_trans_set {
(((struct nft_trans_set *)trans->data)->timeout)
#define nft_trans_set_gc_int(trans) \
(((struct nft_trans_set *)trans->data)->gc_int)
#define nft_trans_set_size(trans) \
(((struct nft_trans_set *)trans->data)->size)

struct nft_trans_chain {
struct nft_chain *chain;
Expand Down
2 changes: 2 additions & 0 deletions include/uapi/linux/netfilter/nf_tables.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ enum nft_verdicts {
* @NFT_MSG_DESTROYSETELEM: destroy a set element (enum nft_set_elem_attributes)
* @NFT_MSG_DESTROYOBJ: destroy a stateful object (enum nft_object_attributes)
* @NFT_MSG_DESTROYFLOWTABLE: destroy flow table (enum nft_flowtable_attributes)
* @NFT_MSG_GETSETELEM_RESET: get set elements and reset attached stateful expressions (enum nft_set_elem_attributes)
*/
enum nf_tables_msg_types {
NFT_MSG_NEWTABLE,
Expand Down Expand Up @@ -140,6 +141,7 @@ enum nf_tables_msg_types {
NFT_MSG_DESTROYSETELEM,
NFT_MSG_DESTROYOBJ,
NFT_MSG_DESTROYFLOWTABLE,
NFT_MSG_GETSETELEM_RESET,
NFT_MSG_MAX,
};

Expand Down
2 changes: 0 additions & 2 deletions net/netfilter/ipset/ip_set_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -739,9 +739,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return 0;

rcu_read_lock_bh();
ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
rcu_read_unlock_bh();

if (ret == -EAGAIN) {
/* Type requests element to be completed */
Expand Down
27 changes: 14 additions & 13 deletions net/netfilter/ipvs/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ config IP_VS_DEBUG

config IP_VS_TAB_BITS
int "IPVS connection table size (the Nth power of 2)"
range 8 20
range 8 20 if !64BIT
range 8 27 if 64BIT
default 12
help
The IPVS connection hash table uses the chaining scheme to handle
Expand All @@ -54,24 +55,24 @@ config IP_VS_TAB_BITS

Note the table size must be power of 2. The table size will be the
value of 2 to the your input number power. The number to choose is
from 8 to 20, the default number is 12, which means the table size
is 4096. Don't input the number too small, otherwise you will lose
performance on it. You can adapt the table size yourself, according
to your virtual server application. It is good to set the table size
not far less than the number of connections per second multiplying
average lasting time of connection in the table. For example, your
virtual server gets 200 connections per second, the connection lasts
for 200 seconds in average in the connection table, the table size
should be not far less than 200x200, it is good to set the table
size 32768 (2**15).
from 8 to 27 for 64BIT(20 otherwise), the default number is 12,
which means the table size is 4096. Don't input the number too
small, otherwise you will lose performance on it. You can adapt the
table size yourself, according to your virtual server application.
It is good to set the table size not far less than the number of
connections per second multiplying average lasting time of
connection in the table. For example, your virtual server gets 200
connections per second, the connection lasts for 200 seconds in
average in the connection table, the table size should be not far
less than 200x200, it is good to set the table size 32768 (2**15).

Another note that each connection occupies 128 bytes effectively and
each hash entry uses 8 bytes, so you can estimate how much memory is
needed for your box.

You can overwrite this number setting conn_tab_bits module parameter
or by appending ip_vs.conn_tab_bits=? to the kernel command line
if IP VS was compiled built-in.
or by appending ip_vs.conn_tab_bits=? to the kernel command line if
IP VS was compiled built-in.

comment "IPVS transport protocol load balancing support"

Expand Down
26 changes: 17 additions & 9 deletions net/netfilter/ipvs/ip_vs_conn.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#include <linux/net.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/proc_fs.h> /* for proc_net_* */
#include <linux/slab.h>
#include <linux/seq_file.h>
Expand Down Expand Up @@ -1482,13 +1481,21 @@ void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
int __init ip_vs_conn_init(void)
{
size_t tab_array_size;
int max_avail;
#if BITS_PER_LONG > 32
int max = 27;
#else
int max = 20;
#endif
int min = 8;
int idx;

/* Compute size and mask */
if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) {
pr_info("conn_tab_bits not in [8, 20]. Using default value\n");
ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
}
max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT;
max_avail -= 2; /* ~4 in hash row */
max_avail -= 1; /* IPVS up to 1/2 of mem */
max_avail -= order_base_2(sizeof(struct ip_vs_conn));
max = clamp(max, min, max_avail);
ip_vs_conn_tab_bits = clamp_val(ip_vs_conn_tab_bits, min, max);
ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;

Expand All @@ -1497,7 +1504,8 @@ int __init ip_vs_conn_init(void)
*/
tab_array_size = array_size(ip_vs_conn_tab_size,
sizeof(*ip_vs_conn_tab));
ip_vs_conn_tab = vmalloc(tab_array_size);
ip_vs_conn_tab = kvmalloc_array(ip_vs_conn_tab_size,
sizeof(*ip_vs_conn_tab), GFP_KERNEL);
if (!ip_vs_conn_tab)
return -ENOMEM;

Expand All @@ -1506,7 +1514,7 @@ int __init ip_vs_conn_init(void)
sizeof(struct ip_vs_conn), 0,
SLAB_HWCACHE_ALIGN, NULL);
if (!ip_vs_conn_cachep) {
vfree(ip_vs_conn_tab);
kvfree(ip_vs_conn_tab);
return -ENOMEM;
}

Expand Down Expand Up @@ -1534,5 +1542,5 @@ void ip_vs_conn_cleanup(void)
rcu_barrier();
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
vfree(ip_vs_conn_tab);
kvfree(ip_vs_conn_tab);
}
92 changes: 88 additions & 4 deletions net/netfilter/nf_nat_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@

#include "nf_internals.h"

#define NF_NAT_MAX_ATTEMPTS 128
#define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4)

static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];

static DEFINE_MUTEX(nf_nat_proto_mutex);
Expand Down Expand Up @@ -197,6 +200,88 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
}

static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags)
{
static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT |
IPS_DYING;
static const unsigned long flags_needed = IPS_SRC_NAT;
enum tcp_conntrack old_state;

old_state = READ_ONCE(ct->proto.tcp.state);
if (old_state < TCP_CONNTRACK_TIME_WAIT)
return false;

if (flags & flags_refuse)
return false;

return (flags & flags_needed) == flags_needed;
}

/* reverse direction will send packets to new source, so
* make sure such packets are invalid.
*/
static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new)
{
return (__s32)(new->proto.tcp.seen[0].td_end -
old->proto.tcp.seen[0].td_end) > 0;
}

static int
nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple,
const struct nf_conn *ignored_conntrack,
unsigned int attempts_left)
{
static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD;
struct nf_conntrack_tuple_hash *thash;
const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple reply;
unsigned long flags;
struct nf_conn *ct;
bool taken = true;
struct net *net;

nf_ct_invert_tuple(&reply, tuple);

if (attempts_left > NF_NAT_HARDER_THRESH ||
tuple->dst.protonum != IPPROTO_TCP ||
ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT)
return nf_conntrack_tuple_taken(&reply, ignored_conntrack);

/* :ast few attempts to find a free tcp port. Destructive
* action: evict colliding if its in timewait state and the
* tcp sequence number has advanced past the one used by the
* old entry.
*/
net = nf_ct_net(ignored_conntrack);
zone = nf_ct_zone(ignored_conntrack);

thash = nf_conntrack_find_get(net, zone, &reply);
if (!thash)
return false;

ct = nf_ct_tuplehash_to_ctrack(thash);

if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL)
goto out;

if (WARN_ON_ONCE(ct == ignored_conntrack))
goto out;

flags = READ_ONCE(ct->status);
if (!nf_nat_may_kill(ct, flags))
goto out;

if (!nf_seq_has_advanced(ct, ignored_conntrack))
goto out;

/* Even if we can evict do not reuse if entry is offloaded. */
if (nf_ct_kill(ct))
taken = flags & flags_offload;
out:
nf_ct_put(ct);
return taken;
}

static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
const struct nf_nat_range2 *range)
{
Expand Down Expand Up @@ -385,7 +470,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
unsigned int range_size, min, max, i, attempts;
__be16 *keyptr;
u16 off;
static const unsigned int max_attempts = 128;

switch (tuple->dst.protonum) {
case IPPROTO_ICMP:
Expand Down Expand Up @@ -471,8 +555,8 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
off = get_random_u16();

attempts = range_size;
if (attempts > max_attempts)
attempts = max_attempts;
if (attempts > NF_NAT_MAX_ATTEMPTS)
attempts = NF_NAT_MAX_ATTEMPTS;

/* We are in softirq; doing a search of the entire range risks
* soft lockup when all tuples are already used.
Expand All @@ -483,7 +567,7 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
another_round:
for (i = 0; i < attempts; i++, off++) {
*keyptr = htons(min + off % range_size);
if (!nf_nat_used_tuple(tuple, ct))
if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))
return;
}

Expand Down
Loading

0 comments on commit 61dc651

Please sign in to comment.