Skip to content
This repository was archived by the owner on Dec 20, 2023. It is now read-only.
16 changes: 16 additions & 0 deletions Documentation/sysctl/vm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm:
- dirty_writeback_centisecs
- drop_caches
- extfrag_threshold
- extra_free_kbytes
- hugepages_treat_as_movable
- hugetlb_shm_group
- laptop_mode
Expand Down Expand Up @@ -204,6 +205,21 @@ fragmentation index is <= extfrag_threshold. The default value is 500.

==============================================================

extra_free_kbytes

This parameter tells the VM to keep extra free memory between the threshold
where background reclaim (kswapd) kicks in, and the threshold where direct
reclaim (by allocating processes) kicks in.

This is useful for workloads that require low latency memory allocations
and have a bounded burstiness in memory allocations, for example a
realtime application that receives and transmits network traffic
(causing in-kernel memory allocations) with a maximum total message burst
size of 200MB may need 200MB of extra free memory to avoid direct reclaim
related latencies.

==============================================================

hugepages_treat_as_movable

This parameter controls whether we can allocate hugepages from ZONE_MOVABLE
Expand Down
4 changes: 2 additions & 2 deletions drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -3250,9 +3250,9 @@ static void ixgbe_setup_mrqc(struct ixgbe_adapter *adapter)
IXGBE_WRITE_REG(hw, IXGBE_RSSRK(i), seed[i]);

/* Fill out redirection table */
for (i = 0, j = 0; i < 128; i++, j++) {
for (i = 0, j = 1; i < 128; i++, j++) {
if (j == rss_i)
j = 0;
j = 1;
/* reta = 4-byte sliding window of
* 0x00..(indices-1)(indices-1)00..etc. */
reta = (reta << 8) | (j * 0x11);
Expand Down
2 changes: 1 addition & 1 deletion include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -897,7 +897,7 @@ static inline int is_highmem(struct zone *zone)

/* These two functions are used to setup the per zone pages min values */
struct ctl_table;
int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
int free_kbytes_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
Expand Down
2 changes: 2 additions & 0 deletions include/linux/swap.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,8 @@ struct swap_info_struct {
/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;
extern unsigned long totalreserve_pages;
extern int min_free_kbytes;
extern int extra_free_kbytes;
extern unsigned long dirty_balance_reserve;
extern unsigned long nr_free_buffer_pages(void);
extern unsigned long nr_free_pagecache_pages(void);
Expand Down
10 changes: 6 additions & 4 deletions include/net/inet_connection_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,10 +253,12 @@ int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb, bool relax);
int inet_csk_get_port(struct sock *sk, unsigned short snum);

struct dst_entry *inet_csk_route_req(struct sock *sk, struct flowi4 *fl4,
const struct request_sock *req);
struct dst_entry *inet_csk_route_child_sock(struct sock *sk, struct sock *newsk,
const struct request_sock *req);
struct dst_entry* inet_csk_route_req(struct sock *sk,
struct flowi4 *fl4,
const struct request_sock *req, int syncookie);
struct dst_entry* inet_csk_route_child_sock(struct sock *sk,
struct sock *newsk,
const struct request_sock *req);

static inline void inet_csk_reqsk_queue_add(struct sock *sk,
struct request_sock *req,
Expand Down
14 changes: 7 additions & 7 deletions include/net/route.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,15 @@ struct in_device;
int ip_rt_init(void);
void rt_cache_flush(struct net *net);
void rt_flush_dev(struct net_device *dev);
struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp, int syncookie);
struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
struct sock *sk);
struct sock *sk, int syncookie);
struct dst_entry *ipv4_blackhole_route(struct net *net,
struct dst_entry *dst_orig);

static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 *flp)
{
return ip_route_output_flow(net, flp, NULL);
return ip_route_output_flow(net, flp, NULL, 0);
}

static inline struct rtable *ip_route_output(struct net *net, __be32 daddr,
Expand All @@ -143,7 +143,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi
daddr, saddr, dport, sport);
if (sk)
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
return ip_route_output_flow(net, fl4, sk);
return ip_route_output_flow(net, fl4, sk, 0);
}

static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 *fl4,
Expand Down Expand Up @@ -266,14 +266,14 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
sport, dport, sk);

if (!dst || !src) {
rt = __ip_route_output_key(net, fl4);
rt = __ip_route_output_key(net, fl4, 0);
if (IS_ERR(rt))
return rt;
ip_rt_put(rt);
flowi4_update_output(fl4, oif, tos, fl4->daddr, fl4->saddr);
}
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
return ip_route_output_flow(net, fl4, sk);
return ip_route_output_flow(net, fl4, sk, 0);
}

static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable *rt,
Expand All @@ -289,7 +289,7 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable
RT_CONN_FLAGS(sk), fl4->daddr,
fl4->saddr);
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
return ip_route_output_flow(sock_net(sk), fl4, sk);
return ip_route_output_flow(sock_net(sk), fl4, sk, 0);
}
return rt;
}
Expand Down
5 changes: 4 additions & 1 deletion include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ typedef __u64 __bitwise __addrpair;
* @skc_state: Connection state
* @skc_reuse: %SO_REUSEADDR setting
* @skc_reuseport: %SO_REUSEPORT setting
* @skc_tw_reuse: %SO_FASTLY_TW_REUSE setting
* @skc_bound_dev_if: bound device index if != 0
* @skc_bind_node: bind hash linkage for various protocol lookup tables
* @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
Expand Down Expand Up @@ -181,7 +182,8 @@ struct sock_common {
unsigned short skc_family;
volatile unsigned char skc_state;
unsigned char skc_reuse:4;
unsigned char skc_reuseport:4;
unsigned char skc_reuseport:2;
unsigned char skc_tw_reuse:2;
int skc_bound_dev_if;
union {
struct hlist_node skc_bind_node;
Expand Down Expand Up @@ -316,6 +318,7 @@ struct sock {
#define sk_state __sk_common.skc_state
#define sk_reuse __sk_common.skc_reuse
#define sk_reuseport __sk_common.skc_reuseport
#define sk_tw_reuse __sk_common.skc_tw_reuse
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
#define sk_bind_node __sk_common.skc_bind_node
#define sk_prot __sk_common.skc_prot
Expand Down
1 change: 1 addition & 0 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ extern int sysctl_tcp_challenge_ack_limit;
extern unsigned int sysctl_tcp_notsent_lowat;
extern int sysctl_tcp_min_tso_segs;
extern int sysctl_tcp_autocorking;
extern int sysctl_tcp_user_cwnd_max;

extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
Expand Down
2 changes: 2 additions & 0 deletions include/uapi/asm-generic/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,6 @@

#define SO_BPF_EXTENSIONS 48

#define SO_FASTLY_TW_REUSE 49

#endif /* __ASM_GENERIC_SOCKET_H */
18 changes: 17 additions & 1 deletion include/uapi/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,17 @@ enum {
#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/
#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */
#define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */
#define TCP_REPAIR 19 /* TCP sock is under repair right now */
#define TCP_REPAIR 26 /* TCP sock is under repair right now */
#define TCP_REPAIR_QUEUE 20
#define TCP_QUEUE_SEQ 21
#define TCP_REPAIR_OPTIONS 22
#define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
#define TCP_TIMESTAMP 24
#define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */
#define TCP_CWND 19 /* Set congestion window */
#define TCP_CWND2 99 /* Set congestion window */

#define TCP_FASTLY_INFO 66 /* Additional info about connection. */

struct tcp_repair_opt {
__u32 opt_code;
Expand Down Expand Up @@ -188,6 +192,18 @@ struct tcp_info {
__u32 tcpi_total_retrans;
};

struct tcp_fst_info {
__u8 version;
__u8 tos;
__u16 __unused;
union {
struct in_addr nexthop;
struct in6_addr nexthop6;
};
__u32 __pad[11];
struct tcp_info info;
};

/* for TCP_MD5SIG socket option */
#define TCP_MD5SIG_MAXKEYLEN 80

Expand Down
10 changes: 9 additions & 1 deletion kernel/sysctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -1314,9 +1314,17 @@ static struct ctl_table vm_table[] = {
.data = &min_free_kbytes,
.maxlen = sizeof(min_free_kbytes),
.mode = 0644,
.proc_handler = min_free_kbytes_sysctl_handler,
.proc_handler = free_kbytes_sysctl_handler,
.extra1 = &zero,
},
{
.procname = "extra_free_kbytes",
.data = &extra_free_kbytes,
.maxlen = sizeof(extra_free_kbytes),
.mode = 0644,
.proc_handler = free_kbytes_sysctl_handler,
.extra1 = &zero,
},
{
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
Expand Down
39 changes: 29 additions & 10 deletions mm/page_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -205,9 +205,22 @@ static char * const zone_names[MAX_NR_ZONES] = {
"Movable",
};

/*
* Try to keep at least this much lowmem free. Do not allow normal
* allocations below this point, only high priority ones. Automatically
* tuned according to the amount of memory in the system.
*/
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;

/*
* Extra memory for the system to try freeing between the min and
* low watermarks. Useful for workloads that require low latency
* memory allocations in bursts larger than the normal gap between
* low and min.
*/
int extra_free_kbytes;

static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
static unsigned long __meminitdata dma_reserve;
Expand Down Expand Up @@ -5625,6 +5638,7 @@ static void setup_per_zone_lowmem_reserve(void)
static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
Expand All @@ -5636,11 +5650,14 @@ static void __setup_per_zone_wmarks(void)
}

for_each_zone(zone) {
u64 tmp;
u64 min, low;

spin_lock_irqsave(&zone->lock, flags);
tmp = (u64)pages_min * zone->managed_pages;
do_div(tmp, lowmem_pages);
min = (u64)pages_min * zone->managed_pages;
do_div(min, lowmem_pages);
low = (u64)pages_low * zone->managed_pages;
do_div(low, vm_total_pages);

if (is_highmem(zone)) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
Expand All @@ -5661,11 +5678,13 @@ static void __setup_per_zone_wmarks(void)
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
zone->watermark[WMARK_MIN] = tmp;
zone->watermark[WMARK_MIN] = min;
}

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +
low + (min >> 2);
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
low + (min >> 1);

__mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
Expand Down Expand Up @@ -5787,11 +5806,11 @@ int __meminit init_per_zone_wmark_min(void)
module_init(init_per_zone_wmark_min)

/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call two helper functions whenever min_free_kbytes
* changes.
* free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call two helper functions whenever min_free_kbytes
* or extra_free_kbytes changes.
*/
int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
int free_kbytes_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
int rc;
Expand Down
4 changes: 4 additions & 0 deletions net/core/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -964,6 +964,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sk->sk_max_pacing_rate);
break;

case SO_FASTLY_TW_REUSE:
sk->sk_tw_reuse = valbool;
break;

default:
ret = -ENOPROTOOPT;
break;
Expand Down
4 changes: 2 additions & 2 deletions net/ipv4/icmp.c
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
rt = __ip_route_output_key(net, fl4);
rt = __ip_route_output_key(net, fl4, 0);
if (IS_ERR(rt))
return rt;

Expand All @@ -421,7 +421,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
goto relookup_failed;

if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
rt2 = __ip_route_output_key(net, &fl4_dec);
rt2 = __ip_route_output_key(net, &fl4_dec, 0);
if (IS_ERR(rt2))
err = PTR_ERR(rt2);
} else {
Expand Down
7 changes: 4 additions & 3 deletions net/ipv4/inet_connection_sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,8 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);

struct dst_entry *inet_csk_route_req(struct sock *sk,
struct flowi4 *fl4,
const struct request_sock *req)
const struct request_sock *req,
int want_cookie)
{
struct rtable *rt;
const struct inet_request_sock *ireq = inet_rsk(req);
Expand All @@ -415,7 +416,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
rt = ip_route_output_flow(net, fl4, sk, want_cookie);
if (IS_ERR(rt))
goto no_route;
if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
Expand Down Expand Up @@ -451,7 +452,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
rt = ip_route_output_flow(net, fl4, sk, 0);
if (IS_ERR(rt))
goto no_route;
if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
Expand Down
Loading