Skip to content

Commit afd20b9

Browse files
q2venkuba-moo
authored andcommitted
af_unix: Replace the big lock with small locks.
The hash table of AF_UNIX sockets is protected by the single lock. This patch replaces it with per-hash locks. The effect is noticeable when we handle multiple sockets simultaneously. Here is a test result on an EC2 c5.24xlarge instance. It shows latency (under 10us only) in unix_insert_unbound_socket() while 64 CPUs creating 1024 sockets for each in parallel. Without this patch: nsec : count distribution 0 : 179 | | 500 : 3021 |********* | 1000 : 6271 |******************* | 1500 : 6318 |******************* | 2000 : 5828 |***************** | 2500 : 5124 |*************** | 3000 : 4426 |************* | 3500 : 3672 |*********** | 4000 : 3138 |********* | 4500 : 2811 |******** | 5000 : 2384 |******* | 5500 : 2023 |****** | 6000 : 1954 |***** | 6500 : 1737 |***** | 7000 : 1749 |***** | 7500 : 1520 |**** | 8000 : 1469 |**** | 8500 : 1394 |**** | 9000 : 1232 |*** | 9500 : 1138 |*** | 10000 : 994 |*** | With this patch: nsec : count distribution 0 : 1634 |**** | 500 : 13170 |****************************************| 1000 : 13156 |*************************************** | 1500 : 9010 |*************************** | 2000 : 6363 |******************* | 2500 : 4443 |************* | 3000 : 3240 |********* | 3500 : 2549 |******* | 4000 : 1872 |***** | 4500 : 1504 |**** | 5000 : 1247 |*** | 5500 : 1035 |*** | 6000 : 889 |** | 6500 : 744 |** | 7000 : 634 |* | 7500 : 498 |* | 8000 : 433 |* | 8500 : 355 |* | 9000 : 336 |* | 9500 : 284 | | 10000 : 243 | | Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent e6b4b87 commit afd20b9

File tree

3 files changed

+71
-49
lines changed

3 files changed

+71
-49
lines changed

include/net/af_unix.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ struct sock *unix_peer_get(struct sock *sk);
2020
#define UNIX_HASH_BITS 8
2121

2222
extern unsigned int unix_tot_inflight;
23-
extern spinlock_t unix_table_lock;
23+
extern spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE];
2424
extern struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
2525

2626
struct unix_address {

net/unix/af_unix.c

Lines changed: 60 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,14 @@
117117

118118
#include "scm.h"
119119

120+
spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE];
121+
EXPORT_SYMBOL_GPL(unix_table_locks);
120122
struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
121123
EXPORT_SYMBOL_GPL(unix_socket_table);
122-
DEFINE_SPINLOCK(unix_table_lock);
123-
EXPORT_SYMBOL_GPL(unix_table_lock);
124124
static atomic_long_t unix_nr_socks;
125125

126126
/* SMP locking strategy:
127-
* hash table is protected with spinlock unix_table_lock
127+
* hash table is protected with spinlock unix_table_locks
128128
* each socket state is protected by separate spin lock.
129129
*/
130130

@@ -157,6 +157,25 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
157157
return hash & (UNIX_HASH_SIZE - 1);
158158
}
159159

160+
static void unix_table_double_lock(unsigned int hash1, unsigned int hash2)
161+
{
162+
/* hash1 and hash2 is never the same because
163+
* one is between 0 and UNIX_HASH_SIZE - 1, and
164+
* another is between UNIX_HASH_SIZE and UNIX_HASH_SIZE * 2.
165+
*/
166+
if (hash1 > hash2)
167+
swap(hash1, hash2);
168+
169+
spin_lock(&unix_table_locks[hash1]);
170+
spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING);
171+
}
172+
173+
static void unix_table_double_unlock(unsigned int hash1, unsigned int hash2)
174+
{
175+
spin_unlock(&unix_table_locks[hash1]);
176+
spin_unlock(&unix_table_locks[hash2]);
177+
}
178+
160179
#ifdef CONFIG_SECURITY_NETWORK
161180
static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
162181
{
@@ -298,16 +317,16 @@ static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr,
298317

299318
static void unix_remove_socket(struct sock *sk)
300319
{
301-
spin_lock(&unix_table_lock);
320+
spin_lock(&unix_table_locks[sk->sk_hash]);
302321
__unix_remove_socket(sk);
303-
spin_unlock(&unix_table_lock);
322+
spin_unlock(&unix_table_locks[sk->sk_hash]);
304323
}
305324

306325
static void unix_insert_unbound_socket(struct sock *sk)
307326
{
308-
spin_lock(&unix_table_lock);
327+
spin_lock(&unix_table_locks[sk->sk_hash]);
309328
__unix_insert_socket(sk);
310-
spin_unlock(&unix_table_lock);
329+
spin_unlock(&unix_table_locks[sk->sk_hash]);
311330
}
312331

313332
static struct sock *__unix_find_socket_byname(struct net *net,
@@ -335,11 +354,11 @@ static inline struct sock *unix_find_socket_byname(struct net *net,
335354
{
336355
struct sock *s;
337356

338-
spin_lock(&unix_table_lock);
357+
spin_lock(&unix_table_locks[hash]);
339358
s = __unix_find_socket_byname(net, sunname, len, hash);
340359
if (s)
341360
sock_hold(s);
342-
spin_unlock(&unix_table_lock);
361+
spin_unlock(&unix_table_locks[hash]);
343362
return s;
344363
}
345364

@@ -348,19 +367,18 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
348367
unsigned int hash = unix_bsd_hash(i);
349368
struct sock *s;
350369

351-
spin_lock(&unix_table_lock);
370+
spin_lock(&unix_table_locks[hash]);
352371
sk_for_each(s, &unix_socket_table[hash]) {
353372
struct dentry *dentry = unix_sk(s)->path.dentry;
354373

355374
if (dentry && d_backing_inode(dentry) == i) {
356375
sock_hold(s);
357-
goto found;
376+
spin_unlock(&unix_table_locks[hash]);
377+
return s;
358378
}
359379
}
360-
s = NULL;
361-
found:
362-
spin_unlock(&unix_table_lock);
363-
return s;
380+
spin_unlock(&unix_table_locks[hash]);
381+
return NULL;
364382
}
365383

366384
/* Support code for asymmetrically connected dgram sockets
@@ -1053,11 +1071,11 @@ static struct sock *unix_find_other(struct net *net,
10531071

10541072
static int unix_autobind(struct sock *sk)
10551073
{
1074+
unsigned int new_hash, old_hash = sk->sk_hash;
10561075
struct unix_sock *u = unix_sk(sk);
10571076
struct unix_address *addr;
10581077
unsigned int retries = 0;
10591078
static u32 ordernum = 1;
1060-
unsigned int new_hash;
10611079
int err;
10621080

10631081
err = mutex_lock_interruptible(&u->bindlock);
@@ -1081,12 +1099,13 @@ static int unix_autobind(struct sock *sk)
10811099
offsetof(struct sockaddr_un, sun_path) + 1;
10821100

10831101
new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1084-
spin_lock(&unix_table_lock);
1102+
unix_table_double_lock(old_hash, new_hash);
10851103
ordernum = (ordernum+1)&0xFFFFF;
10861104

10871105
if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
10881106
new_hash)) {
1089-
spin_unlock(&unix_table_lock);
1107+
unix_table_double_unlock(old_hash, new_hash);
1108+
10901109
/*
10911110
* __unix_find_socket_byname() may take long time if many names
10921111
* are already in use.
@@ -1102,7 +1121,7 @@ static int unix_autobind(struct sock *sk)
11021121
}
11031122

11041123
__unix_set_addr_hash(sk, addr, new_hash);
1105-
spin_unlock(&unix_table_lock);
1124+
unix_table_double_unlock(old_hash, new_hash);
11061125
err = 0;
11071126

11081127
out: mutex_unlock(&u->bindlock);
@@ -1114,10 +1133,10 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
11141133
{
11151134
umode_t mode = S_IFSOCK |
11161135
(SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1136+
unsigned int new_hash, old_hash = sk->sk_hash;
11171137
struct unix_sock *u = unix_sk(sk);
11181138
struct user_namespace *ns; // barf...
11191139
struct unix_address *addr;
1120-
unsigned int new_hash;
11211140
struct dentry *dentry;
11221141
struct path parent;
11231142
int err;
@@ -1156,11 +1175,11 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
11561175
goto out_unlock;
11571176

11581177
new_hash = unix_bsd_hash(d_backing_inode(dentry));
1159-
spin_lock(&unix_table_lock);
1178+
unix_table_double_lock(old_hash, new_hash);
11601179
u->path.mnt = mntget(parent.mnt);
11611180
u->path.dentry = dget(dentry);
11621181
__unix_set_addr_hash(sk, addr, new_hash);
1163-
spin_unlock(&unix_table_lock);
1182+
unix_table_double_unlock(old_hash, new_hash);
11641183
mutex_unlock(&u->bindlock);
11651184
done_path_create(&parent, dentry);
11661185
return 0;
@@ -1181,9 +1200,9 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
11811200
static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
11821201
int addr_len)
11831202
{
1203+
unsigned int new_hash, old_hash = sk->sk_hash;
11841204
struct unix_sock *u = unix_sk(sk);
11851205
struct unix_address *addr;
1186-
unsigned int new_hash;
11871206
int err;
11881207

11891208
addr = unix_create_addr(sunaddr, addr_len);
@@ -1200,19 +1219,19 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
12001219
}
12011220

12021221
new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1203-
spin_lock(&unix_table_lock);
1222+
unix_table_double_lock(old_hash, new_hash);
12041223

12051224
if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
12061225
new_hash))
12071226
goto out_spin;
12081227

12091228
__unix_set_addr_hash(sk, addr, new_hash);
1210-
spin_unlock(&unix_table_lock);
1229+
unix_table_double_unlock(old_hash, new_hash);
12111230
mutex_unlock(&u->bindlock);
12121231
return 0;
12131232

12141233
out_spin:
1215-
spin_unlock(&unix_table_lock);
1234+
unix_table_double_unlock(old_hash, new_hash);
12161235
err = -EADDRINUSE;
12171236
out_mutex:
12181237
mutex_unlock(&u->bindlock);
@@ -1519,9 +1538,9 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
15191538
*
15201539
* The contents of *(otheru->addr) and otheru->path
15211540
* are seen fully set up here, since we have found
1522-
* otheru in hash under unix_table_lock. Insertion
1541+
* otheru in hash under unix_table_locks. Insertion
15231542
* into the hash chain we'd found it in had been done
1524-
* in an earlier critical area protected by unix_table_lock,
1543+
* in an earlier critical area protected by unix_table_locks,
15251544
* the same one where we'd set *(otheru->addr) contents,
15261545
* as well as otheru->path and otheru->addr itself.
15271546
*
@@ -3198,7 +3217,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
31983217
#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
31993218

32003219
#define get_bucket(x) ((x) >> BUCKET_SPACE)
3201-
#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
3220+
#define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
32023221
#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
32033222

32043223
static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
@@ -3222,7 +3241,7 @@ static struct sock *unix_next_socket(struct seq_file *seq,
32223241
struct sock *sk,
32233242
loff_t *pos)
32243243
{
3225-
unsigned long bucket;
3244+
unsigned long bucket = get_bucket(*pos);
32263245

32273246
while (sk > (struct sock *)SEQ_START_TOKEN) {
32283247
sk = sk_next(sk);
@@ -3233,23 +3252,21 @@ static struct sock *unix_next_socket(struct seq_file *seq,
32333252
}
32343253

32353254
do {
3255+
spin_lock(&unix_table_locks[bucket]);
32363256
sk = unix_from_bucket(seq, pos);
32373257
if (sk)
32383258
return sk;
32393259

32403260
next_bucket:
3241-
bucket = get_bucket(*pos) + 1;
3261+
spin_unlock(&unix_table_locks[bucket++]);
32423262
*pos = set_bucket_offset(bucket, 1);
32433263
} while (bucket < ARRAY_SIZE(unix_socket_table));
32443264

32453265
return NULL;
32463266
}
32473267

32483268
static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3249-
__acquires(unix_table_lock)
32503269
{
3251-
spin_lock(&unix_table_lock);
3252-
32533270
if (!*pos)
32543271
return SEQ_START_TOKEN;
32553272

@@ -3266,9 +3283,11 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
32663283
}
32673284

32683285
static void unix_seq_stop(struct seq_file *seq, void *v)
3269-
__releases(unix_table_lock)
32703286
{
3271-
spin_unlock(&unix_table_lock);
3287+
struct sock *sk = v;
3288+
3289+
if (sk)
3290+
spin_unlock(&unix_table_locks[sk->sk_hash]);
32723291
}
32733292

32743293
static int unix_seq_show(struct seq_file *seq, void *v)
@@ -3293,7 +3312,7 @@ static int unix_seq_show(struct seq_file *seq, void *v)
32933312
(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
32943313
sock_i_ino(s));
32953314

3296-
if (u->addr) { // under unix_table_lock here
3315+
if (u->addr) { // under unix_table_locks here
32973316
int i, len;
32983317
seq_putc(seq, ' ');
32993318

@@ -3452,10 +3471,13 @@ static void __init bpf_iter_register(void)
34523471

34533472
static int __init af_unix_init(void)
34543473
{
3455-
int rc = -1;
3474+
int i, rc = -1;
34563475

34573476
BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
34583477

3478+
for (i = 0; i < 2 * UNIX_HASH_SIZE; i++)
3479+
spin_lock_init(&unix_table_locks[i]);
3480+
34593481
rc = proto_register(&unix_dgram_proto, 1);
34603482
if (rc != 0) {
34613483
pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);

net/unix/diag.c

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb)
1515
{
16-
/* might or might not have unix_table_lock */
16+
/* might or might not have unix_table_locks */
1717
struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
1818

1919
if (!addr)
@@ -204,13 +204,13 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
204204
s_slot = cb->args[0];
205205
num = s_num = cb->args[1];
206206

207-
spin_lock(&unix_table_lock);
208207
for (slot = s_slot;
209208
slot < ARRAY_SIZE(unix_socket_table);
210209
s_num = 0, slot++) {
211210
struct sock *sk;
212211

213212
num = 0;
213+
spin_lock(&unix_table_locks[slot]);
214214
sk_for_each(sk, &unix_socket_table[slot]) {
215215
if (!net_eq(sock_net(sk), net))
216216
continue;
@@ -221,14 +221,16 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
221221
if (sk_diag_dump(sk, skb, req,
222222
NETLINK_CB(cb->skb).portid,
223223
cb->nlh->nlmsg_seq,
224-
NLM_F_MULTI) < 0)
224+
NLM_F_MULTI) < 0) {
225+
spin_unlock(&unix_table_locks[slot]);
225226
goto done;
227+
}
226228
next:
227229
num++;
228230
}
231+
spin_unlock(&unix_table_locks[slot]);
229232
}
230233
done:
231-
spin_unlock(&unix_table_lock);
232234
cb->args[0] = slot;
233235
cb->args[1] = num;
234236

@@ -237,21 +239,19 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
237239

238240
static struct sock *unix_lookup_by_ino(unsigned int ino)
239241
{
240-
int i;
241242
struct sock *sk;
243+
int i;
242244

243-
spin_lock(&unix_table_lock);
244245
for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) {
246+
spin_lock(&unix_table_locks[i]);
245247
sk_for_each(sk, &unix_socket_table[i])
246248
if (ino == sock_i_ino(sk)) {
247249
sock_hold(sk);
248-
spin_unlock(&unix_table_lock);
249-
250+
spin_unlock(&unix_table_locks[i]);
250251
return sk;
251252
}
253+
spin_unlock(&unix_table_locks[i]);
252254
}
253-
254-
spin_unlock(&unix_table_lock);
255255
return NULL;
256256
}
257257

0 commit comments

Comments
 (0)