Skip to content

Commit

Permalink
ipc: optimize semget/shmget/msgget for lots of keys
Browse files Browse the repository at this point in the history
ipc_findkey() used to scan all objects to look for the wanted key.  This
is slow when using a high number of keys.  This change adds an rhashtable
of kern_ipc_perm objects in ipc_ids, so that one lookup cease to be O(n).

This change gives a 865% improvement of benchmark reaim.jobs_per_min on a
56 threads Intel(R) Xeon(R) CPU E5-2695 v3 @ 2.30GHz with 256G memory [1]

Other (more micro) benchmark results, by the author: On an i5 laptop, the
following loop executed right after a reboot took, without and with this
change:

    for (int i = 0, k=0x424242; i < KEYS; ++i)
        semget(k++, 1, IPC_CREAT | 0600);

                 total       total          max single  max single
   KEYS        without        with        call without   call with

      1            3.5         4.9   µs            3.5         4.9
     10            7.6         8.6   µs            3.7         4.7
     32           16.2        15.9   µs            4.3         5.3
    100           72.9        41.8   µs            3.7         4.7
   1000        5,630.0       502.0   µs             *           *
  10000    1,340,000.0     7,240.0   µs             *           *
  31900   17,600,000.0    22,200.0   µs             *           *

 *: unreliable measure: high variance

The duration for a lookup-only usage was obtained by the same loop once
the keys are present:

                 total       total          max single  max single
   KEYS        without        with        call without   call with

      1            2.1         2.5   µs            2.1         2.5
     10            4.5         4.8   µs            2.2         2.3
     32           13.0        10.8   µs            2.3         2.8
    100           82.9        25.1   µs             *          2.3
   1000        5,780.0       217.0   µs             *           *
  10000    1,470,000.0     2,520.0   µs             *           *
  31900   17,400,000.0     7,810.0   µs             *           *

Finally, executing each semget() in a new process gave, when still
summing only the durations of these syscalls:

creation:
                 total       total
   KEYS        without        with

      1            3.7         5.0   µs
     10           32.9        36.7   µs
     32          125.0       109.0   µs
    100          523.0       353.0   µs
   1000       20,300.0     3,280.0   µs
  10000    2,470,000.0    46,700.0   µs
  31900   27,800,000.0   219,000.0   µs

lookup-only:
                 total       total
   KEYS        without        with

      1            2.5         2.7   µs
     10           25.4        24.4   µs
     32          106.0        72.6   µs
    100          591.0       352.0   µs
   1000       22,400.0     2,250.0   µs
  10000    2,510,000.0    25,700.0   µs
  31900   28,200,000.0   115,000.0   µs

[1] http://lkml.kernel.org/r/20170814060507.GE23258@yexl-desktop

Link: http://lkml.kernel.org/r/20170815194954.ck32ta2z35yuzpwp@debix
Signed-off-by: Guillaume Knispel <guillaume.knispel@supersonicimagine.com>
Reviewed-by: Marc Pardo <marc.pardo@supersonicimagine.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: Guillaume Knispel <guillaume.knispel@supersonicimagine.com>
Cc: Marc Pardo <marc.pardo@supersonicimagine.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  • Loading branch information
Guillaume Knispel authored and torvalds committed Sep 9, 2017
1 parent e4243b8 commit 0cfb6ae
Show file tree
Hide file tree
Showing 8 changed files with 130 additions and 50 deletions.
3 changes: 3 additions & 0 deletions include/linux/ipc.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <linux/spinlock.h>
#include <linux/uidgid.h>
#include <linux/rhashtable.h>
#include <uapi/linux/ipc.h>
#include <linux/refcount.h>

Expand All @@ -22,6 +23,8 @@ struct kern_ipc_perm {
unsigned long seq;
void *security;

struct rhash_head khtnode;

struct rcu_head rcu;
refcount_t refcount;
} ____cacheline_aligned_in_smp __randomize_layout;
Expand Down
3 changes: 3 additions & 0 deletions include/linux/ipc_namespace.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,18 @@
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/refcount.h>
#include <linux/rhashtable.h>

struct user_namespace;

struct ipc_ids {
int in_use;
unsigned short seq;
bool tables_initialized;
struct rw_semaphore rwsem;
struct idr ipcs_idr;
int next_id;
struct rhashtable key_ht;
};

struct ipc_namespace {
Expand Down
10 changes: 6 additions & 4 deletions ipc/msg.c
Original file line number Diff line number Diff line change
Expand Up @@ -1011,22 +1011,23 @@ SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz,
}


void msg_init_ns(struct ipc_namespace *ns)
int msg_init_ns(struct ipc_namespace *ns)
{
ns->msg_ctlmax = MSGMAX;
ns->msg_ctlmnb = MSGMNB;
ns->msg_ctlmni = MSGMNI;

atomic_set(&ns->msg_bytes, 0);
atomic_set(&ns->msg_hdrs, 0);
ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
return ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
}

#ifdef CONFIG_IPC_NS
void msg_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &msg_ids(ns), freeque);
idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht);
}
#endif

Expand Down Expand Up @@ -1058,11 +1059,12 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
}
#endif

void __init msg_init(void)
int __init msg_init(void)
{
msg_init_ns(&init_ipc_ns);
const int err = msg_init_ns(&init_ipc_ns);

ipc_init_proc_interface("sysvipc/msg",
" key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n",
IPC_MSG_IDS, sysvipc_msg_proc_show);
return err;
}
20 changes: 16 additions & 4 deletions ipc/namespace.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,28 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;

err = mq_init_ns(ns);
err = sem_init_ns(ns);
if (err)
goto fail_put;
err = msg_init_ns(ns);
if (err)
goto fail_destroy_sem;
err = shm_init_ns(ns);
if (err)
goto fail_destroy_msg;

sem_init_ns(ns);
msg_init_ns(ns);
shm_init_ns(ns);
err = mq_init_ns(ns);
if (err)
goto fail_destroy_shm;

return ns;

fail_destroy_shm:
shm_exit_ns(ns);
fail_destroy_msg:
msg_exit_ns(ns);
fail_destroy_sem:
sem_exit_ns(ns);
fail_put:
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
Expand Down
11 changes: 7 additions & 4 deletions ipc/sem.c
Original file line number Diff line number Diff line change
Expand Up @@ -183,30 +183,33 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
#define sc_semopm sem_ctls[2]
#define sc_semmni sem_ctls[3]

void sem_init_ns(struct ipc_namespace *ns)
int sem_init_ns(struct ipc_namespace *ns)
{
ns->sc_semmsl = SEMMSL;
ns->sc_semmns = SEMMNS;
ns->sc_semopm = SEMOPM;
ns->sc_semmni = SEMMNI;
ns->used_sems = 0;
ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
return ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
}

#ifdef CONFIG_IPC_NS
void sem_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &sem_ids(ns), freeary);
idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht);
}
#endif

void __init sem_init(void)
int __init sem_init(void)
{
sem_init_ns(&init_ipc_ns);
const int err = sem_init_ns(&init_ipc_ns);

ipc_init_proc_interface("sysvipc/sem",
" key semid perms nsems uid gid cuid cgid otime ctime\n",
IPC_SEM_IDS, sysvipc_sem_proc_show);
return err;
}

/**
Expand Down
12 changes: 7 additions & 5 deletions ipc/shm.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,14 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp);
static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
#endif

void shm_init_ns(struct ipc_namespace *ns)
int shm_init_ns(struct ipc_namespace *ns)
{
ns->shm_ctlmax = SHMMAX;
ns->shm_ctlall = SHMALL;
ns->shm_ctlmni = SHMMNI;
ns->shm_rmid_forced = 0;
ns->shm_tot = 0;
ipc_init_ids(&shm_ids(ns));
return ipc_init_ids(&shm_ids(ns));
}

/*
Expand All @@ -95,7 +95,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
if (shp->shm_nattch) {
shp->shm_perm.mode |= SHM_DEST;
/* Do not find it any more */
shp->shm_perm.key = IPC_PRIVATE;
ipc_set_key_private(&shm_ids(ns), &shp->shm_perm);
shm_unlock(shp);
} else
shm_destroy(ns, shp);
Expand All @@ -106,13 +106,15 @@ void shm_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht);
}
#endif

static int __init ipc_ns_init(void)
{
shm_init_ns(&init_ipc_ns);
return 0;
const int err = shm_init_ns(&init_ipc_ns);
WARN(err, "ipc: sysv shm_init_ns failed: %d\n", err);
return err;
}

pure_initcall(ipc_ns_init);
Expand Down
100 changes: 76 additions & 24 deletions ipc/util.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,27 +83,46 @@ struct ipc_proc_iface {
*/
static int __init ipc_init(void)
{
sem_init();
msg_init();
int err_sem, err_msg;

err_sem = sem_init();
WARN(err_sem, "ipc: sysv sem_init failed: %d\n", err_sem);
err_msg = msg_init();
WARN(err_msg, "ipc: sysv msg_init failed: %d\n", err_msg);
shm_init();
return 0;

return err_msg ? err_msg : err_sem;
}
device_initcall(ipc_init);

static const struct rhashtable_params ipc_kht_params = {
.head_offset = offsetof(struct kern_ipc_perm, khtnode),
.key_offset = offsetof(struct kern_ipc_perm, key),
.key_len = FIELD_SIZEOF(struct kern_ipc_perm, key),
.locks_mul = 1,
.automatic_shrinking = true,
};

/**
* ipc_init_ids - initialise ipc identifiers
* @ids: ipc identifier set
*
* Set up the sequence range to use for the ipc identifier range (limited
* below IPCMNI) then initialise the ids idr.
* below IPCMNI) then initialise the keys hashtable and ids idr.
*/
void ipc_init_ids(struct ipc_ids *ids)
int ipc_init_ids(struct ipc_ids *ids)
{
int err;
ids->in_use = 0;
ids->seq = 0;
ids->next_id = -1;
init_rwsem(&ids->rwsem);
err = rhashtable_init(&ids->key_ht, &ipc_kht_params);
if (err)
return err;
idr_init(&ids->ipcs_idr);
ids->tables_initialized = true;
return 0;
}

#ifdef CONFIG_PROC_FS
Expand Down Expand Up @@ -147,28 +166,20 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
* Returns the locked pointer to the ipc structure if found or NULL
* otherwise. If key is found ipc points to the owning ipc structure
*
* Called with ipc_ids.rwsem held.
* Called with writer ipc_ids.rwsem held.
*/
static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
{
struct kern_ipc_perm *ipc;
int next_id;
int total;

for (total = 0, next_id = 0; total < ids->in_use; next_id++) {
ipc = idr_find(&ids->ipcs_idr, next_id);

if (ipc == NULL)
continue;
struct kern_ipc_perm *ipcp = NULL;

if (ipc->key != key) {
total++;
continue;
}
if (likely(ids->tables_initialized))
ipcp = rhashtable_lookup_fast(&ids->key_ht, &key,
ipc_kht_params);

if (ipcp) {
rcu_read_lock();
ipc_lock_object(ipc);
return ipc;
ipc_lock_object(ipcp);
return ipcp;
}

return NULL;
Expand Down Expand Up @@ -221,13 +232,13 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
{
kuid_t euid;
kgid_t egid;
int id;
int id, err;
int next_id = ids->next_id;

if (size > IPCMNI)
size = IPCMNI;

if (ids->in_use >= size)
if (!ids->tables_initialized || ids->in_use >= size)
return -ENOSPC;

idr_preload(GFP_KERNEL);
Expand All @@ -246,6 +257,15 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
(next_id < 0) ? 0 : ipcid_to_idx(next_id), 0,
GFP_NOWAIT);
idr_preload_end();

if (id >= 0 && new->key != IPC_PRIVATE) {
err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
ipc_kht_params);
if (err < 0) {
idr_remove(&ids->ipcs_idr, id);
id = err;
}
}
if (id < 0) {
spin_unlock(&new->lock);
rcu_read_unlock();
Expand Down Expand Up @@ -377,6 +397,20 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
return err;
}

/**
* ipc_kht_remove - remove an ipc from the key hashtable
* @ids: ipc identifier set
* @ipcp: ipc perm structure containing the key to remove
*
* ipc_ids.rwsem (as a writer) and the spinlock for this ID are held
* before this function is called, and remain locked on the exit.
*/
static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
{
if (ipcp->key != IPC_PRIVATE)
rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode,
ipc_kht_params);
}

/**
* ipc_rmid - remove an ipc identifier
Expand All @@ -391,10 +425,25 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
int lid = ipcid_to_idx(ipcp->id);

idr_remove(&ids->ipcs_idr, lid);
ipc_kht_remove(ids, ipcp);
ids->in_use--;
ipcp->deleted = true;
}

/**
* ipc_set_key_private - switch the key of an existing ipc to IPC_PRIVATE
* @ids: ipc identifier set
* @ipcp: ipc perm structure containing the key to modify
*
* ipc_ids.rwsem (as a writer) and the spinlock for this ID are held
* before this function is called, and remain locked on the exit.
*/
void ipc_set_key_private(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
{
ipc_kht_remove(ids, ipcp);
ipcp->key = IPC_PRIVATE;
}

int ipc_rcu_getref(struct kern_ipc_perm *ptr)
{
return refcount_inc_not_zero(&ptr->refcount);
Expand Down Expand Up @@ -485,7 +534,7 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out)
}

/**
* ipc_obtain_object
* ipc_obtain_object_idr
* @ids: ipc identifier set
* @id: ipc id to look for
*
Expand All @@ -499,6 +548,9 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id)
struct kern_ipc_perm *out;
int lid = ipcid_to_idx(id);

if (unlikely(!ids->tables_initialized))
return ERR_PTR(-EINVAL);

out = idr_find(&ids->ipcs_idr, lid);
if (!out)
return ERR_PTR(-EINVAL);
Expand Down
Loading

0 comments on commit 0cfb6ae

Please sign in to comment.