Merge master.kernel.org:/pub/scm/linux/kernel/git/mingo/mutex-2.6

FriedrichWilhelmNietzsche · Jan 10, 2006 · 80c0531 · 80c0531
2 parents a457aa6 + 11b751a
commit 80c0531
Show file tree

Hide file tree

Showing 198 changed files with 2,754 additions and 649 deletions.
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
@@ -222,24 +222,30 @@
    <title>Two Main Types of Kernel Locks: Spinlocks and Semaphores</title>
 
    <para>
-     There are two main types of kernel locks.  The fundamental type
+     There are three main types of kernel locks.  The fundamental type
      is the spinlock 
      (<filename class="headerfile">include/asm/spinlock.h</filename>),
      which is a very simple single-holder lock: if you can't get the 
      spinlock, you keep trying (spinning) until you can.  Spinlocks are 
      very small and fast, and can be used anywhere.
    </para>
    <para>
-     The second type is a semaphore
+     The second type is a mutex
+     (<filename class="headerfile">include/linux/mutex.h</filename>): it
+     is like a spinlock, but you may block holding a mutex.
+     If you can't lock a mutex, your task will suspend itself, and be woken
+     up when the mutex is released.  This means the CPU can do something
+     else while you are waiting.  There are many cases when you simply
+     can't sleep (see <xref linkend="sleeping-things"/>), and so have to
+     use a spinlock instead.
+   </para>
+   <para>
+     The third type is a semaphore
      (<filename class="headerfile">include/asm/semaphore.h</filename>): it
      can have more than one holder at any time (the number decided at
      initialization time), although it is most commonly used as a
-     single-holder lock (a mutex).  If you can't get a semaphore,
-     your task will put itself on the queue, and be woken up when the
-     semaphore is released.  This means the CPU will do something
-     else while you are waiting, but there are many cases when you
-     simply can't sleep (see <xref linkend="sleeping-things"/>), and so
-     have to use a spinlock instead.
+     single-holder lock (a mutex).  If you can't get a semaphore, your
+     task will be suspended and later on woken up - just like for mutexes.
    </para>
    <para>
      Neither type of lock is recursive: see

diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt
@@ -0,0 +1,135 @@
+Generic Mutex Subsystem
+
+started by Ingo Molnar <mingo@redhat.com>
+
+  "Why on earth do we need a new mutex subsystem, and what's wrong
+   with semaphores?"
+
+firstly, there's nothing wrong with semaphores. But if the simpler
+mutex semantics are sufficient for your code, then there are a couple
+of advantages of mutexes:
+
+ - 'struct mutex' is smaller on most architectures: .e.g on x86,
+   'struct semaphore' is 20 bytes, 'struct mutex' is 16 bytes.
+   A smaller structure size means less RAM footprint, and better
+   CPU-cache utilization.
+
+ - tighter code. On x86 i get the following .text sizes when
+   switching all mutex-alike semaphores in the kernel to the mutex
+   subsystem:
+
+        text    data     bss     dec     hex filename
+     3280380  868188  396860 4545428  455b94 vmlinux-semaphore
+     3255329  865296  396732 4517357  44eded vmlinux-mutex
+
+   that's 25051 bytes of code saved, or a 0.76% win - off the hottest
+   codepaths of the kernel. (The .data savings are 2892 bytes, or 0.33%)
+   Smaller code means better icache footprint, which is one of the
+   major optimization goals in the Linux kernel currently.
+
+ - the mutex subsystem is slightly faster and has better scalability for
+   contended workloads. On an 8-way x86 system, running a mutex-based
+   kernel and testing creat+unlink+close (of separate, per-task files)
+   in /tmp with 16 parallel tasks, the average number of ops/sec is:
+
+    Semaphores:                        Mutexes:
+
+    $ ./test-mutex V 16 10             $ ./test-mutex V 16 10
+    8 CPUs, running 16 tasks.          8 CPUs, running 16 tasks.
+    checking VFS performance.          checking VFS performance.
+    avg loops/sec:      34713          avg loops/sec:      84153
+    CPU utilization:    63%            CPU utilization:    22%
+
+   i.e. in this workload, the mutex based kernel was 2.4 times faster
+   than the semaphore based kernel, _and_ it also had 2.8 times less CPU
+   utilization. (In terms of 'ops per CPU cycle', the semaphore kernel
+   performed 551 ops/sec per 1% of CPU time used, while the mutex kernel
+   performed 3825 ops/sec per 1% of CPU time used - it was 6.9 times
+   more efficient.)
+
+   the scalability difference is visible even on a 2-way P4 HT box:
+
+    Semaphores:                        Mutexes:
+
+    $ ./test-mutex V 16 10             $ ./test-mutex V 16 10
+    4 CPUs, running 16 tasks.          8 CPUs, running 16 tasks.
+    checking VFS performance.          checking VFS performance.
+    avg loops/sec:      127659         avg loops/sec:      181082
+    CPU utilization:    100%           CPU utilization:    34%
+
+   (the straight performance advantage of mutexes is 41%, the per-cycle
+    efficiency of mutexes is 4.1 times better.)
+
+ - there are no fastpath tradeoffs, the mutex fastpath is just as tight
+   as the semaphore fastpath. On x86, the locking fastpath is 2
+   instructions:
+
+    c0377ccb <mutex_lock>:
+    c0377ccb:       f0 ff 08                lock decl (%eax)
+    c0377cce:       78 0e                   js     c0377cde <.text.lock.mutex>
+    c0377cd0:       c3                      ret
+
+   the unlocking fastpath is equally tight:
+
+    c0377cd1 <mutex_unlock>:
+    c0377cd1:       f0 ff 00                lock incl (%eax)
+    c0377cd4:       7e 0f                   jle    c0377ce5 <.text.lock.mutex+0x7>
+    c0377cd6:       c3                      ret
+
+ - 'struct mutex' semantics are well-defined and are enforced if
+   CONFIG_DEBUG_MUTEXES is turned on. Semaphores on the other hand have
+   virtually no debugging code or instrumentation. The mutex subsystem
+   checks and enforces the following rules:
+
+   * - only one task can hold the mutex at a time
+   * - only the owner can unlock the mutex
+   * - multiple unlocks are not permitted
+   * - recursive locking is not permitted
+   * - a mutex object must be initialized via the API
+   * - a mutex object must not be initialized via memset or copying
+   * - task may not exit with mutex held
+   * - memory areas where held locks reside must not be freed
+   * - held mutexes must not be reinitialized
+   * - mutexes may not be used in irq contexts
+
+   furthermore, there are also convenience features in the debugging
+   code:
+
+   * - uses symbolic names of mutexes, whenever they are printed in debug output
+   * - point-of-acquire tracking, symbolic lookup of function names
+   * - list of all locks held in the system, printout of them
+   * - owner tracking
+   * - detects self-recursing locks and prints out all relevant info
+   * - detects multi-task circular deadlocks and prints out all affected
+   *   locks and tasks (and only those tasks)
+
+Disadvantages
+-------------
+
+The stricter mutex API means you cannot use mutexes the same way you
+can use semaphores: e.g. they cannot be used from an interrupt context,
+nor can they be unlocked from a different context that which acquired
+it. [ I'm not aware of any other (e.g. performance) disadvantages from
+using mutexes at the moment, please let me know if you find any. ]
+
+Implementation of mutexes
+-------------------------
+
+'struct mutex' is the new mutex type, defined in include/linux/mutex.h
+and implemented in kernel/mutex.c. It is a counter-based mutex with a
+spinlock and a wait-list. The counter has 3 states: 1 for "unlocked",
+0 for "locked" and negative numbers (usually -1) for "locked, potential
+waiters queued".
+
+the APIs of 'struct mutex' have been streamlined:
+
+ DEFINE_MUTEX(name);
+
+ mutex_init(mutex);
+
+ void mutex_lock(struct mutex *lock);
+ int  mutex_lock_interruptible(struct mutex *lock);
+ int  mutex_trylock(struct mutex *lock);
+ void mutex_unlock(struct mutex *lock);
+ int  mutex_is_locked(struct mutex *lock);
+
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
@@ -222,6 +222,10 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	if (PageHighMem(page))
 		return;
+	if (!enable)
+		mutex_debug_check_no_locks_freed(page_address(page),
+						 page_address(page+numpages));
+
 	/* the return value is ignored - the calls cannot fail,
 	 * large pages are disabled at boot time.
 	 */

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -137,7 +137,7 @@ spufs_delete_inode(struct inode *inode)
 static void spufs_prune_dir(struct dentry *dir)
 {
 	struct dentry *dentry, *tmp;
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) {
 		spin_lock(&dcache_lock);
 		spin_lock(&dentry->d_lock);
@@ -154,23 +154,23 @@ static void spufs_prune_dir(struct dentry *dir)
 		}
 	}
 	shrink_dcache_parent(dir);
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 }
 
 static int spufs_rmdir(struct inode *root, struct dentry *dir_dentry)
 {
 	struct spu_context *ctx;
 
 	/* remove all entries */
-	down(&root->i_sem);
+	mutex_lock(&root->i_mutex);
 	spufs_prune_dir(dir_dentry);
-	up(&root->i_sem);
+	mutex_unlock(&root->i_mutex);
 
 	/* We have to give up the mm_struct */
 	ctx = SPUFS_I(dir_dentry->d_inode)->i_ctx;
 	spu_forget(ctx);
 
-	/* XXX Do we need to hold i_sem here ? */
+	/* XXX Do we need to hold i_mutex here ? */
 	return simple_rmdir(root, dir_dentry);
 }
 
@@ -330,7 +330,7 @@ long spufs_create_thread(struct nameidata *nd,
 out_dput:
 	dput(dentry);
 out_dir:
-	up(&nd->dentry->d_inode->i_sem);
+	mutex_unlock(&nd->dentry->d_inode->i_mutex);
 out:
 	return ret;
 }

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
@@ -215,7 +215,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 	unsigned offset, bv_offs;
 	int len, ret;
 
-	down(&mapping->host->i_sem);
+	mutex_lock(&mapping->host->i_mutex);
 	index = pos >> PAGE_CACHE_SHIFT;
 	offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);
 	bv_offs = bvec->bv_offset;
@@ -278,7 +278,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 	}
 	ret = 0;
 out:
-	up(&mapping->host->i_sem);
+	mutex_unlock(&mapping->host->i_mutex);
 	return ret;
 unlock:
 	unlock_page(page);
@@ -527,12 +527,12 @@ static int loop_make_request(request_queue_t *q, struct bio *old_bio)
 	lo->lo_pending++;
 	loop_add_bio(lo, old_bio);
 	spin_unlock_irq(&lo->lo_lock);
-	up(&lo->lo_bh_mutex);
+	complete(&lo->lo_bh_done);
 	return 0;
 
 out:
 	if (lo->lo_pending == 0)
-		up(&lo->lo_bh_mutex);
+		complete(&lo->lo_bh_done);
 	spin_unlock_irq(&lo->lo_lock);
 	bio_io_error(old_bio, old_bio->bi_size);
 	return 0;
@@ -593,23 +593,20 @@ static int loop_thread(void *data)
 	lo->lo_pending = 1;
 
 	/*
-	 * up sem, we are running
+	 * complete it, we are running
 	 */
-	up(&lo->lo_sem);
+	complete(&lo->lo_done);
 
 	for (;;) {
 		int pending;
 
-		/*
-		 * interruptible just to not contribute to load avg
-		 */
-		if (down_interruptible(&lo->lo_bh_mutex))
+		if (wait_for_completion_interruptible(&lo->lo_bh_done))
 			continue;
 
 		spin_lock_irq(&lo->lo_lock);
 
 		/*
-		 * could be upped because of tear-down, not pending work
+		 * could be completed because of tear-down, not pending work
 		 */
 		if (unlikely(!lo->lo_pending)) {
 			spin_unlock_irq(&lo->lo_lock);
@@ -632,7 +629,7 @@ static int loop_thread(void *data)
 			break;
 	}
 
-	up(&lo->lo_sem);
+	complete(&lo->lo_done);
 	return 0;
 }
 
@@ -843,7 +840,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 	set_blocksize(bdev, lo_blocksize);
 
 	kernel_thread(loop_thread, lo, CLONE_KERNEL);
-	down(&lo->lo_sem);
+	wait_for_completion(&lo->lo_done);
 	return 0;
 
  out_putf:
@@ -909,10 +906,10 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 	lo->lo_state = Lo_rundown;
 	lo->lo_pending--;
 	if (!lo->lo_pending)
-		up(&lo->lo_bh_mutex);
+		complete(&lo->lo_bh_done);
 	spin_unlock_irq(&lo->lo_lock);
 
-	down(&lo->lo_sem);
+	wait_for_completion(&lo->lo_done);
 
 	lo->lo_backing_file = NULL;
 
@@ -1289,8 +1286,8 @@ static int __init loop_init(void)
 		if (!lo->lo_queue)
 			goto out_mem4;
 		init_MUTEX(&lo->lo_ctl_mutex);
-		init_MUTEX_LOCKED(&lo->lo_sem);
-		init_MUTEX_LOCKED(&lo->lo_bh_mutex);
+		init_completion(&lo->lo_done);
+		init_completion(&lo->lo_bh_done);
 		lo->lo_number = i;
 		spin_lock_init(&lo->lo_lock);
 		disk->major = LOOP_MAJOR;

diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
@@ -27,8 +27,8 @@
 #include <linux/time.h>
 #include <linux/hdreg.h>
 #include <linux/dma-mapping.h>
+#include <linux/completion.h>
 #include <asm/io.h>
-#include <asm/semaphore.h>
 #include <asm/uaccess.h>
 
 #if 0
@@ -303,7 +303,7 @@ struct carm_host {
 
 	struct work_struct		fsm_task;
 
-	struct semaphore		probe_sem;
+	struct completion		probe_comp;
 };
 
 struct carm_response {
@@ -1346,7 +1346,7 @@ static void carm_fsm_task (void *_data)
 	}
 
 	case HST_PROBE_FINISHED:
-		up(&host->probe_sem);
+		complete(&host->probe_comp);
 		break;
 
 	case HST_ERROR:
@@ -1622,7 +1622,7 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	host->flags = pci_dac ? FL_DAC : 0;
 	spin_lock_init(&host->lock);
 	INIT_WORK(&host->fsm_task, carm_fsm_task, host);
-	init_MUTEX_LOCKED(&host->probe_sem);
+	init_completion(&host->probe_comp);
 
 	for (i = 0; i < ARRAY_SIZE(host->req); i++)
 		host->req[i].tag = i;
@@ -1691,8 +1691,8 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc)
 		goto err_out_free_irq;
 
-	DPRINTK("waiting for probe_sem\n");
-	down(&host->probe_sem);
+	DPRINTK("waiting for probe_comp\n");
+	wait_for_completion(&host->probe_comp);
 
 	printk(KERN_INFO "%s: pci %s, ports %d, io %lx, irq %u, major %d\n",
 	       host->name, pci_name(pdev), (int) CARM_MAX_PORTS,