mackstann · jimbelton · Jul 2, 2015 · Jul 2, 2015 · Jul 2, 2015 · Nov 20, 2015
diff --git a/Makefile b/Makefile
@@ -1,25 +1,35 @@
 all: build/glib_hash_table build/stl_unordered_map build/boost_unordered_map build/google_sparse_hash_map build/google_dense_hash_map build/qt_qhash build/python_dict build/ruby_hash
 
 build/glib_hash_table: src/glib_hash_table.c Makefile src/template.c
+	if [ ! -d build ]; then mkdir build; fi
 	gcc -ggdb -O2 -lm `pkg-config --cflags --libs glib-2.0` src/glib_hash_table.c -o build/glib_hash_table
 
 build/stl_unordered_map: src/stl_unordered_map.cc Makefile src/template.c
+	if [ ! -d build ]; then mkdir build; fi
 	g++ -O2 -lm src/stl_unordered_map.cc -o build/stl_unordered_map -std=c++0x
 
 build/boost_unordered_map: src/boost_unordered_map.cc Makefile src/template.c
+	if [ ! -d build ]; then mkdir build; fi
 	g++ -O2 -lm src/boost_unordered_map.cc -o build/boost_unordered_map
 
 build/google_sparse_hash_map: src/google_sparse_hash_map.cc Makefile src/template.c
+	if [ ! -d build ]; then mkdir build; fi
 	g++ -O2 -lm src/google_sparse_hash_map.cc -o build/google_sparse_hash_map
 
 build/google_dense_hash_map: src/google_dense_hash_map.cc Makefile src/template.c
+	if [ ! -d build ]; then mkdir build; fi
 	g++ -O2 -lm src/google_dense_hash_map.cc -o build/google_dense_hash_map
 
 build/qt_qhash: src/qt_qhash.cc Makefile src/template.c
-	g++ -O2 -lm `pkg-config --cflags --libs QtCore` src/qt_qhash.cc -o build/qt_qhash
+	if [ ! -d build ]; then mkdir build; fi
+# Replaced this because it didn't compile
+#	g++ -O2 -lm `pkg-config --cflags --libs QtCore` src/qt_qhash.cc -o build/qt_qhash
+	g++ -O2 -lm -fPIC -I/usr/include/qt5 -I/usr/include/qt5/QtCore -L /usr/lib/x86_64-linux-gnu -lQt5Core src/qt_qhash.cc -o build/qt_qhash
 
 build/python_dict: src/python_dict.c Makefile src/template.c
-	gcc -O2 -lm -I/usr/include/python2.6 -lpython2.6 src/python_dict.c -o build/python_dict
+	if [ ! -d build ]; then mkdir build; fi
+	gcc -O2 -lm -I/usr/include/python2.7 -lpython2.7 src/python_dict.c -o build/python_dict
 
 build/ruby_hash: src/ruby_hash.c Makefile src/template.c
-	gcc -O2 -lm -I/usr/include/ruby-1.9.0 -I /usr/include/ruby-1.9.0/x86_64-linux -lruby1.9 src/ruby_hash.c -o build/ruby_hash
+	if [ ! -d build ]; then mkdir build; fi
+	gcc -O2 -lm -I/usr/include/ruby-1.9.1 -I /usr/include/ruby-1.9.1/x86_64-linux -lruby-1.9.1 src/ruby_hash.c -o build/ruby_hash
diff --git a/README b/README
diff --git a/README.md b/README.md
@@ -0,0 +1,61 @@
+# Hash Table Shootout
+
+A series of benchmarks that run against several hash table implementations,
+written by Nick Welch in 2010. Updated in 2015 by Jim Belton.
+
+## Release Notes
+### 2015-11-26
+* Fixed a bug whereby all C++ implementations where hashing pointer values rather than strings
+* Vastly sped up benchmarks from hours to minutes
+* Simplified standard output from benchmarks
+* Added `kjv-bench.py` which benchmarks each hash table implementation by counting the unique words in the King James version of the bible.
+
+## How to Run the Benchmarks
+
+First, some prerequisites are:
+
+* make (~3.81)
+* gcc/g++ (~4.8.2)
+* python (hard-coded for 2.7 -- edit Makefile to use another version)
+* ruby (hard-coded for 1.9.1 -- edit Makefile to use another version)
+
+Install the hash libraries:
+
+1. Install glib by running: `sudo apt-get install libglib2.0-dev`
+2. Install boost by running: `sudo apt-get install libboost-all-dev`
+3. Install google sparsehash: `sudo apt-get install libsparsehash-dev`
+4. Install qt dev by running: `sudo apt-get install qtbase5-dev`
+5. Install python dev; run: `sudo apt-get install python-dev`
+6. Install ruby dev; run: `sudo apt-get install ruby-dev`
+
+Now, run:
+
+```
+$ make
+$ python bench.py # Note: This step takes many minutes.
+$ python make_chart_data.py < output | python make_html.py
+$ python kjv-bench.py # Runs the new KJV benchmark against all the implementations (fast)
+```
+
+Your charts are now in charts.html.
+
+## Tweaks and Tips
+
+You can tweak some of the values in bench.py to make it run faster at the
+expense of less granular data, and you might need to tweak some of the *tickSize*
+settings in `charts-template.html`.
+
+To run the benchmark at the highest priority possible, do this:
+
+```
+$ sudo nice -n-20 ionice -c1 -n0 sudo -u $USER python bench.py
+```
+
+You might also want to disable any swap files/partitions so that swapping
+doesn't influence performance.  The programs will die if they try to
+allocate too much memory.
+
+## Copyright Information
+
+Originally written by Nick Welch in 2010. Updated and enhanced by Jim Belton in 2015.
+This version is copyright by Jim Belton and licensed under the [MIT License](https://opensource.org/licenses/MIT)
diff --git a/bench.py b/bench.py
@@ -1,29 +1,22 @@
-import sys, os, subprocess, signal
-
-programs = [
-    'glib_hash_table',
-    'stl_unordered_map',
-    'boost_unordered_map',
-    'google_sparse_hash_map',
-    'google_dense_hash_map',
-    'qt_qhash',
-    'python_dict',
-    'ruby_hash',
-]
-
-minkeys  =  2*1000*1000
-maxkeys  = 40*1000*1000
-interval =  2*1000*1000
+import os
+import re
+import signal
+import subprocess
+import sys
+
 best_out_of = 2
+programs    = os.listdir("build")
+pattern     = re.compile(r'(\d+(?:\.\d+)?) (\d+)')
+maxkeys     = 40*1000*1000
+interval    =  2*1000*1000
 
 # for the final run, use this:
-#minkeys  =  2*1000*1000
 #maxkeys  = 40*1000*1000
 #interval =  2*1000*1000
 #best_out_of = 3
 # and use nice/ionice
 # and shut down to the console
-# and swapoff any swap files/partitions
+# and swapoff any swap files/partitions`
 
 outfile = open('output', 'w')
 
@@ -33,37 +26,48 @@
     benchtypes = ('sequential', 'random', 'delete', 'sequentialstring', 'randomstring', 'deletestring')
 
 for benchtype in benchtypes:
-    nkeys = minkeys
-    while nkeys <= maxkeys:
-        for program in programs:
-            fastest_attempt = 1000000
-            fastest_attempt_data = ''
+    for program in programs:
+        fastest_attempt = []
+
+        for attempt in range(best_out_of):
+            points = []
+            proc   = subprocess.Popen(['./build/' + program,  benchtype, str(maxkeys), str(interval)], stdout=subprocess.PIPE)
+
+            for size in range(interval, maxkeys + 1, interval):
+                # wait for the benchmark to output a time and amount of data memory used
+                line = proc.stdout.readline()
+
+                if not line:
+                    sys.stderr.write("%s: %s %s failed to output all results\n" % (__file__, program, benchtype))
+                    break
 
-            for attempt in range(best_out_of):
-                proc = subprocess.Popen(['./build/'+program, str(nkeys), benchtype], stdout=subprocess.PIPE)
+                match = pattern.match(line)
 
-                # wait for the program to fill up memory and spit out its "ready" message
-                try:
-                    runtime = float(proc.stdout.readline().strip())
-                except:
-                    runtime = 0
+                if not match:
+                    sys.stderr.write("%s: %s %s output did not contain time and memory: %s" % (__file__, program, benchtype, line))
+                    break
 
-                ps_proc = subprocess.Popen(['ps up %d | tail -n1' % proc.pid], shell=True, stdout=subprocess.PIPE)
-                nbytes = int(ps_proc.stdout.read().split()[4]) * 1024
-                ps_proc.wait()
+                points.append((float(match.group(1)), int(match.group(2))))
 
-                os.kill(proc.pid, signal.SIGKILL)
-                proc.wait()
+            # Shutdown the benchmark if needed
+            ps_proc = subprocess.Popen(['ps up %d | tail -n1' % proc.pid], shell=True, stdout=subprocess.PIPE)
+            nbytes = int(ps_proc.stdout.read().split()[4]) * 1024
+            ps_proc.wait()
+            os.kill(proc.pid, signal.SIGKILL)
+            proc.wait()
 
-                if nbytes and runtime: # otherwise it crashed
-                    line = ','.join(map(str, [benchtype, nkeys, program, nbytes, "%0.6f" % runtime]))
+            # If there is output and at least as much as any previous attempt
+            if len(points) > 0 and len(points) >= len(fastest_attempt):
+                if len(points) > len(fastest_attempt) or points[len(points) - 1][0] < fastest_attempt[len(fastest_attempt) - 1][0]:
+                    fastest_attempt = points
 
-                    if runtime < fastest_attempt:
-                        fastest_attempt = runtime
-                        fastest_attempt_data = line
+        if len(fastest_attempt) > 0:
+            for i in range(len(fastest_attempt)):
+                size = (i + 1) * interval
+                outfile.write("%s,%d,%s,%d,%0.6f\n" % (benchtype, size, program, fastest_attempt[i][1], fastest_attempt[i][0]))
 
-            if fastest_attempt != 1000000:
-                print >> outfile, fastest_attempt_data
-                print fastest_attempt_data
+            print "%s %s: %d keys, %fs, %d Mbytes" % (program, benchtype, size, fastest_attempt[i][0],
+                                                      fastest_attempt[i][1] / 1048576)
 
-        nkeys += interval
+        else:
+            print "No run of %s %s succeeded" % (program, benchtype)