third_party/0001-Avoid-fetching-nullptr-when-use-featrue-filter.patch

From 80bd59cdd6c3c0a3fc9a512e97529a9f2c6883a6 Mon Sep 17 00:00:00 2001
From: lxy268263 <lxy268263@alibaba-inc.com>
Date: Tue, 13 Jul 2021 16:20:45 +0800
Subject: [PATCH] Avoid fetching nullptr when use featrue filter.

---
 Makefile                                      |   13 +-
 sparsehash/dense_hash_map_lockless            |  447 ++++
 sparsehash/dense_hash_set_lockless            |  381 ++++
 sparsehash/internal/densehashtable_lockless.h | 2010 +++++++++++++++++
 sparsehash/traits                             |   10 +-
 tests/bench_lockless.cc                       | 1466 ++++++++++++
 tests/dense_hash_map_unittests.cc             |  137 +-
 tests/rwlock.h                                |  224 ++
 8 files changed, 4670 insertions(+), 18 deletions(-)
 create mode 100644 sparsehash/dense_hash_map_lockless
 create mode 100644 sparsehash/dense_hash_set_lockless
 create mode 100644 sparsehash/internal/densehashtable_lockless.h
 create mode 100644 tests/bench_lockless.cc
 create mode 100644 tests/rwlock.h

diff --git a/Makefile b/Makefile
index 8bc9963..3bf2368 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@ CPPFLAGS += -I$(TEST_DIR) -I. -isystem $(TEST_DIR)/gtest
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wno-missing-field-initializers -std=c++11 -O3 -D_SPARSEHASH_CI_TESTING_ ${_CXXFLAGS}
 LDFLAGS += -lpthread
 
-all : sparsehash_unittests bench
+all : sparsehash_unittests bench bench_lockless
 
 check : all
 	./sparsehash_unittests
@@ -15,9 +15,15 @@ clean :
 bench.o : $(TEST_DIR)/bench.cc
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(TEST_DIR)/bench.cc
 
+bench_lockless.o : $(TEST_DIR)/bench_lockless.cc
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(TEST_DIR)/bench_lockless.cc
+
 bench: bench.o
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
+bench_lockless: bench_lockless.o
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+
 gmock-gtest-all.o :
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(TEST_DIR)/gtest/gmock-gtest-all.cc
 
@@ -39,9 +45,12 @@ hashtable_unittests.o: $(TEST_DIR)/hashtable_unittests.cc
 hashtable_c11_unittests.o: $(TEST_DIR)/hashtable_c11_unittests.cc
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(TEST_DIR)/hashtable_c11_unittests.cc
 
+dense_hash_map_unittests.o: $(TEST_DIR)/dense_hash_map_unittests.cc
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(TEST_DIR)/dense_hash_map_unittests.cc
+
 testmain.o : $(TEST_DIR)/*.cc
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(TEST_DIR)/testmain.cc
 
-sparsehash_unittests : simple_unittests.o sparsetable_unittests.o allocator_unittests.o hashtable_unittests.o hashtable_c11_unittests.o fixture_unittests.o testmain.o gmock-gtest-all.o
+sparsehash_unittests : simple_unittests.o sparsetable_unittests.o allocator_unittests.o hashtable_unittests.o hashtable_c11_unittests.o fixture_unittests.o dense_hash_map_unittests.o testmain.o gmock-gtest-all.o
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
diff --git a/sparsehash/dense_hash_map_lockless b/sparsehash/dense_hash_map_lockless
new file mode 100644
index 0000000..e68891f
--- /dev/null
+++ b/sparsehash/dense_hash_map_lockless
@@ -0,0 +1,447 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ----
+//
+// This is just a very thin wrapper over densehashtable.h, just
+// like sgi stl's stl_hash_map is a very thin wrapper over
+// stl_hashtable.  The major thing we define is operator[], because
+// we have a concept of a data_type which stl_hashtable doesn't
+// (it only has a key and a value).
+//
+// NOTE: this is exactly like sparse_hash_map.h, with the word
+// "sparse" replaced by "dense", except for the addition of
+// set_empty_key().
+//
+//   YOU MUST CALL SET_EMPTY_KEY() IMMEDIATELY AFTER CONSTRUCTION.
+//
+// Otherwise your program will die in mysterious ways.  (Note if you
+// use the constructor that takes an InputIterator range, you pass in
+// the empty key in the constructor, rather than after.  As a result,
+// this constructor differs from the standard STL version.)
+//
+// In other respects, we adhere mostly to the STL semantics for
+// hash-map.  One important exception is that insert() may invalidate
+// iterators entirely -- STL semantics are that insert() may reorder
+// iterators, but they all still refer to something valid in the
+// hashtable.  Not so for us.  Likewise, insert() may invalidate
+// pointers into the hashtable.  (Whether insert invalidates iterators
+// and pointers depends on whether it results in a hashtable resize).
+// On the plus side, delete() doesn't invalidate iterators or pointers
+// at all, or even change the ordering of elements.
+//
+// Here are a few "power user" tips:
+//
+//    1) set_deleted_key():
+//         If you want to use erase() you *must* call set_deleted_key(),
+//         in addition to set_empty_key(), after construction.
+//         The deleted and empty keys must differ.
+//
+//    2) resize(0):
+//         When an item is deleted, its memory isn't freed right
+//         away.  This allows you to iterate over a hashtable,
+//         and call erase(), without invalidating the iterator.
+//         To force the memory to be freed, call resize(0).
+//         For tr1 compatibility, this can also be called as rehash(0).
+//
+//    3) min_load_factor(0.0)
+//         Setting the minimum load factor to 0.0 guarantees that
+//         the hash table will never shrink.
+//
+// Roughly speaking:
+//   (1) dense_hash_map: fastest, uses the most memory unless entries are small
+//   (2) sparse_hash_map: slowest, uses the least memory
+//   (3) hash_map / unordered_map (STL): in the middle
+//
+// Typically I use sparse_hash_map when I care about space and/or when
+// I need to save the hashtable on disk.  I use hash_map otherwise.  I
+// don't personally use dense_hash_set ever; some people use it for
+// small sets with lots of lookups.
+//
+// - dense_hash_map has, typically, about 78% memory overhead (if your
+//   data takes up X bytes, the hash_map uses .78X more bytes in overhead).
+// - sparse_hash_map has about 4 bits overhead per entry.
+// - sparse_hash_map can be 3-7 times slower than the others for lookup and,
+//   especially, inserts.  See time_hash_map.cc for details.
+//
+// See /usr/(local/)?doc/sparsehash-*/dense_hash_map.html
+// for information about how to use this class.
+
+#pragma once
+
+#include <algorithm>   // needed by stl_alloc
+#include <functional>  // for equal_to<>, select1st<>, etc
+#include <initializer_list> // for initializer_list
+#include <memory>      // for alloc
+#include <utility>     // for pair<>
+#include <tuple>       // forward_as_tuple
+#include <type_traits> // for enable_if, is_constructible, etc
+#include <sparsehash/internal/densehashtable_lockless.h>  // IWYU pragma: export
+#include <sparsehash/internal/libc_allocator_with_realloc.h>
+
+namespace google {
+
+template <class Key, class T, class HashFcn = std::hash<Key>,
+          class EqualKey = std::equal_to<Key>,
+          class Alloc = libc_allocator_with_realloc<std::pair<const Key, T>>>
+class dense_hash_map_lockless {
+ private:
+  // Apparently select1st is not stl-standard, so we define our own
+  struct SelectKey {
+    typedef const Key& result_type;
+
+    template <class Type>
+    using decay_t = typename std::decay<Type>::type;
+
+    template <typename Pair, 
+              typename = typename std::enable_if<std::is_same<
+                  decay_t<typename decay_t<Pair>::first_type>,
+                  decay_t<Key>
+                >::value>::type
+             >
+    result_type operator()(Pair&& p) const {
+      return p.first;
+    }
+  };
+  struct SetKey {
+    void operator()(std::pair<const Key, T>* value, const Key& new_key) const {
+      using NCKey = typename std::remove_cv<Key>::type;
+      *const_cast<NCKey*>(&value->first) = new_key;
+
+      // It would be nice to clear the rest of value here as well, in
+      // case it's taking up a lot of memory.  We do this by clearing
+      // the value.  This assumes T has a zero-arg constructor!
+      value->second = T();
+    }
+    void operator()(std::pair<const Key, T>* value, const Key& new_key, bool) const {
+      new(value) std::pair<const Key, T>(std::piecewise_construct, std::forward_as_tuple(new_key), std::forward_as_tuple());
+    }
+  };
+
+  // The actual data
+  typedef typename sparsehash_internal::key_equal_chosen<HashFcn, EqualKey>::type EqualKeyChosen;
+  typedef dense_hashtable_lockless<std::pair<const Key, T>, Key, HashFcn, SelectKey,
+                          SetKey, EqualKeyChosen, Alloc, T> ht;
+  ht rep;
+
+  static_assert(!sparsehash_internal::has_transparent_key_equal<HashFcn>::value
+                || std::is_same<EqualKey, std::equal_to<Key>>::value
+                || std::is_same<EqualKey, EqualKeyChosen>::value,
+                "Heterogeneous lookup requires key_equal to either be the default container value or the same as the type provided by hash");
+
+ public:
+  typedef typename ht::key_type key_type;
+  typedef T data_type;
+  typedef T mapped_type;
+  typedef typename ht::value_type value_type;
+  typedef typename ht::hasher hasher;
+  typedef typename ht::key_equal key_equal;
+  typedef Alloc allocator_type;
+
+  typedef typename ht::size_type size_type;
+  typedef typename ht::difference_type difference_type;
+  typedef typename ht::pointer pointer;
+  typedef typename ht::const_pointer const_pointer;
+  typedef typename ht::reference reference;
+  typedef typename ht::const_reference const_reference;
+
+  typedef typename ht::iterator iterator;
+  typedef typename ht::const_iterator const_iterator;
+  typedef typename ht::local_iterator local_iterator;
+  typedef typename ht::const_local_iterator const_local_iterator;
+
+  // Iterator functions
+  iterator begin() { return rep.begin(); }
+  iterator end() { return rep.end(); }
+  const_iterator begin() const { return rep.begin(); }
+  const_iterator end() const { return rep.end(); }
+  const_iterator cbegin() const { return rep.begin(); }
+  const_iterator cend() const { return rep.end(); }
+
+  // These come from tr1's unordered_map. For us, a bucket has 0 or 1 elements.
+  local_iterator begin(size_type i) { return rep.begin(i); }
+  local_iterator end(size_type i) { return rep.end(i); }
+  const_local_iterator begin(size_type i) const { return rep.begin(i); }
+  const_local_iterator end(size_type i) const { return rep.end(i); }
+  const_local_iterator cbegin(size_type i) const { return rep.begin(i); }
+  const_local_iterator cend(size_type i) const { return rep.end(i); }
+
+  // Accessor functions
+  allocator_type get_allocator() const { return rep.get_allocator(); }
+  hasher hash_funct() const { return rep.hash_funct(); }
+  hasher hash_function() const { return hash_funct(); }
+  key_equal key_eq() const { return rep.key_eq(); }
+
+  // Constructors
+  explicit dense_hash_map_lockless(size_type expected_max_items_in_table = 0,
+                          const hasher& hf = hasher(),
+                          const key_equal& eql = key_equal(),
+                          const allocator_type& alloc = allocator_type())
+      : rep(expected_max_items_in_table, hf, eql, SelectKey(), SetKey(),
+            alloc) {}
+
+  template <class InputIterator>
+  dense_hash_map_lockless(InputIterator f, InputIterator l,
+                 const key_type& empty_key_val,
+                 size_type expected_max_items_in_table = 0,
+                 const hasher& hf = hasher(),
+                 const key_equal& eql = key_equal(),
+                 const allocator_type& alloc = allocator_type())
+      : rep(expected_max_items_in_table, hf, eql, SelectKey(), SetKey(),
+            alloc) {
+    set_empty_key(empty_key_val);
+    rep.insert(f, l);
+  }
+  // We use the default copy constructor
+  // We use the default operator=()
+  // We use the default destructor
+
+  void clear() { rep.clear(); }
+  // This clears the hash map without resizing it down to the minimum
+  // bucket count, but rather keeps the number of buckets constant
+  void clear_no_resize() { rep.clear_no_resize(); }
+  void swap(dense_hash_map_lockless& hs) { rep.swap(hs.rep); }
+
+  // Functions concerning size
+  size_type size() const { return rep.size(); }
+  size_type max_size() const { return rep.max_size(); }
+  bool empty() const { return rep.empty(); }
+  size_type bucket_count() const { return rep.bucket_count(); }
+  size_type max_bucket_count() const { return rep.max_bucket_count(); }
+  void set_counternum(size_type ncounters) {rep.set_counternum(ncounters);}
+  long long int size_lockless() const {return rep.size_lockless();}
+
+
+  // These are tr1 methods.  bucket() is the bucket the key is or would be in.
+  size_type bucket_size(size_type i) const { return rep.bucket_size(i); }
+  size_type bucket(const key_type& key) const { return rep.bucket(key); }
+  float load_factor() const { return size() * 1.0f / bucket_count(); }
+  float max_load_factor() const {
+    float shrink, grow;
+    rep.get_resizing_parameters(&shrink, &grow);
+    return grow;
+  }
+  void max_load_factor(float new_grow) {
+    float shrink, grow;
+    rep.get_resizing_parameters(&shrink, &grow);
+    rep.set_resizing_parameters(shrink, new_grow);
+  }
+  // These aren't tr1 methods but perhaps ought to be.
+  float min_load_factor() const {
+    float shrink, grow;
+    rep.get_resizing_parameters(&shrink, &grow);
+    return shrink;
+  }
+  void min_load_factor(float new_shrink) {
+    float shrink, grow;
+    rep.get_resizing_parameters(&shrink, &grow);
+    rep.set_resizing_parameters(new_shrink, grow);
+  }
+  // Deprecated; use min_load_factor() or max_load_factor() instead.
+  void set_resizing_parameters(float shrink, float grow) {
+    rep.set_resizing_parameters(shrink, grow);
+  }
+
+  void reserve(size_type size) { rehash(size); } // note: rehash internally treats hint/size as number of elements
+  void resize(size_type hint) { rep.resize(hint); }
+  void rehash(size_type hint) { resize(hint); }  // the tr1 name
+
+  // Lookup routines
+  iterator find(const key_type& key) { return rep.find(key); }
+  const_iterator find(const key_type& key) const { return rep.find(key); }
+  //Lockfree Lookup routines
+  std::pair<key_type, data_type> find_wait_free(key_type& key) {return rep.template find_wait_free<data_type>(key);}
+
+  template <typename K>
+  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, iterator>::type
+  find(const K& key) { return rep.find(key); }
+  template <typename K>
+  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, const_iterator>::type
+  find(const K& key) const { return rep.find(key); }
+
+  data_type& operator[](const key_type& key) {  // This is our value-add!
+    // If key is in the hashtable, returns find(key)->second,
+    // otherwise returns insert(value_type(key, T()).first->second.
+    // Note it does not create an empty T unless the find fails.
+    return rep.template find_or_insert<data_type>(key).second;
+  }
+
+  data_type& operator[](key_type&& key) {
+    return rep.template find_or_insert<data_type>(std::move(key)).second;
+  }
+
+  size_type count(const key_type& key) const { return rep.count(key); }
+
+  template <typename K>
+  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, size_type>::type
+  count(const K& key) const { return rep.count(key); }
+
+  std::pair<iterator, iterator> equal_range(const key_type& key) {
+    return rep.equal_range(key);
+  }
+  std::pair<const_iterator, const_iterator> equal_range(
+      const key_type& key) const {
+    return rep.equal_range(key);
+  }
+
+  template<typename K>
+  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, std::pair<iterator, iterator>>::type
+  equal_range(const K& key) {
+    return rep.equal_range(key);
+  }
+  template<typename K>
+  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, std::pair<const_iterator, const_iterator>>::type
+  equal_range(const K& key) const {
+    return rep.equal_range(key);
+  }
+
+  // Insertion routines
+  std::pair<iterator, bool> insert(const value_type& obj) {
+    return rep.insert(obj);
+  }
+
+  std::pair<iterator, bool> insert_lockless(const value_type& obj) {
+    return rep.insert_lockless(obj);
+  }
+
+std::pair<value_type*, size_type> GetSnapshot(){ return rep.GetSnapShot();}
+
+
+  template <typename Pair, typename = typename std::enable_if<std::is_constructible<value_type, Pair&&>::value>::type>
+  std::pair<iterator, bool> insert(Pair&& obj) {
+    return rep.insert(std::forward<Pair>(obj));
+  }
+
+  // overload to allow {} syntax: .insert( { {key}, {args} } )
+  std::pair<iterator, bool> insert(value_type&& obj) {
+    return rep.insert(std::move(obj));
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> emplace(Args&&... args) {
+    return rep.emplace(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> emplace_hint(const_iterator hint, Args&&... args) {
+    return rep.emplace_hint(hint, std::forward<Args>(args)...);
+  }
+
+
+  template <class InputIterator>
+  void insert(InputIterator f, InputIterator l) {
+    rep.insert(f, l);
+  }
+  void insert(const_iterator f, const_iterator l) { rep.insert(f, l); }
+  void insert(std::initializer_list<value_type> ilist) { rep.insert(ilist.begin(), ilist.end()); }
+  // Required for std::insert_iterator; the passed-in iterator is ignored.
+  iterator insert(const_iterator, const value_type& obj) { return insert(obj).first; }
+  iterator insert(const_iterator, value_type&& obj) { return insert(std::move(obj)).first; }
+  template <class P, class = typename std::enable_if<
+                                        std::is_constructible<value_type, P&&>::value &&
+                                        !std::is_same<value_type, P>::value
+                                      >::type>
+  iterator insert(const_iterator, P&& obj) { return insert(std::forward<P>(obj)).first; }
+
+  // Deletion and empty routines
+  // THESE ARE NON-STANDARD!  I make you specify an "impossible" key
+  // value to identify deleted and empty buckets.  You can change the
+  // deleted key as time goes on, or get rid of it entirely to be insert-only.
+   // YOU MUST CALL THIS!
+  void set_empty_key(const key_type& key) { rep.set_empty_key(key); }
+
+  void set_empty_key_and_value(const key_type& key, T value) {rep.set_empty_key_and_value(key, value);}
+
+  key_type empty_key() const {  return rep.empty_key(); }
+
+  void set_deleted_key(const key_type& key) { rep.set_deleted_key(key); }
+  void clear_deleted_key() { rep.clear_deleted_key(); }
+  key_type deleted_key() const { return rep.deleted_key(); }
+
+  // These are standard
+  size_type erase(const key_type& key) { return rep.erase(key); }
+
+  size_type erase_lockless(const key_type& key) {return rep.erase_lockless(key);}
+
+  iterator erase(const_iterator it) { return rep.erase(it); }
+  iterator erase(const_iterator f, const_iterator l) { return rep.erase(f, l); }
+
+  // Comparison
+  bool operator==(const dense_hash_map_lockless& hs) const { return rep == hs.rep; }
+  bool operator!=(const dense_hash_map_lockless& hs) const { return rep != hs.rep; }
+
+  // I/O -- this is an add-on for writing hash map to disk
+  //
+  // For maximum flexibility, this does not assume a particular
+  // file type (though it will probably be a FILE *).  We just pass
+  // the fp through to rep.
+
+  // If your keys and values are simple enough, you can pass this
+  // serializer to serialize()/unserialize().  "Simple enough" means
+  // value_type is a POD type that contains no pointers.  Note,
+  // however, we don't try to normalize endianness.
+  typedef typename ht::NopointerSerializer NopointerSerializer;
+
+  // serializer: a class providing operator()(OUTPUT*, const value_type&)
+  //    (writing value_type to OUTPUT).  You can specify a
+  //    NopointerSerializer object if appropriate (see above).
+  // fp: either a FILE*, OR an ostream*/subclass_of_ostream*, OR a
+  //    pointer to a class providing size_t Write(const void*, size_t),
+  //    which writes a buffer into a stream (which fp presumably
+  //    owns) and returns the number of bytes successfully written.
+  //    Note basic_ostream<not_char> is not currently supported.
+  template <typename ValueSerializer, typename OUTPUT>
+  bool serialize(ValueSerializer serializer, OUTPUT* fp) {
+    return rep.serialize(serializer, fp);
+  }
+
+  // serializer: a functor providing operator()(INPUT*, value_type*)
+  //    (reading from INPUT and into value_type).  You can specify a
+  //    NopointerSerializer object if appropriate (see above).
+  // fp: either a FILE*, OR an istream*/subclass_of_istream*, OR a
+  //    pointer to a class providing size_t Read(void*, size_t),
+  //    which reads into a buffer from a stream (which fp presumably
+  //    owns) and returns the number of bytes successfully read.
+  //    Note basic_istream<not_char> is not currently supported.
+  // NOTE: Since value_type is std::pair<const Key, T>, ValueSerializer
+  // may need to do a const cast in order to fill in the key.
+  template <typename ValueSerializer, typename INPUT>
+  bool unserialize(ValueSerializer serializer, INPUT* fp) {
+    return rep.unserialize(serializer, fp);
+  }
+};
+
+// We need a global swap as well
+template <class Key, class T, class HashFcn, class EqualKey, class Alloc>
+inline void swap(dense_hash_map_lockless<Key, T, HashFcn, EqualKey, Alloc>& hm1,
+                 dense_hash_map_lockless<Key, T, HashFcn, EqualKey, Alloc>& hm2) {
+  hm1.swap(hm2);
+}
+
+}  // namespace google
diff --git a/sparsehash/dense_hash_set_lockless b/sparsehash/dense_hash_set_lockless
new file mode 100644
index 0000000..5287d11
--- /dev/null
+++ b/sparsehash/dense_hash_set_lockless
@@ -0,0 +1,381 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+//
+// This is just a very thin wrapper over densehashtable.h, just
+// like sgi stl's stl_hash_set is a very thin wrapper over
+// stl_hashtable.  The major thing we define is operator[], because
+// we have a concept of a data_type which stl_hashtable doesn't
+// (it only has a key and a value).
+//
+// This is more different from dense_hash_map than you might think,
+// because all iterators for sets are const (you obviously can't
+// change the key, and for sets there is no value).
+//
+// NOTE: this is exactly like sparse_hash_set.h, with the word
+// "sparse" replaced by "dense", except for the addition of
+// set_empty_key().
+//
+//   YOU MUST CALL SET_EMPTY_KEY() IMMEDIATELY AFTER CONSTRUCTION.
+//
+// Otherwise your program will die in mysterious ways.  (Note if you
+// use the constructor that takes an InputIterator range, you pass in
+// the empty key in the constructor, rather than after.  As a result,
+// this constructor differs from the standard STL version.)
+//
+// In other respects, we adhere mostly to the STL semantics for
+// hash-map.  One important exception is that insert() may invalidate
+// iterators entirely -- STL semantics are that insert() may reorder
+// iterators, but they all still refer to something valid in the
+// hashtable.  Not so for us.  Likewise, insert() may invalidate
+// pointers into the hashtable.  (Whether insert invalidates iterators
+// and pointers depends on whether it results in a hashtable resize).
+// On the plus side, delete() doesn't invalidate iterators or pointers
+// at all, or even change the ordering of elements.
+//
+// Here are a few "power user" tips:
+//
+//    1) set_deleted_key():
+//         If you want to use erase() you must call set_deleted_key(),
+//         in addition to set_empty_key(), after construction.
+//         The deleted and empty keys must differ.
+//
+//    2) resize(0):
+//         When an item is deleted, its memory isn't freed right
+//         away.  This allows you to iterate over a hashtable,
+//         and call erase(), without invalidating the iterator.
+//         To force the memory to be freed, call resize(0).
+//         For tr1 compatibility, this can also be called as rehash(0).
+//
+//    3) min_load_factor(0.0)
+//         Setting the minimum load factor to 0.0 guarantees that
+//         the hash table will never shrink.
+//
+// Roughly speaking:
+//   (1) dense_hash_set: fastest, uses the most memory unless entries are small
+//   (2) sparse_hash_set: slowest, uses the least memory
+//   (3) hash_set / unordered_set (STL): in the middle
+//
+// Typically I use sparse_hash_set when I care about space and/or when
+// I need to save the hashtable on disk.  I use hash_set otherwise.  I
+// don't personally use dense_hash_set ever; some people use it for
+// small sets with lots of lookups.
+//
+// - dense_hash_set has, typically, about 78% memory overhead (if your
+//   data takes up X bytes, the hash_set uses .78X more bytes in overhead).
+// - sparse_hash_set has about 4 bits overhead per entry.
+// - sparse_hash_set can be 3-7 times slower than the others for lookup and,
+//   especially, inserts.  See time_hash_map.cc for details.
+//
+// See /usr/(local/)?doc/sparsehash-*/dense_hash_set.html
+// for information about how to use this class.
+
+#pragma once
+
+#include <algorithm>   // needed by stl_alloc
+#include <functional>  // for equal_to<>, select1st<>, etc
+#include <initializer_list> // for initializer_list
+#include <memory>      // for alloc
+#include <utility>     // for pair<>
+#include <sparsehash/internal/densehashtable_lockless.h>  // IWYU pragma: export
+#include <sparsehash/internal/libc_allocator_with_realloc.h>
+
+namespace google {
+
+template <class Value, class HashFcn = std::hash<Value>,
+          class EqualKey = std::equal_to<Value>,
+          class Alloc = libc_allocator_with_realloc<Value>>
+class dense_hash_set_lockless {
+ private:
+  // Apparently identity is not stl-standard, so we define our own
+  struct Identity {
+    typedef const Value& result_type;
+    template <typename V>
+    const Value& operator()(V&& v) const { return v; }
+  };
+  struct SetKey {
+    void operator()(Value* value, const Value& new_key) const {
+      *value = new_key;
+    }
+    void operator()(Value* value, const Value& new_key, bool) const {
+        new(value) Value(new_key);
+    }
+  };
+
+  // The actual data
+  typedef typename sparsehash_internal::key_equal_chosen<HashFcn, EqualKey>::type EqualKeyChosen;
+  typedef dense_hashtable_lockless<Value, Value, HashFcn, Identity, SetKey, EqualKeyChosen,
+                          Alloc, Value> ht;
+  ht rep;
+
+  static_assert(!sparsehash_internal::has_transparent_key_equal<HashFcn>::value
+                || std::is_same<EqualKey, std::equal_to<Value>>::value
+                || std::is_same<EqualKey, EqualKeyChosen>::value,
+                "Heterogeneous lookup requires key_equal to either be the default container value or the same as the type provided by hash");
+
+ public:
+  typedef typename ht::key_type key_type;
+  typedef Value data_type;
+  typedef typename ht::value_type value_type;
+  typedef typename ht::hasher hasher;
+  typedef typename ht::key_equal key_equal;
+  typedef Alloc allocator_type;
+
+  typedef typename ht::size_type size_type;
+  typedef typename ht::difference_type difference_type;
+  typedef typename ht::const_pointer pointer;
+  typedef typename ht::const_pointer const_pointer;
+  typedef typename ht::const_reference reference;
+  typedef typename ht::const_reference const_reference;
+
+  typedef typename ht::const_iterator iterator;
+  typedef typename ht::const_iterator const_iterator;
+  typedef typename ht::const_local_iterator local_iterator;
+  typedef typename ht::const_local_iterator const_local_iterator;
+
+  // Iterator functions -- recall all iterators are const
+  iterator begin() const { return rep.begin(); }
+  iterator end() const { return rep.end(); }
+  const_iterator cbegin() const { return rep.begin(); }
+  const_iterator cend() const { return rep.end(); }
+
+  // These come from tr1's unordered_set. For us, a bucket has 0 or 1 elements.
+  local_iterator begin(size_type i) const { return rep.begin(i); }
+  local_iterator end(size_type i) const { return rep.end(i); }
+  local_iterator cbegin(size_type i) const { return rep.begin(i); }
+  local_iterator cend(size_type i) const { return rep.end(i); }
+
+  // Accessor functions
+  allocator_type get_allocator() const { return rep.get_allocator(); }
+  hasher hash_funct() const { return rep.hash_funct(); }
+  hasher hash_function() const { return hash_funct(); }  // tr1 name
+  key_equal key_eq() const { return rep.key_eq(); }
+
+  // Constructors
+  explicit dense_hash_set_lockless(size_type expected_max_items_in_table = 0,
+                          const hasher& hf = hasher(),
+                          const key_equal& eql = key_equal(),
+                          const allocator_type& alloc = allocator_type())
+      : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) {
+  }
+
+  template <class InputIterator>
+  dense_hash_set_lockless(InputIterator f, InputIterator l,
+                 const key_type& empty_key_val,
+                 size_type expected_max_items_in_table = 0,
+                 const hasher& hf = hasher(),
+                 const key_equal& eql = key_equal(),
+                 const allocator_type& alloc = allocator_type())
+      : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) {
+    set_empty_key(empty_key_val);
+    rep.insert(f, l);
+  }
+  // We use the default copy constructor
+  // We use the default operator=()
+  // We use the default destructor
+
+  void clear() { rep.clear(); }
+  // This clears the hash set without resizing it down to the minimum
+  // bucket count, but rather keeps the number of buckets constant
+  void clear_no_resize() { rep.clear_no_resize(); }
+  void swap(dense_hash_set_lockless& hs) { rep.swap(hs.rep); }
+
+  // Functions concerning size
+  size_type size() const { return rep.size(); }
+  size_type max_size() const { return rep.max_size(); }
+  bool empty() const { return rep.empty(); }
+  size_type bucket_count() const { return rep.bucket_count(); }
+  size_type max_bucket_count() const { return rep.max_bucket_count(); }
+  void set_counternum(size_type ncounters) {rep.set_counternum(ncounters);}
+  long long int size_lockless() const {return rep.size_lockless();}
+
+  // These are tr1 methods.  bucket() is the bucket the key is or would be in.
+  size_type bucket_size(size_type i) const { return rep.bucket_size(i); }
+  size_type bucket(const key_type& key) const { return rep.bucket(key); }
+  float load_factor() const { return size() * 1.0f / bucket_count(); }
+  float max_load_factor() const {
+    float shrink, grow;
+    rep.get_resizing_parameters(&shrink, &grow);
+    return grow;
+  }
+  void max_load_factor(float new_grow) {
+    float shrink, grow;
+    rep.get_resizing_parameters(&shrink, &grow);
+    rep.set_resizing_parameters(shrink, new_grow);
+  }
+  // These aren't tr1 methods but perhaps ought to be.
+  float min_load_factor() const {
+    float shrink, grow;
+    rep.get_resizing_parameters(&shrink, &grow);
+    return shrink;
+  }
+  void min_load_factor(float new_shrink) {
+    float shrink, grow;
+    rep.get_resizing_parameters(&shrink, &grow);
+    rep.set_resizing_parameters(new_shrink, grow);
+  }
+  // Deprecated; use min_load_factor() or max_load_factor() instead.
+  void set_resizing_parameters(float shrink, float grow) {
+    rep.set_resizing_parameters(shrink, grow);
+  }
+
+  void reserve(size_type size) { rehash(size); } // note: rehash internally treats hint/size as number of elements
+  void resize(size_type hint) { rep.resize(hint); }
+  void rehash(size_type hint) { resize(hint); }  // the tr1 name
+
+  // Lookup routines
+  iterator find(const key_type& key) const { return rep.find(key); }
+  std::pair<key_type, data_type> find_wait_free(key_type& key) {return rep.template find_wait_free<data_type>(key);}
+
+  template <typename K>
+  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, iterator>::type
+  find(const K& key) const { return rep.find(key); }
+
+  size_type count(const key_type& key) const { return rep.count(key); }
+
+  template <typename K>
+  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, size_type>::type
+  count(const K& key) const { return rep.count(key); }
+
+  std::pair<iterator, iterator> equal_range(const key_type& key) const {
+    return rep.equal_range(key);
+  }
+
+  template<typename K>
+  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, std::pair<iterator, iterator>>::type
+  equal_range(const K& key) const {
+    return rep.equal_range(key);
+  }
+
+  // Insertion routines
+  std::pair<iterator, bool> insert(const value_type& obj) {
+    std::pair<typename ht::iterator, bool> p = rep.insert(obj);
+    return std::pair<iterator, bool>(p.first, p.second);  // const to non-const
+  }
+
+  std::pair<iterator, bool> insert_lockless(const value_type& obj) {
+    return rep.insert_lockless(obj);
+  }
+
+  std::pair<iterator, bool> insert(value_type&& obj) {
+    std::pair<typename ht::iterator, bool> p = rep.insert(std::move(obj));
+    return std::pair<iterator, bool>(p.first, p.second);  // const to non-const
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> emplace(Args&&... args) {
+    return rep.emplace(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> emplace_hint(const_iterator hint, Args&&... args) {
+    return rep.emplace_hint(hint, std::forward<Args>(args)...);
+  }
+
+  template <class InputIterator>
+  void insert(InputIterator f, InputIterator l) {
+    rep.insert(f, l);
+  }
+  void insert(const_iterator f, const_iterator l) { rep.insert(f, l); }
+  void insert(std::initializer_list<value_type> ilist) { rep.insert(ilist.begin(), ilist.end()); }
+  // Required for std::insert_iterator; the passed-in iterator is ignored.
+  iterator insert(const_iterator, const value_type& obj) { return insert(obj).first; }
+  iterator insert(const_iterator, value_type&& obj) { return insert(std::move(obj)).first; }
+
+  // Deletion and empty routines
+  // THESE ARE NON-STANDARD!  I make you specify an "impossible" key
+  // value to identify deleted and empty buckets.  You can change the
+  // deleted key as time goes on, or get rid of it entirely to be insert-only.
+  void set_empty_key(const key_type& key) { rep.set_empty_key(key); }
+
+  void set_empty_key_and_value(const key_type& key, data_type value) {rep.set_empty_key_and_value(key, value);}
+  
+  key_type empty_key() const { return rep.empty_key(); }
+
+  void set_deleted_key(const key_type& key) { rep.set_deleted_key(key); }
+  void clear_deleted_key() { rep.clear_deleted_key(); }
+  key_type deleted_key() const { return rep.deleted_key(); }
+
+  // These are standard
+  size_type erase(const key_type& key) { return rep.erase(key); }
+  iterator erase(const_iterator it) { return rep.erase(it); }
+  iterator erase(const_iterator f, const_iterator l) { return rep.erase(f, l); }
+  size_type erase_lockless(const key_type& key) {return rep.erase_lockless(key);}
+
+  // Comparison
+  bool operator==(const dense_hash_set_lockless& hs) const { return rep == hs.rep; }
+  bool operator!=(const dense_hash_set_lockless& hs) const { return rep != hs.rep; }
+
+  // I/O -- this is an add-on for writing metainformation to disk
+  //
+  // For maximum flexibility, this does not assume a particular
+  // file type (though it will probably be a FILE *).  We just pass
+  // the fp through to rep.
+
+  // If your keys and values are simple enough, you can pass this
+  // serializer to serialize()/unserialize().  "Simple enough" means
+  // value_type is a POD type that contains no pointers.  Note,
+  // however, we don't try to normalize endianness.
+  typedef typename ht::NopointerSerializer NopointerSerializer;
+
+  // serializer: a class providing operator()(OUTPUT*, const value_type&)
+  //    (writing value_type to OUTPUT).  You can specify a
+  //    NopointerSerializer object if appropriate (see above).
+  // fp: either a FILE*, OR an ostream*/subclass_of_ostream*, OR a
+  //    pointer to a class providing size_t Write(const void*, size_t),
+  //    which writes a buffer into a stream (which fp presumably
+  //    owns) and returns the number of bytes successfully written.
+  //    Note basic_ostream<not_char> is not currently supported.
+  template <typename ValueSerializer, typename OUTPUT>
+  bool serialize(ValueSerializer serializer, OUTPUT* fp) {
+    return rep.serialize(serializer, fp);
+  }
+
+  // serializer: a functor providing operator()(INPUT*, value_type*)
+  //    (reading from INPUT and into value_type).  You can specify a
+  //    NopointerSerializer object if appropriate (see above).
+  // fp: either a FILE*, OR an istream*/subclass_of_istream*, OR a
+  //    pointer to a class providing size_t Read(void*, size_t),
+  //    which reads into a buffer from a stream (which fp presumably
+  //    owns) and returns the number of bytes successfully read.
+  //    Note basic_istream<not_char> is not currently supported.
+  template <typename ValueSerializer, typename INPUT>
+  bool unserialize(ValueSerializer serializer, INPUT* fp) {
+    return rep.unserialize(serializer, fp);
+  }
+};
+
+template <class Val, class HashFcn, class EqualKey, class Alloc>
+inline void swap(dense_hash_set_lockless<Val, HashFcn, EqualKey, Alloc>& hs1,
+                 dense_hash_set_lockless<Val, HashFcn, EqualKey, Alloc>& hs2) {
+  hs1.swap(hs2);
+}
+
+}  // namespace google
diff --git a/sparsehash/internal/densehashtable_lockless.h b/sparsehash/internal/densehashtable_lockless.h
new file mode 100644
index 0000000..57529db
--- /dev/null
+++ b/sparsehash/internal/densehashtable_lockless.h
@@ -0,0 +1,2010 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+//
+// A dense hashtable is a particular implementation of
+// a hashtable: one that is meant to minimize memory allocation.
+// It does this by using an array to store all the data.  We
+// steal a value from the key space to indicate "empty" array
+// elements (ie indices where no item lives) and another to indicate
+// "deleted" elements.
+//
+// (Note it is possible to change the value of the delete key
+// on the fly; you can even remove it, though after that point
+// the hashtable is insert_only until you set it again.  The empty
+// value however can't be changed.)
+//
+// To minimize allocation and pointer overhead, we use internal
+// probing, in which the hashtable is a single table, and collisions
+// are resolved by trying to insert again in another bucket.  The
+// most cache-efficient internal probing schemes are linear probing
+// (which suffers, alas, from clumping) and quadratic probing, which
+// is what we implement by default.
+//
+// Type requirements: value_type is required to be Copy Constructible
+// and Default Constructible. It is not required to be (and commonly
+// isn't) Assignable.
+//
+// You probably shouldn't use this code directly.  Use dense_hash_map<>
+// or dense_hash_set<> instead.
+
+// You can change the following below:
+// HT_OCCUPANCY_PCT      -- how full before we double size
+// HT_EMPTY_PCT          -- how empty before we halve size
+// HT_MIN_BUCKETS        -- default smallest bucket size
+//
+// You can also change enlarge_factor (which defaults to
+// HT_OCCUPANCY_PCT), and shrink_factor (which defaults to
+// HT_EMPTY_PCT) with set_resizing_parameters().
+//
+// How to decide what values to use?
+// shrink_factor's default of .4 * OCCUPANCY_PCT, is probably good.
+// HT_MIN_BUCKETS is probably unnecessary since you can specify
+// (indirectly) the starting number of buckets at construct-time.
+// For enlarge_factor, you can use this chart to try to trade-off
+// expected lookup time to the space taken up.  By default, this
+// code uses quadratic probing, though you can change it to linear
+// via JUMP_ below if you really want to.
+//
+// From
+// http://www.augustana.ca/~mohrj/courses/1999.fall/csc210/lecture_notes/hashing.html
+// NUMBER OF PROBES / LOOKUP       Successful            Unsuccessful
+// Quadratic collision resolution   1 - ln(1-L) - L/2    1/(1-L) - L - ln(1-L)
+// Linear collision resolution     [1+1/(1-L)]/2         [1+1/(1-L)2]/2
+//
+// -- enlarge_factor --           0.10  0.50  0.60  0.75  0.80  0.90  0.99
+// QUADRATIC COLLISION RES.
+//    probes/successful lookup    1.05  1.44  1.62  2.01  2.21  2.85  5.11
+//    probes/unsuccessful lookup  1.11  2.19  2.82  4.64  5.81  11.4  103.6
+// LINEAR COLLISION RES.
+//    probes/successful lookup    1.06  1.5   1.75  2.5   3.0   5.5   50.5
+//    probes/unsuccessful lookup  1.12  2.5   3.6   8.5   13.0  50.0  5000.0
+
+#pragma once
+
+#include <assert.h>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <typeinfo>
+#include <stdio.h>    // for FILE, fwrite, fread
+#include <algorithm>  // For swap(), eg
+#include <iterator>   // For iterator tags
+#include <limits>     // for numeric_limits
+#include <memory>     // For uninitialized_fill
+#include <utility>    // for pair
+#include <stdexcept>  // For length_error
+#include <type_traits>
+#include <atomic>
+#include <mutex>
+#include <sparsehash/internal/hashtable-common.h>
+#include <sparsehash/internal/libc_allocator_with_realloc.h>
+#include <immintrin.h>
+
+#include <chrono>
+
+extern double time_for_insert_lockless;
+extern double time_for_insert_noresize_lockless;
+extern double time_for_insert_at_lockless;
+extern double time_for_setvalue_lockless;
+extern double time_for_rebucket_lockless;
+
+
+namespace google {
+
+// The probing method
+// Linear probing
+// #define JUMP_(key, num_probes)    ( 1 )
+// Quadratic probing
+#define JUMP_(key, num_probes) (num_probes)
+
+static thread_local long thread_flag = -1;
+// Hashtable class, used to implement the hashed associative containers
+// hash_set and hash_map.
+
+// Value: what is stored in the table (each bucket is a Value).
+// Key: something in a 1-to-1 correspondence to a Value, that can be used
+//      to search for a Value in the table (find() takes a Key).
+// HashFcn: Takes a Key and returns an integer, the more unique the better.
+// ExtractKey: given a Value, returns the unique Key associated with it.
+//             Must inherit from unary_function, or at least have a
+//             result_type enum indicating the return type of operator().
+// SetKey: given a Value* and a Key, modifies the value such that
+//         ExtractKey(value) == key.  We guarantee this is only called
+//         with key == deleted_key or key == empty_key.
+// EqualKey: Given two Keys, says whether they are the same (that is,
+//           if they are both associated with the same Value).
+// Alloc: STL allocator to use to allocate memory.
+
+template <class Value, class Key, class HashFcn, class ExtractKey, class SetKey,
+          class EqualKey, class Alloc, class Data>
+class dense_hashtable_lockless;
+
+template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
+struct dense_hashtable_lockless_iterator;
+
+template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
+struct dense_hashtable_lockless_const_iterator;
+
+// We're just an array, but we need to skip over empty and deleted elements
+template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
+struct dense_hashtable_lockless_iterator {
+ private:
+  using value_alloc_type =
+      typename std::allocator_traits<A>::template rebind_alloc<V>;
+
+ public:
+  typedef dense_hashtable_lockless_iterator<V, K, HF, ExK, SetK, EqK, A, Data> iterator;
+  typedef dense_hashtable_lockless_const_iterator<V, K, HF, ExK, SetK, EqK, A, Data>
+      const_iterator;
+
+  typedef std::forward_iterator_tag iterator_category;  // very little defined!
+  typedef V value_type;
+  typedef typename value_alloc_type::difference_type difference_type;
+  typedef typename value_alloc_type::size_type size_type;
+  typedef typename value_alloc_type::reference reference;
+  typedef typename value_alloc_type::pointer pointer;
+
+  // "Real" constructor and default constructor
+  dense_hashtable_lockless_iterator(
+      const dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>* h, pointer it,
+      pointer it_end, bool advance)
+      : ht(h), pos(it), end(it_end) {
+    if (advance) advance_past_empty_and_deleted();
+  }
+  dense_hashtable_lockless_iterator() {}
+  // The default destructor is fine; we don't define one
+  // The default operator= is fine; we don't define one
+
+  // Happy dereferencer
+  reference operator*() const { return *pos; }
+  pointer operator->() const { return &(operator*()); }
+
+  // Arithmetic.  The only hard part is making sure that
+  // we're not on an empty or marked-deleted array element
+  void advance_past_empty_and_deleted() {
+    while (pos != end && (ht->test_empty(*this) || ht->test_deleted(*this)))
+      ++pos;
+  }
+  iterator& operator++() {
+    assert(pos != end);
+    ++pos;
+    advance_past_empty_and_deleted();
+    return *this;
+  }
+  iterator operator++(int) {
+    iterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+
+  // Comparison.
+  bool operator==(const iterator& it) const { return pos == it.pos; }
+  bool operator!=(const iterator& it) const { return pos != it.pos; }
+
+  // The actual data
+  const dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>* ht;
+  pointer pos, end;
+};
+
+// Now do it all again, but with const-ness!
+template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
+struct dense_hashtable_lockless_const_iterator {
+ private:
+  using value_alloc_type =
+      typename std::allocator_traits<A>::template rebind_alloc<V>;
+
+ public:
+  typedef dense_hashtable_lockless_iterator<V, K, HF, ExK, SetK, EqK, A, Data> iterator;
+  typedef dense_hashtable_lockless_const_iterator<V, K, HF, ExK, SetK, EqK, A, Data>
+      const_iterator;
+
+  typedef std::forward_iterator_tag iterator_category;  // very little defined!
+  typedef V value_type;
+  typedef typename value_alloc_type::difference_type difference_type;
+  typedef typename value_alloc_type::size_type size_type;
+  typedef typename value_alloc_type::const_reference reference;
+  typedef typename value_alloc_type::const_pointer pointer;
+
+  // "Real" constructor and default constructor
+  dense_hashtable_lockless_const_iterator(
+      const dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>* h, pointer it,
+      pointer it_end, bool advance)
+      : ht(h), pos(it), end(it_end) {
+    if (advance) advance_past_empty_and_deleted();
+  }
+  dense_hashtable_lockless_const_iterator() : ht(NULL), pos(pointer()), end(pointer()) {}
+  // This lets us convert regular iterators to const iterators
+  dense_hashtable_lockless_const_iterator(const iterator& it)
+      : ht(it.ht), pos(it.pos), end(it.end) {}
+  // The default destructor is fine; we don't define one
+  // The default operator= is fine; we don't define one
+
+  // Happy dereferencer
+  reference operator*() const { return *pos; }
+  pointer operator->() const { return &(operator*()); }
+
+  // Arithmetic.  The only hard part is making sure that
+  // we're not on an empty or marked-deleted array element
+  void advance_past_empty_and_deleted() {
+    while (pos != end && (ht->test_empty(*this) || ht->test_deleted(*this)))
+      ++pos;
+  }
+  const_iterator& operator++() {
+    assert(pos != end);
+    ++pos;
+    advance_past_empty_and_deleted();
+    return *this;
+  }
+  const_iterator operator++(int) {
+    const_iterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+
+  // Comparison.
+  bool operator==(const const_iterator& it) const { return pos == it.pos; }
+  bool operator!=(const const_iterator& it) const { return pos != it.pos; }
+
+  // The actual data
+  const dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>* ht;
+  pointer pos, end;
+};
+
+template <class Value, class Key, class HashFcn, class ExtractKey, class SetKey,
+          class EqualKey, class Alloc, class Data>
+class dense_hashtable_lockless {
+ private:
+  using value_alloc_type =
+      typename std::allocator_traits<Alloc>::template rebind_alloc<Value>;
+
+ public:
+  typedef Key key_type;
+  typedef Value value_type;
+  typedef HashFcn hasher;
+  typedef EqualKey key_equal;
+  typedef Alloc allocator_type;
+
+  typedef typename value_alloc_type::size_type size_type;
+  typedef typename value_alloc_type::difference_type difference_type;
+  typedef typename value_alloc_type::reference reference;
+  typedef typename value_alloc_type::const_reference const_reference;
+  typedef typename value_alloc_type::pointer pointer;
+  typedef typename value_alloc_type::const_pointer const_pointer;
+  typedef dense_hashtable_lockless_iterator<Value, Key, HashFcn, ExtractKey, SetKey,
+                                   EqualKey, Alloc, Data> iterator;
+
+  typedef dense_hashtable_lockless_const_iterator<
+      Value, Key, HashFcn, ExtractKey, SetKey, EqualKey, Alloc, Data> const_iterator;
+
+  // These come from tr1.  For us they're the same as regular iterators.
+  typedef iterator local_iterator;
+  typedef const_iterator const_local_iterator;
+ // A struct used for atomic copying for table parameters when using multi-thread
+
+  // How full we let the table get before we resize, by default.
+  // Knuth says .8 is good -- higher causes us to probe too much,
+  // though it saves memory.
+  static const int HT_OCCUPANCY_PCT;  // defined at the bottom of this file
+
+  // How empty we let the table get before we resize lower, by default.
+  // (0.0 means never resize lower.)
+  // It should be less than OCCUPANCY_PCT / 2 or we thrash resizing
+  static const int HT_EMPTY_PCT;  // defined at the bottom of this file
+
+  // Minimum size we're willing to let hashtables be.
+  // Must be a power of two, and at least 4.
+  // Note, however, that for a given hashtable, the initial size is a
+  // function of the first constructor arg, and may be >HT_MIN_BUCKETS.
+  static const size_type HT_MIN_BUCKETS = 4;
+
+  // By default, if you don't specify a hashtable size at
+  // construction-time, we use this size.  Must be a power of two, and
+  // at least HT_MIN_BUCKETS.
+  static const size_type HT_DEFAULT_STARTING_BUCKETS = 32;
+
+  // ITERATOR FUNCTIONS
+
+  iterator begin() { return iterator(this, pnew->table_, pnew->table_ + pnew->num_buckets_, true); }
+
+
+  iterator end() {
+    return iterator(this, pnew->table_ + pnew->num_buckets_, pnew->table_ + pnew->num_buckets_, true);
+  }
+
+  iterator end(void *tmp_pointer){
+    TableInternalParameter* tmp = static_cast<TableInternalParameter*>(tmp_pointer);
+    return iterator(this, tmp->table_ + tmp->num_buckets_, tmp->table_ + tmp->num_buckets_, true);
+  }
+
+  const_iterator begin() const {
+    return const_iterator(this, pnew->table_, pnew->table_ + pnew->num_buckets_, true);
+  }
+  const_iterator end() const {
+    return const_iterator(this, pnew->table_ + pnew->num_buckets_, pnew->table_ + pnew->num_buckets_, true);
+  }
+
+  // These come from tr1 unordered_map.  They iterate over 'bucket' n.
+  // We'll just consider bucket n to be the n-th element of the table.
+  local_iterator begin(size_type i) {
+    return local_iterator(this, table + i, table + i + 1, false);
+  }
+  local_iterator end(size_type i) {
+    local_iterator it = begin(i);
+    if (!test_empty(i) && !test_deleted(i)) ++it;
+    return it;
+  }
+
+  const_local_iterator begin(size_type i) const {
+    return const_local_iterator(this, pnew->table_ + i, pnew->table_ + i + 1, false);
+  }
+
+  const_local_iterator end(size_type i) const {
+    const_local_iterator it = begin(i);
+    if (!test_empty(i) && !test_deleted(i)) ++it;
+    return it;
+  }
+
+  // ACCESSOR FUNCTIONS for the things we templatize on, basically
+  hasher hash_funct() const { return settings; }
+  key_equal key_eq() const { return key_info; }
+  allocator_type get_allocator() const { return allocator_type(val_info); }
+
+  // Accessor function for statistics gathering.
+  int num_table_copies() const { return settings.num_ht_copies(); }
+
+ private:
+  // Annoyingly, we can't copy values around, because they might have
+  // const components (they're probably pair<const X, Y>).  We use
+  // explicit destructor invocation and placement new to get around
+  // this.  Arg.
+
+
+  template <typename... Args>
+  void set_value(pointer dst, value_type p) {
+    dst->~value_type();  // delete the old value, if any
+    new (dst) value_type(p);
+  }
+
+  template <typename... Args, typename K>
+  bool set_value_lockless(K* dst, K p){
+    using NCKey = typename std::remove_cv<Key>::type;
+    bool flag_set_empty_key = __sync_bool_compare_and_swap(const_cast<NCKey*>(dst), pnew->key_info_.empty_key, p);
+    if (flag_set_empty_key){
+      return true;
+    } else{
+      bool flag_set_del_key = __sync_bool_compare_and_swap(const_cast<NCKey*>(dst), pnew->key_info_.delkey, p);
+      if (flag_set_del_key){
+        return true;
+      } else{
+        return false;
+      }
+    }
+  }
+
+
+  template <typename... Args, typename K, typename T>
+  bool set_value_lockless(std::pair<K, T>* dst, std::pair<K, T> p){
+    bool flag = __sync_bool_compare_and_swap(&dst->second, empty_value, p.second);
+    if (!flag){
+      return false;
+    } else{
+      using NCKey = typename std::remove_cv<Key>::type;
+      bool flag_set_empty_key = __sync_bool_compare_and_swap(const_cast<NCKey*>(&dst->first), pnew->key_info_.empty_key, p.first);
+      if (flag_set_empty_key){
+        return true;
+      } else{
+        bool flag_set_del_key = __sync_bool_compare_and_swap(const_cast<NCKey*>(&dst->first), pnew->key_info_.delkey, p.first);
+        if (flag_set_del_key){
+          return true;
+        } else{
+        return false;
+        }
+      }
+    }
+  }
+
+
+  template <typename T, typename... Args>
+  void set_value(pointer dst, key_type first, T second) {
+    dst->~value_type();  // delete the old value, if any
+    new (dst) value_type(first, second);
+  }
+
+  void destroy_buckets(size_type first, size_type last) {
+    for (; first != last; ++first) table[first].~value_type();
+  }
+
+  // DELETE HELPER FUNCTIONS
+  // This lets the user describe a key that will indicate deleted
+  // table entries.  This key should be an "impossible" entry --
+  // if you try to insert it for real, you won't be able to retrieve it!
+  // (NB: while you pass in an entire value, only the key part is looked
+  // at.  This is just because I don't know how to assign just a key.)
+ private:
+  void squash_deleted() {          // gets rid of any deleted entries we have
+    if (num_deleted) {             // get rid of deleted before writing
+      size_type resize_to = settings.min_buckets(
+          num_elements, bucket_count());
+      dense_hashtable_lockless tmp(std::move(*this), resize_to);  // copying will get rid of deleted
+      swap(tmp);                   // now we are tmp
+    }
+    assert(num_deleted == 0);
+  }
+
+  // Test if the given key is the deleted indicator.  Requires
+  // num_deleted > 0, for correctness of read(), and because that
+  // guarantees that key_info.delkey is valid.
+
+  bool test_deleted_key(const key_type& key) const {
+    return equals(pnew->key_info_.delkey, key);
+  }
+
+  bool test_deleted_key(const key_type& key, void* tmp_pointer) const {
+    TableInternalParameter* tmp = static_cast<TableInternalParameter*>(tmp_pointer);
+    return equals(tmp->key_info_.delkey, key);
+  }
+
+
+ public:
+  void set_deleted_key(const key_type& key) {
+    // the empty indicator (if specified) and the deleted indicator
+    // must be different
+    assert(
+        (!settings.use_empty() || !equals(key, key_info.empty_key)) &&
+        "Passed the empty-key to set_deleted_key");
+    // It's only safe to change what "deleted" means if we purge deleted guys
+    squash_deleted();
+    settings.set_use_deleted(true);
+
+    pnew->settings_.set_use_deleted(true);
+    pold->settings_.set_use_deleted(true);
+    pnew->key_info_.delkey = key;
+    pold->key_info_.delkey = key;
+
+    key_info.delkey = key;
+  }
+  void clear_deleted_key() {
+    squash_deleted();
+    settings.set_use_deleted(false);
+  }
+  key_type deleted_key() const {
+    assert(settings.use_deleted() &&
+           "Must set deleted key before calling deleted_key");
+    return key_info.delkey;
+  }
+
+  // These are public so the iterators can use them
+  // True if the item at position bucknum is "deleted" marker
+
+  bool test_deleted(size_type bucknum) const {
+    // Invariant: !use_deleted() implies num_deleted is 0.
+    assert(pnew->settings_.use_deleted() || pnew->num_deleted_ == 0);
+    return pnew->num_deleted_ > 0 && test_deleted_key(get_key(pnew->table_[bucknum]));
+  }
+  bool test_deleted(size_type bucknum, void* tmp_pointer) const {
+    // Invariant: !use_deleted() implies num_deleted is 0.
+    TableInternalParameter* tmp = static_cast<TableInternalParameter*>(tmp_pointer);
+    assert(tmp->settings_.use_deleted() || tmp->num_deleted_ == 0);
+    return test_deleted_key(get_key(tmp->table_[bucknum]), tmp_pointer);
+  }
+
+
+
+
+  bool test_deleted(const iterator& it) const {
+    // Invariant: !use_deleted() implies num_deleted is 0.
+    assert(settings.use_deleted() || num_deleted == 0);
+    return test_deleted_key(get_key(*it));
+  }
+
+
+  bool test_deleted(const const_iterator& it) const {
+    // Invariant: !use_deleted() implies num_deleted is 0.
+    assert(pnew->settings_.use_deleted() || pnew->num_deleted_ == 0);
+    return test_deleted_key(get_key(*it));
+  }
+
+
+ private:
+  void check_use_deleted(const char* caller) {
+    (void)caller;  // could log it if the assert failed
+    assert(pnew->settings_.use_deleted());
+
+  }
+
+  // Set it so test_deleted is true.  true if object didn't used to be deleted.
+  bool set_deleted(iterator& it) {
+    check_use_deleted("set_deleted()");
+    bool retval = !test_deleted(it);
+    // &* converts from iterator to value-type.
+    set_key(&(*it), key_info.delkey);
+    return retval;
+  }
+  // Set it so test_deleted is false.  true if object used to be deleted.
+  bool clear_deleted(iterator& it) {
+    check_use_deleted("clear_deleted()");
+    // Happens automatically when we assign something else in its place.
+    return test_deleted(it);
+  }
+
+  // We also allow to set/clear the deleted bit on a const iterator.
+  // We allow a const_iterator for the same reason you can delete a
+  // const pointer: it's convenient, and semantically you can't use
+  // 'it' after it's been deleted anyway, so its const-ness doesn't
+  // really matter.
+  bool set_deleted(const_iterator& it) {
+    check_use_deleted("set_deleted()");
+    bool retval = !test_deleted(it);
+    set_key(const_cast<pointer>(&(*it)), key_info.delkey);
+    return retval;
+  }
+  // Set it so test_deleted is false.  true if object used to be deleted.
+  bool clear_deleted(const_iterator& it) {
+    check_use_deleted("clear_deleted()");
+    return test_deleted(it);
+  }
+
+  // EMPTY HELPER FUNCTIONS
+  // This lets the user describe a key that will indicate empty (unused)
+  // table entries.  This key should be an "impossible" entry --
+  // if you try to insert it for real, you won't be able to retrieve it!
+  // (NB: while you pass in an entire value, only the key part is looked
+  // at.  This is just because I don't know how to assign just a key.)
+ public:
+  // These are public so the iterators can use them
+  // True if the item at position bucknum is "empty" marker
+
+  bool test_empty(size_type bucknum) const {
+    assert(pnew->settings_.use_empty());  // we always need to know what's empty!
+    return equals(pnew->key_info_.empty_key, get_key(pnew->table_[bucknum]));
+  }
+  bool test_empty(size_type bucknum, void* tmp_pointer) const {
+    TableInternalParameter* tmp = static_cast<TableInternalParameter*>(tmp_pointer);
+    assert(tmp->settings_.use_empty());  // we always need to know what's empty!
+    return equals(tmp->key_info_.empty_key, get_key(tmp->table_[bucknum]));
+  }
+  bool test_old_empty(size_type bucknum) const {
+    assert(pold->settings_.use_empty());  // we always need to know what's empty!
+    return equals(pold->key_info_.empty_key, get_key(pold->table_[bucknum]));
+  }
+
+
+
+  bool test_empty(const iterator& it) const {
+    assert(pnew->settings_.use_empty());  // we always need to know what's empty!
+    return equals(pnew->key_info_.empty_key, get_key(*it));
+  }
+  bool test_empty(const const_iterator& it) const {
+    assert(pnew->settings_.use_empty());  // we always need to know what's empty!
+    return equals(pnew->key_info_.empty_key, get_key(*it));
+  }
+
+ private:
+  void fill_range_with_empty(pointer table_start, size_type count) {
+    for (size_type i = 0; i < count; ++i)
+    {
+      construct_key(&table_start[i], pnew->key_info_.empty_key);
+    }
+  }
+  template <typename K, typename T>
+  void set_bucket(std::pair<K, T>* table_start, T value) {
+    using NCKey = typename std::remove_cv<Key>::type;
+    *const_cast<NCKey*>(&table_start->first) = pnew->key_info_.empty_key;
+    table_start->second = value;
+  }
+
+  template <typename K, typename T>
+  void set_bucket(K* table_start, T value) {
+    using NCKey = typename std::remove_cv<Key>::type;
+    *const_cast<NCKey*>(table_start) = pnew->key_info_.empty_key;
+  }
+
+
+  template <typename T>
+  void fill_range_with_empty(pointer table_start, size_type count, T value){
+    for (size_type i = 0; i < count; ++i)
+    {
+      set_bucket(&table_start[i], value);
+    }
+  }
+
+
+ public:
+template<typename T>
+  void set_empty_key_and_value(const key_type& key, T value){
+    assert(!pnew->settings_.use_empty() && "Calling set_empty_key multiple times");
+    assert(!pold->settings_.use_empty() && "Calling set_empty_key multiple times");
+    assert(
+        (!pnew->settings_.use_deleted() || !equals(key, pnew->key_info_.delkey)) &&
+        "Setting the empty key the same as the deleted key");
+    assert(
+        (!pold->settings_.use_deleted() || !equals(key, pold->key_info_.delkey)) &&
+        "Setting the empty key the same as the deleted key");
+    pnew->settings_.set_use_empty(true);
+    pnew->key_info_.empty_key = key;
+    empty_value = value;
+
+    pold->settings_.set_use_empty(true);
+    pold->key_info_.empty_key = key;
+    assert(!pnew->table_); 
+    assert(!pold->table_);
+
+    pnew->num_buckets_ = 128;
+    pnew->table_ = pnew->val_info_.allocate(pnew->num_buckets_);
+    assert(pnew->table_);
+    fill_range_with_empty(pnew->table_, pnew->num_buckets_, value);
+    pold->num_buckets_ = 128;
+    pold->table_ = pnew->val_info_.allocate(pold->num_buckets_);
+    assert(pold->table_);
+    fill_range_with_empty(pold->table_, pold->num_buckets_, value);
+  }
+
+  void set_empty_key(const key_type& key) {
+    // Once you set the empty key, you can't change it
+    assert(!pnew->settings_.use_empty() && "Calling set_empty_key multiple times");
+    assert(!pold->settings_.use_empty() && "Calling set_empty_key multiple times");
+
+    // The deleted indicator (if specified) and the empty indicator
+    // must be different.
+    assert(
+        (!pnew->settings_.use_deleted() || !equals(key, pnew->key_info_.delkey)) &&
+        "Setting the empty key the same as the deleted key");
+    assert(
+        (!pold->settings_.use_deleted() || !equals(key, pold->key_info_.delkey)) &&
+        "Setting the empty key the same as the deleted key");
+    pnew->settings_.set_use_empty(true);
+    pnew->key_info_.empty_key = key;
+
+    pold->settings_.set_use_empty(true);
+    pold->key_info_.empty_key = key;
+
+    assert(!pnew->table_);  // must set before first use
+    // num_buckets was set in constructor even though table was NULL
+    assert(!pold->table_);
+    pnew->table_ = pnew->val_info_.allocate(pnew->num_buckets_);
+    assert(pnew->table_);
+    fill_range_with_empty(pnew->table_, pnew->num_buckets_);
+    pold->table_ = pnew->val_info_.allocate(pold->num_buckets_);
+    assert(pold->table_);
+    fill_range_with_empty(pold->table_, pold->num_buckets_);
+  }
+
+  void set_counternum(size_type counternum){
+    num_counters = counternum;
+    num_elements_array = (size_type*)_mm_malloc(sizeof(size_type)*num_counters,64);
+    counter_flag = (long*)malloc(sizeof(long)*num_counters);
+    for(size_type i = 0; i < num_counters; i++){
+      num_elements_array[i] = 0;
+      counter_flag[i] = 0;
+    }
+  }
+
+
+  key_type empty_key() const {
+    assert(settings.use_empty());
+    return key_info.empty_key;
+  }
+
+  // FUNCTIONS CONCERNING SIZE
+ public:
+
+  size_type size() const { return pnew->num_elements_ - pnew->num_deleted_; }
+
+  size_type max_size() const { return val_info.max_size(); }
+
+  bool empty() const { return size() == 0; }
+
+  size_type bucket_count() const { return pnew->num_buckets_; }
+  size_type old_bucket_count() const { return pold->num_buckets_; }
+
+  size_type max_bucket_count() const { return max_size(); }
+  size_type nonempty_bucket_count() const { return num_elements; }
+  // These are tr1 methods.  Their idea of 'bucket' doesn't map well to
+  // what we do.  We just say every bucket has 0 or 1 items in it.
+  size_type bucket_size(size_type i) const {
+    return begin(i) == end(i) ? 0 : 1;
+  }
+
+ private:
+  // Because of the above, size_type(-1) is never legal; use it for errors
+  static const size_type ILLEGAL_BUCKET = size_type(-1);
+
+  // Used after a string of deletes.  Returns true if we actually shrunk.
+  // TODO(csilvers): take a delta so we can take into account inserts
+  // done after shrinking.  Maybe make part of the Settings class?
+  bool maybe_shrink() {
+    assert(pnew->num_elements_ >= pnew->num_deleted_);
+    assert((bucket_count() & (bucket_count() - 1)) == 0);  // is a power of two
+    assert(bucket_count() >= HT_MIN_BUCKETS);
+    bool retval = false;
+
+    // If you construct a hashtable with < HT_DEFAULT_STARTING_BUCKETS,
+    // we'll never shrink until you get relatively big, and we'll never
+    // shrink below HT_DEFAULT_STARTING_BUCKETS.  Otherwise, something
+    // like "dense_hash_set<int> x; x.insert(4); x.erase(4);" will
+    // shrink us down to HT_MIN_BUCKETS buckets, which is too small.
+
+    const size_type num_remain = pnew->num_elements_ - pnew->num_deleted_;
+    const size_type shrink_threshold = pnew->settings_.shrink_threshold();
+
+    if (shrink_threshold > 0 && num_remain < shrink_threshold &&
+        bucket_count() > HT_DEFAULT_STARTING_BUCKETS) {
+      const float shrink_factor = settings.shrink_factor();
+      size_type sz = bucket_count() / 2;  // find how much we should shrink
+      while (sz > HT_DEFAULT_STARTING_BUCKETS &&
+             num_remain < sz * shrink_factor) {
+        sz /= 2;  // stay a power of 2
+      }
+      dense_hashtable_lockless tmp(std::move(*this), sz);  // Do the actual resizing
+      swap(tmp);                       // now we are tmp
+      retval = true;
+    }
+    pnew->settings_.set_consider_shrink(false);  // because we just considered it
+
+    return retval;
+  }
+
+  // We'll let you resize a hashtable -- though this makes us copy all!
+  // When you resize, you say, "make it big enough for this many more elements"
+  // Returns true if we actually resized, false if size was already ok.
+  bool resize_delta(size_type delta) {
+    bool did_resize = false;
+    if (pnew->settings_.consider_shrink()) {  // see if lots of deletes happened
+      if (maybe_shrink()) did_resize = true;
+    }
+
+    if (pnew->num_elements_ >= (std::numeric_limits<size_type>::max)() - delta) {
+
+      throw std::length_error("resize overflow");
+    }
+    if (bucket_count() >= HT_MIN_BUCKETS &&
+        (pnew->num_elements_ + delta) <= pnew->settings_.enlarge_threshold())
+      return did_resize;  // we're ok as we are
+
+
+    // Sometimes, we need to resize just to get rid of all the
+    // "deleted" buckets that are clogging up the hashtable.  So when
+    // deciding whether to resize, count the deleted buckets (which
+    // are currently taking up room).  But later, when we decide what
+    // size to resize to, *don't* count deleted buckets, since they
+    // get discarded during the resize.
+    size_type needed_size = pnew->settings_.min_buckets(pnew->num_elements_ + delta, 0);
+
+
+    if (needed_size <= bucket_count())  // we have enough buckets
+      return did_resize;
+
+    size_type resize_to = pnew->settings_.min_buckets(
+        pnew->num_elements_ - pnew->num_deleted_ + delta, bucket_count());
+
+
+
+    // When num_deleted is large, we may still grow but we do not want to
+    // over expand.  So we reduce needed_size by a portion of num_deleted
+    // (the exact portion does not matter).  This is especially helpful
+    // when min_load_factor is zero (no shrink at all) to avoid doubling
+    // the bucket count to infinity.  See also test ResizeWithoutShrink.
+    needed_size = pnew->settings_.min_buckets(pnew->num_elements_ - pnew->num_deleted_ / 4 + delta, 0);
+
+
+    if (resize_to < needed_size &&  // may double resize_to
+        resize_to < (std::numeric_limits<size_type>::max)() / 2) {
+      // This situation means that we have enough deleted elements,
+      // that once we purge them, we won't actually have needed to
+      // grow.  But we may want to grow anyway: if we just purge one
+      // element, say, we'll have to grow anyway next time we
+      // insert.  Might as well grow now, since we're already going
+      // through the trouble of copying (in order to purge the
+      // deleted elements).
+
+      const size_type target =
+          static_cast<size_type>(pnew->settings_.shrink_size(resize_to * 2));
+      if (pnew->num_elements_ - pnew->num_deleted_ + delta >= target) {
+        // Good, we won't be below the shrink threshhold even if we double.
+        resize_to *= 2;
+      }
+    }
+   
+/*realloc and memory copy*/
+    for (unsigned long i = 0; i < pold->num_buckets_; ++i) pold->table_[i].~value_type();
+    pold->table_ = pold->val_info_.allocate(resize_to);
+    assert(pold->table_);
+    fill_range_with_empty(pold->table_, resize_to);
+    pold->num_elements_ = 0;
+    pold->num_deleted_ = 0;
+    pold->num_buckets_ = resize_to;  // our new size
+    pold->settings_.reset_thresholds(resize_to);
+    assert((old_bucket_count() & (old_bucket_count() - 1)) == 0);
+    size_type bucknum;
+    const size_type bucket_count_minus_one = old_bucket_count() - 1;
+    for(unsigned long i = 0; i < bucket_count(); i++){
+      if (!test_empty(i) && !test_deleted(i)) {
+        size_type num_probes = 0;  // how many times we've probed
+        for (bucknum = hash(get_key(pnew->table_[i])) & bucket_count_minus_one;
+           !test_old_empty(bucknum);  // not empty
+           bucknum =
+               (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one) {
+        ++num_probes;
+        assert(num_probes < bucket_count() &&
+               "Hashtable is full: an error in key_equal<> or hash<>");
+      }
+      using will_move = std::is_rvalue_reference<dense_hashtable_lockless &&>;
+      using value_t = typename std::conditional<will_move::value, value_type&&, const_reference>::type;
+
+      set_value(&(pold->table_[bucknum]), std::forward<value_t>(pnew->table_[i]));
+      pold->num_elements_++;
+      }
+  
+
+    }
+    std::atomic<TableInternalParameter*> tmp_atomic (0);
+    tmp_atomic.store(pold);
+    TableInternalParameter* tmp = pnew;
+    pnew = tmp_atomic.load();
+    pold = tmp;
+    return true;
+  }
+
+  // We require table be not-NULL and empty before calling this.
+  void resize_table(size_type /*old_size*/, size_type new_size,
+                    std::true_type) {
+    table = val_info.realloc_or_die(table, new_size);
+  }
+
+  void resize_table(size_type old_size, size_type new_size, std::false_type) {
+    val_info.deallocate(table, old_size);
+    table = val_info.allocate(new_size);
+  }
+
+  // Used to actually do the rehashing when we grow/shrink a hashtable
+  template <typename Hashtable>
+  void copy_or_move_from(Hashtable&& ht, size_type min_buckets_wanted) {
+    clear_to_size(settings.min_buckets(ht.size(), min_buckets_wanted));
+
+    // We use a normal iterator to get non-deleted bcks from ht
+    // We could use insert() here, but since we know there are
+    // no duplicates and no deleted items, we can be more efficient
+    assert((bucket_count() & (bucket_count() - 1)) == 0);  // a power of two
+    for (auto&& value : ht) {
+      size_type num_probes = 0;  // how many times we've probed
+      size_type bucknum;
+      const size_type bucket_count_minus_one = bucket_count() - 1;
+      for (bucknum = hash(get_key(value)) & bucket_count_minus_one;
+           !test_empty(bucknum);  // not empty
+           bucknum =
+               (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one) {
+        ++num_probes;
+        assert(num_probes < bucket_count() &&
+               "Hashtable is full: an error in key_equal<> or hash<>");
+      }
+
+      using will_move = std::is_rvalue_reference<Hashtable&&>;
+      using value_t = typename std::conditional<will_move::value, value_type&&, const_reference>::type;
+
+      set_value(&table[bucknum], std::forward<value_t>(value));
+      num_elements++;
+    }
+    settings.inc_num_ht_copies();
+  }
+
+  // Required by the spec for hashed associative container
+ public:
+  // Though the docs say this should be num_buckets, I think it's much
+  // more useful as num_elements.  As a special feature, calling with
+  // req_elements==0 will cause us to shrink if we can, saving space.
+  void resize(size_type req_elements) {  // resize to this or larger
+  if (pnew->settings_.consider_shrink() || req_elements == 0) maybe_shrink();
+  if (req_elements > pnew->num_elements_) resize_delta(req_elements - pnew->num_elements_);
+  }
+
+  // Get and change the value of shrink_factor and enlarge_factor.  The
+  // description at the beginning of this file explains how to choose
+  // the values.  Setting the shrink parameter to 0.0 ensures that the
+  // table never shrinks.
+  void get_resizing_parameters(float* shrink, float* grow) const {
+    *shrink = settings.shrink_factor();
+    *grow = settings.enlarge_factor();
+  }
+  void set_resizing_parameters(float shrink, float grow) {
+    settings.set_resizing_parameters(shrink, grow);
+    settings.reset_thresholds(bucket_count());
+  }
+
+  // CONSTRUCTORS -- as required by the specs, we take a size,
+  // but also let you specify a hashfunction, key comparator,
+  // and key extractor.  We also define a copy constructor and =.
+  // DESTRUCTOR -- needs to free the table
+  explicit dense_hashtable_lockless(size_type expected_max_items_in_table = 0,
+                           const HashFcn& hf = HashFcn(),
+                           const EqualKey& eql = EqualKey(),
+                           const ExtractKey& ext = ExtractKey(),
+                           const SetKey& set = SetKey(),
+                           const Alloc& alloc = Alloc())
+      : settings(hf),
+        key_info(ext, set, eql),
+        num_deleted(0),
+        num_elements(0),
+        num_buckets(expected_max_items_in_table == 0
+                        ? HT_DEFAULT_STARTING_BUCKETS
+                        : settings.min_buckets(expected_max_items_in_table, 0)),
+        val_info(alloc_impl<value_alloc_type>(alloc)),
+
+        insert_counter(0),
+        tp_new(expected_max_items_in_table, hf, eql, ext, set, alloc),
+        tp_old(expected_max_items_in_table, hf, eql, ext, set, alloc),
+
+        table(NULL) {
+    // table is NULL until emptyval is set.  However, we set num_buckets
+    // here so we know how much space to allocate once emptyval is set
+   
+
+    pnew = &tp_new;
+    pold = &tp_old;
+    resizeflag = false;
+    tp_new.settings_.reset_thresholds(bucket_count());
+    tp_old.settings_.reset_thresholds(bucket_count());
+
+    settings.reset_thresholds(bucket_count());
+  }
+
+  // As a convenience for resize(), we allow an optional second argument
+  // which lets you make this new hashtable a different size than ht
+  dense_hashtable_lockless(const dense_hashtable_lockless& ht,
+                  size_type min_buckets_wanted = HT_DEFAULT_STARTING_BUCKETS)
+      : settings(ht.settings),
+        key_info(ht.key_info),
+        num_deleted(0),
+        num_elements(0),
+        num_buckets(0),
+        val_info(ht.val_info),
+        table(NULL) {
+    if (!ht.settings.use_empty()) {
+      // If use_empty isn't set, copy_from will crash, so we do our own copying.
+      assert(ht.empty());
+      num_buckets = settings.min_buckets(ht.size(), min_buckets_wanted);
+      settings.reset_thresholds(bucket_count());
+      return;
+    }
+    settings.reset_thresholds(bucket_count());
+    copy_or_move_from(ht, min_buckets_wanted);  // copy_or_move_from() ignores deleted entries
+  }
+
+  dense_hashtable_lockless(dense_hashtable_lockless&& ht)
+      : dense_hashtable_lockless() {
+    swap(ht);
+  }
+
+  dense_hashtable_lockless(dense_hashtable_lockless&& ht,
+                  size_type min_buckets_wanted)
+      : settings(ht.settings),
+        key_info(ht.key_info),
+        num_deleted(0),
+        num_elements(0),
+        num_buckets(0),
+        val_info(std::move(ht.val_info)),
+        table(NULL) {
+    if (!ht.settings.use_empty()) {
+      // If use_empty isn't set, copy_or_move_from will crash, so we do our own copying.
+      assert(ht.empty());
+      num_buckets = settings.min_buckets(ht.size(), min_buckets_wanted);
+      settings.reset_thresholds(bucket_count());
+      return;
+    }
+    settings.reset_thresholds(bucket_count());
+    copy_or_move_from(std::move(ht), min_buckets_wanted);  // copy_or_move_from() ignores deleted entries
+  }
+
+  dense_hashtable_lockless& operator=(const dense_hashtable_lockless& ht) {
+    if (&ht == this) return *this;  // don't copy onto ourselves
+    if (!ht.settings.use_empty()) {
+      assert(ht.empty());
+      dense_hashtable_lockless empty_table(ht);  // empty table with ht's thresholds
+      this->swap(empty_table);
+      return *this;
+    }
+    settings = ht.settings;
+    key_info = ht.key_info;
+    // copy_or_move_from() calls clear and sets num_deleted to 0 too
+    copy_or_move_from(ht, HT_MIN_BUCKETS);
+    // we purposefully don't copy the allocator, which may not be copyable
+    return *this;
+  }
+
+  dense_hashtable_lockless& operator=(dense_hashtable_lockless&& ht) {
+    assert(&ht != this); // this should not happen
+    swap(ht);
+    return *this;
+  }
+
+  ~dense_hashtable_lockless() {
+    if (table) {
+      destroy_buckets(0, num_buckets);
+      val_info.deallocate(table, num_buckets);
+    }
+    for(size_type i = 0; i < pnew->num_buckets_; i++)
+      pnew->table_[i].~value_type();
+    for(size_type i = 0; i < pold->num_buckets_; i++)
+      pold->table_[i].~value_type();
+    free(pnew->table_);
+    free(pold->table_);
+  }
+
+  // Many STL algorithms use swap instead of copy constructors
+  void swap(dense_hashtable_lockless& ht) {
+    std::swap(settings, ht.settings);
+    std::swap(key_info, ht.key_info);
+    std::swap(num_deleted, ht.num_deleted);
+    std::swap(num_elements, ht.num_elements);
+    std::swap(num_buckets, ht.num_buckets);
+    std::swap(table, ht.table);
+    settings.reset_thresholds(bucket_count());  // also resets consider_shrink
+    ht.settings.reset_thresholds(ht.bucket_count());
+    // we purposefully don't swap the allocator, which may not be swap-able
+  }
+
+ private:
+  void clear_to_size(size_type new_num_buckets) {
+    if (!table) {
+      table = val_info.allocate(new_num_buckets);
+    } else {
+      destroy_buckets(0, num_buckets);
+      if (new_num_buckets != num_buckets) {  // resize, if necessary
+        typedef std::integral_constant<
+            bool, std::is_same<value_alloc_type,
+                               libc_allocator_with_realloc<value_type>>::value>
+            realloc_ok;
+        resize_table(num_buckets, new_num_buckets, realloc_ok());
+      }
+    }
+    assert(table);
+    fill_range_with_empty(table, new_num_buckets);
+    num_elements = 0;
+    num_deleted = 0;
+    num_buckets = new_num_buckets;  // our new size
+    settings.reset_thresholds(bucket_count());
+  }
+
+ public:
+  // It's always nice to be able to clear a table without deallocating it
+  void clear() {
+    // If the table is already empty, and the number of buckets is
+    // already as we desire, there's nothing to do.
+    const size_type new_num_buckets = settings.min_buckets(0, 0);
+    if (num_elements == 0 && new_num_buckets == num_buckets) {
+      return;
+    }
+    clear_to_size(new_num_buckets);
+  }
+
+  // Clear the table without resizing it.
+  // Mimicks the stl_hashtable's behaviour when clear()-ing in that it
+  // does not modify the bucket count
+  void clear_no_resize() {
+    if (num_elements > 0) {
+      assert(table);
+      destroy_buckets(0, num_buckets);
+      fill_range_with_empty(table, num_buckets);
+    }
+    // don't consider to shrink before another erase()
+    settings.reset_thresholds(bucket_count());
+    num_elements = 0;
+    num_deleted = 0;
+  }
+
+  // LOOKUP ROUTINES
+ private:
+  // Returns a pair of positions: 1st where the object is, 2nd where
+  // it would go if you wanted to insert it.  1st is ILLEGAL_BUCKET
+  // if object is not found; 2nd is ILLEGAL_BUCKET if it is.
+  // Note: because of deletions where-to-insert is not trivial: it's the
+  // first deleted bucket we see, as long as we don't find the key later
+  template <typename K>
+  std::pair<size_type, size_type> find_position_for_insert(const K& key, void* tmp){
+    size_type num_probes = 0;
+    TableInternalParameter* tmp_pointer = static_cast<TableInternalParameter*>(tmp); 
+    const size_type bucket_count_minus_one = tmp_pointer->num_buckets_ - 1;
+    size_type bucknum = hash(key) & bucket_count_minus_one;
+    size_type insert_pos = ILLEGAL_BUCKET;
+    while(1){
+      if(test_empty(bucknum,tmp_pointer)){
+        if (insert_pos == ILLEGAL_BUCKET)
+          return std::pair<size_type, size_type>(ILLEGAL_BUCKET, bucknum);
+        else
+          return std::pair<size_type, size_type>(ILLEGAL_BUCKET, insert_pos);
+      }else if(test_deleted(bucknum, tmp_pointer)){
+        if (insert_pos == ILLEGAL_BUCKET) insert_pos = bucknum;
+      }else if(equals(key, get_key(tmp_pointer->table_[bucknum]))) {
+        return std::pair<size_type, size_type>(bucknum, ILLEGAL_BUCKET);
+      }
+      ++num_probes;  // we're doing another probe
+      bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one;
+      if(num_probes == tmp_pointer->num_buckets_)
+        return std::pair<size_type, size_type>(ILLEGAL_BUCKET, bucknum);  
+    }
+  }
+  template <typename K>
+  std::pair<size_type, size_type> find_position(const K& key, void* tmp){
+    size_type num_probes = 0;
+    TableInternalParameter* tmp_pointer = static_cast<TableInternalParameter*>(tmp); 
+    const size_type bucket_count_minus_one = tmp_pointer->num_buckets_ - 1;
+    size_type bucknum = hash(key) & bucket_count_minus_one;
+    size_type insert_pos = ILLEGAL_BUCKET;  // where we would insert
+    while(1){
+      if(test_empty(bucknum,tmp_pointer)){
+        if (insert_pos == ILLEGAL_BUCKET)   // found no prior place to insert
+          return std::pair<size_type, size_type>(ILLEGAL_BUCKET, bucknum);
+        else
+          return std::pair<size_type, size_type>(ILLEGAL_BUCKET, insert_pos);
+      }else if(test_deleted(bucknum, tmp_pointer)){
+        if (insert_pos == ILLEGAL_BUCKET) insert_pos = bucknum;
+      }else if(equals(key, get_key(tmp_pointer->table_[bucknum]))) {
+        return std::pair<size_type, size_type>(bucknum, ILLEGAL_BUCKET);
+      }
+      ++num_probes;  // we're doing another probe
+      bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one;
+      if(num_probes == tmp_pointer->num_buckets_)
+        return std::pair<size_type, size_type>(ILLEGAL_BUCKET, bucknum);
+    }
+  }
+
+  template <typename K>
+  std::pair<size_type, size_type> find_position(const K& key) const {
+    size_type num_probes = 0;  // how many times we've probed
+    TableInternalParameter* tmp_pointer = pnew;
+    const size_type bucket_count_minus_one = tmp_pointer->num_buckets_ - 1;
+    size_type bucknum = hash(key) & bucket_count_minus_one;
+    size_type insert_pos = ILLEGAL_BUCKET;  // where we would insert
+    while (1) {                             // probe until something happens     
+      if (test_empty(bucknum, tmp_pointer)) {            // bucket is empty
+      
+        if (insert_pos == ILLEGAL_BUCKET)   // found no prior place to insert
+          return std::pair<size_type, size_type>(ILLEGAL_BUCKET, bucknum);
+        else
+          return std::pair<size_type, size_type>(ILLEGAL_BUCKET, insert_pos);
+      } else if (test_deleted(bucknum, tmp_pointer)) {  // keep searching, but mark to insert
+        if (insert_pos == ILLEGAL_BUCKET) insert_pos = bucknum;
+      } else if (equals(key, get_key(tmp_pointer->table_[bucknum]))) {
+        return std::pair<size_type, size_type>(bucknum, ILLEGAL_BUCKET);
+      }
+      ++num_probes;  // we're doing another probe
+      bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one;
+      assert(num_probes < bucket_count() &&
+             "Hashtable is full: an error in key_equal<> or hash<>");
+    }
+  }
+
+
+ public:
+
+
+
+long long int size_lockless() const {
+  long long int sum1 = 0;
+  for(size_type i = 0; i < num_counters; i++)
+    __sync_add_and_fetch(&sum1, num_elements_array[i]);
+  return sum1;
+}
+
+template <typename T,typename K>
+  std::pair<K, T> find_wait_free(K& key){
+    size_type num_probes = 0;
+    TableInternalParameter* tmp_pointer = pnew;  
+    const size_type bucket_count_minus_one = tmp_pointer->num_buckets_ - 1;
+    size_type bucknum = hash(key) & bucket_count_minus_one;
+    size_type insert_pos = ILLEGAL_BUCKET;  // where we would insert
+    while(1){
+      if (test_empty(bucknum,tmp_pointer)) {            // bucket is empty
+        return std::pair<K, T>(tmp_pointer->key_info_.empty_key, empty_value);
+      }else if(test_deleted(bucknum, tmp_pointer)) {
+        if(insert_pos == ILLEGAL_BUCKET) insert_pos = bucknum;
+      }else if (equals(key, get_key(tmp_pointer->table_[bucknum]))) {
+        std::pair<K, T> tmp(key, tmp_pointer->table_[bucknum].second);
+        if(tmp.first == key){
+          return tmp;
+        }else{
+          return std::pair<K,T>(tmp_pointer->key_info_.empty_key, empty_value);
+        }
+      }
+      ++num_probes;
+      bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one;
+      if(num_probes == tmp_pointer->num_buckets_)
+        return std::pair<K, T>(tmp_pointer->key_info_.empty_key, empty_value);
+
+    } 
+  }
+
+
+
+  template <typename K>
+  iterator find(const K& key) {
+    std::pair<size_type, size_type> pos = find_position(key);
+    if (pos.first == ILLEGAL_BUCKET)  // alas, not there
+      return end();
+    else
+      return iterator(this, pnew->table_ + pos.first, pnew->table_ + pnew->num_buckets_, false);
+  }
+
+  template <typename K>
+  const_iterator find(const K& key) const {
+    if (size() == 0) return end();
+    std::pair<size_type, size_type> pos = find_position(key);
+    if (pos.first == ILLEGAL_BUCKET)  // alas, not there
+      return end();
+    else
+      return const_iterator(this, pnew->table_ + pos.first, pnew->table_ + pnew->num_buckets_,
+                            false);
+  }
+
+  // This is a tr1 method: the bucket a given key is in, or what bucket
+  // it would be put in, if it were to be inserted.  Shrug.
+  size_type bucket(const key_type& key) const {
+    std::pair<size_type, size_type> pos = find_position(key);
+    return pos.first == ILLEGAL_BUCKET ? pos.second : pos.first;
+  }
+
+  // Counts how many elements have key key.  For maps, it's either 0 or 1.
+  template <typename K>
+  size_type count(const K& key) const {
+    std::pair<size_type, size_type> pos = find_position(key);
+    return pos.first == ILLEGAL_BUCKET ? 0 : 1;
+  }
+
+  // Likewise, equal_range doesn't really make sense for us.  Oh well.
+  template <typename K>
+  std::pair<iterator, iterator> equal_range(const K& key) {
+    iterator pos = find(key);  // either an iterator or end
+    if (pos == end()) {
+      return std::pair<iterator, iterator>(pos, pos);
+    } else {
+      const iterator startpos = pos++;
+      return std::pair<iterator, iterator>(startpos, pos);
+    }
+  }
+  template <typename K>
+  std::pair<const_iterator, const_iterator> equal_range(
+      const K& key) const {
+    const_iterator pos = find(key);  // either an iterator or end
+    if (pos == end()) {
+      return std::pair<const_iterator, const_iterator>(pos, pos);
+    } else {
+      const const_iterator startpos = pos++;
+      return std::pair<const_iterator, const_iterator>(startpos, pos);
+    }
+  }
+
+  // INSERTION ROUTINES
+ private:
+  // Private method used by insert_noresize and find_or_insert.
+  template <typename... Args>
+  iterator insert_at(size_type pos, Args&&... args) {
+    if (size() >= max_size()) {
+      throw std::length_error("insert overflow");
+    }
+    if (test_deleted(pos)) {  // just replace if it's been del.
+      // shrug: shouldn't need to be const.
+      const_iterator delpos(this, pnew->table_ + pos, pnew->table_ + pnew->num_buckets_, false);
+      clear_deleted(delpos);
+      assert(pnew->num_deleted_ > 0);
+      --pnew->num_deleted_;  // used to be, now it isn't
+    } else {
+      ++pnew->num_elements_; 
+    }
+    set_value(&(pnew->table_[pos]), std::forward<Args>(args)...);
+    return iterator(this, pnew->table_ + pos, pnew->table_ + pnew->num_buckets_, false);
+
+  }
+
+
+ template <typename... Args>
+  iterator insert_at_lockless(size_type pos, void* tmp, Args&&... args) {
+    TableInternalParameter* tmp_pointer =  static_cast<TableInternalParameter*>(tmp);
+    if (test_deleted(pos)) {
+      const_iterator delpos(this, tmp_pointer->table_ + pos, tmp_pointer->table_ + tmp_pointer->num_buckets_, false);
+      clear_deleted(delpos);
+    }
+  //  auto st_time = std::chrono::high_resolution_clock::now();
+    bool flag = set_value_lockless(&tmp_pointer->table_[pos], std::forward<Args>(args)...);
+  //  auto ed_time = std::chrono::high_resolution_clock::now();
+   // if(thread_flag == 0)
+	  //time_for_setvalue_lockless += std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+    if (flag){
+      return iterator(this, tmp_pointer->table_ + pos, tmp_pointer->table_ + tmp_pointer->num_buckets_, false);
+    } else{
+      return iterator(this, tmp_pointer->table_ + tmp_pointer->num_buckets_, tmp_pointer->table_ + tmp_pointer->num_buckets_, false);
+    }
+
+  }
+
+  template <typename K, typename... Args>
+  std::pair<iterator, bool> insert_noresize_lockless(K&& key, Args&&... args) {
+    assert(pnew->settings_.use_empty() && "Inserting without empty key");
+    assert(!equals(std::forward<K>(key), pnew->key_info_.empty_key) && "Inserting the empty key");
+    assert((!pnew->settings_.use_deleted() || !equals(key, pnew->key_info_.delkey)) && "Inserting the deleted key");  
+    TableInternalParameter* tmp_pointer = pnew;
+    size_type tmp_old_bucket_count = tmp_pointer->num_buckets_;
+    // wait until resize flag set
+    while (resizeflag) { }
+    //auto st_time = std::chrono::high_resolution_clock::now();
+
+    std::pair<size_type, size_type> pos = find_position_for_insert(key, tmp_pointer);
+    long size_sum = 0;
+
+    for(int i = 0; i < num_counters; i++) {
+      size_sum += num_elements_array[i];
+    }
+
+    if (size_sum > bucket_count() / 2){
+      pos.first = ILLEGAL_BUCKET;
+      pos.second = ILLEGAL_BUCKET;
+    }
+    if (pos.first != ILLEGAL_BUCKET){
+      return std::pair<iterator, bool>(
+        iterator(this, tmp_pointer->table_ + pos.first, tmp_pointer->table_ + tmp_pointer->num_buckets_, false),
+        true);
+    } else {
+      if (pos.second != ILLEGAL_BUCKET) {
+        auto it =  std::pair<iterator, bool>(insert_at_lockless(pos.second, tmp_pointer, std::forward<Args>(args)...), true);
+        while (resizeflag) {}
+        auto pos = find_position(key, pnew);
+        if (pos.first == ILLEGAL_BUCKET)
+          return std::pair<iterator, bool>(end(), false);
+        else{
+          if (it.first.pos == it.first.end){
+            it.second = false;
+            return it;
+          } else{
+            __sync_add_and_fetch(&num_elements_array[thread_flag], 1);
+            return it;
+          }
+        }
+      } else {
+
+        std::lock_guard<std::mutex> mlock(table_mutex);
+
+        if (tmp_old_bucket_count == bucket_count()) { 
+          /*The hash table has already been rebucketed*/
+          __sync_bool_compare_and_swap(&resizeflag, false, true);
+          for (unsigned long i = 0; i < pold->num_buckets_; ++i) {
+            pold->table_[i].~value_type();
+          }
+          pointer temp = pold->val_info_.allocate(bucket_count()*2);
+          pold->val_info_.deallocate(pold->table_, old_bucket_count());
+          pold->table_ = temp;
+          assert(pold->table_);
+         // auto st_time = std::chrono::high_resolution_clock::now();
+          fill_range_with_empty(pold->table_, bucket_count()*2, empty_value);
+
+
+          pold->num_elements_ = 0;
+          pold->num_deleted_ = 0;
+          pold->num_buckets_ = bucket_count() * 2;  // our new size
+          pold->settings_.reset_thresholds(bucket_count()*2);
+          size_type bucknum;
+          const size_type bucket_count_minus_one = old_bucket_count() - 1;
+          for (unsigned long i = 0; i < bucket_count(); i++) {
+            if (!test_empty(i,tmp_pointer) && !test_deleted(i,tmp_pointer)) {
+              size_type num_probes = 0;  // how many times we've probed
+              for (bucknum = hash(get_key(pnew->table_[i])) & bucket_count_minus_one;
+                  !test_old_empty(bucknum);  // not empty
+                  bucknum =
+                      (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one) {
+                  ++num_probes;
+              }
+              using will_move = std::is_rvalue_reference<dense_hashtable_lockless &&>;
+              using value_t = typename std::conditional<will_move::value, value_type&&, const_reference>::type;
+              set_value(&(pold->table_[bucknum]), std::forward<value_t>(pnew->table_[i]));
+              pold->num_elements_++;
+            }
+          }
+          //auto ed_time = std::chrono::high_resolution_clock::now();
+ 	        //time_for_rebucket_lockless += std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+          std::atomic<TableInternalParameter*> tmp_atomic(0);
+          tmp_atomic.store(pold);
+          TableInternalParameter* tmp = pnew;
+          pnew = tmp_atomic.load();
+          pold = tmp;
+          __sync_bool_compare_and_swap(&resizeflag, true, false);
+          //if(thread_flag == 0)
+          //std::cout<<time_for_rebucket_lockless<<"\n";
+        }
+        return std::pair<iterator, bool>(end(), false);
+      }
+    }
+    
+  }
+
+
+  // If you know *this is big enough to hold obj, use this routine
+  template <typename K, typename... Args>
+  std::pair<iterator, bool> insert_noresize(K&& key, Args&&... args) {
+    // First, double-check we're not inserting delkey or emptyval
+
+    assert(pnew->settings_.use_empty() && "Inserting without empty key");
+    assert(!equals(std::forward<K>(key), pnew->key_info_.empty_key) && "Inserting the empty key");
+    assert((!pnew->settings_.use_deleted() || !equals(key, pnew->key_info_.delkey)) && "Inserting the deleted key");
+
+
+    const std::pair<size_type, size_type> pos = find_position(key);
+    if (pos.first != ILLEGAL_BUCKET) {  // object was already there
+      return std::pair<iterator, bool>(
+          iterator(this, pnew->table_ + pos.first, pnew->table_ + pnew->num_buckets_, false),
+          false);  // false: we didn't insert
+    } else {       // pos.second says where to put it
+      return std::pair<iterator, bool>(insert_at(pos.second, std::forward<Args>(args)...), true);
+    }
+  }
+
+  // Specializations of insert(it, it) depending on the power of the iterator:
+  // (1) Iterator supports operator-, resize before inserting
+  template <class ForwardIterator>
+  void insert(ForwardIterator f, ForwardIterator l, std::forward_iterator_tag) {
+    size_t dist = std::distance(f, l);
+    if (dist >= (std::numeric_limits<size_type>::max)()) {
+      throw std::length_error("insert-range overflow");
+    }
+    resize_delta(static_cast<size_type>(dist));
+    for (; dist > 0; --dist, ++f) {
+      insert_noresize(get_key(*f), *f);
+    }
+  }
+
+  // (2) Arbitrary iterator, can't tell how much to resize
+  template <class InputIterator>
+  void insert(InputIterator f, InputIterator l, std::input_iterator_tag) {
+    for (; f != l; ++f) insert(*f);
+  }
+
+ public:
+  // This is the normal insert routine, used by the outside world
+  template <typename Arg>
+  std::pair<iterator, bool> insert(Arg&& obj) {
+    bool did_resize = resize_delta(1);  // adding an object, grow if need be
+    return insert_noresize(get_key(std::forward<Arg>(obj)), std::forward<Arg>(obj));
+  }
+
+  template <typename Arg>
+  std::pair<iterator, bool> insert_lockless(Arg&& obj) {
+    while (thread_flag == -1) {
+      long min_counter = counter_flag[0];
+      int min_index = 0;
+      for (size_type i = 0; i < num_counters; i++) {
+        if (counter_flag[i] <= min_counter) {
+          min_counter = counter_flag[i];
+          min_index = i;
+        }
+      }
+      bool flag = __sync_bool_compare_and_swap(&counter_flag[min_index], min_counter, min_counter+1);
+      if (flag) {
+        thread_flag = min_index;
+        break;
+      }
+    }
+    do {
+      auto it = insert_noresize_lockless(get_key(std::forward<Arg>(obj)), std::forward<Arg>(obj));
+      if (it.first != end() && it.second == true && get_key(std::forward<Arg>(*it.first)) != pnew->key_info_.empty_key){
+        return it;
+      }
+      
+    } while (true);
+  }
+
+
+  template <typename K, typename... Args>
+  std::pair<iterator, bool> emplace(K&& key, Args&&... args) {
+    resize_delta(1);
+    // here we push key twice as we need it once for the indexing, and the rest of the params are for the emplace itself
+    return insert_noresize(std::forward<K>(key), std::forward<K>(key), std::forward<Args>(args)...);
+  }
+
+  /* Overload for maps: Here, K != V, and we need to pass hint->first to the equal() function. */
+  template <typename K, typename... Args, typename KeyCopy = Key>
+  typename std::enable_if<!std::is_same<KeyCopy, Value>::value,
+                          std::pair<iterator, bool>>::type
+  emplace_hint(const_iterator hint, K&& key, Args&&... args) {
+    resize_delta(1);
+
+    if ((hint != this->end()) && (equals(key, hint->first))) {
+        return {iterator(this, const_cast<pointer>(hint.pos), const_cast<pointer>(hint.end), false), false};
+    }
+
+    // here we push key twice as we need it once for the indexing, and the rest of the params are for the emplace itself
+    return insert_noresize(std::forward<K>(key), std::forward<K>(key), std::forward<Args>(args)...);
+  }
+
+  /* Overload for sets: Here, K == V, and we need to pass *hint to the equal() function. */
+  template <typename K, typename... Args, typename KeyCopy = Key>
+  typename std::enable_if<std::is_same<KeyCopy, Value>::value,
+                          std::pair<iterator, bool>>::type
+  emplace_hint(const_iterator hint, K&& key, Args&&... args) {
+    resize_delta(1);
+
+    if ((hint != this->end()) && (equals(key, *hint))) {
+      return {iterator(this, const_cast<pointer>(hint.pos), const_cast<pointer>(hint.end), false), false};
+    }
+
+    // here we push key twice as we need it once for the indexing, and the rest of the params are for the emplace itself
+    return insert_noresize(std::forward<K>(key), std::forward<K>(key), std::forward<Args>(args)...);
+  }
+
+  // When inserting a lot at a time, we specialize on the type of iterator
+  template <class InputIterator>
+  void insert(InputIterator f, InputIterator l) {
+    // specializes on iterator type
+    insert(f, l,
+           typename std::iterator_traits<InputIterator>::iterator_category());
+  }
+
+  // DefaultValue is a functor that takes a key and returns a value_type
+  // representing the default value to be inserted if none is found.
+  template <class T, class K>
+  value_type& find_or_insert(K&& key) {
+    // First, double-check we're not inserting emptykey or delkey
+    assert(
+        (!(pnew->settings_.use_empty()) || !equals(key, pnew->key_info_.empty_key)) &&
+        "Inserting the empty key");
+    assert((!(pnew->settings_.use_deleted()) || !equals(key, pnew->key_info_.delkey)) &&
+           "Inserting the deleted key");
+    const std::pair<size_type, size_type> pos = find_position(key);
+    if (pos.first != ILLEGAL_BUCKET) {  // object was already there
+      return pnew->table_[pos.first];
+    } else if (resize_delta(1)) {  // needed to rehash to make room
+      // Since we resized, we can't use pos, so recalculate where to insert.
+      return *insert_noresize(std::forward<K>(key), std::forward<K>(key), T()).first;
+    } else {  // no need to rehash, insert right here
+      return *insert_at(pos.second, std::forward<K>(key), T());
+    }
+  }
+  template<typename K, typename V>
+  bool delete_bucket_lockless(std::pair<K, V>* ptr, const key_type& key, void* tmp_pointer) {
+    TableInternalParameter* tmp = static_cast<TableInternalParameter*>(tmp_pointer);
+    using NCKey = typename std::remove_cv<Key>::type;
+    bool flag;
+    flag = __sync_bool_compare_and_swap(const_cast<NCKey*>(&(ptr->first)), key, tmp->key_info_.delkey);
+    if (!flag) {
+      return false;
+    } else {
+      __sync_bool_compare_and_swap(&ptr->second, ptr->second, empty_value);
+      return true;
+    } 
+  }
+
+  template<typename K>
+  bool delete_bucket_lockless(K* ptr, const key_type& key, void* tmp_pointer) {
+    using NCKey = typename std::remove_cv<Key>::type;
+    bool flag;
+    TableInternalParameter* tmp = static_cast<TableInternalParameter*>(tmp_pointer);
+    flag = __sync_bool_compare_and_swap(const_cast<NCKey*>(ptr), key, tmp->key_info_.delkey);
+    return flag;
+  }
+
+  size_type erase_lockless(const key_type& key){
+    assert(
+        (!pnew->settings_.use_empty() || !equals(key, pnew->key_info_.empty_key)) &&
+        "Erasing the empty key");
+    assert((!pnew->settings_.use_deleted() || !equals(key, pnew->key_info_.delkey)) &&
+          "Erasing the deleted key");
+    bool find_flag = 0;
+    bool flag;
+    while (thread_flag == -1) {
+      long min_counter = counter_flag[0];
+      int min_index = 0;
+      for (size_type i = 0; i < num_counters; i++) {
+        if (counter_flag[i] <= min_counter) {
+          min_counter = counter_flag[i];
+          min_index = i;
+        }
+      }
+      bool flag = __sync_bool_compare_and_swap(&counter_flag[min_index], min_counter, min_counter+1);
+      if (flag) {
+        thread_flag = min_index;
+        break;
+      }
+    }
+    
+    while(resizeflag){}
+    while(1){
+      TableInternalParameter* tmp_pointer = pnew;
+      std::pair<size_type, size_type> pos = find_position(key,tmp_pointer);
+      if(pos.first == ILLEGAL_BUCKET){
+      // The element has already been deleted successfully!
+        if(find_flag){
+            if(flag)
+              __sync_sub_and_fetch(&num_elements_array[thread_flag], 1);
+            return 1;
+        }  
+        return 0;
+      }
+      else{
+        find_flag = 1;
+        flag = delete_bucket_lockless(&tmp_pointer->table_[pos.first], key, tmp_pointer);
+        if(!flag)
+          continue;
+        if (resizeflag)
+          continue;
+        else{
+          pos = find_position(key, pnew);
+          if (pos.first != ILLEGAL_BUCKET)
+            continue;
+          else{
+            __sync_sub_and_fetch(&num_elements_array[thread_flag], 1);
+            return 1;
+          }
+        }
+      }
+    }
+  }
+
+std::pair<pointer, size_type> GetSnapShot(){
+  table_mutex.lock();
+  table_for_dump = (pointer)malloc(sizeof(value_type)*bucket_count());
+  using NCKey = typename std::remove_cv<Key>::type;
+  for(size_type i = 0; i < bucket_count(); i++){
+        *const_cast<NCKey*>(&table_for_dump[i].first) = pnew->table_[i].first;
+        table_for_dump[i].second = pnew->table_[i].second;
+  }
+  table_mutex.unlock();
+  return std::pair<pointer, size_type>(table_for_dump, bucket_count());
+}
+
+
+
+  // DELETION ROUTINES
+  size_type erase(const key_type& key) {
+    // First, double-check we're not trying to erase delkey or emptyval.
+    assert(
+        (!settings.use_empty() || !equals(key, key_info.empty_key)) &&
+        "Erasing the empty key");
+    assert((!settings.use_deleted() || !equals(key, key_info.delkey)) &&
+           "Erasing the deleted key");
+    const_iterator pos = find(key);  // shrug: shouldn't need to be const
+    if (pos != end()) {
+      assert(!test_deleted(pos));  // or find() shouldn't have returned it
+      set_deleted(pos);
+      ++num_deleted;
+      settings.set_consider_shrink(
+          true);  // will think about shrink after next insert
+      return 1;   // because we deleted one thing
+    } else {
+      return 0;  // because we deleted nothing
+    }
+  }
+
+  // We return the iterator past the deleted item.
+  iterator erase(const_iterator pos) {
+    if (pos == end()) return end();  // sanity check
+    if (set_deleted(pos)) {    // true if object has been newly deleted
+      ++num_deleted;
+      settings.set_consider_shrink(
+          true);  // will think about shrink after next insert
+    }
+    return iterator(this, const_cast<pointer>(pos.pos), const_cast<pointer>(pos.end), true);
+  }
+
+  iterator erase(const_iterator f, const_iterator l) {
+    for (; f != l; ++f) {
+      if (set_deleted(f))  // should always be true
+        ++num_deleted;
+    }
+    settings.set_consider_shrink(
+        true);  // will think about shrink after next insert
+    return iterator(this, const_cast<pointer>(f.pos), const_cast<pointer>(f.end), false);
+  }
+
+  // COMPARISON
+  bool operator==(const dense_hashtable_lockless& ht) const {
+    if (size() != ht.size()) {
+      return false;
+    } else if (this == &ht) {
+      return true;
+    } else {
+      // Iterate through the elements in "this" and see if the
+      // corresponding element is in ht
+      for (const_iterator it = begin(); it != end(); ++it) {
+        const_iterator it2 = ht.find(get_key(*it));
+        if ((it2 == ht.end()) || (*it != *it2)) {
+          return false;
+        }
+      }
+      return true;
+    }
+  }
+  bool operator!=(const dense_hashtable_lockless& ht) const { return !(*this == ht); }
+
+  // I/O
+  // We support reading and writing hashtables to disk.  Alas, since
+  // I don't know how to write a hasher or key_equal, you have to make
+  // sure everything but the table is the same.  We compact before writing.
+ private:
+  // Every time the disk format changes, this should probably change too
+  typedef unsigned long MagicNumberType;
+  static const MagicNumberType MAGIC_NUMBER = 0x13578642;
+
+ public:
+  // I/O -- this is an add-on for writing hash table to disk
+  //
+  // INPUT and OUTPUT must be either a FILE, *or* a C++ stream
+  //    (istream, ostream, etc) *or* a class providing
+  //    Read(void*, size_t) and Write(const void*, size_t)
+  //    (respectively), which writes a buffer into a stream
+  //    (which the INPUT/OUTPUT instance presumably owns).
+
+  typedef sparsehash_internal::pod_serializer<value_type> NopointerSerializer;
+
+  // ValueSerializer: a functor.  operator()(OUTPUT*, const value_type&)
+  template <typename ValueSerializer, typename OUTPUT>
+  bool serialize(ValueSerializer serializer, OUTPUT* fp) {
+    squash_deleted();  // so we don't have to worry about delkey
+    if (!sparsehash_internal::write_bigendian_number(fp, MAGIC_NUMBER, 4))
+      return false;
+    if (!sparsehash_internal::write_bigendian_number(fp, num_buckets, 8))
+      return false;
+    if (!sparsehash_internal::write_bigendian_number(fp, num_elements, 8))
+      return false;
+    // Now write a bitmap of non-empty buckets.
+    for (size_type i = 0; i < num_buckets; i += 8) {
+      unsigned char bits = 0;
+      for (int bit = 0; bit < 8; ++bit) {
+        if (i + bit < num_buckets && !test_empty(i + bit)) bits |= (1 << bit);
+      }
+      if (!sparsehash_internal::write_data(fp, &bits, sizeof(bits)))
+        return false;
+      for (int bit = 0; bit < 8; ++bit) {
+        if (bits & (1 << bit)) {
+          if (!serializer(fp, table[i + bit])) return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  // INPUT: anything we've written an overload of read_data() for.
+  // ValueSerializer: a functor.  operator()(INPUT*, value_type*)
+  template <typename ValueSerializer, typename INPUT>
+  bool unserialize(ValueSerializer serializer, INPUT* fp) {
+    assert(settings.use_empty() && "empty_key not set for read");
+
+    clear();  // just to be consistent
+    MagicNumberType magic_read;
+    if (!sparsehash_internal::read_bigendian_number(fp, &magic_read, 4))
+      return false;
+    if (magic_read != MAGIC_NUMBER) {
+      return false;
+    }
+    size_type new_num_buckets;
+    if (!sparsehash_internal::read_bigendian_number(fp, &new_num_buckets, 8))
+      return false;
+    clear_to_size(new_num_buckets);
+    if (!sparsehash_internal::read_bigendian_number(fp, &num_elements, 8))
+      return false;
+
+    // Read the bitmap of non-empty buckets.
+    for (size_type i = 0; i < num_buckets; i += 8) {
+      unsigned char bits;
+      if (!sparsehash_internal::read_data(fp, &bits, sizeof(bits)))
+        return false;
+      for (int bit = 0; bit < 8; ++bit) {
+        if (i + bit < num_buckets && (bits & (1 << bit))) {  // not empty
+          if (!serializer(fp, &table[i + bit])) return false;
+        }
+      }
+    }
+    return true;
+  }
+
+ private:
+  template <class A>
+  class alloc_impl : public A {
+   public:
+    typedef typename A::pointer pointer;
+    typedef typename A::size_type size_type;
+
+    // Convert a normal allocator to one that has realloc_or_die()
+    alloc_impl(const A& a) : A(a) {}
+
+    // realloc_or_die should only be used when using the default
+    // allocator (libc_allocator_with_realloc).
+    pointer realloc_or_die(pointer /*ptr*/, size_type /*n*/) {
+      fprintf(stderr,
+              "realloc_or_die is only supported for "
+              "libc_allocator_with_realloc\n");
+      exit(1);
+      return NULL;
+    }
+  };
+
+  // A template specialization of alloc_impl for
+  // libc_allocator_with_realloc that can handle realloc_or_die.
+  template <class A>
+  class alloc_impl<libc_allocator_with_realloc<A>>
+      : public libc_allocator_with_realloc<A> {
+   public:
+    typedef typename libc_allocator_with_realloc<A>::pointer pointer;
+    typedef typename libc_allocator_with_realloc<A>::size_type size_type;
+
+    alloc_impl(const libc_allocator_with_realloc<A>& a)
+        : libc_allocator_with_realloc<A>(a) {}
+
+    pointer realloc_or_die(pointer ptr, size_type n) {
+      pointer retval = this->reallocate(ptr, n);
+      if (retval == NULL) {
+        fprintf(stderr,
+                "sparsehash: FATAL ERROR: failed to reallocate "
+                "%lu elements for ptr %p",
+                static_cast<unsigned long>(n), static_cast<void*>(ptr));
+        exit(1);
+      }
+      return retval;
+    }
+  };
+
+  // Package allocator with emptyval to eliminate memory needed for
+  // the zero-size allocator.
+  // If new fields are added to this class, we should add them to
+  // operator= and swap.
+  class ValInfo : public alloc_impl<value_alloc_type> {
+   public:
+    typedef typename alloc_impl<value_alloc_type>::value_type value_type;
+
+    ValInfo(const alloc_impl<value_alloc_type>& a)
+        : alloc_impl<value_alloc_type>(a) {}
+  };
+
+  // Package functors with another class to eliminate memory needed for
+  // zero-size functors.  Since ExtractKey and hasher's operator() might
+  // have the same function signature, they must be packaged in
+  // different classes.
+  struct Settings
+      : sparsehash_internal::sh_hashtable_settings<key_type, hasher, size_type,
+                                                   HT_MIN_BUCKETS> {
+    explicit Settings(const hasher& hf)
+        : sparsehash_internal::sh_hashtable_settings<key_type, hasher,
+                                                     size_type, HT_MIN_BUCKETS>(
+              hf, HT_OCCUPANCY_PCT / 100.0f, HT_EMPTY_PCT / 100.0f) {}
+  };
+
+  // Packages ExtractKey and SetKey functors.
+  class KeyInfo : public ExtractKey, public SetKey, public EqualKey {
+   public:
+    KeyInfo(const ExtractKey& ek, const SetKey& sk, const EqualKey& eq)
+        : ExtractKey(ek), SetKey(sk), EqualKey(eq) {}
+
+    // We want to return the exact same type as ExtractKey: Key or const Key&
+    template <typename V>
+    typename ExtractKey::result_type get_key(V&& v) const {
+      return ExtractKey::operator()(std::forward<V>(v));
+    }
+    void set_key(pointer v, const key_type& k) const {
+      SetKey::operator()(v, k);
+    }
+    void construct_key(pointer v, const key_type& k) const {
+      SetKey::operator()(v, k, true);
+    }
+    template <typename K1, typename K2>
+    bool equals(const K1& a, const K2& b) const {
+      return EqualKey::operator()(a, b);
+    }
+
+    // Which key marks deleted entries.
+    // TODO(csilvers): make a pointer, and get rid of use_deleted (benchmark!)
+    typename std::remove_const<key_type>::type delkey;
+    typename std::remove_const<key_type>::type empty_key;
+  };
+
+  class TableInternalParameter {
+    public:
+      TableInternalParameter(size_type expected_max_items_in_table = 0, const HashFcn& hf = HashFcn(), const EqualKey& eql = EqualKey(),const ExtractKey& ext = ExtractKey(),
+                            const SetKey& set = SetKey(),const Alloc& alloc = Alloc())
+                            : settings_(hf), key_info_(ext,set,eql),num_deleted_(0), num_elements_(0),
+                            num_buckets_(expected_max_items_in_table == 0
+                            ? HT_DEFAULT_STARTING_BUCKETS
+                            : settings_.min_buckets(expected_max_items_in_table, 0)),
+                            val_info_(alloc_impl<value_alloc_type>(alloc)),
+                            table_(NULL) {
+      }
+      Settings settings_;
+      KeyInfo key_info_;
+
+      size_type num_deleted_;  
+      size_type num_elements_;
+      size_type num_buckets_;
+      ValInfo val_info_;  
+      pointer table_;
+  };
+
+
+  // Utility functions to access the templated operators
+  template <typename K>
+  size_type hash(const K& v) const { return settings.hash(v); }
+  template <typename K1, typename K2>
+  bool equals(const K1& a, const K2& b) const {
+    return key_info.equals(a, b);
+  }
+  template <typename V>
+  typename ExtractKey::result_type get_key(V&& v) const {
+    return key_info.get_key(std::forward<V>(v));
+  }
+  void set_key(pointer v, const key_type& k) const { key_info.set_key(v, k); }
+  void construct_key(pointer v, const key_type& k) const { key_info.construct_key(v, k); }
+
+ private:
+  // Actual data
+  Settings settings;
+  KeyInfo key_info;
+
+  size_type num_deleted;  // how many occupied buckets are marked deleted
+  size_type num_elements;
+  size_type num_counters;
+  size_type *num_elements_array;
+  size_type num_buckets;
+  ValInfo val_info;  // holds emptyval, and also the allocator
+  pointer table;
+  pointer table_for_dump;
+  long insert_counter;
+  long *counter_flag;
+
+  size_type thread_num;
+  Data empty_value;
+  volatile bool resizeflag;
+  TableInternalParameter tp_old,tp_new;
+  TableInternalParameter *pold, *pnew;
+  std::mutex table_mutex;
+};
+
+// We need a global swap as well
+template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
+inline void swap(dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>& x,
+                 dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>& y) {
+  x.swap(y);
+}
+
+#undef JUMP_
+
+template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
+const typename dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>::size_type
+    dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>::ILLEGAL_BUCKET;
+
+// How full we let the table get before we resize.  Knuth says .8 is
+// good -- higher causes us to probe too much, though saves memory.
+// However, we go with .5, getting better performance at the cost of
+// more space (a trade-off densehashtable explicitly chooses to make).
+// Feel free to play around with different values, though, via
+// max_load_factor() and/or set_resizing_parameters().
+template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
+const int dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>::HT_OCCUPANCY_PCT = 50;
+
+// How empty we let the table get before we resize lower.
+// It should be less than OCCUPANCY_PCT / 2 or we thrash resizing.
+template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
+const int dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>::HT_EMPTY_PCT =
+    static_cast<int>(
+        0.4 * dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>::HT_OCCUPANCY_PCT);
+
+}  // namespace google
diff --git a/sparsehash/traits b/sparsehash/traits
index 65135a6..8e0a0d6 100644
--- a/sparsehash/traits
+++ b/sparsehash/traits
@@ -40,11 +40,19 @@ namespace google {
 // struct is_relocatable<MyType> : std::true_type {};
 // }
 
+#if (defined __GNUG__) && (__GNUC__ < 5)
+template <class T>
+struct is_relocatable
+    : std::integral_constant<bool,
+                             (__has_trivial_copy(T) &&
+                              __has_trivial_destructor(T))> {};
+#else
 template <class T>
 struct is_relocatable
     : std::integral_constant<bool,
                              (std::is_trivially_copy_constructible<T>::value &&
                               std::is_trivially_destructible<T>::value)> {};
+#endif  // __GNUG__ && __GNUC__ < 5
 template <class T, class U>
 struct is_relocatable<std::pair<T, U>>
     : std::integral_constant<bool, (is_relocatable<T>::value &&
@@ -52,4 +60,4 @@ struct is_relocatable<std::pair<T, U>>
 
 template <class T>
 struct is_relocatable<const T> : is_relocatable<T> {};
-}
\ No newline at end of file
+}
diff --git a/tests/bench_lockless.cc b/tests/bench_lockless.cc
new file mode 100644
index 0000000..ac94adf
--- /dev/null
+++ b/tests/bench_lockless.cc
@@ -0,0 +1,1466 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Authors: Sanjay Ghemawat and Craig Silverstein
+
+// Time various hash map implementations
+//
+// Below, times are per-call.  "Memory use" is "bytes in use by
+// application" as reported by tcmalloc, compared before and after the
+// function call.  This does not really report fragmentation, which is
+// not bad for the sparse* routines but bad for the dense* ones.
+//
+// The tests generally yield best-case performance because the
+// code uses sequential keys; on the other hand, "map_fetch_random" does
+// lookups in a pseudorandom order.  Also, "stresshashfunction" is
+// a stress test of sorts.  It uses keys from an arithmetic sequence, which,
+// if combined with a quick-and-dirty hash function, will yield worse
+// performance than the otherwise similar "map_predict/grow."
+//
+// Consider doing the following to get good numbers:
+//
+// 1. Run the tests on a machine with no X service. Make sure no other
+//    processes are running.
+// 2. Minimize compiled-code differences. Compare results from the same
+//    binary, if possible, instead of comparing results from two different
+//    binaries.
+//
+// See PERFORMANCE for the output of one example run.
+
+#include <cstdint>  // for uintptr_t
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <thread>
+
+extern "C" {
+#include <time.h>
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#ifdef HAVE_SYS_RESOURCE_H
+#include <sys/resource.h>
+#endif
+#ifdef HAVE_SYS_UTSNAME_H
+#include <sys/utsname.h>
+#endif  // for uname()
+}
+
+// The functions that we call on each map, that differ for different types.
+// By default each is a noop, but we redefine them for types that need them.
+
+#include <map>
+#include <unordered_map>
+#include <algorithm>
+#include <vector>
+#include <set>
+#include <chrono>
+#include <type_traits>
+#include <sparsehash/dense_hash_map>
+#include <sparsehash/sparse_hash_map>
+#include <sparsehash/dense_hash_map_lockless>
+#include "rwlock.h"
+
+using std::map;
+using std::unordered_map;
+using std::swap;
+using std::vector;
+using std::chrono::steady_clock;
+using std::chrono::duration_cast;
+using std::chrono::time_point;
+using std::chrono::nanoseconds;
+using google::dense_hash_map;
+using google::sparse_hash_map;
+using google::dense_hash_map_lockless;
+using std::hash;  // or __gnu_cxx::hash, or maybe tr1::hash, depending on your OS
+
+
+static bool FLAGS_test_sparse_hash_map = true;
+static bool FLAGS_test_dense_hash_map = true;
+static bool FLAGS_test_hash_map = true;
+static bool FLAGS_test_map = true;
+
+static bool FLAGS_test_4_bytes = true;
+static bool FLAGS_test_8_bytes = true;
+static bool FLAGS_test_16_bytes = true;
+static bool FLAGS_test_256_bytes = true;
+static bool FLAGS_test_parallel = true;
+
+static const int kDefaultIters = 10000000;
+easy_spinrwlock_t mu = EASY_SPINRWLOCK_INITIALIZER;
+easy_spinrwlock_t mu_list[1000];
+
+double time_for_insert_lockless = 0.0;
+double time_for_insert_noresize_lockless = 0.0;
+double time_for_insert_at_lockless = 0.0;
+double time_for_setvalue_lockless = 0.0;
+double time_for_rebucket_lockless = 0.0;
+//double time_for_insert_noresize[16] = {0.0};
+//double time_for_rebucket[16] = {0.0};
+
+
+struct eqstr
+{
+	bool operator()(long  s1, long s2) const
+	{
+		return (s1 == s2);
+	}
+};
+
+// A version of each of the hashtable classes we test, that has been
+// augumented to provide a common interface.  For instance, the
+// sparse_hash_map and dense_hash_map versions set empty-key and
+// deleted-key (we can do this because all our tests use int-like
+// keys), so the users don't have to.  The hash_map version adds
+// resize(), so users can just call resize() for all tests without
+// worrying about whether the map-type supports it or not.
+
+template <typename K, typename V, typename H>
+class EasyUseSparseHashMap : public sparse_hash_map<K, V, H> {
+ public:
+  EasyUseSparseHashMap() { this->set_deleted_key(-1); }
+};
+
+template <typename K, typename V, typename H>
+class EasyUseDenseHashMap : public dense_hash_map<K, V, H> {
+ public:
+  EasyUseDenseHashMap() {
+    this->set_empty_key(-1);
+    this->set_deleted_key(-2);
+  }
+};
+
+// For pointers, we only set the empty key.
+template <typename K, typename V, typename H>
+class EasyUseSparseHashMap<K*, V, H> : public sparse_hash_map<K*, V, H> {
+ public:
+  EasyUseSparseHashMap() {}
+};
+
+template <typename K, typename V, typename H>
+class EasyUseDenseHashMap<K*, V, H> : public dense_hash_map<K*, V, H> {
+ public:
+  EasyUseDenseHashMap() { this->set_empty_key((K*)(~0)); }
+};
+
+template <typename K, typename V, typename H>
+class EasyUseHashMap : public unordered_map<K, V, H> {
+ public:
+  // resize() is called rehash() in tr1
+  void resize(size_t r) { this->rehash(r); }
+};
+
+template <typename K, typename V>
+class EasyUseMap : public map<K, V> {
+ public:
+  void resize(size_t) {}  // map<> doesn't support resize
+};
+
+// Returns the number of hashes that have been done since the last
+// call to NumHashesSinceLastCall().  This is shared across all
+// HashObject instances, which isn't super-OO, but avoids two issues:
+// (1) making HashObject bigger than it ought to be (this is very
+// important for our testing), and (2) having to pass around
+// HashObject objects everywhere, which is annoying.
+static int g_num_hashes;
+static int g_num_copies;
+
+int NumHashesSinceLastCall() {
+  int retval = g_num_hashes;
+  g_num_hashes = 0;
+  return retval;
+}
+int NumCopiesSinceLastCall() {
+  int retval = g_num_copies;
+  g_num_copies = 0;
+  return retval;
+}
+
+/*
+ * These are the objects we hash.  Size is the size of the object
+ * (must be > sizeof(int).  Hashsize is how many of these bytes we
+ * use when hashing (must be > sizeof(int) and < Size).
+ */
+template <int Size, int Hashsize>
+class HashObject {
+ public:
+  typedef HashObject<Size, Hashsize> class_type;
+  HashObject() {}
+  HashObject(int i) : i_(i) {
+    memset(buffer_, i & 255, sizeof(buffer_));  // a "random" char
+  }
+  HashObject(const HashObject& that) { operator=(that); }
+  void operator=(const HashObject& that) {
+    g_num_copies++;
+    this->i_ = that.i_;
+    memcpy(this->buffer_, that.buffer_, sizeof(this->buffer_));
+  }
+
+  size_t Hash() const {
+    g_num_hashes++;
+    int hashval = i_;
+    for (size_t i = 0; i < Hashsize - sizeof(i_); ++i) {
+      hashval += buffer_[i];
+    }
+    return std::hash<int>()(hashval);
+  }
+
+  bool operator==(const class_type& that) const { return this->i_ == that.i_; }
+  bool operator<(const class_type& that) const { return this->i_ < that.i_; }
+  bool operator<=(const class_type& that) const { return this->i_ <= that.i_; }
+
+ private:
+  int i_;  // the key used for hashing
+  char buffer_[Size - sizeof(int)];
+};
+
+// A specialization for the case sizeof(buffer_) == 0
+template <>
+class HashObject<sizeof(int), sizeof(int)> {
+ public:
+  typedef HashObject<sizeof(int), sizeof(int)> class_type;
+  HashObject() {}
+  HashObject(int i) : i_(i) {}
+  HashObject(const HashObject& that) { operator=(that); }
+  void operator=(const HashObject& that) {
+    g_num_copies++;
+    this->i_ = that.i_;
+  }
+
+  size_t Hash() const {
+    g_num_hashes++;
+    return std::hash<int>()(i_);
+  }
+
+  bool operator==(const class_type& that) const { return this->i_ == that.i_; }
+  bool operator<(const class_type& that) const { return this->i_ < that.i_; }
+  bool operator<=(const class_type& that) const { return this->i_ <= that.i_; }
+
+ private:
+  int i_;  // the key used for hashing
+};
+
+namespace google {
+// Let the hashtable implementations know it can use an optimized memcpy,
+// because the compiler defines both the destructor and copy constructor.
+template <int Size, int Hashsize>
+struct is_relocatable<HashObject<Size, Hashsize>> : std::true_type {};
+}
+
+class HashFn {
+ public:
+  template <int Size, int Hashsize>
+  size_t operator()(const HashObject<Size, Hashsize>& obj) const {
+    return obj.Hash();
+  }
+  // Do the identity hash for pointers.
+  template <int Size, int Hashsize>
+  size_t operator()(const HashObject<Size, Hashsize>* obj) const {
+    return reinterpret_cast<uintptr_t>(obj);
+  }
+
+  // Less operator for MSVC's hash containers.
+  template <int Size, int Hashsize>
+  bool operator()(const HashObject<Size, Hashsize>& a,
+                  const HashObject<Size, Hashsize>& b) const {
+    return a < b;
+  }
+  template <int Size, int Hashsize>
+  bool operator()(const HashObject<Size, Hashsize>* a,
+                  const HashObject<Size, Hashsize>* b) const {
+    return a < b;
+  }
+  // These two public members are required by msvc.  4 and 8 are defaults.
+  static const size_t bucket_size = 4;
+  static const size_t min_buckets = 8;
+};
+
+/*
+ * Measure resource usage.
+ */
+
+class Rusage {
+ public:
+  /* Start collecting usage */
+  Rusage() { Reset(); }
+
+  /* Reset collection */
+  void Reset();
+
+  /* Show usage, in nanoseconds */
+  double UserTime();
+
+ private:
+  steady_clock::time_point start_;
+};
+
+inline void Rusage::Reset() { 
+  g_num_copies = 0;
+  g_num_hashes = 0;  
+  start_ = steady_clock::now(); 
+}
+
+inline double Rusage::UserTime() {
+  auto diff = steady_clock::now() - start_;
+  return duration_cast<nanoseconds>(diff).count();
+}
+
+static void print_uname() {
+#ifdef HAVE_SYS_UTSNAME_H
+  struct utsname u;
+  if (uname(&u) == 0) {
+    printf("%s %s %s %s %s\n", u.sysname, u.nodename, u.release, u.version,
+           u.machine);
+  }
+#endif
+}
+
+// Generate stamp for this run
+static void stamp_run(int iters, int read_factor) {
+  time_t now = time(0);
+  printf("======\n");
+  fflush(stdout);
+  print_uname();
+  printf("Average over %d iterations\n", iters);
+  printf("read factor = %d\n", read_factor);
+  fflush(stdout);
+  // don't need asctime_r/gmtime_r: we're not threaded
+  printf("Current time (GMT): %s", asctime(gmtime(&now)));
+}
+
+// This depends on the malloc implementation for exactly what it does
+// -- and thus requires work after the fact to make sense of the
+// numbers -- and also is likely thrown off by the memory management
+// STL tries to do on its own.
+
+#ifdef HAVE_GOOGLE_MALLOC_EXTENSION_H
+#include <google/malloc_extension.h>
+
+static size_t CurrentMemoryUsage() {
+  size_t result;
+  if (MallocExtension::instance()->GetNumericProperty(
+          "generic.current_allocated_bytes", &result)) {
+    return result;
+  } else {
+    return 0;
+  }
+}
+
+#else /* not HAVE_GOOGLE_MALLOC_EXTENSION_H */
+static size_t CurrentMemoryUsage() { return 0; }
+
+#endif
+
+static void report(char const* title, double t, int iters, size_t start_memory,
+                   size_t end_memory) {
+  // Construct heap growth report text if applicable
+  char heap[100] = "";
+  if (end_memory > start_memory) {
+    snprintf(heap, sizeof(heap), "%7.1f MB",
+             (end_memory - start_memory) / 1048576.0);
+  }
+
+  printf("%-20s %6.1f ns  (%8d hashes, %8d copies)%s\n", title, (t / iters),
+         NumHashesSinceLastCall(), NumCopiesSinceLastCall(), heap);
+  fflush(stdout);
+}
+
+template <class MapType>
+static void time_map_grow(int iters) {
+  MapType set;
+  Rusage t;
+
+  const size_t start = CurrentMemoryUsage();
+  t.Reset();
+  for (int i = 0; i < iters; i++) {
+    set[i] = i + 1;
+  }
+  double ut = t.UserTime();
+  const size_t finish = CurrentMemoryUsage();
+  report("map_grow", ut, iters, start, finish);
+}
+
+template <class MapType>
+static void time_map_grow_predicted(int iters) {
+  MapType set;
+  Rusage t;
+
+  const size_t start = CurrentMemoryUsage();
+  set.resize(iters);
+  t.Reset();
+  for (int i = 0; i < iters; i++) {
+    set[i] = i + 1;
+  }
+  double ut = t.UserTime();
+  const size_t finish = CurrentMemoryUsage();
+  report("map_predict/grow", ut, iters, start, finish);
+}
+
+template <class MapType>
+static void time_map_replace(int iters) {
+  MapType set;
+  Rusage t;
+  int i;
+
+  for (i = 0; i < iters; i++) {
+    set[i] = i + 1;
+  }
+
+  t.Reset();
+  for (i = 0; i < iters; i++) {
+    set[i] = i + 1;
+  }
+  double ut = t.UserTime();
+
+  report("map_replace", ut, iters, 0, 0);
+}
+
+template <class MapType>
+static void time_map_fetch(int iters, const vector<int>& indices,
+                           char const* title) {
+  MapType set;
+  Rusage t;
+  int r;
+  int i;
+
+  for (i = 0; i < iters; i++) {
+    set[i] = i + 1;
+  }
+
+  r = 1;
+  t.Reset();
+  for (i = 0; i < iters; i++) {
+    r ^= static_cast<int>(set.find(indices[i]) != set.end());
+  }
+  double ut = t.UserTime();
+
+  srand(r);  // keep compiler from optimizing away r (we never call rand())
+  report(title, ut, iters, 0, 0);
+}
+
+template <class MapType>
+static void time_map_fetch_sequential(int iters) {
+  vector<int> v(iters);
+  for (int i = 0; i < iters; i++) {
+    v[i] = i;
+  }
+  time_map_fetch<MapType>(iters, v, "map_fetch_sequential");
+}
+
+// Apply a pseudorandom permutation to the given vector.
+static void shuffle(vector<int>* v) {
+  srand(9);
+  for (int n = v->size(); n >= 2; n--) {
+    swap((*v)[n - 1], (*v)[static_cast<unsigned>(rand()) % n]);
+  }
+}
+
+template <class MapType>
+static void time_map_fetch_random(int iters) {
+  vector<int> v(iters);
+  for (int i = 0; i < iters; i++) {
+    v[i] = i;
+  }
+  shuffle(&v);
+  time_map_fetch<MapType>(iters, v, "map_fetch_random");
+}
+
+template <class MapType>
+static void time_map_fetch_empty(int iters) {
+  MapType set;
+  Rusage t;
+  int r;
+  int i;
+
+  r = 1;
+  t.Reset();
+  for (i = 0; i < iters; i++) {
+    r ^= static_cast<int>(set.find(i) != set.end());
+  }
+  double ut = t.UserTime();
+
+  srand(r);  // keep compiler from optimizing away r (we never call rand())
+  report("map_fetch_empty", ut, iters, 0, 0);
+}
+
+template <class MapType>
+static void time_map_remove(int iters) {
+  MapType set;
+  Rusage t;
+  int i;
+
+  for (i = 0; i < iters; i++) {
+    set[i] = i + 1;
+  }
+
+  t.Reset();
+  for (i = 0; i < iters; i++) {
+    set.erase(i);
+  }
+  double ut = t.UserTime();
+
+  report("map_remove", ut, iters, 0, 0);
+}
+
+template <class MapType>
+static void time_map_toggle(int iters) {
+  MapType set;
+  Rusage t;
+  int i;
+
+  const size_t start = CurrentMemoryUsage();
+  t.Reset();
+  for (i = 0; i < iters; i++) {
+    set[i] = i + 1;
+    set.erase(i);
+  }
+
+  double ut = t.UserTime();
+  const size_t finish = CurrentMemoryUsage();
+
+  report("map_toggle", ut, iters, start, finish);
+}
+
+template <class MapType>
+static void time_map_iterate(int iters) {
+  MapType set;
+  Rusage t;
+  int r;
+  int i;
+
+  for (i = 0; i < iters; i++) {
+    set[i] = i + 1;
+  }
+
+  r = 1;
+  t.Reset();
+  for (typename MapType::const_iterator it = set.begin(), it_end = set.end();
+       it != it_end; ++it) {
+    r ^= it->second;
+  }
+
+  double ut = t.UserTime();
+
+  srand(r);  // keep compiler from optimizing away r (we never call rand())
+  report("map_iterate", ut, iters, 0, 0);
+}
+
+template <class MapType>
+static void stresshashfunction(int desired_insertions, int map_size,
+                               int stride) {
+  Rusage t;
+  int num_insertions = 0;
+  // One measurement of user time (in nanoseconds) is done for each iteration of
+  // the outer loop.  The times are summed.
+  double total_nanoseconds = 0;
+  const int k = desired_insertions / map_size;
+  MapType set;
+  for (int o = 0; o < k; o++) {
+    set.clear();
+    set.resize(map_size);
+    t.Reset();
+    const int maxint = (1ull << (sizeof(int) * 8 - 1)) - 1;
+    // Use n arithmetic sequences.  Using just one may lead to overflow
+    // if stride * map_size > maxint.  Compute n by requiring
+    // stride * map_size/n < maxint, i.e., map_size/(maxint/stride) < n
+    char* key;  // something we can do math on
+    const int n = map_size / (maxint / stride) + 1;
+    for (int i = 0; i < n; i++) {
+      key = NULL;
+      key += i;
+      for (int j = 0; j < map_size / n; j++) {
+        key += stride;
+        set[reinterpret_cast<typename MapType::key_type>(key)] =
+            ++num_insertions;
+      }
+    }
+    total_nanoseconds += t.UserTime();
+  }
+  printf("stresshashfunction map_size=%d stride=%d: %.1fns/insertion\n",
+         map_size, stride, total_nanoseconds / num_insertions);
+}
+
+template <class MapType>
+static void stresshashfunction(int num_inserts) {
+  static const int kMapSizes[] = {256, 1024};
+  for (unsigned i = 0; i < sizeof(kMapSizes) / sizeof(kMapSizes[0]); i++) {
+    const int map_size = kMapSizes[i];
+    for (int stride = 1; stride <= map_size; stride *= map_size) {
+      stresshashfunction<MapType>(num_inserts, map_size, stride);
+    }
+  }
+}
+
+template <class MapType, class StressMapType>
+static void measure_map(const char* label, int obj_size, int iters,
+                        bool stress_hash_function) {
+  printf("\n%s (%d byte objects, %d iterations):\n", label, obj_size, iters);
+  if (1) time_map_grow<MapType>(iters);
+  if (1) time_map_grow_predicted<MapType>(iters);
+  if (1) time_map_replace<MapType>(iters);
+  if (1) time_map_fetch_random<MapType>(iters);
+  if (1) time_map_fetch_sequential<MapType>(iters);
+  if (1) time_map_fetch_empty<MapType>(iters);
+  if (1) time_map_remove<MapType>(iters);
+  if (1) time_map_toggle<MapType>(iters);
+  if (1) time_map_iterate<MapType>(iters);
+  // This last test is useful only if the map type uses hashing.
+  // And it's slow, so use fewer iterations.
+  if (stress_hash_function) {
+    // Blank line in the output makes clear that what follows isn't part of the
+    // table of results that we just printed.
+    puts("");
+    stresshashfunction<StressMapType>(iters / 4);
+  }
+}
+
+template <class ObjType>
+static void test_all_maps(int obj_size, int iters) {
+  const bool stress_hash_function = obj_size <= 8;
+
+  if (FLAGS_test_sparse_hash_map)
+    measure_map<EasyUseSparseHashMap<ObjType, int, HashFn>,
+                EasyUseSparseHashMap<ObjType*, int, HashFn>>(
+        "SPARSE_HASH_MAP", obj_size, iters, stress_hash_function);
+
+  if (FLAGS_test_dense_hash_map)
+    measure_map<EasyUseDenseHashMap<ObjType, int, HashFn>,
+                EasyUseDenseHashMap<ObjType*, int, HashFn>>(
+        "DENSE_HASH_MAP", obj_size, iters, stress_hash_function);
+
+  if (FLAGS_test_hash_map)
+    measure_map<EasyUseHashMap<ObjType, int, HashFn>,
+                EasyUseHashMap<ObjType*, int, HashFn>>(
+        "STANDARD HASH_MAP", obj_size, iters, stress_hash_function);
+
+  if (FLAGS_test_map)
+    measure_map<EasyUseMap<ObjType, int>, EasyUseMap<ObjType*, int>>(
+        "STANDARD MAP", obj_size, iters, false);
+}
+
+void thread_lookup(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht, int iter, int offset, int threadnum){
+  for (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++) {
+    ht.find_wait_free(j);
+  }
+}
+
+void thread_lookup_rwlock(dense_hash_map<long, long, hash<long>, eqstr>& ht, int iter, int offset, int threadnum){
+  for (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++) {
+    spin_rd_lock l(mu);
+    ht.find(j);
+  }
+}
+
+void thread_lookup_rwlock_and_shaders(dense_hash_map<long, long, hash<long>, eqstr>* ht, int iter, int offset, int threadnum){
+  for (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++){
+    int bucket = j % 1000;
+    spin_rd_lock l(mu_list[bucket]);
+    ht[bucket].find(j);
+  }
+}
+
+void thread_insert(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht, int iter, int offset, int threadnum){
+  for (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++) {
+    ht.insert_lockless(std::move(std::pair<long, long>(j, j+10)));
+  }
+}
+
+void thread_insert_rwlock(dense_hash_map<long, long, hash<long>, eqstr>& ht, int iter, int offset, int threadnum){
+  for (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++) {
+    spin_wr_lock l(mu);
+    ht.insert(std::move(std::pair<long, long>(j, j+10)));
+  }
+}
+
+void thread_insert_rwlock_and_shaders(dense_hash_map<long, long, hash<long>, eqstr>* ht, int iter, int offset, int threadnum){
+  for (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++) {
+    int bucket = j % 1000;
+    spin_wr_lock l(mu_list[bucket]);
+    ht[bucket].insert(std::move(std::pair<long, long>(j, j+10)));
+ }
+}
+
+void thread_find_insert(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht, int iter, int offset, int threadnum){
+  for  (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++) {
+    ht.insert_lockless(std::move(std::pair<long, long>(j, j+10)));
+    ht.find_wait_free(j);
+  }
+}
+
+void thread_erase(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht, int iter, int offset, int threadnum){
+  for (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++) {
+    ht.erase_lockless(j);
+  }
+}
+
+void test_parallel_find(int threadnum, int iter){
+  dense_hash_map_lockless< long,  long, hash<long>, eqstr> ht;
+  ht.set_empty_key_and_value(-1, 2147483647);
+  ht.set_deleted_key(-2);
+  ht.set_counternum(16);
+  
+  for (long i = 0; i < iter; i++)
+    ht.insert_lockless(std::move(std::pair<long ,long>(i, i + 10)));
+  std::vector<std::thread> insert_threads(threadnum);
+  auto st_time = std::chrono::high_resolution_clock::now();
+  for (long i = 0 ; i < threadnum; i++) {
+    insert_threads[i] = std::thread(thread_lookup, std::ref(ht), iter, i, threadnum);
+  }
+  for (auto &t : insert_threads) {
+    t.join();
+  }
+  auto ed_time = std::chrono::high_resolution_clock::now();
+  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+  std::cout<<"parallel_find\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<iter/seconds_elapsed<<std::endl;
+}
+
+void test_parallel_insert(int threadnum, int iter){
+  dense_hash_map_lockless< long,  long, hash<long>, eqstr> ht;
+  ht.set_empty_key_and_value(-1, 2147483647);
+  ht.set_deleted_key(-2);
+  ht.set_counternum(16);
+  std::vector<std::thread> insert_threads(threadnum);
+  auto st_time = std::chrono::high_resolution_clock::now();
+  for (long i = 0 ; i < threadnum; i++) {
+    insert_threads[i] = std::thread(thread_insert, std::ref(ht), iter, i, threadnum);
+  }
+  for (auto &t : insert_threads) {
+    t.join();
+  }
+  auto ed_time = std::chrono::high_resolution_clock::now();
+  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+  std::cout<<"parallel_insert\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<iter/seconds_elapsed<<std::endl;
+}
+
+void test_parallel_find_and_insert(int threadnum, int iter){
+  dense_hash_map_lockless< long,  long, hash<long>, eqstr> ht;
+  ht.set_empty_key_and_value(-1, 2147483647);
+  ht.set_deleted_key(-2);
+  ht.set_counternum(16);
+  std::vector<std::thread> insert_threads(threadnum);
+  auto st_time = std::chrono::high_resolution_clock::now();
+  for (long i = 0 ; i < threadnum; i++){
+    insert_threads[i] = std::thread(thread_find_insert, std::ref(ht), iter, i, threadnum);
+  }
+  for (auto &t : insert_threads) {
+    t.join();
+  }
+  auto ed_time = std::chrono::high_resolution_clock::now();
+  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+  std::cout<<"parallel_find_and_insert\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<2*iter/seconds_elapsed<<std::endl;
+}
+
+void test_parallel_erase(int threadnum, int iter){
+  dense_hash_map_lockless< long,  long, hash<long>, eqstr> ht;
+  ht.set_empty_key_and_value(-1, 2147483647);
+  ht.set_deleted_key(-2);
+  ht.set_counternum(16);
+  for (long i = 0; i < iter; i++)
+    ht.insert(std::move(std::pair<long ,long>(i, i + 10)));
+  std::vector<std::thread> insert_threads(threadnum);
+  auto st_time = std::chrono::high_resolution_clock::now();
+  for (long i = 0 ; i < threadnum; i++) {
+    insert_threads[i] = std::thread(thread_erase, std::ref(ht), iter, i, threadnum);
+  }
+  for (auto &t : insert_threads) {
+    t.join();
+  }
+  auto ed_time = std::chrono::high_resolution_clock::now();
+  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+  std::cout<<"parallel_erase\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<iter/seconds_elapsed<<std::endl;
+}
+
+int lookup(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht, long key){
+  auto it = ht.find(key);
+  if (it == ht.end()) {
+    return 0;
+  }else{
+    return 1;
+  }
+}
+
+int lookup_rwlock(dense_hash_map<long, long, hash<long>, eqstr>& ht, long key){
+  spin_rd_lock l(mu);
+  auto it = ht.find(key);
+  if(it == ht.end()){
+    return 0;
+  }else{
+    return 1;
+ }
+}
+
+int lookup_rwlock_and_shaders(dense_hash_map<long, long, hash<long>, eqstr>* ht, long key){
+  int bucket = key % 1000;
+  spin_rd_lock l(mu_list[bucket]);
+	auto it = ht[bucket].find(key);
+	if(it == ht[bucket].end()){
+		return 0;
+	}else{
+		return 1;
+	}
+}
+
+void find_or_insert(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht,long *keys, long ReadLoops, int i){
+  for (long j = 0; j < ReadLoops; j++) {
+    if (!lookup(std::ref(ht), keys[j])) {
+      auto it1 = ht.insert_lockless(std::move(std::pair<long, long >(keys[j], keys[j]+10)));
+    }
+  }
+}
+
+void find_or_insert_rwlock(dense_hash_map<long, long, hash<long>, eqstr>& ht,long *keys, long ReadLoops, int i){
+  for(long j = 0; j < ReadLoops; j++){
+    if (!lookup_rwlock(std::ref(ht), keys[j])) {
+      spin_wr_lock l(mu);
+      auto it1 = ht.insert(std::move(std::pair<long, long >(keys[j], keys[j]+10)));
+    }
+  }
+}
+
+void find_or_insert_rwlock_and_shaders(dense_hash_map<long, long, hash<long>, eqstr>* ht,long *keys, long ReadLoops, int i){
+  for(long j = 0; j < ReadLoops; j++){
+    if(!lookup_rwlock_and_shaders(ht, keys[j])){
+      int bucket = keys[j] % 1000;
+      spin_wr_lock l(mu_list[bucket]);
+      auto it1 = ht[bucket].insert(std::move(std::pair<long, long >(keys[j], keys[j]+10)));
+    }
+  }
+}
+
+void find_or_insert_with_shaders(dense_hash_map_lockless<long, long, hash<long>, eqstr>* ht,long *keys, long ReadLoops, int i){
+  for(long j = 0; j < ReadLoops; j++){
+    if (!lookup(std::ref(ht[0]), keys[j])) {
+      auto it1 = ht[0].insert_lockless(std::move(std::pair<long, long >(keys[j], keys[j]+10)));
+    }
+  }
+}
+
+void gen_thd(std::set<long>* segs, int thdid, long seg_num) {
+  int i = 0;
+  while(true) {
+    if (segs->size() == seg_num) 
+      break;
+    
+    long gen_key = (rand() % ( 10 * seg_num)) * (1 + thdid);
+    if (segs->find(gen_key) == segs->end()) {
+      segs->insert(gen_key);
+    }
+  }
+}
+
+void insert_thd2(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht, const std::set<long>& keys , int thdid) {
+  auto it = keys.begin();
+  for (; it !=keys.end(); it++) {
+    long id = *it;
+    if(!lookup(std::ref(ht), id)){
+      auto it1 = ht.insert_lockless(std::move(std::pair<long, long >(id, id+10)));
+    }
+  } 
+}
+
+void gen_hybrid_thd(std::set<long>* segs, std::vector<long>* insert_keys, std::vector<long>* lookup_keys, int thdid, long seg_num) {
+  {
+    auto it = segs->begin();
+    for (; it != segs->end(); it++) {
+      lookup_keys->push_back(*it);
+    }
+  }
+  int i = 0;
+  while(true) {
+    if (segs->size() == seg_num) 
+      break;
+    
+    long gen_key = (rand() % ( 1024 * seg_num) + 1) * (1 + thdid);
+    if (segs->find(gen_key) == segs->end()) {
+      segs->insert(gen_key);
+      insert_keys->push_back(gen_key);
+    }
+  }
+}
+
+void hybrid_thd_f(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht, std::vector<long>* insert_keys, std::vector<long>* lookup_keys,  long ops, int thdid) {
+  auto insert_it = insert_keys->begin();
+  auto lookup_it = lookup_keys->begin(); 
+ 
+  for (long i = 0; i < ops ; i++) {
+    int k = i % 2; 
+    // 80% lookup
+    if ( k == 0 ) {
+      if (lookup_it == lookup_keys->end()) lookup_it = lookup_keys->begin();
+      if (!lookup(std::ref(ht), *lookup_it)) {
+        auto it1 = ht.insert_lockless(std::move(std::pair<long, long >(*lookup_it, *lookup_it+10)));
+        }
+        lookup_it++;
+      } else if (k == 1) {
+        // 20% insert
+        if (insert_it == insert_keys->end()) insert_it = insert_keys->begin();
+        if(!lookup(std::ref(ht), *lookup_it)){
+        auto it1 = ht.insert_lockless(std::move(std::pair<long, long >(*lookup_it, *lookup_it+10)));
+        }
+        insert_it++;
+      } 
+  } 
+}
+
+void test_parallel_hybrid(int thread_num, int iter, int read_factor){
+  dense_hash_map_lockless<long, long, hash<long>, eqstr> ht;
+  ht.set_empty_key_and_value(-1, 2147483647);
+  ht.set_deleted_key(-2);
+  ht.set_counternum(16);
+  std::vector<long> lookup_keys;
+  std::vector<std::set<long>> seg_keys(thread_num);
+  long seg_num = read_factor * iter / thread_num;
+  std::vector<std::thread> gen_threads(thread_num);
+  for (int i = 0; i < thread_num; i++) {
+    gen_threads[i] = std::thread(gen_thd, &seg_keys[i], i, seg_num);
+  }
+
+  for (auto &t : gen_threads) {
+    t.join();
+  }
+
+  std::vector<std::thread> insert_threads(thread_num);
+
+  for (size_t i = 0; i < thread_num; ++i) {
+    insert_threads[i] = std::thread(insert_thd2, std::ref(ht), seg_keys[i], i);
+  }
+  for (auto &t : insert_threads) {
+    t.join();
+  }
+
+  {
+    std::vector<std::vector<long> > lookup_keys(thread_num);
+    std::vector<std::vector<long> > insert_keys(thread_num);
+
+    std::vector<std::thread> gen_threads(thread_num);
+    for (int i = 0; i < thread_num; i++) {
+      gen_threads[i] =
+      std::thread(gen_hybrid_thd, &seg_keys[i], &insert_keys[i], &lookup_keys[i], i, 2 * seg_num);
+    }
+
+    for (auto &t : gen_threads) {
+      t.join();
+    }
+    std::vector<std::thread> hybrid_threads(thread_num);
+    auto st_time = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < thread_num; ++i) {
+        hybrid_threads[i] =
+          std::thread(hybrid_thd_f, std::ref(ht), &(insert_keys[i]), &(lookup_keys[i]), read_factor * iter / thread_num, i);
+    }
+    for (auto &t : hybrid_threads) {
+      t.join();
+    }
+    auto ed_time = std::chrono::high_resolution_clock::now();
+    auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+    std::cout<<"parallel_hybrid\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<read_factor*iter/seconds_elapsed<<std::endl;
+  }
+
+}
+
+void test_parallel_8find_and_2insert_hotspot(int threadnum, int iter, int read_factor){
+  dense_hash_map_lockless< long,  long, hash<long>, eqstr> ht;
+  ht.set_empty_key_and_value(-1, 2147483647);
+  ht.set_deleted_key(-2);
+  ht.set_counternum(16);
+  bool* flag = (bool *)malloc(sizeof(bool)*2147483647);
+  srand((unsigned)time(NULL)); 
+  long *keys = (long *)malloc(sizeof(long)*iter);
+  long *counter = (long *)malloc(sizeof(long)*iter);
+  long *hotkeys, *coldkeys;
+  hotkeys = keys;
+  coldkeys = keys + iter/5;
+  for (long i = 0; i < 2147483647; i++) {
+    flag[i] = 0;
+  }
+  for (long i = 0; i < iter; i++) {
+    counter[i] = 1;
+  }
+  int index = 0;
+  while (index < iter) {
+    long j = rand() % 2147483647;
+    if (flag[j] == 1) // the number is already set as a key
+      continue;
+    else{ // the number is not selected as a key
+      keys[index] = j;
+      index++;
+      flag[j] = 1;
+    }
+  }
+  free(flag);
+  long** lookup_keys = (long **)malloc(sizeof(long *) * threadnum);
+
+  for (size_t i = 0; i < threadnum; i++)
+    lookup_keys[i] = (long *)malloc(sizeof(long) * read_factor * iter/threadnum);
+
+  for (long k = 0; k < threadnum; k ++) {
+    for (long i = 0; i < read_factor * iter/threadnum; i++) {
+      long j = rand()%10;
+      if (j < 8) {
+        long pos = rand()%(iter/5);
+	lookup_keys[k][i] = hotkeys[pos];
+      } else {
+        long pos = rand()%(iter * 4 / 5);
+	lookup_keys[k][i] = coldkeys[pos];
+      }
+    }
+  }
+  std::vector<std::thread> lookup_threads(threadnum);
+  auto st_time = std::chrono::high_resolution_clock::now();
+  for (size_t i =0 ; i < threadnum; i++) {
+    lookup_threads[i] = std::thread(find_or_insert, std::ref(ht), lookup_keys[i], read_factor*iter/threadnum, i);
+  }
+  for(auto &t : lookup_threads) {
+    t.join();
+  }
+  auto ed_time = std::chrono::high_resolution_clock::now();
+  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+  std::cout<<"parallel_find_or_create\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<read_factor*iter/seconds_elapsed<<std::endl;
+}
+
+void test_parallel_find_or_create_with_shaders(int threadnum, int iter, int read_factor){
+  dense_hash_map_lockless< long,  long, hash<long>, eqstr> ht[1000];
+  for (int i = 0; i < 1000 ; i++){
+    ht[i].set_empty_key_and_value(-1, 2147483647);
+    ht[i].set_deleted_key(-2);
+    ht[i].set_counternum(16);
+  }
+  bool* flag = (bool *)malloc(sizeof(bool)*2147483647);
+  srand((unsigned)time(NULL)); 
+  long *keys = (long *)malloc(sizeof(long)*iter);
+  long *counter = (long *)malloc(sizeof(long)*iter);
+  long *hotkeys, *coldkeys;
+  hotkeys = keys;
+  coldkeys = keys + iter/5;
+  for(long i = 0; i < 2147483647; i++){
+    flag[i] = 0;
+  }
+  for(long i = 0; i < iter; i++){
+    counter[i] = 1;
+  }
+  int index = 0;
+  while (index < iter){
+    long j = rand() % 2147483647;
+    if(flag[j] == 1) // the number is already set as a key
+      continue;
+    else{ // the number is not selected as a key
+      keys[index] = j;
+      index++;
+      flag[j] = 1;
+   }
+  }
+  free(flag);
+  long** lookup_keys = (long **)malloc(sizeof(long *) * threadnum);
+
+  for (size_t i = 0; i < threadnum; i++)
+    lookup_keys[i] = (long *)malloc(sizeof(long) * read_factor *iter/threadnum);
+
+  for (long k = 0; k < threadnum; k ++) {
+    for (long i = 0; i < read_factor*iter/threadnum; i++) {
+      long j = rand()%10;
+      if (j < 8) {
+        long pos = rand()%(iter/5);
+	lookup_keys[k][i] = hotkeys[pos];
+      }else{
+	long pos = rand()%(iter * 4 / 5);
+	lookup_keys[k][i] = coldkeys[pos];
+      }
+    }
+  }
+  std::vector<std::thread> lookup_threads(threadnum);
+  auto st_time = std::chrono::high_resolution_clock::now();
+  for (size_t i = 0 ; i < threadnum; i++) {
+    lookup_threads[i] = std::thread(find_or_insert_with_shaders, ht, lookup_keys[i], read_factor*iter/threadnum, i);
+  }
+  for (auto &t : lookup_threads) {
+    t.join();
+  }
+  auto ed_time = std::chrono::high_resolution_clock::now();
+  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+  std::cout<<"parallel_find_or_create_with_shaders\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<read_factor*iter/seconds_elapsed<<std::endl;
+}
+
+
+void test_dense_hash_map_parallel(int threadnum, int iter, int read_factor){
+  test_parallel_find(threadnum, iter);
+  test_parallel_insert(threadnum, iter);
+  test_parallel_hybrid(threadnum, iter, read_factor);
+}
+
+
+void test_parallel_find_rwlock(int threadnum, int iter){
+  dense_hash_map< long,  long, hash<long>, eqstr> ht;
+  ht.set_empty_key(-1);
+  ht.set_deleted_key(-2);
+  for (long i = 0; i < iter; i++)
+    ht.insert(std::move(std::pair<long ,long>(i, i + 10)));
+  std::vector<std::thread> insert_threads(threadnum);
+  auto st_time = std::chrono::high_resolution_clock::now();
+  for (long i = 0 ; i < threadnum; i++) {
+    insert_threads[i] = std::thread(thread_lookup_rwlock, std::ref(ht), iter, i, threadnum);
+  }
+  for (auto &t : insert_threads) {
+    t.join();
+  }
+  auto ed_time = std::chrono::high_resolution_clock::now();
+  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+  std::cout<<"parallel_find\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<iter/seconds_elapsed<<std::endl;
+}
+
+void test_parallel_insert_rwlock(int threadnum, int iter){
+  dense_hash_map< long,  long, hash<long>, eqstr> ht;
+  ht.set_empty_key(-1);
+  ht.set_deleted_key(-2);
+  std::vector<std::thread> insert_threads(threadnum);
+  auto st_time = std::chrono::high_resolution_clock::now();
+  for (long i = 0 ; i < threadnum; i++) {
+    insert_threads[i] = std::thread(thread_insert_rwlock, std::ref(ht), iter, i, threadnum);
+  }
+  for (auto &t : insert_threads) {
+    t.join();
+  }
+  auto ed_time = std::chrono::high_resolution_clock::now();
+  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count(); 
+  std::cout<<"parallel_insert\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<iter/seconds_elapsed<<std::endl;
+}
+
+void test_parallel_8find_and_2insert_hotspot_rwlock(int threadnum, int iter){
+  dense_hash_map< long,  long, hash<long>, eqstr> ht;
+  ht.set_empty_key(-1);
+  ht.set_deleted_key(-2);
+  bool* flag = (bool *)malloc(sizeof(bool)*2147483647);
+  srand((unsigned)time(NULL)); 
+  long *keys = (long *)malloc(sizeof(long)*iter);
+  long *counter = (long *)malloc(sizeof(long)*iter);
+  long *hotkeys, *coldkeys;
+  hotkeys = keys;
+  coldkeys = keys + iter/5;
+  for (long i = 0; i < 2147483647; i++) {
+    flag[i] = 0;
+  }
+  for (long i = 0; i < iter; i++) {
+    counter[i] = 1;
+  }
+  int index = 0;
+  while (index < iter) {
+    long j = rand() % 2147483647;
+    if (flag[j] == 1) // the number is already set as a key
+      continue;
+    else{ // the number is not selected as a key
+      keys[index] = j;
+      index++;
+      flag[j] = 1;
+    }
+  }
+  free(flag);
+  long** lookup_keys = (long **)malloc(sizeof(long *) * threadnum);
+
+  for (size_t i = 0; i < threadnum; i++)
+    lookup_keys[i] = (long *)malloc(sizeof(long) * 5*iter/threadnum);
+
+  for (long k = 0; k < threadnum; k ++) {
+    for (long i = 0; i < 5*iter/threadnum; i++){
+      long j = rand()%10;
+      if (j < 8) {
+        long pos = rand()%(iter/5);
+	lookup_keys[k][i] = hotkeys[pos];
+      }else {
+        long pos = rand()%(iter * 4 / 5);
+	lookup_keys[k][i] = coldkeys[pos];
+      }
+    }
+  }
+  std::vector<std::thread> lookup_threads(threadnum);
+  auto st_time = std::chrono::high_resolution_clock::now();
+  for (size_t i = 0 ; i < threadnum; i++) {
+    lookup_threads[i] = std::thread(find_or_insert_rwlock, std::ref(ht), lookup_keys[i], 5*iter/threadnum, i);
+  }
+  for (auto &t : lookup_threads) {
+    t.join();
+  }
+  auto ed_time = std::chrono::high_resolution_clock::now();
+  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+  std::cout<<"parallel_find_or_create\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<5*iter/seconds_elapsed<<std::endl;
+}
+
+void test_dense_hash_map_with_rwlock(int threadnum, int iter){
+  test_parallel_find_rwlock(threadnum, iter);
+  test_parallel_insert_rwlock(threadnum, iter);
+}
+
+void test_parallel_find_rwlock_and_shaders(int threadnum, int iter){
+  dense_hash_map< long,  long, hash<long>, eqstr> ht[1000];
+  for (int i = 0; i < 1000 ; i++){
+    ht[i].set_empty_key(-1);
+    ht[i].set_deleted_key(-2);
+  }
+  for (long i = 0; i < iter; i++) {
+    int bucket = i % 1000;
+    ht[bucket].insert(std::move(std::pair<long ,long>(i, i + 10)));
+  }
+  std::vector<std::thread> insert_threads(threadnum);
+  auto st_time = std::chrono::high_resolution_clock::now();
+  for(long i =0 ; i < threadnum; i++){
+    insert_threads[i] = std::thread(thread_lookup_rwlock_and_shaders, ht, iter, i, threadnum);
+  }
+  for(auto &t : insert_threads){
+    t.join();
+  }
+  auto ed_time = std::chrono::high_resolution_clock::now();
+  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+  std::cout<<"parallel_find\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<iter/seconds_elapsed<<std::endl;
+}
+
+void test_parallel_insert_rwlock_and_shaders(int threadnum, int iter){
+  dense_hash_map< long,  long, hash<long>, eqstr> ht[1000];
+  for (int i = 0; i < 1000 ; i++) {
+    ht[i].set_empty_key(-1);
+    ht[i].set_deleted_key(-2);
+  }
+  std::vector<std::thread> insert_threads(threadnum);
+  auto st_time = std::chrono::high_resolution_clock::now();
+  for (long i = 0 ; i < threadnum; i++) {
+    insert_threads[i] = std::thread(thread_insert_rwlock_and_shaders, ht, iter, i, threadnum);
+  }
+  for(auto &t : insert_threads){
+    t.join();
+  }
+  auto ed_time = std::chrono::high_resolution_clock::now();
+  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+  std::cout<<"parallel_insert\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<iter/seconds_elapsed<<std::endl;
+}
+
+void test_parallel_8find_and_2insert_hotspot_rwlock_and_shaders(int threadnum, int iter, int read_factor){
+  dense_hash_map< long,  long, hash<long>, eqstr> ht[1000];
+  for (int i = 0; i < 1000 ; i++) {
+    ht[i].set_empty_key(-1);
+    ht[i].set_deleted_key(-2);
+  }
+  bool* flag = (bool *)malloc(sizeof(bool)*2147483647);
+  srand((unsigned)time(NULL)); 
+  long *keys = (long *)malloc(sizeof(long)*iter);
+  long *counter = (long *)malloc(sizeof(long)*iter);
+  long *hotkeys, *coldkeys;
+  hotkeys = keys;
+  coldkeys = keys + iter/5;
+  for (long i = 0; i < 2147483647; i++) {
+    flag[i] = 0;
+  }
+  for (long i = 0; i < iter; i++) {
+    counter[i] = 1;
+  }
+  int index = 0;
+  while (index < iter) {
+    long j = rand() % 2147483647;
+    if (flag[j] == 1) // the number is already set as a key
+      continue;
+    else { // the number is not selected as a key
+     keys[index] = j;
+     index++;
+     flag[j] = 1;
+    }
+  }
+  free(flag);
+  long** lookup_keys = (long **)malloc(sizeof(long *) * threadnum);
+
+  for (size_t i = 0; i < threadnum; i++)
+  lookup_keys[i] = (long *)malloc(sizeof(long) * read_factor * iter/threadnum);
+  for (long k = 0; k < threadnum; k++) {
+    for (long i = 0; i < read_factor*iter/threadnum; i++) {
+      long j = rand()%10;
+      if (j < 8) {
+        long pos = rand()%(iter/5);
+        lookup_keys[k][i] = hotkeys[pos];
+      }else{
+        long pos = rand()%(iter * 4 / 5);
+        lookup_keys[k][i] = coldkeys[pos];
+      }
+    }
+  }
+  std::vector<std::thread> lookup_threads(threadnum);
+  auto st_time = std::chrono::high_resolution_clock::now();
+  for (size_t i = 0 ; i < threadnum; i++) {
+    lookup_threads[i] = std::thread(find_or_insert_rwlock_and_shaders, ht, lookup_keys[i], read_factor*iter/threadnum, i);
+  }
+  for (auto &t : lookup_threads) {
+    t.join();
+  }
+  auto ed_time = std::chrono::high_resolution_clock::now();
+  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+  std::cout<<"parallel_find_or_create\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<read_factor*iter/seconds_elapsed<<std::endl;
+
+}
+
+void insert_thd2_rwlock_and_shaders(dense_hash_map<long, long, hash<long>, eqstr>* ht, const std::set<long>& keys , int thdid) {
+  auto it = keys.begin();
+  for (; it !=keys.end(); it++) {
+    long id = *it;
+    if (!lookup_rwlock_and_shaders(ht, id)) {
+      int bucket = id % 1000;
+      spin_wr_lock l(mu_list[bucket]);
+      auto it1 = ht[bucket].insert(std::move(std::pair<long, long >(id, id+10)));
+    }
+  } 
+}
+
+
+void hybrid_thd_f_rwlock_and_shaders(dense_hash_map<long, long, hash<long>, eqstr>* ht, std::vector<long>* insert_keys, std::vector<long>* lookup_keys,  long ops, int thdid) {
+  auto insert_it = insert_keys->begin();
+  auto lookup_it = lookup_keys->begin(); 
+ 
+  for (long i = 0; i < ops ; i++) {
+
+    int k = i % 2;
+    
+      // 80% lookup
+    if ( k == 0 ) {
+      if (lookup_it == lookup_keys->end()) lookup_it = lookup_keys->begin();
+      if(!lookup_rwlock_and_shaders(ht, *lookup_it)){
+        int bucket = *lookup_it % 1000;
+        spin_wr_lock l(mu_list[bucket]);
+        auto it1 = ht[bucket].insert(std::move(std::pair<long, long >(*lookup_it, *lookup_it+10)));
+      }
+      lookup_it++;
+      } else if (k == 1) {
+        // 20% insert
+        if (insert_it == insert_keys->end()) insert_it = insert_keys->begin();
+        if(!lookup_rwlock_and_shaders(ht, *lookup_it)){
+          int bucket = *lookup_it % 1000;
+          spin_wr_lock l(mu_list[bucket]);
+          auto it1 = ht[bucket].insert(std::move(std::pair<long, long >(*lookup_it, *lookup_it+10)));
+        }
+        insert_it++;
+      }
+  } 
+}
+
+void test_parallel_hybrid_rwlock_and_shaders(int thread_num, int iter, int read_factor){
+  dense_hash_map<long, long, hash<long>, eqstr> ht[1000];
+  for(int i=0; i < 1000 ; i++){
+    ht[i].set_empty_key(-1);
+	  ht[i].set_deleted_key(-2);
+  }
+  std::vector<long> lookup_keys;
+  std::vector<std::set<long>> seg_keys(thread_num);
+  long seg_num = read_factor * iter / thread_num;
+  std::vector<std::thread> gen_threads(thread_num);
+  for(int i = 0; i < thread_num; i++) {
+    gen_threads[i] = std::thread(gen_thd, &seg_keys[i], i, seg_num);
+  }
+
+  for (auto &t : gen_threads) {
+    t.join();
+  }
+
+  std::vector<std::thread> insert_threads(thread_num);
+
+  for (size_t i = 0; i < thread_num; ++i) {
+    insert_threads[i] = std::thread(insert_thd2_rwlock_and_shaders, ht, seg_keys[i], i);
+  }
+  for (auto &t : insert_threads) {
+    t.join();
+  }
+
+  {
+    std::vector<std::vector<long> > lookup_keys(thread_num);
+    std::vector<std::vector<long> > insert_keys(thread_num);
+
+    std::vector<std::thread> gen_threads(thread_num);
+    for (int i = 0; i < thread_num; i++) {
+      gen_threads[i] =
+      std::thread(gen_hybrid_thd, &seg_keys[i], &insert_keys[i], &lookup_keys[i], i, 2 * seg_num);
+    }
+
+    for (auto &t : gen_threads) {
+      t.join();
+    }
+    std::vector<std::thread> hybrid_threads(thread_num);
+    auto st_time = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < thread_num; ++i) {
+        hybrid_threads[i] =
+          std::thread(hybrid_thd_f_rwlock_and_shaders, ht, &(insert_keys[i]), &(lookup_keys[i]), read_factor * iter / thread_num, i);
+    }
+    for (auto &t : hybrid_threads) {
+      t.join();
+    }
+    auto ed_time = std::chrono::high_resolution_clock::now();
+    auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
+    std::cout<<"parallel_hybrid\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<read_factor*iter/seconds_elapsed<<std::endl;
+  }
+
+}
+
+void test_dense_hash_map_with_rwlock_and_shaders(int threadnum, int iter, int read_factor){
+  for (int i = 0; i < 1000; i++)
+    mu_list[i] = EASY_SPINRWLOCK_INITIALIZER;
+  test_parallel_find_rwlock_and_shaders(threadnum, iter);
+  test_parallel_insert_rwlock_and_shaders(threadnum, iter);
+  test_parallel_hybrid_rwlock_and_shaders(threadnum, iter, read_factor);
+}
+
+int main(int argc, char** argv) {
+  int iters = kDefaultIters;
+  int threadnum = 16;
+  int read_factor = 5;
+  if (argc > 1) {  // first arg is # of iterations
+    threadnum = atoi(argv[1]);
+    if(argc > 2)
+     read_factor  = atoi(argv[2]);
+    if(argc > 3)
+	iters = atoi(argv[3]);
+  }
+
+  stamp_run(iters, read_factor);
+
+#ifndef HAVE_SYS_RESOURCE_H
+  printf(
+      "\n*** WARNING ***: sys/resources.h was not found, so all times\n"
+      "                 reported are wall-clock time, not user time\n");
+#endif
+
+  // It would be nice to set these at run-time, but by setting them at
+  // compile-time, we allow optimizations that make it as fast to use
+  // a HashObject as it would be to use just a straight int/char
+  // buffer.  To keep memory use similar, we normalize the number of
+  // iterations based on size.
+  std::cout<<"Benchmark for dense hash map with rwlock\n";
+  std::cout<<"**************************************************\n";
+  test_dense_hash_map_with_rwlock(threadnum, iters/2);
+  std::cout<<"**************************************************\n";
+  std::cout<<"Benchmark for dense hash map with rwlock and shaders\n";
+  std::cout<<"**************************************************\n";
+  test_dense_hash_map_with_rwlock_and_shaders(threadnum, iters/2, read_factor);
+  std::cout<<"**************************************************\n";
+  std::cout<<"Benchmark for lockless dense hash map\n";
+  std::cout<<"**************************************************\n";
+  test_dense_hash_map_parallel(threadnum, iters/2, read_factor);
+  std::cout<<"**************************************************\n";
+
+  return 0;
+}
diff --git a/tests/dense_hash_map_unittests.cc b/tests/dense_hash_map_unittests.cc
index 7c4b3c1..37560c1 100644
--- a/tests/dense_hash_map_unittests.cc
+++ b/tests/dense_hash_map_unittests.cc
@@ -1,21 +1,128 @@
-//
-// Created by Lukas Barth on 17.04.18.
-//
-
 #include "gtest/gtest.h"
-#include "sparsehash/dense_hash_map"
+#include <iostream>
+#include <sparsehash/dense_hash_map_lockless>
+#include <cstring>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <chrono>
+#include <thread>
+#include <vector>
+#include <random>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+using google::dense_hash_map_lockless;      // namespace where class lives by default
+using std::cout;
+using std::endl;
+using std::hash;  // or __gnu_cxx::hash, or maybe tr1::hash, depending on your OS
+
+#define THREADNUM 16
+
+long InsertLoops = 1000;
+long ReadLoops = 5000;
+long min = 0;
+long max = 2147483647;
+dense_hash_map_lockless<long, long> ht;
+dense_hash_map_lockless<long, long> ht_insert;
+
+int lookup(long key){
+  auto it = ht.find(key);
+  if (it == ht.end()) {
+    return 0;
+  }else{
+    return 1;
+  }
+}
+
+void hybrid_process(long *keys, long ReadLoops){
+  for (long j = 0; j < ReadLoops; j++) {
+    ht.insert_lockless(std::move(std::pair<long, long >(keys[j], keys[j]+10)));
+    auto it = ht.find_wait_free(keys[j]);
+    ASSERT_EQ(it.first + 10 , it.second);
+    if (j%2 == 0) {
+      ht.erase_lockless(keys[j]);
+      it = ht.find_wait_free(keys[j]);
+      ASSERT_EQ(it.first, -1);
+    }
+  }
+}
+
+void multi_insertion(){
+  for (long j = 0; j < 5; j++) {
+    ht_insert.insert_lockless(std::move(std::pair<long, long >(j, j+10)));
+  }
+}
 
-using google::dense_hash_map;
 
-TEST(DenseHashMap, TestEmplaceHint) {
-	dense_hash_map<int, const char *> map;
-	map.set_empty_key(0);
+TEST(DenseHashMap, Testconcurrent) {
+  bool* flag = (bool *)malloc(sizeof(bool)*max);
+  srand((unsigned)time(NULL)); 
+  long *keys = (long *)malloc(sizeof(long)*InsertLoops);
+  long *counter = (long *)malloc(sizeof(long)*InsertLoops);
+  ht.set_empty_key_and_value(-1, max);
+  ht.set_deleted_key(-2);
+  ht.set_counternum(16);
 
-	const char * str1 = "Hello";
+  for (long i = 0; i < max; i++) {
+    flag[i] = 0;
+  }
+  for (long i = 0; i < InsertLoops; i++) {
+    counter[i] = 1;
+  }
+  int index = 0;
+  while (index < InsertLoops) {
+    long j = rand() % max;
+    if (flag[j] == 1) // the number is already set as a key
+      continue;
+    else { // the number is not selected as a key
+      keys[index] = j;
+      index++;
+      flag[j] = 1;
+    }
+  }
+  free(flag);
+  std::vector<std::thread> insert_threads(THREADNUM);
+  for (size_t i = 0 ; i < THREADNUM; i++) {
+    insert_threads[i] = std::thread(hybrid_process, &keys[i*InsertLoops/THREADNUM], InsertLoops/THREADNUM);
+  }
+  for (auto &t : insert_threads) {
+    t.join();
+  }
+  {
+    long sum = 0;
+    std::pair<std::pair<const long, long>*, long> snapshot = ht.GetSnapshot();
+    std::pair<const long, long>* ht_dump = snapshot.first;
+    long bucket_cnt_dump = snapshot.second;
+    for (long i = 0; i < bucket_cnt_dump; i++) {
+      if (ht_dump[i].first != -1 &&  ht_dump[i].first != -2) {
+        sum++;
+      }
+    }
+    ASSERT_EQ(ht.size_lockless(), sum);
+  }
 
-	map.insert({42, str1});
-	auto it = map.begin();
-	map.emplace_hint(it, 1701, "World");
+  ht_insert.set_empty_key_and_value(-1, max);
+  ht_insert.set_deleted_key(-2);
+  ht_insert.set_counternum(16);
 
-	ASSERT_EQ(map.size(), 2);
-}
\ No newline at end of file
+  for (size_t i = 0 ; i < THREADNUM; i++) {
+    insert_threads[i] = std::thread(multi_insertion);
+  }
+  for (auto &t : insert_threads) {
+    t.join();
+  }
+  {
+    long sum = 0;
+    std::pair<std::pair<const long, long>*, long> snapshot = ht_insert.GetSnapshot();
+    std::pair<const long, long>* ht_dump = snapshot.first;
+    long bucket_cnt_dump = snapshot.second;
+    for (long i = 0; i < bucket_cnt_dump; i++) {
+      if (ht_dump[i].first != -1 &&  ht_dump[i].first != -2) {
+        sum++;
+      }
+    }
+    ASSERT_EQ(ht_insert.size_lockless(), 5);
+    ASSERT_EQ(ht_insert.size_lockless(), sum);
+  }
+}
diff --git a/tests/rwlock.h b/tests/rwlock.h
new file mode 100644
index 0000000..59ef9b2
--- /dev/null
+++ b/tests/rwlock.h
@@ -0,0 +1,224 @@
+#define EASY_SMP_LOCK               "lock;"
+#define easy_atomic_set(v,i)        ((v) = (i))
+
+typedef volatile int64_t easy_atomic_t;
+static __inline__ void easy_atomic_add(easy_atomic_t *v, int64_t i)
+{
+    __asm__ __volatile__(
+        EASY_SMP_LOCK "addq %1,%0"
+        : "=m" ((*v)) : "r" (i), "m" ((*v)));
+}
+static __inline__ int64_t easy_atomic_add_return(easy_atomic_t *value, int64_t i)
+{
+    int64_t                 __i = i;
+    __asm__ __volatile__(
+        EASY_SMP_LOCK "xaddq %0, %1;"
+        :"=r"(i)
+        :"m"(*value), "0"(i));
+    return i + __i;
+}
+static __inline__ int64_t easy_atomic_cmp_set(easy_atomic_t *lock, int64_t old, int64_t set)
+{
+    uint8_t                 res;
+    __asm__ volatile (
+        EASY_SMP_LOCK "cmpxchgq %3, %1; sete %0"
+        : "=a" (res) : "m" (*lock), "a" (old), "r" (set) : "cc", "memory");
+    return res;
+}
+static __inline__ void easy_atomic_inc(easy_atomic_t *v)
+{
+    __asm__ __volatile__(EASY_SMP_LOCK "incq %0" : "=m" (*v) :"m" (*v));
+}
+static __inline__ void easy_atomic_dec(easy_atomic_t *v)
+{
+    __asm__ __volatile__(EASY_SMP_LOCK "decq %0" : "=m" (*v) :"m" (*v));
+}
+
+#define EASY_OK                     0
+#define EASY_ERROR                  (-1)
+#define EASY_ABORT                  (-2)
+#define EASY_ASYNC                  (-3)
+#define EASY_BREAK                  (-4)
+#define EASY_ENCODE                 (-5)
+#define EASY_QUEUE_FULL             (-6)
+#define EASY_AGAIN                  (-EAGAIN)
+
+typedef struct easy_spinrwlock_t {
+    easy_atomic_t ref_cnt;
+    easy_atomic_t wait_write;
+} easy_spinrwlock_t;
+#define EASY_SPINRWLOCK_INITIALIZER {0, 0}
+static __inline__ int easy_spinrwlock_rdlock(easy_spinrwlock_t *lock)
+{
+    int ret = EASY_OK;
+
+    if (NULL == lock) {
+        ret = EASY_ERROR;
+    } else {
+        int cond = 1;
+
+        while (cond) {
+            int loop = 1;
+
+            do {
+                easy_atomic_t oldv = lock->ref_cnt;
+
+                if (0 <= oldv && 0 == lock->wait_write) {
+                    if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, oldv + 1)) {
+                        return ret;
+                    }
+                }
+
+                asm("pause");
+                loop <<= 1;
+            } while (loop < 1024);
+
+            sched_yield();
+        }
+    }
+
+    return ret;
+}
+static __inline__ int easy_spinrwlock_wrlock(easy_spinrwlock_t *lock)
+{
+    int ret = EASY_OK;
+
+    if (NULL == lock) {
+        ret = EASY_ERROR;
+    } else {
+        int cond = 1;
+        easy_atomic_inc(&lock->wait_write);
+
+        while (cond) {
+            int loop = 1;
+
+            do {
+                easy_atomic_t oldv = lock->ref_cnt;
+
+                if (0 == oldv) {
+                    if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, -1)) {
+                        cond = 0;
+                        break;
+                    }
+                }
+
+                asm("pause");
+                loop <<= 1;
+            } while (loop < 1024);
+
+            if (cond) sched_yield();
+        }
+
+        easy_atomic_dec(&lock->wait_write);
+    }
+
+    return ret;
+}
+static __inline__ int easy_spinrwlock_try_rdlock(easy_spinrwlock_t *lock)
+{
+    int ret = EASY_OK;
+
+    if (NULL == lock) {
+        ret = EASY_ERROR;
+    } else {
+        ret = EASY_AGAIN;
+        easy_atomic_t oldv = lock->ref_cnt;
+
+        if (0 <= oldv
+                && 0 == lock->wait_write) {
+            easy_atomic_t newv = oldv + 1;
+
+            if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) {
+                ret = EASY_OK;
+            }
+        }
+    }
+
+    return ret;
+}
+static __inline__ int easy_spinrwlock_try_wrlock(easy_spinrwlock_t *lock)
+{
+    int ret = EASY_OK;
+
+    if (NULL == lock) {
+        ret = EASY_ERROR;
+    } else {
+        ret = EASY_AGAIN;
+        easy_atomic_t oldv = lock->ref_cnt;
+
+        if (0 == oldv) {
+            easy_atomic_t newv = -1;
+
+            if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) {
+                ret = EASY_OK;
+            }
+        }
+    }
+
+    return ret;
+}
+static __inline__ int easy_spinrwlock_unlock(easy_spinrwlock_t *lock)
+{
+    int ret = EASY_OK;
+
+    if (NULL == lock) {
+        ret = EASY_ERROR;
+    } else {
+        while (1) {
+            easy_atomic_t oldv = lock->ref_cnt;
+
+            if (-1 == oldv) {
+                easy_atomic_t newv = 0;
+
+                if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) {
+                    break;
+                }
+            } else if (0 < oldv) {
+                easy_atomic_t newv = oldv - 1;
+
+                if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) {
+                    break;
+                }
+            } else {
+                ret = EASY_ERROR;
+                break;
+            }
+        }
+    }
+
+    return ret;
+}
+
+class spin_rd_lock {
+public:
+    typedef easy_spinrwlock_t lock_type;
+
+    explicit spin_rd_lock(lock_type* lock) : lock_(lock) {
+        easy_spinrwlock_rdlock(lock_);
+    }
+    explicit spin_rd_lock(lock_type& lock) : lock_(&lock) {
+        easy_spinrwlock_rdlock(lock_);
+    }
+    ~spin_rd_lock() {
+        easy_spinrwlock_unlock(lock_);
+    }
+private:
+    lock_type* lock_;
+};
+
+class spin_wr_lock {
+public:
+    typedef easy_spinrwlock_t lock_type;
+
+    explicit spin_wr_lock(lock_type* lock) : lock_(lock) {
+        easy_spinrwlock_wrlock(lock_);
+    }
+    explicit spin_wr_lock(lock_type& lock) : lock_(&lock) {
+        easy_spinrwlock_wrlock(lock_);
+    }
+    ~spin_wr_lock() {
+        easy_spinrwlock_unlock(lock_);
+    }
+private:
+    lock_type* lock_;
+};
\ No newline at end of file
-- 
2.23.0