Skip to content

0.6.0 release #348

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Dec 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
c0faac1
Updated the CMakeLists to simply use as a fetch'd library.
LTLA Jul 10, 2021
085b165
Added the missing header.
LTLA Jul 10, 2021
c41ee3b
Made the data getter const.
LTLA Jul 10, 2021
ed7c92a
Migrate from travis to github actions
Jul 12, 2021
8f1a044
Merge pull request #325 from dyashuni/migration_to_actions
yurymalkov Jul 14, 2021
4833abe
Parallel indexing
alonre24 Jul 20, 2021
a6af73d
Added python bindings for BF index for recall testing
alonre24 Jul 21, 2021
d4c881d
Add recall test for hnsw via python bindings
alonre24 Jul 22, 2021
079c71e
Add load and store index to the bindings, update test recall
alonre24 Jul 25, 2021
9c2dc7c
Adding documentation
alonre24 Jul 25, 2021
cb399cf
Added AVX512 support for space_l2 and space_ip.
slice4e Sep 1, 2021
677700f
fixed missing #endif.
slice4e Sep 10, 2021
d6c8e3a
Lowered minimum CMake version back to 2.6.
LTLA Sep 16, 2021
e7935b7
Corrently check for the presense of AVX512F (Foundation set) at compi…
slice4e Sep 16, 2021
d7bec60
Fixed a bug where we are aligning the TmpRes[16] variable to 32 bytes…
slice4e Sep 16, 2021
05de244
changed float * to void * in L2SqrSIMD16Ext
slice4e Sep 29, 2021
290f3e2
fixed errors with const void* conversion
slice4e Oct 1, 2021
ff10e88
Merge pull request #339 from slice4e/develop
yurymalkov Oct 3, 2021
7c63a15
Merge pull request #340 from LTLA/master
yurymalkov Oct 3, 2021
ac43973
add a test for distance computation correctness
yurymalkov Oct 4, 2021
a29c3dd
fix : a sufficient results may occur(has marked_deleted node)
Oct 25, 2021
c2bc2ad
and => &&
dorosy-yeong Oct 26, 2021
8c6960b
fix a performance degradation.
dorosy-yeong Nov 8, 2021
67a7a1b
Merge pull request #344 from dorosy-yeong/markeddeleted-sufficient-re…
yurymalkov Nov 8, 2021
342257e
make test faster, fix few bugs
yurymalkov Nov 7, 2021
79d5d74
Merge pull request #332 from RedisAI/add_bindings_to_bf_index
yurymalkov Nov 16, 2021
36d00bf
Fix failing windows build (#346)
yurymalkov Nov 16, 2021
9006b32
Unmark deleted
Jul 29, 2021
47bb1a1
Merge pull request #334 from dyashuni/unmark_deleted
yurymalkov Nov 23, 2021
cca297a
update documents for the release
yurymalkov Nov 25, 2021
bcbcb5d
add performance test for commits
yurymalkov Dec 9, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: HNSW CI

on: [push, pull_request]

jobs:
test:
runs-on: ${{matrix.os}}
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
python-version: ['3.6', '3.7', '3.8', '3.9']
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}

- name: Build and install
run: python -m pip install .

- name: Test
run: python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ python_bindings/tests/__pycache__/
*.pyd
hnswlib.cpython*.so
var/
.idea/
.vscode/

63 changes: 0 additions & 63 deletions .travis.yml

This file was deleted.

2 changes: 2 additions & 0 deletions ALGO_PARAMS.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ The ```knn_query``` function returns two numpy arrays, containing labels and dis
elements for the queries. Note that in case the algorithm is not be able to find ```k``` neighbors to all of the queries,
(this can be due to problems with graph or ```k```>size of the dataset) an exception is thrown.

An example of tuning the parameters can be found in [TESTING_RECALL.md](TESTING_RECALL.md)

## Construction parameters:
* ```M``` - the number of bi-directional links created for every new element during construction. Reasonable range for ```M```
is 2-100. Higher ```M``` work better on datasets with high intrinsic dimensionality and/or high recall, while low ```M``` work
Expand Down
39 changes: 19 additions & 20 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,28 +1,27 @@
cmake_minimum_required (VERSION 2.6)
project (hnsw_lib)
project(hnsw_lib
LANGUAGES CXX)

include_directories("${PROJECT_BINARY_DIR}")
add_library(hnswlib INTERFACE)
target_include_directories(hnswlib INTERFACE .)

if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
set(CMAKE_CXX_STANDARD 11)

if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize")
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
endif()

set(SOURCE_EXE main.cpp)
add_executable(test_updates examples/updates_test.cpp)
target_link_libraries(test_updates hnswlib)

set(SOURCE_LIB sift_1b.cpp)
add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp)
target_link_libraries(searchKnnCloserFirst_test hnswlib)

add_library(sift_test STATIC ${SOURCE_LIB})


add_executable(main ${SOURCE_EXE})
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize")
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
add_executable(main main.cpp sift_1b.cpp)
target_link_libraries(main hnswlib)
endif()

add_executable(test_updates examples/updates_test.cpp)

add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp)

target_link_libraries(main sift_test)
34 changes: 22 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,20 @@ Header-only C++ HNSW implementation with python bindings.

**NEWS:**

* **Hnswlib is now 0.5.2**. Bugfixes - thanks [@marekhanus](https://github.com/marekhanus) for fixing the missing arguments, adding support for python 3.8, 3.9 in Travis, improving python wrapper and fixing typos/code style; [@apoorv-sharma](https://github.com/apoorv-sharma) for fixing the bug int the insertion/deletion logic; [@shengjun1985](https://github.com/shengjun1985) for simplifying the memory reallocation logic; [@TakaakiFuruse](https://github.com/TakaakiFuruse) for improved description of `add_items`; [@psobot ](https://github.com/psobot) for improving error handling; [@ShuAiii](https://github.com/ShuAiii) for reporting the bug in the python interface
**version 0.6**
* Thanks to ([@dyashuni](https://github.com/dyashuni)) hnswlib now uses github actions for CI, there is a search speedup in some scenarios with deletions. `unmark_deleted(label)` is now also a part of the python interface (note now it throws an exception for double deletions).
* Thanks to ([@slice4e](https://github.com/slice4e)) we now support AVX512; thanks to ([@LTLA](https://github.com/LTLA)) the cmake interface for the lib is now updated.
* Thanks to ([@alonre24](https://github.com/alonre24)) we now have a python bindings for brute-force (and examples for recall tuning: [TESTING_RECALL.md](TESTING_RECALL.md).
* Thanks to ([@dorosy-yeong](https://github.com/dorosy-yeong)) there is a bug fixed in the handling large quantities of deleted elements and large K.

* **Hnswlib is now 0.5.0**. Added support for pickling indices, support for PEP-517 and PEP-518 building, small speedups, bug and documentation fixes. Many thanks to [@dbespalov](https://github.com/dbespalov), [@dyashuni](https://github.com/dyashuni), [@groodt](https://github.com/groodt),[@uestc-lfs](https://github.com/uestc-lfs), [@vinnitu](https://github.com/vinnitu), [@fabiencastan](https://github.com/fabiencastan), [@JinHai-CN](https://github.com/JinHai-CN), [@js1010](https://github.com/js1010)!

* **Thanks to Apoorv Sharma [@apoorv-sharma](https://github.com/apoorv-sharma), hnswlib now supports true element updates (the interface remained the same, but when you the performance/memory should not degrade as you update the element embeddings).**

* **Thanks to Dmitry [@2ooom](https://github.com/2ooom), hnswlib got a boost in performance for vector dimensions that are not multiple of 4**


* **Thanks to Louis Abraham ([@louisabraham](https://github.com/louisabraham)) hnswlib can now be installed via pip!**

Highlights:
1) Lightweight, header-only, no dependencies other than C++ 11.
2) Interfaces for C++, python and R (https://github.com/jlmelville/rcpphnsw).
### Highlights:
1) Lightweight, header-only, no dependencies other than C++ 11
2) Interfaces for C++, Java, Python and R (https://github.com/jlmelville/rcpphnsw).
3) Has full support for incremental index construction. Has support for element deletions
(currently, without actual freeing of the memory).
(by marking them in index). Index is picklable.
4) Can work with custom user defined distances (C++).
5) Significantly less memory footprint and faster build time compared to current nmslib's implementation.

Expand Down Expand Up @@ -53,7 +52,9 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib.
- If index already has the elements with the same labels, their features will be updated. Note that update procedure is slower than insertion of a new element, but more memory- and query-efficient.
* Thread-safe with other `add_items` calls, but not with `knn_query`.

* `mark_deleted(label)` - marks the element as deleted, so it will be omitted from search results.
* `mark_deleted(label)` - marks the element as deleted, so it will be omitted from search results. Throws an exception if it is already deleted.
*
* `unmark_deleted(label)` - unmarks the element as deleted, so it will be not be omitted from search results.

* `resize_index(new_size)` - changes the maximum capacity of the index. Not thread safe with `add_items` and `knn_query`.

Expand Down Expand Up @@ -225,6 +226,15 @@ pip install .
or you can install via pip:
`pip install hnswlib`


### For developers

When making changes please run tests (and please add a test to `python_bindings/tests` in case there is new functionality):
```bash
python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py
```


### Other implementations
* Non-metric space library (nmslib) - main library(python, C++), supports exotic distances: https://github.com/nmslib/nmslib
* Faiss library by facebook, uses own HNSW implementation for coarse quantization (python, C++):
Expand Down
91 changes: 91 additions & 0 deletions TESTING_RECALL.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Testing recall

Selecting HNSW parameters for a specific use case highly impacts the search quality. One way to test the quality of the constructed index is to compare the HNSW search results to the actual results (i.e., the actual `k` nearest neighbors).
For that cause, the API enables creating a simple "brute-force" index in which vectors are stored as is, and searching for the `k` nearest neighbors to a query vector requires going over the entire index.
Comparing between HNSW and brute-force results may help with finding the desired HNSW parameters for achieving a satisfying recall, based on the index size and data dimension.

### Brute force index API
`hnswlib.BFIndex(space, dim)` creates a non-initialized index in space `space` with integer dimension `dim`.

`hnswlib.BFIndex` methods:

`init_index(max_elements)` initializes the index with no elements.

max_elements defines the maximum number of elements that can be stored in the structure.

`add_items(data, ids)` inserts the data (numpy array of vectors, shape:`N*dim`) into the structure.
`ids` are optional N-size numpy array of integer labels for all elements in data.

`delete_vector(label)` delete the element associated with the given `label` so it will be omitted from search results.

`knn_query(data, k = 1)` make a batch query for `k `closest elements for each element of the
`data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`).

`load_index(path_to_index, max_elements = 0)` loads the index from persistence to the uninitialized index.

`save_index(path_to_index)` saves the index from persistence.

### measuring recall example

```
import hnswlib
import numpy as np

dim = 32
num_elements = 100000
k = 10
nun_queries = 10

# Generating sample data
data = np.float32(np.random.random((num_elements, dim)))

# Declaring index
hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip
bf_index = hnswlib.BFIndex(space='l2', dim=dim)

# Initing both hnsw and brute force indices
# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
# during insertion of an element.
# The capacity can be increased by saving/loading the index, see below.
#
# hnsw construction params:
# ef_construction - controls index search speed/build speed tradeoff
#
# M - is tightly connected with internal dimensionality of the data. Strongly affects the memory consumption (~M)
# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction

hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16)
bf_index.init_index(max_elements=num_elements)

# Controlling the recall for hnsw by setting ef:
# higher ef leads to better accuracy, but slower search
hnsw_index.set_ef(200)

# Set number of threads used during batch search/construction in hnsw
# By default using all available cores
hnsw_index.set_num_threads(1)

print("Adding batch of %d elements" % (len(data)))
hnsw_index.add_items(data)
bf_index.add_items(data)

print("Indices built")

# Generating query data
query_data = np.float32(np.random.random((nun_queries, dim)))

# Query the elements and measure recall:
labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k)
labels_bf, distances_bf = bf_index.knn_query(query_data, k)

# Measure recall
correct = 0
for i in range(nun_queries):
for label in labels_hnsw[i]:
for correct_label in labels_bf[i]:
if label == correct_label:
correct += 1
break

print("recall is :", float(correct)/(k*nun_queries))
```
16 changes: 16 additions & 0 deletions examples/git_tester.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from pydriller import Repository
import os
import datetime
os.system("cp examples/speedtest.py examples/speedtest2.py")
for commit in Repository('.', from_tag="v0.5.2").traverse_commits():
print(commit.hash)
print(commit.msg)

os.system(f"git checkout {commit.hash}; rm -rf build; ")
os.system("python -m pip install .")
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 4 -t 1')
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 64 -t 1')
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 128 -t 1')
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 4 -t 24')
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 128 -t 24')

62 changes: 62 additions & 0 deletions examples/speedtest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import hnswlib
import numpy as np
import os.path
import time
import argparse

# Use nargs to specify how many arguments an option should take.
ap = argparse.ArgumentParser()
ap.add_argument('-d')
ap.add_argument('-n')
ap.add_argument('-t')
args = ap.parse_args()
dim = int(args.d)
name = args.n
threads=int(args.t)
num_elements = 1000000 * 4//dim

# Generating sample data
np.random.seed(1)
data = np.float32(np.random.random((num_elements, dim)))


index_path=f'speed_index{dim}.bin'
# Declaring index
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip

if not os.path.isfile(index_path) :

p.init_index(max_elements=num_elements, ef_construction=100, M=16)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
p.set_ef(10)

# Set number of threads used during batch search/construction
# By default using all available cores
p.set_num_threads(12)

p.add_items(data)

# Serializing and deleting the index:

print("Saving index to '%s'" % index_path)
p.save_index(index_path)
p.set_num_threads(threads)
times=[]
time.sleep(10)
p.set_ef(100)
for _ in range(3):
p.load_index(index_path)
for _ in range(10):
t0=time.time()
labels, distances = p.knn_query(data, k=1)
tt=time.time()-t0
times.append(tt)
print(f"{tt} seconds")
str_out=f"mean time:{np.mean(times)}, median time:{np.median(times)}, std time {np.std(times)} {name}"
print(str_out)
with open (f"log_{dim}_t{threads}.txt","a") as f:
f.write(str_out+"\n")
f.flush()

Loading