forked from nmslib/nmslib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmanual.tex
4218 lines (3729 loc) · 217 KB
/
manual.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
{\pdfoutput=1
\documentclass[runningheads,a4paper]{llncs}
\usepackage{amssymb}
\usepackage{amsmath}
\setcounter{tocdepth}{3}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{hyperref}
\usepackage{todonotes}
\usepackage{datetime}
\usepackage{listings}
\usepackage{xcolor}
\usepackage{wrapfig}
\usepackage{subfig}
\usepackage{booktabs}
\usepackage{url}
\usepackage{float}
\usepackage{upquote}
\usepackage{placeins}
\renewcommand{\arraystretch}{1.3}
\newcommand{\todonoteinline}[1]{\todo[color=red!40,inline,caption={TODO}]{#1}}
\newcommand{\todonote}[1]{\todo[color=red!40,caption={TODO}]{#1}}
% replocdir and replocfile should end on '/'
\newcommand{\replocdir}{https://github.com/searchivarius/nmslib/tree/v1.5/}
\newcommand{\replocfile}{https://github.com/searchivarius/nmslib/blob/v1.5/}
%\newcommand{\replocdir}{https://github.com/searchivarius/NonMetricSpaceLib/tree/pserv}
%\newcommand{\replocfile}{https://github.com/searchivarius/NonMetricSpaceLib/blob/pserv/}
\newcommand{\ttt}[1]{\texttt{#1}}
\newcommand{\knn}{$k$-NN }
\newcommand{\knnns}{$k$-NN}
\newcommand{\ED}{\mbox{\rm ED}}
\newcommand{\pos}{\mbox{\rm pos}}
\newcommand{\rev}{\mbox{\rm rev}}
\newcommand{\DEL}[2]{\Delta_{#1}(#2)}
\newcommand{\LibVersion}{1.5}
\setcounter{secnumdepth}{3}
\begin{document}
\mainmatter % start of an individual contribution
% first the title is needed
\title{Non-Metric Space Library (NMSLIB) Manual}
% a short form should be given in case it is too long for the running head
\titlerunning{NMSLIB Manual}
% the name(s) of the author(s) follow(s) next
%
% NB: Chinese authors should write their first names(s) in front of
% their surnames. This ensures that the names appear correctly in
% the running heads and the author index.
%
\author{Bilegsaikhan Naidan\inst{1} \and Leonid Boytsov\inst{2}}
%
\authorrunning{Bilegsaikhan Naidan and Leonid Boytsov}
% (feature abused for this document to repeat the title also on left hand pages)
% the affiliations are given next; don't give your e-mail address
% unless you accept that it will be published
\institute{
Department of Computer and Information Science,\\
Norwegian University of Science and Technology,\\
Trondheim, Norway\\
{\hspace{1em}}\\
\and
Language Technologies Institute, \\
Carnegie Mellon University,\\
Pittsburgh, PA, USA\\
\email{srchvrs@cs.cmu.edu}\\
}
%
% NB: a more complex sample for affiliations and the mapping to the
% corresponding authors can be found in the file "llncs.dem"
% (search for the string "\mainmatter" where a contribution starts).
% "llncs.dem" accompanies the document class "llncs.cls".
%
%\toctitle{Lecture Notes in Computer Science}
%\tocauthor{Authors' Instructions}
\maketitle
{\begin{center}{\small \textbf{Maintainer}: Leonid Boytsov}\end{center}}
{\begin{center}{Version \LibVersion}\end{center}
{\begin{center}{{\today}}\end{center}}
\begin{abstract}
This document describes a library for similarity searching.
Even though the library contains a variety of metric-space access methods,
our main focus is on search methods for non-metric spaces.
Because there are fewer exact solutions for non-metric spaces,
many of our methods give only approximate answers.
Thus, the methods
are evaluated in terms of efficiency-effectiveness trade-offs
rather than merely in terms of their efficiency.
Our goal is, therefore, to provide
not only state-of-the-art approximate search methods for
both non-metric and metric spaces,
but also the tools to measure search quality.
We concentrate on technical details, i.e.,
how to compile the code, run the benchmarks, evaluate results,
and use our code in other applications.
Additionally, we explain how to extend the code by adding
new search methods and spaces.
\end{abstract}
\section{Introduction}
\subsection{History, Objectives, and Principles}
Non-Metric Space Library (NMSLIB) is an \textbf{efficient} cross-platform similarity search library and a toolkit for evaluation of similarity search methods. The goal of the project is to create an effective and \textbf{comprehensive} toolkit for searching in \textbf{generic non-metric} spaces.
Being comprehensive is important, because no single method is likely to be sufficient in all cases.
Because exact solutions are hardly efficient in high dimensions and/or non-metric spaces, the main focus is on \textbf{approximate} methods.
NMSLIB is an extendible library, which means that is possible to add new search methods and distance functions.
NMSLIB can be used directly in C++ and Python (via Python bindings, see \S~\ref{SectionPythonBind}). In addition, it is also possible to build a query server (see \S~\ref{SectionQueryServer}), which can be used from Java (or other languages supported by Apache Thrift). Java has a native client, i.e., it works on many platforms without requiring a C++ library to be installed.
Even though our methods are generic, they often outperform specialized methods for the Euclidean and/or angular distance
(i.e., for the cosine similarity).
Tables \ref{FigGlove} and \ref{FigSIFT}
contain results (as of May 2016) of NMSLIB compared to the best implementations participated in \href{https://github.com/erikbern/ann-benchmarks}{a public evaluation code-named \ttt{ann-benchmarks}}. Our main competitors are:
\begin{itemize}
\item A popular library \href{https://github.com/spotify/annoy}{Annoy}, which uses a forest of random-projection KD-trees \cite{SilpaAnan2008OptimisedKF}.
\item A new library \href{https://github.com/FALCONN-LIB/FALCONN}{FALCONN}, which is a highly-optimized implementation of the multiprobe LSH \cite{andoni2015practical}. It uses a novel type of random projections based on the fast Hadamard transform.
\end{itemize}
The benchmarks were run on a c4.2xlarge instance on EC2 (using a previously unseen subset of 5K queries). The benchmarks employ two data sets:
\begin{itemize}
\item \href{http://nlp.stanford.edu/projects/glove/}{GloVe}: 1.2M 100-dimensional word embeddings trained on Tweets;
\item 1M of 128-dimensional \href{http://corpus-texmex.irisa.fr/}{SIFT features}.
\end{itemize}
\begin{figure}[!htb]
\centering
\caption{\label{FigGlove}1.19M vectors from GloVe (100 dimensions, trained from tweets), cosine similarity.}
\includegraphics[width=0.8\textwidth]{figures/glove.png}
%\end{figure}
%\begin{figure}[!htb]
\centering
\caption{\label{FigSIFT}1M SIFT features (128 dimensions), Euclidean distance.}
\includegraphics[width=0.8\textwidth]{figures/sift.png}
\end{figure}
Most search methods were implemented by Bileg(saikhan) Naidan and Leo(nid) Boytsov.\footnote{Leo(nid) Boytsov is a maintainer.}
Additional contributors are listed on the \href{https://github.com/searchivarius/NonMetricSpaceLib}{GitHub
page}.
Bileg and Leo gratefully acknowledge support by the iAd Center~\footnote{\url{https://web.archive.org/web/20160306011711/http://www.iad-center.com/}}
and the Open Advancement of Question Answering Systems (OAQA) group~\footnote{\url{http://oaqa.github.io/}}.
The code written by Bileg and Leo is distributed under the business-friendly \href{http://apache.org/licenses/LICENSE-2.0}{Apache License}.
However, some third-party contributions are licensed differently.
For more information regarding licensing and acknowledging the use of the library
resource, please refer to \S~\ref{SectionCredits}.
The design of the library was influenced by
and superficially resembles the design of the Metric Spaces Library \cite{LibMetricSpace}.
Yet our approach is different in many ways:
\begin{itemize}
\item We focus on approximate\footnote{
An approximate method may not
return a true nearest-neighbor or
all the points within a given query ball.}
search methods and non-metric spaces.
\item We simplify experimentation, in particular,
through automatically measuring and aggregating important parameters
related to speed, accuracy, index size, and index creation time.
In addition, we provide capabilities for testing in both single- and multi-threaded modes
to ensure that implemented solutions scale well with the number of available CPUs.
\item We care about overall efficiency and
aim to implement methods that have runtime comparable to an optimized production system.
\end{itemize}
Search methods for non-metric spaces are especially interesting.
This domain does not provide sufficiently generic \emph{exact} search methods.
We may know very little about analytical properties of the distance
or the analytical representation may not be available at all (e.g., if the
distance is computed by a black-box device \cite{Skopal:2007}).
In many cases it is not possible to search exactly
and instead one has to resort to approximate search procedures.
This is why methods
are evaluated in terms of efficiency-effectiveness trade-offs
rather than merely in terms of their efficiency.
As mentioned previously, we believe that there is no ``one-size-fits-all'' search method.
Therefore, it is important to provide a variety of methods
each of which may work best for some specific classes of data.
Our commitment to efficiency affected several design decisions:
\begin{itemize}
\item The library is implemented in C++;
\item We focus on in-memory indices
and, thus, do not require all methods to materialize a disk-based version of an index
(this reduces programming effort).
\item
We provide efficient implementations of many distance functions,
which rely on Single Instruction Multiple Data (SIMD) CPU commands and/or
approximation of computationally intensive mathematical operations (see \S~\ref{SectionEfficiency}).
\end{itemize}
It is often possible to demonstrate a substantial reduction in the number of distance computations
compared to sequential searching.
However, such reductions may entail additional computations (i.e., extra book-keeping)
and do not always lead to improved overall performance \cite{Boytsov_and_Bilegsaikhan:sisap2013}.
To eliminate situations where book-keeping costs are ``masked''
by inefficiencies of the distance function,
we pay special attention to distance function efficiency.
\subsection{Problem Formulation}
Similarity search is an essential part of many applications,
which include, among others,
content-based retrieval of multimedia and statistical machine learning.
The search is carried out in a finite database of objects $\{o_i\}$,
using a search query $q$ and a dissimilarity measure (the term data point or simply a point is often
used a synonym to denote either a data object or a query).
The dissimilarity measure is typically represented by a distance function $d(o_i, q)$.
The ultimate goal is to answer a query by retrieving a subset of database objects sufficiently similar to the query $q$.
These objects will be called \emph{answers}.
Note that we use the terms \emph{distance} and the \emph{distance function} in a broader sense than
some of the textbooks:
We do not assume that the distance is a true metric distance.
The distance function can disobey the triangle inequality and/or be even non-symmetric.
Two retrieval tasks are typically considered: a nearest neighbor and a range search.
The nearest neighbor search aims to find the least dissimilar object,
i.e., the object at the smallest distance from the query.
Its direct generalization is the $k$-nearest neighbor search (the \knn search),
which looks for the $k$ most closest objects.
Given a radius $r$,
the range query retrieves all objects within a query ball (centered at the query object $q$) with the radius $r$,
or, formally, all the objects~$\lbrace o_i \rbrace$ such that $d(o_i, q) \le r$.
In generic spaces, the distance is not necessarily symmetric.
Thus, two types of queries can be considered.
In a \emph{left} query, the object is the left argument of the distance function,
while the query is the right argument.
In a \emph{right} query, $q$ is the first argument and the object is the second, i.e.,
the right, argument.
The queries can be answered either exactly,
i.e., by returning a complete result set that does not contain erroneous elements, or,
approximately, e.g., by finding only some answers.
Thus, the methods are evaluated in terms of efficiency-effectiveness trade-offs
rather than merely in terms of their efficiency.
One common effectiveness metric is recall. In the case
of the nearest neighbor search, it is computed as
an average fraction of true neighbors returned by the method.
If ground-truth judgements (produced by humans) are available,
it is also possible to compute an accuracy of a \knn based classification
(see \S~\ref{SectionEffect}).
%In the current release, we focus on vector-space implementations,
%i.e., all the distance functions are defined over real-valued vectors.
%Note that this is not a principal limitation,
%because most methods do not access data objects
%directly.
%Instead, they rely only on distance values.
%In the future, we plan to add more complex spaces, in particular, string-based.
\section{Getting Started}
\subsection{What's new in version \LibVersion{} (major changes)}
\begin{itemize}
\item We have adopted a new method: a hierarchical (navigable) small-world graph (HNSW),
contributed by Yury Malkov \cite{Malkov2016}, see \S~\ref{SectionHNSW}.
\item We have improved performance of two core methods SW-graph (\S~\ref{SectionSWGraph}) and NAPP (\ref{SectionNAPP}).
\item We have written basic tuning guidelines for SW-graph, HNSW, and NAPP \ref{SectionTuningGuide}.
\item We have modified the workflow of our benchmarking utility \ttt{experiment}
and improved handling of the gold standard data, see \S~\ref{SectionBenchEfficiency};
\item We have updated the API so that methods can save and restore indices, see \S~\ref{SectionCreateMethod}.
\item We have implemented a server, which can have clients in C++, Java, Python, and other languages supported by Apache Thrift, see \S~\ref{SectionQueryServer}.
\item We have implemented generic Python bindings that work for non-vector spaces, see \S~\ref{SectionPythonBind}.
\item Last, we retired older methods \ttt{permutation}, \ttt{permutation\_incsort}, and \ttt{permutation\_vptree}.
The latter two methods are superseded by \ttt{proj\_incsort} and \ttt{proj\_vptree}, respectively.
\end{itemize}
\subsection{Prerequisites}
NMSLIB was developed and tested on 64-bit Linux.
Yet, almost all the code can be built and run on 64-bit Windows (two notable exceptions are: LSHKIT and NN-Descent).
%It should also be possible to build the library on MAC OS.
Building the code requires a modern C++ compiler that supports \mbox{C++11}.
Currently, we support GNU C++ ($\ge4.7$), Intel compiler ($\ge14$),
Clang ($\ge3.4$),
and Visual Studio ($\ge12$)\footnote{One can use the free express version.}.
Under Linux, the build process relies on CMake.
Under Windows, one could use Visual Studio projects stored in the repository.
These projects are for Visual Studio 14 (2015).
However, they can be downgraded to work with Visual Studio 12 (see \S~\ref{SectionBuildWindows}).
%Note, however, that we do not have a portable code to measure memory consumption:
%This part will work only for Linux (with \href{https://en.wikipedia.org/wiki/Procfs}{PROCFS})
%and Windows.
More specifically, for Linux we require:
\begin{enumerate}
\item A \textbf{64-bit} distributive (Ubuntu \textbf{LTS} is recommended)
\item GNU C++ ($\ge4.7$), Intel Compiler ($\ge14$), Clang ($\ge3.4$)
\item CMake (GNU make is also required)
\item Boost (dev version $\ge48$, Ubuntu package \ttt{libboost1.48-all-dev} or newer \ttt{libboost1.54-all-dev})
\item GNU scientific library (dev version, Ubuntu package \ttt{libgsl0-dev})
\item Eigen (dev version, Ubuntu package \ttt{libeigen3-dev})
\end{enumerate}
For Windows, we require:
\begin{enumerate}
\item A \textbf{64-bit} distributive (we tested on Windows 8);
\item Visual Studio Express (or Professional) version 12 or later;
\item Boost is not required to build the core library and test utilities,
but it is necessary to build some applications,
including the main testing binary \ttt{experiment.exe} (see \S~\ref{SectionBuildWindows}).
\end{enumerate}
Efficient implementations of many distance functions (see \S~\ref{SectionEfficiency})
rely on SIMD instructions,
which operate on small vectors of integer or floating point numbers.
These instructions are available on most modern processors,
but we support only SIMD instructions available on recent Intel and AMD processors.
Each distance function has a pure C++ implementation,
which can be less efficient than an optimized SIMD-based implementation.
On Linux, SIMD-based implementations are activated automatically
for all sufficiently recent CPUs.
On Windows, only SSE2 is enabled by default\footnote{Because SSE2 is available on all 64-bit computers.}.
Yet, it is necessary to manually update project settings
to enable more recent SIMD extensions (see \S\ref{SectionBuildWindows}).
Scripts to generate and process data sets are written in Python.
We also provide a sample Python script to plot performance graphs: \href{\replocfile scripts/genplot_configurable.py}{genplot\_configurable.py} (see \S~\ref{SectionGenPlot}).
In addition to Python, this plotting script requires Latex and PGF.
\subsection{Installing C++11 Compilers}
Installing C++11 compilers can be tricky,
because they are not always provided as a standard package.
This is why we briefly review the installation process here.
In addition, installing compilers does
not necessarily make them default compilers.
One way to fix this is on Linux is to set environment variables \ttt{CXX} and \ttt{CC}.
For the GNU 4.7 compiler:
\begin{verbatim}
export CXX=g++-4.7 CC=gcc-4.7
\end{verbatim}
For the Clang compiler:
\begin{verbatim}
export CXX=clang++-3.4 CC=clang-3.4
\end{verbatim}
For the Intel compiler:
\begin{verbatim}
export CXX=icc CC=icc
\end{verbatim}
It is, perhaps, the easiest to obtain Visual Studio by simply downloading
it from the \href{https://www.microsoft.com/en-us/download/default.aspx}{Microsoft web-site}.
We were able to build and run the 64-bit code using the free distributive of Visual Studio Express 14 (also called
\ttt{Express 2015}) and Visual Studio 12 (Express 2013). The professional (and expensive) version of Visual Studio is not required.
To install GNU C++ version 4.7 on newer Linux distributions (in particular, on Ubuntu 14) with the Debian package management system,
one can simply type:
\begin{verbatim}
sudo apt-get install gcc-4.7 g++-4.7
\end{verbatim}
At the time of this writing, GNU C++ 4.8 was available as a standard package as well.
However, this would not work on older distributives of Linux.
One may need to use an experimental repository as follows:
\begin{verbatim}
sudo add-apt-repository ppa:ubuntu-toolchain-r/test
sudo apt-get update
sudo apt-get install gcc-4.7 g++-4.7
\end{verbatim}
If the script \ttt{add-apt-repository} is missing, it can be installed as follows:
\begin{verbatim}
sudo apt-get install python-software-properties
\end{verbatim}
More details can be found on the \href{http://askubuntu.com/questions/113291/how-do-i-install-gcc-4-7}{AskUbuntu web-site.}
Similarly to the GNU C++ compiler, to install a C++11 version of Clang on newer Debian-based distributives one can simply type:
\begin{verbatim}
sudo apt-get install clang-3.6
\end{verbatim}
However, for older distributives one may need to add a non-standard repository.
For Debian and Ubuntu distributions, it is easiest to add repositories from \href{http://llvm.org/apt/}{the LLVM web-site}.
For example, if you have Ubuntu 12 (Precise), you need to add repositories
as follows:\footnote{Do not forget to remove \ttt{deb-src} for source repositories.
See the discussion \href{http://askubuntu.com/questions/160511/why-does-add-apt-repository-fail-to-add-source-repositories}{here for more details.}}
\begin{verbatim}
sudo add-apt-repository \
"deb http://llvm.org/apt/precise/ llvm-toolchain-precise main"
sudo add-apt-repository \
"http://llvm.org/apt/precise/ llvm-toolchain-precise main"
sudo add-apt-repository \
"deb http://llvm.org/apt/precise/ llvm-toolchain-precise-3.4 main"
sudo add-apt-repository \
"http://llvm.org/apt/precise/ llvm-toolchain-precise-3.4 main"
sudo add-apt-repository \
"deb http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu \
precise main"
\end{verbatim}
Then, Clang 3.4 (and LLDB debugger) can be installed by typing:
\begin{verbatim}
sudo apt-get install clang-3.4 lldb-3.4
\end{verbatim}
The Intel compiler can be freely used for non-commercial purposes by some categories of users (students, academics,
and active open-source contributors).
It is a part of C++ Composer XE for Linux and can
be obtained \href{http://software.intel.com/en-us/non-commercial-software-development}{from the Intel web site}.
After downloading and running an installation script, one needs to set environment variables.
If the compiler is installed to the folder \ttt{/opt/intel}, environment variables
are set by a script as follows:
\begin{verbatim}
/opt/intel/bin/compilervars.sh intel64
\end{verbatim}
\subsection{Quick Start on Linux}\label{QuickStartLinux}
To build the project, go to the directory \href{\replocdir similarity_search}{similarity\_search} and type:
\begin{verbatim}
cmake .
make
\end{verbatim}
This creates several binaries in the directory \ttt{similarity\_search/release},
most importantly,
a benchmarking utility \ttt{experiment}, which carries out experiments,
and testing utilities \ttt{bunit}, \ttt{test\_integer}, and \ttt{bench\_distfunc}.
A more detailed description of the build process on Linux is given in \S~\ref{SectionBuildLinux}.
\subsection{Query Server (Linux-only)}\label{SectionQueryServer}
The query server requires Apache Thrift. We used Apache Thrift 0.9.2, but, perhaps, newer versions will work as well.
To install Apache Thrift, you need to \href{https://thrift.apache.org/docs/BuildingFromSource}{build it from source}.
This may require additional libraries. On Ubuntu they can be installed as follows:
\begin{verbatim}
sudo apt-get install libboost-dev libboost-test-dev \
libboost-program-options-dev libboost-system-dev \
libboost-filesystem-dev libevent-dev \
automake libtool flex bison pkg-config \
g++ libssl-dev libboost-thread-dev make
\end{verbatim}
To simplify the building process of Apache Thrift, one can disable unnecessary languages:
\begin{verbatim}
./configure --without-erlang --without-nodejs --without-lua \
--without-php --without-ruby --without-haskell \
--without-go --without-d
\end{verbatim}
After Apache Thrift is installed, you need to build the library itself. Then, change the directory
to \href{\replocfile query_server/cpp_client_server}{query\_server/cpp\_client\_server} and type \ttt{make} (the makefile
may need to be modified, if Apache Thrift is installed to a non-standard location).
The query server has a similar set of parameters to the benchmarking utility \ttt{experiment}. For example,
you can start the server as follows:
\begin{verbatim}
./query_server -i ../../sample_data/final8_10K.txt -s l2 -p 10000 \
-m sw-graph -c NN=10,efConstruction=200,initIndexAttempts=1
\end{verbatim}
There are also three sample clients implemented in \href{\replocfile query_server/cpp_client_server}{C++}, \href{\replocfile query_server/python_client/}{Python},
and \href{\replocfile query_server/java_client/}{Java}.
A client reads a string representation of the query object from the standard stream.
The format is the same as the format of objects in a data file.
Here is an example of searching for ten vectors closest to the first data set vector (stored in row one) of a provided sample data file:
\begin{verbatim}
export DATA_FILE=../../sample_data/final8_10K.txt
head -1 $DATA_FILE | ./query_client -p 10000 -a localhost -k 10
\end{verbatim}
It is also possible to generate client classes for other languages supported by Thrift from
\href{\replocfile query_server/protocol.thrift}{the interface definition file}, e.g., for C\#. To this end, one should invoke the thrift compiler as follows:
\begin{verbatim}
thrift --gen csharp protocol.thrift
\end{verbatim}
For instructions on using generated code, please consult the \href{https://thrift.apache.org/tutorial/}{Apache Thrift tutorial}.
\subsection{Python bindings (Linux-only)}\label{SectionPythonBind}
We provide basic Python bindings (for Linux and Python 2.7).
%\href{https://github.com/erikbern/ann-benchmarks}{One public evaluation of our library using these bindings was carried out by Erik Bernhardsson}.
To build bindings for dense vector spaces (see \S~\ref{SectionSpaces} for the description of spaces), build the library first.
Then, change the directory to \ttt{python\_vect\_bindings} and type:
\begin{verbatim}
make
sudo make install
\end{verbatim}
For an example of using our library in Python, see the script \href{\replocfile python_vect_bindings/test_nmslib_vect.py}{test\_nmslib\_vect.py}.
Generic vector spaces are supported as well (see \href{\replocfile python_gen_bindings}{python\_gen\_bindings}). However, they work only for spaces
that properly define define serialization and de-serialization (see a brief description in \S~\ref{SectionCreateSpace}).
\subsection{Quick Start on Windows}\label{QuickStartWindows}
Building on Windows is straightforward.
Download \href{https://www.visualstudio.com/en-us/downloads/download-visual-studio-vs.aspx}{Visual Studio 2015 Express for Desktop}.
Download and install respective \href{http://sourceforge.net/projects/boost/files/boost-binaries/1.59.0/boost_1_59_0-msvc-14.0-64.exe/download}{Boost binaries}. Please, use the \textbf{default} installation directory on disk \ttt{c:} (otherwise, it will be necessary to update project files).
Afterwards, one can simply use the provided \href{\replocfile similarity_search/NonMetricSpaceLib.sln}{Visual Studio
solution file}.
The solution file references several project (*.vcxproj) files:
\href{\replocfile similarity_search/src/NonMetricSpaceLib.vcxproj}{NonMetricSpaceLib.vcxproj}
is the main project file that is used to build the library itself.
The output is stored in the folder \ttt{similarity\_search\textbackslash x64}.
A more detailed description of the build process on Windows is given in \S~\ref{SectionBuildWindows}.
Note that the core library, the test utilities,
as well as examples of the standalone applications (projects \ttt{sample\_standalone\_app1}
and \ttt{sample\_standalone\_app2})
can be built without installing Boost.
\section{Building and running the code (in detail)}
A build process creates several important binaries, which include:
\begin{itemize}
\item NMSLIB library (on Linux \ttt{libNonMetricSpaceLib.a}),
which can be used in external applications;
\item The main benchmarking utility \ttt{experiment} (\ttt{experiment.exe} on Windows)
that carries out experiments and saves evaluation results;
\item A tuning utility \ttt{tune\_vptree} (\ttt{tune\_vptree.exe} on Windows)
that finds optimal VP-tree parameters (see \S~\ref{SectionVPtree} and our paper for details \cite{Boytsov_and_Bilegsaikhan:nips2013});
\item A semi unit test utility \ttt{bunit} (\ttt{bunit.exe} on Windows);
\item A utility \ttt{bench\_distfunc} that carries out integration tests (\ttt{bench\_distfunc.exe} on Windows);
\end{itemize}
A build process is different under Linux and Windows.
In the following sections, we consider these differences in more detail.
\subsection{Building under Linux}\label{SectionBuildLinux}
Implementation of similarity search methods is in the directory \ttt{similarity\_search}.
The code is built using a \ttt{cmake}, which works on top of the GNU make.
Before creating the makefiles, we need to ensure that a right compiler is used.
This is done by setting two environment variables: \ttt{CXX} and \ttt{CC}.
In the case of GNU C++ (version 4.7), you may need to type:
\begin{verbatim}
export CCX=g++-4.7 CC=gcc-4.7
\end{verbatim}
In the case of the Intel compiler, you may need to type:
\begin{verbatim}
export CXX=icc CC=icc
\end{verbatim}
If you do not set variables \ttt{CXX} and \ttt{CC},
the default C++ compiler is used (which can be fine, if it is the right compiler already).
To create makefiles for a release version of the code, type:
\begin{verbatim}
cmake -DCMAKE_BUILD_TYPE=Release .
\end{verbatim}
If you did not create any makefiles before, you can shortcut by typing:
\begin{verbatim}
cmake .
\end{verbatim}
To create makefiles for a debug version of the code, type:
\begin{verbatim}
cmake -DCMAKE_BUILD_TYPE=Debug .
\end{verbatim}
When makefiles are created, just type:
\begin{verbatim}
make
\end{verbatim}
If \ttt{cmake} complains about the wrong version of the GCC,
it is most likely that you forgot to set the environment variables \ttt{CXX} and \ttt{CC} (as described above).
If this is the case, make these variables point to the correction version of the compiler.
\textbf{Important note:}
do not forget to delete the \ttt{cmake} cache and make file, before recreating the makefiles.
For example, you can do the following (assuming the current directory is \ttt{similarity\_search}):
\begin{verbatim}
rm -rf `find . -name CMakeFiles` CMakeCache.txt
\end{verbatim}
Also note that, for some reason, \ttt{cmake} might sometimes ignore environmental variables \ttt{CXX} and \ttt{CC}.
In this unlikely case, you can specify the compiler directly through \ttt{cmake} arguments.
For example, in the case of the GNU C++ and the \ttt{Release} build,
this can be done as follows:
\begin{verbatim}
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=g++-4.7 \
-DCMAKE_GCC_COMPILER=gcc-4.7 CMAKE_CC_COMPILER=gcc-4.7 .
\end{verbatim}
The build process creates several binaries. Most importantly,
the main benchmarking utility \ttt{experiment}.
The directory \ttt{similarity\_search/release} contains release versions of
these binaries. Debug versions are placed into the folder \ttt{similarity\_search/debug}.
\textbf{Important note:} a shortcut command:
\begin{verbatim}
cmake .
\end{verbatim}
(re)-creates makefiles for the previously
created build. When you type \ttt{cmake .} for the first time,
it creates release makefiles. However, if you create debug
makefiles and then type \ttt{cmake .},
this will not lead to creation of release makefiles!
If the user cannot install necessary libraries to a standard location,
it is still possible to build a project.
First, download Boost to some local directory.
Assume it is \ttt{\$HOME/boost\_download\_dir}.
Then, set the corresponding environment variable, which will inform \ttt{cmake}
about the location of the Boost files:
\begin{verbatim}
export BOOST_ROOT=$HOME/boost_download_dir
\end{verbatim}
Second, the user needs to install the additional libraries. Assume that the
lib-files are installed to \ttt{\$HOME/local\_lib}, while corresponding include
files are installed to \ttt{\$HOME/local\_include}. Then, the user
needs to invoke \ttt{cmake} with the following arguments (after possibly deleting previously
created cache and makefiles):
\begin{verbatim}
cmake . -DCMAKE_LIBRARY_PATH=$HOME/local_lib \
-DCMAKE_INCLUDE_PATH=$HOME/local_include \
-DBoost_NO_SYSTEM_PATHS=true
\end{verbatim}
Note the last option. Sometimes, an old version of Boost is installed.
Setting the variable \ttt{Boost\_NO\_SYSTEM\_PATHS} to true,
tells \ttt{cmake} to ignore such an installation.
To use the library in external applications, which do not belong to the library repository,
one needs to install the library first.
Assume that an installation location is the folder \ttt{NonMetrLibRelease} in the
home directory. Then, the following commands do the trick:
\begin{verbatim}
cmake \
-DCMAKE_INSTALL_PREFIX=$HOME/NonMetrLibRelease \
-DCMAKE_BUILD_TYPE=Release .
make install
\end{verbatim}
A directory \href{\replocfile sample_standalone_app}{sample\_standalone\_app}
contains two sample programs (see files
\href{\replocfile sample_standalone_app/sample_standalone_app1.cc}{sample\_standalone\_app1.cc}
and
\href{\replocfile sample_standalone_app/sample_standalone_app2.cc}{sample\_standalone\_app2.cc})
that use NMSLIB binaries installed in the folder \ttt{\$HOME/NonMetrLibRelease}.
\subsubsection{Developing and Debugging on Linux}
There are several debuggers that can be employed.
Among them, some of the most popular are: \ttt{gdb} (a command line tool)
and a \ttt{ddd} (a GUI wrapper for gdb).
For users who prefer IDEs, one good and free option is Eclipse IDE for C/C++ developers.
It is not the same as Eclipse for Java and one needs
to \href{http://www.eclipse.org/ide/}{download
this version of Eclipse separately.}.
An even better option is, perhaps, \href{https://www.jetbrains.com/clion/}{CLion}. However, it is not free.\footnote{There are
a few categories of people, including students, who can ask for a free license, though.}
\begin{figure}
\centering
\caption{\label{FigEclipse1}Selecting an existing project to import}
\includegraphics[width=0.9\textwidth]{figures/Eclipse1.pdf}
\end{figure}
After downloading and decompressing, e.g. as follows:
\begin{verbatim}
tar -zxvf eclipse-cpp-europa-winter-linux-gtk-x86_64.tar.gz
\end{verbatim}
one can simply run the binary \ttt{eclipse} (in a
newly created directory \ttt{eclipse}).
On the first start, Eclipse will ask you select a repository location.
This would be the place to store the project metadata and (optionally)
actual project source files.
The following description is given for Eclipse Europe.
It may be a bit different with newer versions of Eclipse.
\begin{figure}
\centering
\caption{\label{FigEclipse2}Importing an existing project}
\includegraphics[width=0.9\textwidth]{figures/Eclipse2.pdf}
\end{figure}
After selecting the workspace, the user can import the Eclipse project
stored in the GitHub repository.
Go to the menu \ttt{File}, sub-menu \ttt{Import}, category \ttt{General}
and choose to import
an existing project into the workspace as shown in Fig.~\ref{FigEclipse1}.
After that select a root directory. To this end,
go to the directory where you checked out the contents
of the GitHub repository and enter a sub-directory \ttt{similarity\_search}.
You should now be able to see the project \ttt{Non-Metric-Space-Library}
as shown in Fig~\ref{FigEclipse2}.
You can now finalize the import by pressing the button \ttt{Finish}.
\begin{figure}
\centering
\caption{\label{FigEclipse3}Enabling indexing of the source code}
\includegraphics[width=0.9\textwidth]{figures/Eclipse3.pdf}
\end{figure}
Next, we need to set some useful settings.
Most importantly, we need to enable indexing of source files.
This would allow us to browse class hierarchies, as well as find declarations
of variables or classes.
To this end, go to the menu \ttt{Window}, sub-menu \ttt{Preferences}
and select a category \ttt{C++/indexing} (see Fig.~\ref{FigEclipse3}).
Then, check the box \ttt{Index all files}.
Eclipse will start indexing your files
with the progress being shown in the status bar (right down corner).
The user can also change the editor settings.
We would strongly encourage to disable the use of tabs.
Again, go the menu \ttt{Window}, sub-menu \ttt{Preferences},
and select a category \ttt{General/Editors/Text Editors}.
Then, check the box \ttt{Insert spaces for tabs}.
In the same menu, you can also change the fonts (use the
category \ttt{General/Appearance/Colors and Fonts}).
In a newer Eclipse version, disabling tabs is done differently.
To this end, go to the menu \ttt{Window}, sub-menu \ttt{Preferences},
and select a category \ttt{C++/Code Style/Formatter}.
Then, you need to create a new profile and make this profile active.
In the profile, change the tab policy to \ttt{Spaces only}.
It is possible to build the project from Eclipse (see the menu \ttt{Project}).
However, one first needs to generate makefiles as described in \S~\ref{SectionBuildLinux}.
The current limitation is that you can build either release
or the debug version at a time.
Moreover, to switch from one version to another, you need to recreate
the makefiles from the command line.
\begin{figure}
\centering
\caption{\label{FigDebugConf}Creating a debug/run configuration}
\includegraphics[width=0.9\textwidth]{figures/EclipseDebugConf.pdf}
\end{figure}
After building you can debug the project.
To do this, you need to create a debug configuration.
As an example, one configuration can be found in the
project folder \ttt{launches}.
Right click on the item \ttt{sample.launch},
choose the option \ttt{Debug as} (in the drop-down menu),
and click on \ttt{sample} (in the pop-up menu).
Do not forget to edit command line arguments before you actually debug the application!
After switching to a debug perspective,
the Eclipse may stop the debugger in
the file \ttt{dl-debug.c} as shown in Fig.~\ref{FigDebug}.
If this happened, simply, press the continue icon a couple of
times until the debugger enters the code belonging to the library.
Additional configurations can be created by right clicking
on the project name (left pane), selecting \ttt{Properties}
in the pop-up menu and clicking on \ttt{Run/Debug settings}.
The respective screenshot is shown in Fig.~\ref{FigDebugConf}.
\begin{figure}
\centering
\caption{\label{FigDebug}Starting a debugger}
\includegraphics[width=0.9\textwidth]{figures/EclipseDebug.pdf}
\end{figure}
Note that this manual contains only a basic introduction to Eclipse.
If the user is new to Eclipse, we recommend reading
additional \href{http://www.eclipse.org/ide/}{documentation available online}.
\subsection{Building under Windows}\label{SectionBuildWindows}
\begin{figure}
\centering
\caption{\label{FigAVXSet}Enabling advanced SIMD Instructions in the Visual Studio}
\includegraphics[width=0.9\textwidth]{figures/SettingAVXinVS2012.pdf}
\end{figure}
Download \href{https://www.visualstudio.com/en-us/downloads/download-visual-studio-vs.aspx}{Visual Studio 2015 Express for Desktop}.
Download and install respective \href{http://sourceforge.net/projects/boost/files/boost-binaries/1.59.0/boost_1_59_0-msvc-14.0-64.exe/download}{Boost binaries}. Please, use the \textbf{default} installation directory on disk \ttt{c:}.
In the end of the section, we explain how to select a different location of the Boost files,
as well as to how downgrade the project to build it with Visual Studio 2013 (if this is really necessary).
After downloading Visual Studio and installing Boost (version 59, 64-bit binaries, for MSVC-14), it is straightforward to build the project
using the provided \href{\replocfile similarity_search/NonMetricSpaceLib.sln}{Visual Studio
solution file}.
The solution file references several (sub)-project (\ttt{*.vcxproj}) files,
which can be built either separately or all together.
The main sub-project is \ttt{NonMetricSpaceLib}, which is built before any other sub-projects.
Sub-projects:
\ttt{sample\_standalone\_app1},
\ttt{sample\_standalone\_app2}
are examples of using the library in a standalone mode.
Unlike building under Linux, we provide no installation procedure yet.
In a nutshell, the installation consists in copying the library binary
as well as \href{\replocdir similarity_search/include}{the directory} with header files.
There are three possible configurations for the binaries:
\ttt{Release}, \ttt{Debug}, and \ttt{RelWithDebInfo} (release with debug information).
The corresponding output files are placed into the subdirectories:
\begin{verbatim}
similarity_search\x64\Release,
similarity_search\x64\Debug,
similarity_search\x64\RelWithDebInfo.
\end{verbatim}
Unlike other compilers, there seems to be no way to detect the CPU type in the Visual Studio automatically.\footnote{It is not also possible to opt for using \href{http://en.wikipedia.org/wiki/SSE4}{only SSE4}.}
And, by default, only SSE2 is enabled (because it is supported by all 64-bit CPUs).
Therefore, if the user's CPU supports \href{https://en.wikipedia.org/wiki/Advanced_Vector_Extensions}{AVX extensions}, it is recommended to modify code generation settings as shown in the screenshot in Fig.~\ref{FigAVXSet}.
This should be done for \textbf{all} sub-projects and \textbf{all} binary configurations.
Note that you can set a property for all projects at once, if you select all the sub-projects, right-click, and then choose \ttt{Properties}
in the pop-up menu.
\begin{figure}
\centering
\caption{\label{FigBoostSet}Specifying Location of Boost libraries}
\includegraphics[width=0.9\textwidth]{figures/SettingBoostLocation.pdf}
\end{figure}
The core library, the semi unit test binary as well as examples of the standalone applications
can be built without installing Boost.
However, Boost libraries are required for the binaries \ttt{experiment.exe}, \ttt{tune\_vptree.exe}, and \ttt{test\_integr.exe}.
We would re-iterate that one needs 64-bit Boost binaries compiled with the same version of the Visual Studio as the NMSLIB binaries.
If you \href{http://sourceforge.net/projects/boost/files/boost-binaries/1.59.0/boost_1_59_0-msvc-14.0-64.exe/download}{download the installer for Boost 59} and install it to a default location, then you do not have to change project files.
Should you install Boost into a different folder,
the location of Boost binaries and header file need to be specified in the
project settings for all three build configurations (\ttt{Release}, \ttt{Debug}, \ttt{RelWithDebInfo}). An example of specifying the location of Boost libraries (binaries) is given in Fig.~\ref{FigBoostSet}.
In the unlikely case that the user has to use older Visual Studio 12, the project files are to be downgraded.
To do so, one have to manually edit every \ttt{*.vcxproj} file by replacing
each occurrence of \ttt{<PlatformToolset>v140</PlatformToolset>} with \ttt{<PlatformToolset>v120</PlatformToolset>}.
Additionally, one has to download Boost binaries compatible with the older Visual studio and modify the project files
accordingly.
In particular, one may need to modify options \ttt{Additional Include Directories} and \ttt{Additional Library Directories}.
\subsection{Testing the Correctness of Implementations}
We have two main testing utilities \ttt{bunit} and \ttt{test\_integr} (\ttt{experiment.exe} and
\ttt{test\_integr.exe} on Windows).
Both utilities accept the single optional argument: the name of the log file.
If the log file is not specified, a lot of informational messages are printed to the screen.
The \ttt{bunit} verifies some basic functitionality akin to unit testing.
In particular, it checks that an optimized version of the, e.g., Eucledian, distance
returns results that are very similar to the results returned by unoptimized and simpler version.
The utility \ttt{bunit} is expected to always run without errors.
The utility \ttt{test\_integr} runs complete implementations of many methods
and checks if several effectiveness and efficiency characteristics
meet the expectations.
The expectations are encoded as an array of instances of the class \ttt{MethodTestCase}
(see \href{\replocdir similarity_search/test/test_integr.cc#L65}{the code here}).
For example, we expect that the recall (see \S~\ref{SectionEffect})
fall in a certain pre-recorded range.
Because almost all our methods are randomized, there is a great deal of variance
in the observed performance characteristics. Thus, some tests
may fail infrequently, if e.g., the actual recall value is slightly lower or higher
than an expected minimum or maximum.
From an error message, it should be clear if the discrepancy is substantial, i.e.,
something went wrong, or not, i.e., we observe an unlikely outcome due to randomization.
The exact search method, however, should always have an almost perfect recall.
Variance is partly due to using low-dimensional test sets. In the future, we plan to change this.
For high-dimensional data sets, the outcomes are much more stable despite the randomized nature
of most implementations.
Finally, one is encouraged to run a small-scale test using the script
\href{\replocfile scripts/test_run.sh}{test\_run.sh} on Linux (\href{\replocfile scripts/test_run.sh}{test\_run.bat} on Windows).
The test includes several important methods, which can be very efficient.
Both the Linux and the Windows scripts expect some dense vector space file (see \S~\ref{SectionSpaces} and \S~\ref{SectionDatasets}) and the number of threads
as the first two parameters. The Linux script has an additional third parameter.
If the user specifies 1, the software creates a plot (see \S~\ref{SectionGenPlot}). On Linux, the user can verify
that for the Colors data set \cite{LibMetricSpace} the plot looks as follows:
\begin{figure}[h]
\centering
\includegraphics[width=0.75\textwidth]{figures/test_run.pdf}
\end{figure}
On Windows, the script \ttt{test\_run.bat} creates only the data (\ttt{output\_file\_K\=10.dat})
and the report file (\ttt{output\_file\_K\=10.rep}). The report file can be checked manually.
The data file can be used to create a plot via Excel or Google Docs.
\subsection{Running Benchmarks}\label{SectionRunBenchmark}
There are no principal differences in benchmarking on Linux and Windows.
There is a \emph{single} benchmarking utility
\ttt{experiment} (\ttt{experiment.exe} on Windows) that includes implementation of all methods.
It has multiple options, which specify, among others,
a space, a data set, a type of search, a method to test, and method parameters.
These options and their use cases are described in the following subsections.
Note that unlike older versions it is not possible to test multiple methods in a single run.
However, it is possible to test the single method with different values of query-time parameters.
\subsubsection{Space and distance value type}
A distance function can return an integer (\ttt{int}), a single-precision (\ttt{float}),
or a double-precision (\ttt{double}) real value.
A type of the distance and its value is specified as follows:
\begin{verbatim}
-s [ --spaceType ] arg space type, e.g., l1, l2, lp:p=0.25
--distType arg (=float) distance value type:
int, float, double
\end{verbatim}
A description of a space may contain parameters (parameters may not contain whitespaces).
In this case, there is colon after the space mnemonic name followed by a
comma-separated (not spaces) list of parameters in the format:
\ttt{<parameter name>=<parameter value>}.
Currently, this is used only for $L_p$ spaces. For instance,
\ttt{lp:0.5} denotes the space $L_{0.5}$.
A detailed list of possible spaces and respective
distance functions is given in Table~\ref{TableSpaces} in \S~\ref{SectionSpaces}.
For real-valued distance functions, one can use either single- or double-precision
type. Single-precision is a recommended default.
One example of integer-valued distance function the Levenshtein distance.
\subsubsection{Input Data/Test Set}
There are two options that define the data to be indexed:
\begin{verbatim}
-i [ --dataFile ] arg input data file
-D [ --maxNumData ] arg (=0) if non-zero, only the first
maxNumData elements are used
\end{verbatim}
The input file can be indexed either completely, or partially.
In the latter case, the user can create the index using only
the first \ttt{--maxNumData} elements.
For testing, the user can use a separate query set.
It is, again, possible to limit the number of queries:
\begin{verbatim}
-q [ --queryFile ] arg query file
-Q [ --maxNumQuery ] arg (=0) if non-zero, use maxNumQuery query
elements(required in the case
of bootstrapping)
\end{verbatim}
If a separate query set is not available, it can be simulated by bootstrapping.
To this, end the \ttt{--maxNumData} elements of the original data set
are randomly divided into testing and indexable sets.
The number of queries in this case is defined by the option \ttt{--maxNumQuery}.
A number of bootstrap iterations is specified through an option:
\begin{verbatim}
-b [ --testSetQty ] arg (=0) # of sets created by bootstrapping;
\end{verbatim}
Benchmarking can be carried out in either a single- or a multi-threaded
mode. The number of test threads are specified as follows:
\begin{verbatim}
--threadTestQty arg (=1) # of threads
\end{verbatim}
\subsubsection{Query Type}
Our framework supports the \knn and the range search.
The user can request to run both types of queries:
\begin{verbatim}
-k [ --knn ] arg comma-separated values of k
for the k-NN search
-r [ --range ] arg comma-separated radii for range search
\end{verbatim}
For example, by specifying the options
\begin{verbatim}
--knn 1,10 --range 0.01,0.1,1
\end{verbatim}
the user requests to run queries of five different types: $1$-NN, $10$-NN,
as well three range queries with radii 0.01, 0.1, and 1.
\subsubsection{Method Specification}
Unlike older versions it is possible to test only a single method at a time.
To specify a method's name, use the following option:
\begin{verbatim}
-m [ --method ] arg method/index name
\end{verbatim}
A method can have a single set of index-time parameters, which is specified
via:
\begin{verbatim}
-c [ --createIndex ] arg index-time method(s) parameters
\end{verbatim}
In addition to the set of index-time parameters,
the method can have multiple sets of query-time parameters, which are specified
using the following (possibly repeating) option:
\begin{verbatim}
-t [ --queryTimeParams ] arg query-time method(s) parameters
\end{verbatim}
For each set of query-time parameters, i.e., for each occurrence of the option \ttt{--queryTimeParams},
the benchmarking utility \ttt{experiment},
carries out an evaluation using the specified set of queries and a query type (e.g., a 10-NN search with
queries from a specified file).
If the user does not specify any query-time parameters, there is only one evaluation to be carried out.
This evaluation uses default query-time parameters.
In general, we ensure that \emph{whenever a query-time parameter is missed, the default value is used}.
Similar to parameters of the spaces,
a set of method's parameters is a comma-separated list (no-spaces)
of parameter-value pairs in the format:
\ttt{<parameter name>=<parameter value>}.