Skip to content

Commit

Permalink
complete cloriSearch skeleton
Browse files Browse the repository at this point in the history
  • Loading branch information
shpilu committed Dec 11, 2018
1 parent 2fc5ce9 commit 2532d93
Show file tree
Hide file tree
Showing 90 changed files with 17,297 additions and 506 deletions.
30 changes: 30 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#
# cloriSearch cmake
# require cmake >= 2.8.10 && gcc >= 4.8
# Copyright (C) 2018 James Wei (weijianlhp@163.com). All rights reserved
#
cmake_minimum_required(VERSION 2.8.10)
project(clorisearch CXX)

set(CLORICONF_VERSION 1.0.0)

if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
# require at least gcc 4.8
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
message(FATAL_ERROR "GCC is too old, please install a newer version supporting C++11")
endif()
else()
message(WARNING "You are using an unsupported compiler! Compilation has only been tested with GCC only")
endif()

# compile options
OPTION(DEBUG "Print debug logs" OFF)
OPTION(WITH_DEBUG_SYMBOLS "With debug symbols" ON)

# install prefix
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
set(CMAKE_INSTALL_PREFIX "/usr/local/clorisearch" CACHE PATH "clorisearch install prefix" FORCE)
endif()

add_subdirectory(src)

11 changes: 11 additions & 0 deletions cmake/clorisearch.pc.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
prefix=@CMAKE_INSTALL_PREFIX@
includedir=${CMAKE_INSTALL_PREFIX}/include
libdir=${CMAKE_INSTALL_PREFIX}/lib

Name: clorisearch
Description: An ad retireval engine
Version: @CLORISEARCH_VERSION@
Cflags: -I${CMAKE_INSTALL_PREFIX}/include
Libs: -L${CMAKE_INSTALL_PREFIX}/lib -lclorisearch
Libs.private: @CLORISEARCH_PRIVATE_LIBS@

6 changes: 6 additions & 0 deletions cmake/def.h.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#ifndef CLORIS_CLORISEARCH_DEF_H_
#define CLORIS_CLORISEARCH_DEF_H_

// #cmakedefine ENABLE_JSON

#endif // CLORIS_CLORISEARCH_DEF_H_
133 changes: 133 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#
# cloriSearch cmake
# require cmake >= 2.8.10 && gcc >= 4.8
# Copyright (C) 2018 James Wei (weijianlhp@163.com). All rights reserved
#

if(WITH_DEBUG_SYMBOLS)
SET(DEBUG_SYMBOL "-g")
endif()

configure_file(${PROJECT_SOURCE_DIR}/cmake/def.h.in ${PROJECT_BINARY_DIR}/output/include/internal/def.h @ONLY)

set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
# if(ENABLE_JSON)
# find_package(RAPIDJSON REQUIRED)
# # customize rapidjson search path -DRAPIDJSON_PATH=xxx
# if(RAPIDJSON_PATH)
# include_directories(${RAPIDJSON_PATH})
# endif()
# endif()

include_directories(${CMAKE_CURRENT_SRC_DIR}
${CMAKE_SOURCE_DIR}/src
${CMAKE_SOURCE_DIR}/src/third_party
${PROJECT_BINARY_DIR}/output/include)

set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} ${DEBUG_SYMBOL}")
set(CMAKE_CXX_FLAGS "${CMAKE_CPP_FLAGS} -O2 -pipe -Wall -W -fPIC -fstrict-aliasing -Wno-invalid-offsetof -Wno-unused-parameter -fno-omit-frame-pointer")

macro(use_cxx11)
if(CMAKE_VERSION VERSION_LESS "3.1.3")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
else()
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
endif()
endmacro(use_cxx11)

set(DYNAMIC_LIB pthread)
use_cxx11()

# if(ENABLE_JSON)
# include_directories(${RAPIDJSON_INCLUDE_PATH})
# endif()

# for *.so output
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/output/lib)
# for *.a output
set(CMAKE_ARCHIVE_OUTPUT_DIRECOTRY ${PROJECT_BINARY_DIR}/output/lib)

set(PROJECT_CC_DIR ${PROJECT_SOURCE_DIR}/src)
set(PROJECT_INDEXER_DIR ${PROJECT_SOURCE_DIR}/src/indexer)
set(PROJECT_INTERNAL_DIR ${PROJECT_SOURCE_DIR}/src/internal)
set(PROJECT_TPARTY_DIR ${PROJECT_SOURCE_DIR}/src/third_party)

# add .proto file
include(FindProtobuf)
protobuf_generate_cpp(PROTO_SRC PROTO_HEADER
${PROJECT_CC_DIR}/proto/index_schema.proto
${PROJECT_CC_DIR}/proto/inverted_index.proto)

file(GLOB BASIC_SOURCES ${PROJECT_CC_DIR}/*.cc
${PROJECT_INDEXER_DIR}/*.cc
${PROJECT_INTERNAL_DIR}/*.cc
${PROJECT_TPARTY_DIR}/json2pb/*.cc
${PROJECT_TPARTY_DIR}/butil/*.cc
${PROJECT_TPARTY_DIR}/butil/*.cpp
${PROJECT_TPARTY_DIR}/butil/strings/*.cpp
${PROJECT_TPARTY_DIR}/butil/third_party/modp_b64/*.cc
)
set(PACKAGE_SOURCES ${BASIC_SOURCES} ${PROTO_SRC})
# ${PROTO_SRC}
# ${PROJECT_INDEXER_DIR}/conjunction_scorer.cc
# ${PROJECT_INDEXER_DIR}/indexer.cc
# ${PROJECT_INDEXER_DIR}/indexer_factory.cc
# ${PROJECT_INDEXER_DIR}/indexer_manager.cc
# ${PROJECT_INDEXER_DIR}/posting_list.cc
# ${PROJECT_INDEXER_DIR}/section_indexer.cc
# ${PROJECT_INDEXER_DIR}/simple_indexer.cc
# ${PROJECT_INTERNAL_DIR}/log.cc
# ${PROJECT_TPARTY_DIR}/json2pb/json_to_pb.cc
# ${PROJECT_TPARTY_DIR}/json2pb/protobuf_map.cc
# ${PROJECT_TPARTY_DIR}/json2pb/encode_decode.cc
# ${PROJECT_TPARTY_DIR}/json2pb/pb_to_json.cc
# )

include_directories(${CMAKE_CURRENT_BINARY_DIR})
add_library(OBJ_LIB OBJECT ${PACKAGE_SOURCES})
set_property(TARGET ${OBJ_LIB} PROPERTY POSITION_INDEPENTENT_CODE 1)
add_library(clorisearch-shared SHARED $<TARGET_OBJECTS:OBJ_LIB>)
add_library(clorisearch-static STATIC $<TARGET_OBJECTS:OBJ_LIB>)

target_link_libraries(clorisearch-shared ${DYNAMIC_LIB})
set_target_properties(clorisearch-shared PROPERTIES OUTPUT_NAME clorisearch CLEAN_DIRECT_OUTPUT 1)
set_target_properties(clorisearch-static PROPERTIES OUTPUT_NAME clorisearch CLEAN_DIRECT_OUTPUT 1)

set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/output/bin)
# set(TUTORIAL_SOURCES ${PROJECT_SOURCE_DIR}/src/example/tutorial.cc)
# add_executable(tutorial ${TUTORIAL_SOURCES})
# target_link_libraries(tutorial clorisearch-shared)

file(COPY ${PROJECT_SOURCE_DIR}/bin/
DESTINATION ${EXECUTABLE_OUTPUT_PATH})

file(COPY ${PROJECT_SOURCE_DIR}/conf/
DESTINATION ${PROJECT_BINARY_DIR}/output/conf)

# file(COPY ${PROJECT_SOURCE_DIR}/src/config.h DESTINATION ${PROJECT_BINARY_DIR}/output/include/)
file(COPY ${PROJECT_SOURCE_DIR}/src/internal/
DESTINATION ${PROJECT_BINARY_DIR}/output/include/internal/
FILES_MATCHING
PATTERN "*.h"
)
configure_file(${PROJECT_SOURCE_DIR}/cmake/clorisearch.pc.in ${PROJECT_BINARY_DIR}/output/clorisearch.pc)

# install
install(TARGETS clorisearch-shared
RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)

install(TARGETS clorisearch-static
RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)

install(DIRECTORY ${PROJECT_BINARY_DIR}/output/include/
DESTINATION ${CMAKE_INSTALL_PREFIX}/include/clorisearch/
FILES_MATCHING
PATTERN "*.h"
)

install(FILES ${PROJECT_BINARY_DIR}/output/clorisearch.pc DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/pkgconfig)
11 changes: 6 additions & 5 deletions src/clorisearch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// Copyright (C) 2018 James Wei (weijianlhp@163.com). All rights reserved
//
#include "internal/log.h"
#include "internal/json2pb.h"
#include "json2pb/json2pb.h"
#include "clorisearch.h"

namespace cloris {
Expand All @@ -16,8 +16,8 @@ CloriSearch::~CloriSearch() {
}

bool CloriSearch::Init(const std::string& source, IndexSchemaFormat format, SourceType source_type) {
if (src_type != DIRECT) {
cLog(ERROR, "cloriSearch init failed: unsupport src_type");
if (source_type != DIRECT) {
cLog(ERROR, "cloriSearch init failed: unsupport source_type");
return false;
}
if (format != ISF_JSON) {
Expand All @@ -43,7 +43,7 @@ bool CloriSearch::Init(const std::string& source, IndexSchemaFormat format, Sour
return true;
}

bool CloriSearch::Load(const std::string& source, IndexSchemaFormat format, bool is_incremental) {
bool CloriSearch::Add(const std::string& source, IndexSchemaFormat format, bool is_incremental) {
if (format != ISF_JSON) {
cLog(ERROR, "unsupport format-style");
return false;
Expand All @@ -54,7 +54,8 @@ bool CloriSearch::Load(const std::string& source, IndexSchemaFormat format, bool
cLog(ERROR, "CloriSearch load failed:" + err_msg);
return false;
}
inverted_index()->Add(dnf);
inverted_index()->Add(dnf, false);
return true;
}

std::vector<int> CloriSearch::Search(const Query& query, int limit) {
Expand Down
10 changes: 5 additions & 5 deletions src/clorisearch.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ enum SourceType {

class CloriSearch {
public:
ClorisSearch();
~ClorisSearch();
CloriSearch();
~CloriSearch();

bool Init(const std::string& source, IndexSchemaFormat format, SourceType source_type = DIRECT);
bool Load(const std::string& source, int docid, IndexSchemaFormat format, bool is_incremental = false);
bool Add(const std::string& source, IndexSchemaFormat format, bool is_incremental = false);
std::vector<int> Search(const Query& query, int limit = -1);

inline InvertedIndex* inverted_index() const { return iidx_; }
inline ForwardIndex* forward_index() const { return fidx_; }
inline InvertedIndex* inverted_index() { return &iidx_; }
inline ForwardIndex* forward_index() { return &fidx_; }
private:
InvertedIndex iidx_;
ForwardIndex fidx_;
Expand Down
47 changes: 9 additions & 38 deletions src/indexer/conjunction_scorer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,48 +4,18 @@
// Copyright (C) 2018 James Wei (weijianlhp@163.com). All rights reserved
//

#include <algorithm>
#include "posting_list.h"
#include "conjunction_scorer.h"

namespace cloris {

class PostingList {
public:
PostingList(std::list<int>* pl);
~PostingList() { }

static EOL = -1,

int CurrentEntry();
void SkipTo(int docid);
private:
std::list<int>* doc_list_;
std::list<int>::iterator iter_;
};

PostingList::PostingList(std::list<int>* pl) : doc_list_(pl) {
iter_ = doc_list_->begin();
}

int PostingList::CurrentEntry() {
if (iter_ == doc_list->end()) {
return EOL;
} else {
return *iter_;
}
}

int PostingList::SkipTo(int docid) {
while ((iter_ != doc_list_->end()) && (*iter < docid)) {
++iter_;
}
}

void ConjunctionScorer::AddPostingList(std::list<int>* doc_list) {
PostingList pl(doc_list_);
PostingList pl(doc_list);
plists_.push_back(pl);
}

std::vector<int> ConjunctionScorer::GetMatchedDocid(int k) {
std::vector<int> ConjunctionScorer::GetMatchedDocid(size_t k) {
std::vector<int> ret;
if (k == 0) {
k = 1;
Expand All @@ -55,7 +25,7 @@ std::vector<int> ConjunctionScorer::GetMatchedDocid(int k) {
}
int next_id;
while (plists_[k - 1].CurrentEntry() != PostingList::EOL) {
sort(plists_.begin(), plists_.end());
std::sort(plists_.begin(), plists_.end());
}
if (plists_[0].CurrentEntry() == plists_[k - 1].CurrentEntry()) {
// TODO support NOT IN
Expand All @@ -64,8 +34,8 @@ std::vector<int> ConjunctionScorer::GetMatchedDocid(int k) {
} else {
ret.push_back(plists_[k - 1].CurrentEntry());
// K=2, 2,2,2,2
for (int L = k; L < plists_.size(); ++L) {
if (plists_[L] < next_id) {
for (size_t L = k; L < plists_.size(); ++L) {
if (plists_[L].CurrentEntry() < next_id) {
plists_[L].SkipTo(next_id);
} else {
break;
Expand All @@ -75,9 +45,10 @@ std::vector<int> ConjunctionScorer::GetMatchedDocid(int k) {
} else {
next_id = plists_[k - 1].CurrentEntry();
}
for (int L = 0; L < k; ++L) {
for (size_t L = 0; L < k; ++L) {
plists_[L].SkipTo(next_id);
}
return ret;
}

} // namespace cloris
9 changes: 7 additions & 2 deletions src/indexer/conjunction_scorer.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,23 @@
#ifndef CLORIS_CONJUNCTION_SCORER_H_
#define CLORIS_CONJUNCTION_SCORER_H_

#include <unistd.h>
#include <vector>
#include <list>
#include "posting_list.h"

namespace cloris {

// refer to lucene implementation
// Conjunction Algorithm refered from << indexing boolean expression >>
// 每个倒排链起名叫posting list
class ConjunctionScorer {
public:
std::vector<int> GetMatchedDocid(int k);
std::vector<int> GetMatchedDocid(size_t k);
void AddPostingList(std::list<int>* doc_list);
private:
std::vector<PostingList> plists_;
}
};

} // namespace cloris

Expand Down
10 changes: 7 additions & 3 deletions src/indexer/indexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,19 @@
#ifndef CLORIS_INDEXER_H_
#define CLORIS_INDEXER_H_

#include <list>
#include "inverted_index.pb.h"
#include "term.h"

namespace cloris {

class Indexer {
public:
virtual ~Indexer() { }
virtual bool Add(const TermValue& value, int docid) { }
virtual std::list<int>* GetPostingLists(const TermValue&);
virtual bool Add(const ConjValue& value, int docid) = 0;
virtual std::list<int>* GetPostingLists(const Term& term) = 0;
protected:
AttributeTyle type_;
int type_;

private:

Expand Down
Loading

0 comments on commit 2532d93

Please sign in to comment.