Skip to content

Commit

Permalink
Add more prebuilt indexes features (#235)
Browse files Browse the repository at this point in the history
+ prebuild indexes for BERTserini
  • Loading branch information
qguo96 authored Sep 29, 2020
1 parent 9ce3b00 commit 2ed2acc
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 26 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ It currently supports:
+ robust04 (TREC Disks 4 & 5)
+ ms-marco-passage (MS MARCO Passage)
+ ms-marco-doc (MS MARCO Doc)
+ enwiki-paragraphs (English Wikipedia)
+ zhwiki-paragraphs (Chinese Wikipedia)

## How Do I Fetch a Document?

Expand Down
23 changes: 23 additions & 0 deletions pyserini/index/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from ..analysis import get_lucene_analyzer, JAnalyzer, JAnalyzerUtils
from ..pyclass import autoclass, JString
from ..search import Document
from pyserini.util import download_prebuilt_index, get_indexes_info

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -150,6 +151,28 @@ def __init__(self, index_dir):
self.object = JIndexReader()
self.reader = self.object.getReader(JString(index_dir))

@classmethod
def from_prebuilt_index(cls, prebuilt_index_name: str):
"""Build an index reader from the prebuilt index, download the index if necessary.
Parameters
----------
prebuilt_index_name : str
Prebuilt index name.
Returns
-------
IndexReader
Index reader built from the prebuilt index.
"""
index_dir = download_prebuilt_index(prebuilt_index_name)
return cls(index_dir)

@staticmethod
def list_prebuilt_indexes():
"""Display available prebuilt indexes' information."""
get_indexes_info()

def analyze(self, text: str, analyzer=None) -> List[str]:
"""Analyze a piece of text. Applies Anserini's default Lucene analyzer if analyzer not specified.
Expand Down
74 changes: 74 additions & 0 deletions pyserini/indexInfo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
INDEX_INFO = {
"robust04": {
"name": "robust04",
"description": "TREC Disks 4 & 5 (TREC 2004 Robust Track)",
"url": {"uwaterloo": "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz"},
"md5": "15f3d001489c97849a010b0a4734d018",
"downloaded": False,
"size compressed": "1821814915 bytes",
"size uncompressed": "2172142080 bytes",
"total_terms": 174540872,
"documents": 528030,
"non_empty_documents": 528030,
"unique_terms": 923436},
"trec45": {
"name": "trec45",
"description": "TREC Disks 4 & 5 (TREC 2004 Robust Track)",
"url": {"uwaterloo": "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz"},
"md5": "15f3d001489c97849a010b0a4734d018",
"downloaded": False,
"size compressed": "1821814915 bytes",
"size uncompressed": "2172142080 bytes",
"total_terms": 174540872,
"documents": 528030,
"non_empty_documents": 528030,
"unique_terms": 923436},
"ms-marco-passage": {
"name": "ms-marco-passage",
"description": "MS MARCO Passage Dataset",
"url": {"uwaterloo": "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-passage-20191117-0ed488.tar.gz"},
"md5": "3c2ef64ee6d0ee8e317adcb341b92e28",
"downloaded": False,
"size compressed": "2153209812 bytes",
"size uncompressed": "2675783168 bytes",
"total_terms": 352316036,
"documents": 8841823,
"non_empty_documents": 8841823,
"unique_terms": -1},
"ms-marco-doc": {
"name": "ms-marco-doc",
"description": "MS MARCO Doc Dataset",
"url": {"dropbox": "https://www.dropbox.com/s/awukuo8c0tkl9sc/index-msmarco-doc-20200527-a1ecfa.tar.gz?dl=1"},
"md5": "72b1a0f9a9094a86d15c6f4babf8967a",
"downloaded": False,
"size compressed": "13661943256 bytes",
"size uncompressed": "16769683456 bytes",
"total_terms": 2748636047,
"documents": 3213835,
"non_empty_documents": 3213835,
"unique_terms": -1},
"enwiki-paragraphs": {
"name": "lucene-index.enwiki-20180701-paragraphs",
"description": "English Wikipedia",
"url": {"dropbox": "https://www.dropbox.com/s/b7qqaos9ot3atlp/lucene-index.enwiki-20180701-paragraphs.tar.gz?dl=1"},
"md5": "77d1cd530579905dad2ee3c2bda1b73d",
"downloaded": False,
"size compressed": "17725958785 bytes",
"size uncompressed": "21854924288 bytes",
"total_terms": 1498980668,
"documents": 39880064,
"non_empty_documents": 39879903,
"unique_terms": -1},
"zhwiki-paragraphs": {
"name": "lucene-index.zhwiki-20181201-paragraphs",
"description": "Chinese Wikipedia",
"url": {"dropbox": "https://www.dropbox.com/s/6zn16mombt0wirs/lucene-index.zhwiki-20181201-paragraphs.tar.gz?dl=1"},
"md5": "c005af4036296972831288c894918a92",
"downloaded": False,
"size compressed": "3284531213 bytes",
"size uncompressed": "3893332992 bytes",
"total_terms": 320776789,
"documents": 4170312,
"non_empty_documents": 4170301,
"unique_terms": -1}
}
19 changes: 18 additions & 1 deletion pyserini/search/_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from pyserini.pyclass import autoclass, JString, JArrayList
from pyserini.trectools import TrecRun
from pyserini.fusion import FusionMethod, reciprocal_rank_fusion
from pyserini.util import download_prebuilt_index
from pyserini.util import download_prebuilt_index, get_indexes_info

logger = logging.getLogger(__name__)

Expand All @@ -51,9 +51,26 @@ def __init__(self, index_dir: str):

@classmethod
def from_prebuilt_index(cls, prebuilt_index_name: str):
"""Build a searcher from the prebuilt index, download the index if necessary.
Parameters
----------
prebuilt_index_name : str
Prebuilt index name.
Returns
-------
SimpleSearcher
Searcher built from the prebuilt index.
"""
index_dir = download_prebuilt_index(prebuilt_index_name)
return cls(index_dir)

@staticmethod
def list_prebuilt_indexes():
"""Display available prebuilt indexes' information."""
get_indexes_info()

def search(self, q: Union[str, JQuery], k: int = 10, query_generator: JQueryGenerator = None, strip_segment_id=False, remove_dups=False) -> List[JSimpleSearcherResult]:
"""Search the collection.
Expand Down
51 changes: 26 additions & 25 deletions pyserini/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,8 @@
import tarfile
from tqdm import tqdm
from urllib.request import urlretrieve

INDEX_INFO = {
'index-marco-passage': {
'urls': {'uwaterloo': 'https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-passage-20191117-0ed488.tar.gz'},
'md5': '3c2ef64ee6d0ee8e317adcb341b92e28'},
'index-marco-doc': {
'urls': {'dropbox': 'https://www.dropbox.com/s/awukuo8c0tkl9sc/index-msmarco-doc-20200527-a1ecfa.tar.gz?dl=1'},
'md5': '72b1a0f9a9094a86d15c6f4babf8967a'},
'index-robust04': {
'urls': {'uwaterloo': 'https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz'},
'md5': '15f3d001489c97849a010b0a4734d018'}
}

INDEX_MAPPING = {
'ms-marco-passage': INDEX_INFO['index-marco-passage'],
'ms-marco-doc': INDEX_INFO['index-marco-doc'],
'trec45': INDEX_INFO['index-robust04'],
'robust04': INDEX_INFO['index-robust04']
}
import pandas as pd
from pyserini.indexInfo import INDEX_INFO


# https://gist.github.com/leimao/37ff6e990b3226c2c9670a2cd1e4a6f5
Expand Down Expand Up @@ -99,7 +82,7 @@ def download_and_unpack_index(url, index_directory='indexes', force=False, verbo

if prebuilt:
index_directory = os.path.join(get_cache_home(), 'indexes')
index_path = os.path.join(index_directory, f'{index_name}{md5}')
index_path = os.path.join(index_directory, f'{index_name}.{md5}')
local_tarball = os.path.join(index_directory, f'{index_name}.tar.gz')
if not os.path.exists(index_directory):
os.makedirs(index_directory)
Expand Down Expand Up @@ -139,15 +122,33 @@ def download_and_unpack_index(url, index_directory='indexes', force=False, verbo
os.rename(os.path.join(index_directory, f'{index_name}'), index_path)
return index_path

def check_downloaded(index_name):
mirror = next(iter(INDEX_INFO[index_name]["url"]))
index_url = INDEX_INFO[index_name]["url"][mirror]
index_md5 = INDEX_INFO[index_name]["md5"]
index_name = index_url.split('/')[-1]
index_name = re.sub('''.tar.gz.*$''', '', index_name)
index_directory = os.path.join(get_cache_home(), 'indexes')
index_path = os.path.join(index_directory, f'{index_name}.{index_md5}')
return os.path.exists(index_path)

def get_indexes_info():
indexDf = pd.DataFrame.from_dict(INDEX_INFO)
for index in indexDf.keys():
indexDf[index]['downloaded'] = check_downloaded(index)
with pd.option_context('display.max_rows', None, 'display.max_columns', \
None, 'display.max_colwidth', -1, 'display.colheader_justify', 'left'):
print(indexDf)

def download_prebuilt_index(index_name, force=False, verbose=True, mirror=None):
if index_name in INDEX_MAPPING:
if index_name in INDEX_INFO:
if not mirror:
mirror = next(iter(INDEX_MAPPING[index_name]["urls"]))
elif mirror not in INDEX_MAPPING[index_name]["urls"]:
mirror = next(iter(INDEX_INFO[index_name]["url"]))
elif mirror not in INDEX_INFO[index_name]["url"]:
raise ValueError("unrecognized mirror name {}".format(mirror))
index_url = INDEX_MAPPING[index_name]["urls"][mirror]
index_md5 = INDEX_MAPPING[index_name]["md5"]
index_url = INDEX_INFO[index_name]["url"][mirror]
index_md5 = INDEX_INFO[index_name]["md5"]
return download_and_unpack_index(index_url, prebuilt=True, md5=index_md5)
else:
raise ValueError("unrecognized index name {}".format(index_name))

0 comments on commit 2ed2acc

Please sign in to comment.