Add more prebuilt indexes features (#235)

+ prebuild indexes for BERTserini
castorini · Sep 29, 2020 · 2ed2acc · 2ed2acc
1 parent 9ce3b00
commit 2ed2acc
Show file tree

Hide file tree

Showing 5 changed files with 143 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -128,6 +128,8 @@ It currently supports:
 + robust04 (TREC Disks 4 & 5)
 + ms-marco-passage (MS MARCO Passage)
 + ms-marco-doc (MS MARCO Doc)
++ enwiki-paragraphs (English Wikipedia)
++ zhwiki-paragraphs (Chinese Wikipedia)
 
 ## How Do I Fetch a Document?
 

diff --git a/pyserini/index/_base.py b/pyserini/index/_base.py
@@ -27,6 +27,7 @@
 from ..analysis import get_lucene_analyzer, JAnalyzer, JAnalyzerUtils
 from ..pyclass import autoclass, JString
 from ..search import Document
+from pyserini.util import download_prebuilt_index, get_indexes_info
 
 logger = logging.getLogger(__name__)
 
@@ -150,6 +151,28 @@ def __init__(self, index_dir):
         self.object = JIndexReader()
         self.reader = self.object.getReader(JString(index_dir))
 
+    @classmethod
+    def from_prebuilt_index(cls, prebuilt_index_name: str):
+        """Build an index reader from the prebuilt index, download the index if necessary.
+
+        Parameters
+        ----------
+        prebuilt_index_name : str
+            Prebuilt index name.
+
+        Returns
+        -------
+        IndexReader
+            Index reader built from the prebuilt index.
+        """
+        index_dir = download_prebuilt_index(prebuilt_index_name)
+        return cls(index_dir)
+
+    @staticmethod
+    def list_prebuilt_indexes():
+        """Display available prebuilt indexes' information."""
+        get_indexes_info()
+
     def analyze(self, text: str, analyzer=None) -> List[str]:
         """Analyze a piece of text. Applies Anserini's default Lucene analyzer if analyzer not specified.
 

diff --git a/pyserini/indexInfo.py b/pyserini/indexInfo.py
@@ -0,0 +1,74 @@
+INDEX_INFO = {
+  "robust04": {
+    "name": "robust04",
+    "description": "TREC Disks 4 & 5 (TREC 2004 Robust Track)",
+    "url": {"uwaterloo": "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz"},
+    "md5": "15f3d001489c97849a010b0a4734d018",
+    "downloaded": False,
+    "size compressed": "1821814915 bytes",
+    "size uncompressed": "2172142080 bytes",
+    "total_terms": 174540872,
+    "documents": 528030,
+    "non_empty_documents": 528030,
+    "unique_terms": 923436},
+  "trec45": {
+    "name": "trec45",
+    "description": "TREC Disks 4 & 5 (TREC 2004 Robust Track)",
+    "url": {"uwaterloo": "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz"},
+    "md5": "15f3d001489c97849a010b0a4734d018",
+    "downloaded": False,
+    "size compressed": "1821814915 bytes",
+    "size uncompressed": "2172142080 bytes",
+    "total_terms": 174540872,
+    "documents": 528030,
+    "non_empty_documents": 528030,
+    "unique_terms": 923436},
+  "ms-marco-passage": {
+    "name": "ms-marco-passage",
+    "description": "MS MARCO Passage Dataset",
+    "url": {"uwaterloo": "https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-passage-20191117-0ed488.tar.gz"},
+    "md5": "3c2ef64ee6d0ee8e317adcb341b92e28",
+    "downloaded": False,
+    "size compressed": "2153209812 bytes",
+    "size uncompressed": "2675783168 bytes",
+    "total_terms": 352316036,
+    "documents": 8841823,
+    "non_empty_documents": 8841823,
+    "unique_terms": -1},
+  "ms-marco-doc": {
+    "name": "ms-marco-doc",
+    "description": "MS MARCO Doc Dataset",
+    "url": {"dropbox": "https://www.dropbox.com/s/awukuo8c0tkl9sc/index-msmarco-doc-20200527-a1ecfa.tar.gz?dl=1"},
+    "md5": "72b1a0f9a9094a86d15c6f4babf8967a",
+    "downloaded": False,
+    "size compressed": "13661943256 bytes",
+    "size uncompressed": "16769683456 bytes",
+    "total_terms": 2748636047,
+    "documents": 3213835,
+    "non_empty_documents": 3213835,
+    "unique_terms": -1},
+  "enwiki-paragraphs": {
+    "name": "lucene-index.enwiki-20180701-paragraphs",
+    "description": "English Wikipedia",
+    "url": {"dropbox": "https://www.dropbox.com/s/b7qqaos9ot3atlp/lucene-index.enwiki-20180701-paragraphs.tar.gz?dl=1"},
+    "md5": "77d1cd530579905dad2ee3c2bda1b73d",
+    "downloaded": False,
+    "size compressed": "17725958785 bytes",
+    "size uncompressed": "21854924288 bytes",
+    "total_terms": 1498980668,
+    "documents": 39880064,
+    "non_empty_documents": 39879903,
+    "unique_terms": -1},
+  "zhwiki-paragraphs": {
+    "name": "lucene-index.zhwiki-20181201-paragraphs",
+    "description": "Chinese Wikipedia",
+    "url": {"dropbox": "https://www.dropbox.com/s/6zn16mombt0wirs/lucene-index.zhwiki-20181201-paragraphs.tar.gz?dl=1"},
+    "md5": "c005af4036296972831288c894918a92",
+    "downloaded": False,
+    "size compressed": "3284531213 bytes",
+    "size uncompressed": "3893332992 bytes",
+    "total_terms": 320776789,
+    "documents": 4170312,
+    "non_empty_documents": 4170301,
+    "unique_terms": -1}
+}
diff --git a/pyserini/search/_searcher.py b/pyserini/search/_searcher.py
@@ -26,7 +26,7 @@
 from pyserini.pyclass import autoclass, JString, JArrayList
 from pyserini.trectools import TrecRun
 from pyserini.fusion import FusionMethod, reciprocal_rank_fusion
-from pyserini.util import download_prebuilt_index
+from pyserini.util import download_prebuilt_index, get_indexes_info
 
 logger = logging.getLogger(__name__)
 
@@ -51,9 +51,26 @@ def __init__(self, index_dir: str):
 
     @classmethod
     def from_prebuilt_index(cls, prebuilt_index_name: str):
+        """Build a searcher from the prebuilt index, download the index if necessary.
+
+        Parameters
+        ----------
+        prebuilt_index_name : str
+            Prebuilt index name.
+
+        Returns
+        -------
+        SimpleSearcher
+            Searcher built from the prebuilt index.
+        """
         index_dir = download_prebuilt_index(prebuilt_index_name)
         return cls(index_dir)
 
+    @staticmethod
+    def list_prebuilt_indexes():
+        """Display available prebuilt indexes' information."""
+        get_indexes_info()
+
     def search(self, q: Union[str, JQuery], k: int = 10, query_generator: JQueryGenerator = None, strip_segment_id=False, remove_dups=False) -> List[JSimpleSearcherResult]:
         """Search the collection.
 

diff --git a/pyserini/util.py b/pyserini/util.py
@@ -21,25 +21,8 @@
 import tarfile
 from tqdm import tqdm
 from urllib.request import urlretrieve
-
-INDEX_INFO = {
-    'index-marco-passage': {
-        'urls': {'uwaterloo': 'https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-msmarco-passage-20191117-0ed488.tar.gz'},
-        'md5': '3c2ef64ee6d0ee8e317adcb341b92e28'},
-    'index-marco-doc': {
-        'urls': {'dropbox': 'https://www.dropbox.com/s/awukuo8c0tkl9sc/index-msmarco-doc-20200527-a1ecfa.tar.gz?dl=1'},
-        'md5': '72b1a0f9a9094a86d15c6f4babf8967a'},
-    'index-robust04': {
-        'urls': {'uwaterloo': 'https://git.uwaterloo.ca/jimmylin/anserini-indexes/raw/master/index-robust04-20191213.tar.gz'},
-        'md5': '15f3d001489c97849a010b0a4734d018'}
-}
-
-INDEX_MAPPING = {
-    'ms-marco-passage': INDEX_INFO['index-marco-passage'],
-    'ms-marco-doc': INDEX_INFO['index-marco-doc'],
-    'trec45': INDEX_INFO['index-robust04'],
-    'robust04': INDEX_INFO['index-robust04']
-}
+import pandas as pd
+from pyserini.indexInfo import INDEX_INFO
 
 
 # https://gist.github.com/leimao/37ff6e990b3226c2c9670a2cd1e4a6f5
@@ -99,7 +82,7 @@ def download_and_unpack_index(url, index_directory='indexes', force=False, verbo
 
     if prebuilt:
         index_directory = os.path.join(get_cache_home(), 'indexes')
-        index_path = os.path.join(index_directory, f'{index_name}{md5}')
+        index_path = os.path.join(index_directory, f'{index_name}.{md5}')
         local_tarball = os.path.join(index_directory, f'{index_name}.tar.gz')
         if not os.path.exists(index_directory):
             os.makedirs(index_directory)
@@ -139,15 +122,33 @@ def download_and_unpack_index(url, index_directory='indexes', force=False, verbo
         os.rename(os.path.join(index_directory, f'{index_name}'), index_path)
     return index_path
 
+def check_downloaded(index_name):
+    mirror = next(iter(INDEX_INFO[index_name]["url"]))
+    index_url = INDEX_INFO[index_name]["url"][mirror]
+    index_md5 = INDEX_INFO[index_name]["md5"]
+    index_name = index_url.split('/')[-1]
+    index_name = re.sub('''.tar.gz.*$''', '', index_name)
+    index_directory = os.path.join(get_cache_home(), 'indexes')
+    index_path = os.path.join(index_directory, f'{index_name}.{index_md5}')
+    return os.path.exists(index_path)
+
+def get_indexes_info():
+    indexDf = pd.DataFrame.from_dict(INDEX_INFO)
+    for index in indexDf.keys():
+        indexDf[index]['downloaded'] = check_downloaded(index)
+    with pd.option_context('display.max_rows', None, 'display.max_columns', \
+                           None, 'display.max_colwidth', -1, 'display.colheader_justify', 'left'):
+        print(indexDf)
 
 def download_prebuilt_index(index_name, force=False, verbose=True, mirror=None):
-    if index_name in INDEX_MAPPING:
+    if index_name in INDEX_INFO:
         if not mirror:
-            mirror = next(iter(INDEX_MAPPING[index_name]["urls"]))
-        elif  mirror not in INDEX_MAPPING[index_name]["urls"]:
+            mirror = next(iter(INDEX_INFO[index_name]["url"]))
+        elif  mirror not in INDEX_INFO[index_name]["url"]:
             raise ValueError("unrecognized mirror name {}".format(mirror))
-        index_url = INDEX_MAPPING[index_name]["urls"][mirror]
-        index_md5 = INDEX_MAPPING[index_name]["md5"]
+        index_url = INDEX_INFO[index_name]["url"][mirror]
+        index_md5 = INDEX_INFO[index_name]["md5"]
         return download_and_unpack_index(index_url, prebuilt=True, md5=index_md5)
     else:
         raise ValueError("unrecognized index name {}".format(index_name))
+