Initial support for MS MARCO V2 passage and document collections (#1571)

castorini · Jun 24, 2021 · ce35d61 · ce35d61
1 parent 5187422
commit ce35d61
Show file tree

Hide file tree

Showing 12 changed files with 581 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -110,6 +110,7 @@ For the most part, manual copying and pasting of commands into a shell is requir
 + [Guide to reproducing doc2query results](docs/experiments-doc2query.md) (MS MARCO passage ranking and TREC-CAR)
 + [Guide to reproducing docTTTTTquery results](docs/experiments-docTTTTTquery.md) (MS MARCO passage and document ranking)
 + [Guide to reproducing DeepImpact for the MS MARCO Passage Ranking Task](docs/experiments-msmarco-passage-deepimpact.md)
++ [Guide to working with the MS MARCO V2 Collections](docs/experiments-msmarco-v2.md)
 
 ### Other Experiments
 

diff --git a/docs/experiments-msmarco-v2.md b/docs/experiments-msmarco-v2.md
@@ -0,0 +1,65 @@
+# Anserini: Guide to Working with the MS MARCO V2 Collections
+
+This guide presents information for working with V2 of the MS MARCO passage and document test collections.
+
+Indexing the passage collection, which is 20 GB compressed:
+
+```
+sh target/appassembler/bin/IndexCollection -collection MsMarcoPassageV2Collection \
+ -generator DefaultLuceneDocumentGenerator -threads 18 \
+ -input collections/msmarco_v2_passage \
+ -index indexes/msmarco-passage-v2 \
+ -storePositions -storeDocvectors -storeRaw
+```
+
+Adjust `-threads` as appropriate.
+The above configuration, on a 2017 iMac Pro with SSD, takes around 30min.
+
+The complete index occupies 72 GB (138,364,198 passages).
+It's big because it includes postions (for phrase queries), document vectors (for relevance feedback), and a complete copy of the collection itself.
+The index size can be reduced by removing the options `-storePositions`, `-storeDocvectors`, `-storeRaw` as appropriate.
+For reference:
+
++ Without any of the three above option, index size reduces to 12 GB.
++ With just `-storeRaw`, index size reduces to 47 GB. This setting contains the raw JSON document, which makes it suitable for use as first-stage retrieval to support downstream rerankers. Bloat compared to compressed size of raw collection is due to support for per-document random access.
+
+Indexing the document collection, which is 32 GB compressed:
+
+```
+sh target/appassembler/bin/IndexCollection -collection MsMarcoDocV2Collection \
+ -generator DefaultLuceneDocumentGenerator -threads 18 \
+ -input collections/msmarco_v2_doc \
+ -index indexes/msmarco-doc-v2 \
+ -storePositions -storeDocvectors -storeRaw
+```
+
+Same instructions as above.
+On the same machine, indexing takes around 40min.
+Complete index occupies 134 GB (11,959,635 documents).
+Index size can be reduced by removing the options `-storePositions`, `-storeDocvectors`, `-storeRaw` as appropriate.
+For reference:
+
++ Without any of the three above option, index size reduces to 9.4 GB.
++ With just `-storeRaw`, index size reduces to 73 GB. This setting contains the raw JSON document, which makes it suitable for use as first-stage retrieval to support downstream rerankers. Bloat compared to compressed size of raw collection is due to support for per-document random access; evidently, the JSON docs don't compress well.
+
+Perform a run on the dev queries:
+
+```
+target/appassembler/bin/SearchCollection -index indexes/msmarco-doc-v2 \
+ -topicreader TsvInt -topics collections/docv2_dev_queries.tsv \
+ -output runs/run.msmarco-doc-v2.dev.txt \
+ -bm25 -hits 100
+```
+
+Evaluation:
+
+```bash
+$ tools/eval/trec_eval.9.0.4/trec_eval -c -m map -m recall.100 -m recip_rank collections/docv2_dev_qrels.uniq.tsv runs/run.msmarco-doc-v2.dev.txt
+map                   	all	0.1552
+recip_rank            	all	0.1572
+recall_100            	all	0.5956
+```
+
+
+## Reproduction Log[*](reproducibility.md)
+
diff --git a/src/main/java/io/anserini/collection/MsMarcoDocV2Collection.java b/src/main/java/io/anserini/collection/MsMarcoDocV2Collection.java
@@ -0,0 +1,152 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.collection;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.MappingIterator;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.zip.GZIPInputStream;
+
+public class MsMarcoDocV2Collection extends DocumentCollection<MsMarcoDocV2Collection.Document> {
+  private static final Logger LOG = LogManager.getLogger(JsonCollection.class);
+
+  public MsMarcoDocV2Collection(Path path){
+    this.path = path;
+  }
+
+  @SuppressWarnings("unchecked")
+  @Override
+  public FileSegment<MsMarcoDocV2Collection.Document> createFileSegment(Path p) throws IOException {
+    return new Segment(p);
+  }
+
+  /**
+   * A file in a JSON collection, typically containing multiple documents.
+   */
+  public static class Segment<T extends Document> extends FileSegment<T> {
+    private JsonNode node = null;
+    private Iterator<JsonNode> iter = null; // iterator for JSON document array
+    private MappingIterator<JsonNode> iterator; // iterator for JSON line objects
+
+    public Segment(Path path) throws IOException {
+      super(path);
+
+      if (path.toString().endsWith(".gz")) {
+        InputStream stream = new GZIPInputStream(Files.newInputStream(path, StandardOpenOption.READ), BUFFER_SIZE);
+        bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
+      } else {
+        bufferedReader = new BufferedReader(new FileReader(path.toString()));
+      }
+
+      ObjectMapper mapper = new ObjectMapper();
+      iterator = mapper.readerFor(JsonNode.class).readValues(bufferedReader);
+      if (iterator.hasNext()) {
+        node = iterator.next();
+      }
+    }
+
+    @SuppressWarnings("unchecked")
+    @Override
+    public void readNext() throws NoSuchElementException {
+      if (node == null) {
+        throw new NoSuchElementException("JsonNode is empty");
+      } else if (node.isObject()) {
+        bufferedRecord = (T) createNewDocument(node);
+        if (iterator.hasNext()) { // if bufferedReader contains JSON line objects, we parse the next JSON into node
+          node = iterator.next();
+        } else {
+          atEOF = true; // there is no more JSON object in the bufferedReader
+        }
+      } else {
+        LOG.error("Error: invalid JsonNode type");
+        throw new NoSuchElementException("Invalid JsonNode type");
+      }
+    }
+
+    protected Document createNewDocument(JsonNode json) {
+      return new Document(node);
+    }
+  }
+
+  /**
+   * A document in a JSON collection.
+   */
+  public static class Document implements SourceDocument {
+    private String id;
+    private String raw;
+    private Map<String, String> fields;
+
+    public Document(JsonNode json) {
+      this.raw = json.toPrettyString();
+      this.fields = new HashMap<>();
+
+      json.fields().forEachRemaining( e -> {
+        if ("docid".equals(e.getKey())) {
+          this.id = json.get("docid").asText();
+        } else {
+          this.fields.put(e.getKey(), e.getValue().asText());
+        }
+      });
+    }
+
+    @Override
+    public String id() {
+      if (id == null) {
+        throw new RuntimeException("Document does not have the required \"docid\" field!");
+      }
+      return id;
+    }
+
+    @Override
+    public String contents() {
+      if (!fields.containsKey("url") || !fields.containsKey("title") ||
+          !fields.containsKey("headings") || !fields.containsKey("body")) {
+        throw new RuntimeException("Document is missing required fields!");
+      }
+
+      return fields.get("url") + " " + fields.get("title") + " " + fields.get("headings") + " " + fields.get("body");
+    }
+
+    @Override
+    public String raw() {
+      return raw;
+    }
+
+    @Override
+    public boolean indexable() {
+      return true;
+    }
+  }
+}
diff --git a/src/main/java/io/anserini/collection/MsMarcoPassageV2Collection.java b/src/main/java/io/anserini/collection/MsMarcoPassageV2Collection.java
@@ -0,0 +1,143 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.collection;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.MappingIterator;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.zip.GZIPInputStream;
+
+public class MsMarcoPassageV2Collection extends DocumentCollection<MsMarcoPassageV2Collection.Document> {
+  private static final Logger LOG = LogManager.getLogger(JsonCollection.class);
+
+  public MsMarcoPassageV2Collection(Path path){
+    this.path = path;
+  }
+
+  @SuppressWarnings("unchecked")
+  @Override
+  public FileSegment<MsMarcoPassageV2Collection.Document> createFileSegment(Path p) throws IOException {
+    return new Segment(p);
+  }
+
+  /**
+   * A file in a JSON collection, typically containing multiple documents.
+   */
+  public static class Segment<T extends Document> extends FileSegment<T>{
+    private JsonNode node = null;
+    private Iterator<JsonNode> iter = null; // iterator for JSON document array
+    private MappingIterator<JsonNode> iterator; // iterator for JSON line objects
+
+    public Segment(Path path) throws IOException {
+      super(path);
+
+      if (path.toString().endsWith(".gz")) {
+        InputStream stream = new GZIPInputStream(Files.newInputStream(path, StandardOpenOption.READ), BUFFER_SIZE);
+        bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
+      } else {
+        bufferedReader = new BufferedReader(new FileReader(path.toString()));
+      }
+
+      ObjectMapper mapper = new ObjectMapper();
+      iterator = mapper.readerFor(JsonNode.class).readValues(bufferedReader);
+      if (iterator.hasNext()) {
+        node = iterator.next();
+      }
+    }
+
+    @SuppressWarnings("unchecked")
+    @Override
+    public void readNext() throws NoSuchElementException {
+      if (node == null) {
+        throw new NoSuchElementException("JsonNode is empty");
+      } else if (node.isObject()) {
+        bufferedRecord = (T) createNewDocument(node);
+        if (iterator.hasNext()) { // if bufferedReader contains JSON line objects, we parse the next JSON into node
+          node = iterator.next();
+        } else {
+          atEOF = true; // there is no more JSON object in the bufferedReader
+        }
+      } else {
+        LOG.error("Error: invalid JsonNode type");
+        throw new NoSuchElementException("Invalid JsonNode type");
+      }
+    }
+
+    protected Document createNewDocument(JsonNode json) {
+      return new Document(node);
+    }
+  }
+
+  /**
+   * A document in a JSON collection.
+   */
+  public static class Document implements SourceDocument {
+    private String id;
+    private String contents;
+    private String raw;
+
+    public Document(JsonNode json) {
+      this.raw = json.toPrettyString();
+      this.id = json.get("pid").asText();
+      this.contents = json.get("passage").asText();
+    }
+
+    @Override
+    public String id() {
+      if (id == null) {
+        throw new RuntimeException("Passage does not have the required \"pid\" field!");
+      }
+      return id;
+    }
+
+    @Override
+    public String contents() {
+      if (contents == null) {
+        throw new RuntimeException("Passage does not have the required \"passage\" field!");
+      }
+      return contents;
+    }
+
+    @Override
+    public String raw() {
+      return raw;
+    }
+
+    @Override
+    public boolean indexable() {
+      return true;
+    }
+  }
+}