-
Notifications
You must be signed in to change notification settings - Fork 477
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial support for MS MARCO V2 passage and document collections (#1571)
- Loading branch information
Showing
12 changed files
with
581 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# Anserini: Guide to Working with the MS MARCO V2 Collections | ||
|
||
This guide presents information for working with V2 of the MS MARCO passage and document test collections. | ||
|
||
Indexing the passage collection, which is 20 GB compressed: | ||
|
||
``` | ||
sh target/appassembler/bin/IndexCollection -collection MsMarcoPassageV2Collection \ | ||
-generator DefaultLuceneDocumentGenerator -threads 18 \ | ||
-input collections/msmarco_v2_passage \ | ||
-index indexes/msmarco-passage-v2 \ | ||
-storePositions -storeDocvectors -storeRaw | ||
``` | ||
|
||
Adjust `-threads` as appropriate. | ||
The above configuration, on a 2017 iMac Pro with SSD, takes around 30min. | ||
|
||
The complete index occupies 72 GB (138,364,198 passages). | ||
It's big because it includes postions (for phrase queries), document vectors (for relevance feedback), and a complete copy of the collection itself. | ||
The index size can be reduced by removing the options `-storePositions`, `-storeDocvectors`, `-storeRaw` as appropriate. | ||
For reference: | ||
|
||
+ Without any of the three above option, index size reduces to 12 GB. | ||
+ With just `-storeRaw`, index size reduces to 47 GB. This setting contains the raw JSON document, which makes it suitable for use as first-stage retrieval to support downstream rerankers. Bloat compared to compressed size of raw collection is due to support for per-document random access. | ||
|
||
Indexing the document collection, which is 32 GB compressed: | ||
|
||
``` | ||
sh target/appassembler/bin/IndexCollection -collection MsMarcoDocV2Collection \ | ||
-generator DefaultLuceneDocumentGenerator -threads 18 \ | ||
-input collections/msmarco_v2_doc \ | ||
-index indexes/msmarco-doc-v2 \ | ||
-storePositions -storeDocvectors -storeRaw | ||
``` | ||
|
||
Same instructions as above. | ||
On the same machine, indexing takes around 40min. | ||
Complete index occupies 134 GB (11,959,635 documents). | ||
Index size can be reduced by removing the options `-storePositions`, `-storeDocvectors`, `-storeRaw` as appropriate. | ||
For reference: | ||
|
||
+ Without any of the three above option, index size reduces to 9.4 GB. | ||
+ With just `-storeRaw`, index size reduces to 73 GB. This setting contains the raw JSON document, which makes it suitable for use as first-stage retrieval to support downstream rerankers. Bloat compared to compressed size of raw collection is due to support for per-document random access; evidently, the JSON docs don't compress well. | ||
|
||
Perform a run on the dev queries: | ||
|
||
``` | ||
target/appassembler/bin/SearchCollection -index indexes/msmarco-doc-v2 \ | ||
-topicreader TsvInt -topics collections/docv2_dev_queries.tsv \ | ||
-output runs/run.msmarco-doc-v2.dev.txt \ | ||
-bm25 -hits 100 | ||
``` | ||
|
||
Evaluation: | ||
|
||
```bash | ||
$ tools/eval/trec_eval.9.0.4/trec_eval -c -m map -m recall.100 -m recip_rank collections/docv2_dev_qrels.uniq.tsv runs/run.msmarco-doc-v2.dev.txt | ||
map all 0.1552 | ||
recip_rank all 0.1572 | ||
recall_100 all 0.5956 | ||
``` | ||
|
||
|
||
## Reproduction Log[*](reproducibility.md) | ||
|
152 changes: 152 additions & 0 deletions
152
src/main/java/io/anserini/collection/MsMarcoDocV2Collection.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
/* | ||
* Anserini: A Lucene toolkit for reproducible information retrieval research | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package io.anserini.collection; | ||
|
||
import com.fasterxml.jackson.databind.JsonNode; | ||
import com.fasterxml.jackson.databind.MappingIterator; | ||
import com.fasterxml.jackson.databind.ObjectMapper; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.FileReader; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.InputStreamReader; | ||
import java.nio.charset.StandardCharsets; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.nio.file.StandardOpenOption; | ||
import java.util.Arrays; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.Iterator; | ||
import java.util.Map; | ||
import java.util.NoSuchElementException; | ||
import java.util.zip.GZIPInputStream; | ||
|
||
public class MsMarcoDocV2Collection extends DocumentCollection<MsMarcoDocV2Collection.Document> { | ||
private static final Logger LOG = LogManager.getLogger(JsonCollection.class); | ||
|
||
public MsMarcoDocV2Collection(Path path){ | ||
this.path = path; | ||
} | ||
|
||
@SuppressWarnings("unchecked") | ||
@Override | ||
public FileSegment<MsMarcoDocV2Collection.Document> createFileSegment(Path p) throws IOException { | ||
return new Segment(p); | ||
} | ||
|
||
/** | ||
* A file in a JSON collection, typically containing multiple documents. | ||
*/ | ||
public static class Segment<T extends Document> extends FileSegment<T> { | ||
private JsonNode node = null; | ||
private Iterator<JsonNode> iter = null; // iterator for JSON document array | ||
private MappingIterator<JsonNode> iterator; // iterator for JSON line objects | ||
|
||
public Segment(Path path) throws IOException { | ||
super(path); | ||
|
||
if (path.toString().endsWith(".gz")) { | ||
InputStream stream = new GZIPInputStream(Files.newInputStream(path, StandardOpenOption.READ), BUFFER_SIZE); | ||
bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); | ||
} else { | ||
bufferedReader = new BufferedReader(new FileReader(path.toString())); | ||
} | ||
|
||
ObjectMapper mapper = new ObjectMapper(); | ||
iterator = mapper.readerFor(JsonNode.class).readValues(bufferedReader); | ||
if (iterator.hasNext()) { | ||
node = iterator.next(); | ||
} | ||
} | ||
|
||
@SuppressWarnings("unchecked") | ||
@Override | ||
public void readNext() throws NoSuchElementException { | ||
if (node == null) { | ||
throw new NoSuchElementException("JsonNode is empty"); | ||
} else if (node.isObject()) { | ||
bufferedRecord = (T) createNewDocument(node); | ||
if (iterator.hasNext()) { // if bufferedReader contains JSON line objects, we parse the next JSON into node | ||
node = iterator.next(); | ||
} else { | ||
atEOF = true; // there is no more JSON object in the bufferedReader | ||
} | ||
} else { | ||
LOG.error("Error: invalid JsonNode type"); | ||
throw new NoSuchElementException("Invalid JsonNode type"); | ||
} | ||
} | ||
|
||
protected Document createNewDocument(JsonNode json) { | ||
return new Document(node); | ||
} | ||
} | ||
|
||
/** | ||
* A document in a JSON collection. | ||
*/ | ||
public static class Document implements SourceDocument { | ||
private String id; | ||
private String raw; | ||
private Map<String, String> fields; | ||
|
||
public Document(JsonNode json) { | ||
this.raw = json.toPrettyString(); | ||
this.fields = new HashMap<>(); | ||
|
||
json.fields().forEachRemaining( e -> { | ||
if ("docid".equals(e.getKey())) { | ||
this.id = json.get("docid").asText(); | ||
} else { | ||
this.fields.put(e.getKey(), e.getValue().asText()); | ||
} | ||
}); | ||
} | ||
|
||
@Override | ||
public String id() { | ||
if (id == null) { | ||
throw new RuntimeException("Document does not have the required \"docid\" field!"); | ||
} | ||
return id; | ||
} | ||
|
||
@Override | ||
public String contents() { | ||
if (!fields.containsKey("url") || !fields.containsKey("title") || | ||
!fields.containsKey("headings") || !fields.containsKey("body")) { | ||
throw new RuntimeException("Document is missing required fields!"); | ||
} | ||
|
||
return fields.get("url") + " " + fields.get("title") + " " + fields.get("headings") + " " + fields.get("body"); | ||
} | ||
|
||
@Override | ||
public String raw() { | ||
return raw; | ||
} | ||
|
||
@Override | ||
public boolean indexable() { | ||
return true; | ||
} | ||
} | ||
} |
143 changes: 143 additions & 0 deletions
143
src/main/java/io/anserini/collection/MsMarcoPassageV2Collection.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
/* | ||
* Anserini: A Lucene toolkit for reproducible information retrieval research | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package io.anserini.collection; | ||
|
||
import com.fasterxml.jackson.databind.JsonNode; | ||
import com.fasterxml.jackson.databind.MappingIterator; | ||
import com.fasterxml.jackson.databind.ObjectMapper; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.FileReader; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.InputStreamReader; | ||
import java.nio.charset.StandardCharsets; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.nio.file.StandardOpenOption; | ||
import java.util.Arrays; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.Iterator; | ||
import java.util.Map; | ||
import java.util.NoSuchElementException; | ||
import java.util.zip.GZIPInputStream; | ||
|
||
public class MsMarcoPassageV2Collection extends DocumentCollection<MsMarcoPassageV2Collection.Document> { | ||
private static final Logger LOG = LogManager.getLogger(JsonCollection.class); | ||
|
||
public MsMarcoPassageV2Collection(Path path){ | ||
this.path = path; | ||
} | ||
|
||
@SuppressWarnings("unchecked") | ||
@Override | ||
public FileSegment<MsMarcoPassageV2Collection.Document> createFileSegment(Path p) throws IOException { | ||
return new Segment(p); | ||
} | ||
|
||
/** | ||
* A file in a JSON collection, typically containing multiple documents. | ||
*/ | ||
public static class Segment<T extends Document> extends FileSegment<T>{ | ||
private JsonNode node = null; | ||
private Iterator<JsonNode> iter = null; // iterator for JSON document array | ||
private MappingIterator<JsonNode> iterator; // iterator for JSON line objects | ||
|
||
public Segment(Path path) throws IOException { | ||
super(path); | ||
|
||
if (path.toString().endsWith(".gz")) { | ||
InputStream stream = new GZIPInputStream(Files.newInputStream(path, StandardOpenOption.READ), BUFFER_SIZE); | ||
bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); | ||
} else { | ||
bufferedReader = new BufferedReader(new FileReader(path.toString())); | ||
} | ||
|
||
ObjectMapper mapper = new ObjectMapper(); | ||
iterator = mapper.readerFor(JsonNode.class).readValues(bufferedReader); | ||
if (iterator.hasNext()) { | ||
node = iterator.next(); | ||
} | ||
} | ||
|
||
@SuppressWarnings("unchecked") | ||
@Override | ||
public void readNext() throws NoSuchElementException { | ||
if (node == null) { | ||
throw new NoSuchElementException("JsonNode is empty"); | ||
} else if (node.isObject()) { | ||
bufferedRecord = (T) createNewDocument(node); | ||
if (iterator.hasNext()) { // if bufferedReader contains JSON line objects, we parse the next JSON into node | ||
node = iterator.next(); | ||
} else { | ||
atEOF = true; // there is no more JSON object in the bufferedReader | ||
} | ||
} else { | ||
LOG.error("Error: invalid JsonNode type"); | ||
throw new NoSuchElementException("Invalid JsonNode type"); | ||
} | ||
} | ||
|
||
protected Document createNewDocument(JsonNode json) { | ||
return new Document(node); | ||
} | ||
} | ||
|
||
/** | ||
* A document in a JSON collection. | ||
*/ | ||
public static class Document implements SourceDocument { | ||
private String id; | ||
private String contents; | ||
private String raw; | ||
|
||
public Document(JsonNode json) { | ||
this.raw = json.toPrettyString(); | ||
this.id = json.get("pid").asText(); | ||
this.contents = json.get("passage").asText(); | ||
} | ||
|
||
@Override | ||
public String id() { | ||
if (id == null) { | ||
throw new RuntimeException("Passage does not have the required \"pid\" field!"); | ||
} | ||
return id; | ||
} | ||
|
||
@Override | ||
public String contents() { | ||
if (contents == null) { | ||
throw new RuntimeException("Passage does not have the required \"passage\" field!"); | ||
} | ||
return contents; | ||
} | ||
|
||
@Override | ||
public String raw() { | ||
return raw; | ||
} | ||
|
||
@Override | ||
public boolean indexable() { | ||
return true; | ||
} | ||
} | ||
} |
Oops, something went wrong.