Skip to content

Commit

Permalink
Add JsonStringTopicReader (#2032)
Browse files Browse the repository at this point in the history
* Add JsonStringTopicReader
  • Loading branch information
justram authored Dec 7, 2022
1 parent 2effd3c commit 6872c87
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


package io.anserini.search.topicreader;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

public class JsonStringTopicReader extends TopicReader<String> {

public JsonStringTopicReader(Path topicFile) {
super(topicFile);
}

@Override
public SortedMap<String, Map<String, String>> read(BufferedReader reader) throws IOException {
SortedMap<String, Map<String, String>> map = new TreeMap<>();
String line;
ObjectMapper mapper = new ObjectMapper();
while ((line = reader.readLine()) != null) {
line = line.trim();
JsonNode lineNode = mapper.readerFor(JsonNode.class).readTree(line);
String topicID = lineNode.get("id").asText();

Map<String, String> fields = new HashMap<>();
lineNode.fields().forEachRemaining( e -> {
if ("id".equals(e.getKey())) return; //skip id
fields.put(e.getKey(), e.getValue().asText());
});
map.put(topicID, fields);
}
return map;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.search.topicreader;

import org.junit.Test;

import java.io.IOException;
import java.nio.file.Paths;
import java.util.Map;
import java.util.SortedMap;

import static org.junit.Assert.assertEquals;

public class JsonStringTopicReaderTest {

@Test
public void test() throws IOException {
TopicReader<String> reader = new JsonStringTopicReader(
Paths.get("src/test/resources/sample_topics/stringID_topics.jsonl"));

SortedMap<String, Map<String, String>> topics = reader.read();

assertEquals(2, topics.keySet().size());
assertEquals("topic1", topics.firstKey());
assertEquals("topic2", topics.lastKey());
assertEquals("this is the contents 1.", topics.get(topics.firstKey()).get("contents"));
assertEquals("topic1 field1 content", topics.get(topics.firstKey()).get("field1"));
assertEquals("topic1 field2 content", topics.get(topics.firstKey()).get("field2"));
assertEquals("this is the contents 2.", topics.get(topics.lastKey()).get("contents"));
assertEquals("topic2 field1 content", topics.get(topics.lastKey()).get("field1"));
assertEquals("topic2 field2 content", topics.get(topics.lastKey()).get("field2"));

}
}
2 changes: 2 additions & 0 deletions src/test/resources/sample_topics/stringID_topics.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "topic1", "contents": "this is the contents 1.", "field1": "topic1 field1 content", "field2": "topic1 field2 content"}
{"id": "topic2", "contents": "this is the contents 2.", "field1": "topic2 field1 content", "field2": "topic2 field2 content"}

0 comments on commit 6872c87

Please sign in to comment.