Skip to content

Commit b50f00e

Browse files
committed
add BuildExternalIndex to the App #17
1 parent b13a7b2 commit b50f00e

File tree

1 file changed

+17
-0
lines changed

1 file changed

+17
-0
lines changed

src/NaturalLanguage.App/Program.cs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class Program
2626
static readonly string wordCountsPath = Path.Combine(basePath, "word_counts.json");
2727
static readonly string indexPath = Path.Combine(basePath, "index.bin");
2828
static readonly string dawgIndexPath = Path.Combine(basePath, "dawg_index.bin");
29+
static readonly string externalIndexPath = Path.Combine(basePath, "external_index");
2930

3031
static readonly IDocumentDataSerializer<string> stringDataSerializer = new StringDocumentDataSerializer();
3132
static readonly IDocumentDataSerializer<IEnumerable<string>> tokenizedDataSerializer = new TokenizedDocumentDataSerializer();
@@ -54,6 +55,8 @@ static void Main(string[] args)
5455
HashWikipedia,
5556
IndexWikipedia,
5657
ProcessAndIndexWikipedia,
58+
PrintIndexStats,
59+
BuildExternalIndex,
5760
};
5861

5962
foreach (var action in actions)
@@ -87,6 +90,20 @@ static void ProcessAndIndexWikipedia()
8790
index.Serialize(file);
8891
}
8992

93+
static void BuildExternalIndex()
94+
{
95+
PrepareOutputDirectory(externalIndexPath);
96+
97+
var reader = new CorpusZipReader<IList<char>>(wikiPath, charDataSerializer);
98+
var buildableIndex = new BlockedExternalBuildableIndex<int>(externalIndexPath);
99+
var indexBuilder = new IndexBuilder<int, IEnumerable<int>>(buildableIndex);
100+
var processor = new WikitextProcessor();
101+
indexBuilder.IndexCorpus(processor.Transform(reader.Read()));
102+
103+
var index = buildableIndex.Build();
104+
Console.WriteLine($"The: {index.Search(TextHasher.CalculateHashCode("the".AsSpan())).Count()}");
105+
}
106+
90107
static void BuildDawgIndex()
91108
{
92109
var reader = new CorpusZipReader<IEnumerable<string>>(tokenizedPath, tokenizedDataSerializer);

0 commit comments

Comments
 (0)