@@ -26,6 +26,7 @@ class Program
26
26
static readonly string wordCountsPath = Path . Combine ( basePath , "word_counts.json" ) ;
27
27
static readonly string indexPath = Path . Combine ( basePath , "index.bin" ) ;
28
28
static readonly string dawgIndexPath = Path . Combine ( basePath , "dawg_index.bin" ) ;
29
+ static readonly string externalIndexPath = Path . Combine ( basePath , "external_index" ) ;
29
30
30
31
static readonly IDocumentDataSerializer < string > stringDataSerializer = new StringDocumentDataSerializer ( ) ;
31
32
static readonly IDocumentDataSerializer < IEnumerable < string > > tokenizedDataSerializer = new TokenizedDocumentDataSerializer ( ) ;
@@ -54,6 +55,8 @@ static void Main(string[] args)
54
55
HashWikipedia ,
55
56
IndexWikipedia ,
56
57
ProcessAndIndexWikipedia ,
58
+ PrintIndexStats ,
59
+ BuildExternalIndex ,
57
60
} ;
58
61
59
62
foreach ( var action in actions )
@@ -87,6 +90,20 @@ static void ProcessAndIndexWikipedia()
87
90
index . Serialize ( file ) ;
88
91
}
89
92
93
+ static void BuildExternalIndex ( )
94
+ {
95
+ PrepareOutputDirectory ( externalIndexPath ) ;
96
+
97
+ var reader = new CorpusZipReader < IList < char > > ( wikiPath , charDataSerializer ) ;
98
+ var buildableIndex = new BlockedExternalBuildableIndex < int > ( externalIndexPath ) ;
99
+ var indexBuilder = new IndexBuilder < int , IEnumerable < int > > ( buildableIndex ) ;
100
+ var processor = new WikitextProcessor ( ) ;
101
+ indexBuilder . IndexCorpus ( processor . Transform ( reader . Read ( ) ) ) ;
102
+
103
+ var index = buildableIndex . Build ( ) ;
104
+ Console . WriteLine ( $ "The: { index . Search ( TextHasher . CalculateHashCode ( "the" . AsSpan ( ) ) ) . Count ( ) } ") ;
105
+ }
106
+
90
107
static void BuildDawgIndex ( )
91
108
{
92
109
var reader = new CorpusZipReader < IEnumerable < string > > ( tokenizedPath , tokenizedDataSerializer ) ;
0 commit comments