Skip to content

Commit 61f1466

Browse files
committed
implement ExternalIndexSerializer #17
1 parent b50f00e commit 61f1466

File tree

4 files changed

+94
-7
lines changed

4 files changed

+94
-7
lines changed

src/NaturalLanguage.App/Program.cs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ static void Main(string[] args)
5757
ProcessAndIndexWikipedia,
5858
PrintIndexStats,
5959
BuildExternalIndex,
60+
PrintExternalIndexStats,
6061
};
6162

6263
foreach (var action in actions)
@@ -100,7 +101,17 @@ static void BuildExternalIndex()
100101
var processor = new WikitextProcessor();
101102
indexBuilder.IndexCorpus(processor.Transform(reader.Read()));
102103

103-
var index = buildableIndex.Build();
104+
using var index = buildableIndex.Build();
105+
106+
var serializer = new ExternalIndexSerializer<int>();
107+
serializer.Serialize(externalIndexPath, index);
108+
}
109+
110+
static void PrintExternalIndexStats()
111+
{
112+
var serializer = new ExternalIndexSerializer<int>();
113+
using var index = serializer.Deserialize(externalIndexPath);
114+
104115
Console.WriteLine($"The: {index.Search(TextHasher.CalculateHashCode("the".AsSpan())).Count()}");
105116
}
106117

src/NaturalLanguage.WPF/MainWindow.xaml.cs

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,13 @@ public partial class MainWindow : Window
3030
static readonly string basePath = @"F:\wikipedia";
3131
static readonly string wikiPath = IO::Path.Combine(basePath, "enwiki");
3232
static readonly string indexPath = IO::Path.Combine(basePath, "index.bin");
33+
static readonly string externalIndexPath = IO::Path.Combine(basePath, "external_index");
3334

3435
private readonly Lazy<BooleanSearchEngine<int>> engine;
3536
private readonly Lazy<CorpusMetadata> metadata;
3637
private readonly CorpusZipReader<string> reader;
38+
39+
private bool useExternalIndex = true;
3740

3841
Stopwatch timer = new Stopwatch();
3942

@@ -102,12 +105,35 @@ public DocumentIdTemplate(DocumentId id, CorpusMetadata metadata)
102105
public override string ToString() => title;
103106
}
104107

105-
private BooleanSearchEngine<int> LoadSearchEngine()
108+
private ISearchableIndex<int> LoadInMemoryIndex()
106109
{
107110
using var file = IO::File.OpenRead(indexPath);
111+
return DictionaryIndex<int>.Deserialize(file);
112+
}
113+
114+
private ISearchableIndex<int> LoadExternalIndex()
115+
{
116+
var serializer = new ExternalIndexSerializer<int>();
117+
return serializer.Deserialize(externalIndexPath);
118+
}
119+
120+
private ISearchableIndex<int> LoadIndex()
121+
{
122+
if (useExternalIndex)
123+
{
124+
return LoadExternalIndex();
125+
}
126+
else
127+
{
128+
return LoadInMemoryIndex();
129+
}
130+
}
131+
132+
private BooleanSearchEngine<int> LoadSearchEngine()
133+
{
108134
var timer = new Stopwatch();
109135
timer.Start();
110-
var index = DictionaryIndex<int>.Deserialize(file);
136+
var index = LoadIndex();
111137
timer.Stop();
112138
Log($"Index loaded in {timer.Elapsed:g}");
113139

src/NaturalLanguageTools/Indexing/BlockedExternalBuildableIndex.cs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,14 @@ public class BlockedExternalBuildableIndex<T> : IBuildableIndex<T>, IDisposable
1717
where T : IComparable<T>
1818
{
1919
private readonly BuildableIndexManager indexManager;
20-
private readonly IFileSystem fileSystem;
2120
private readonly Stream stream;
2221

2322
public BlockedExternalBuildableIndex(string basePath) : this(basePath, new FileSystem()) { }
2423

2524
public BlockedExternalBuildableIndex(string basePath, IFileSystem fileSystem)
2625
{
2726
this.indexManager = new BuildableIndexManager(basePath, fileSystem);
28-
this.fileSystem = fileSystem;
29-
string indexPath = Path.Combine(basePath, "corpus.index");
27+
string indexPath = Path.Combine(basePath, ExternalIndexSerializer<T>.IndexFileName);
3028
this.stream = fileSystem.File.Open(indexPath, FileMode.OpenOrCreate, FileAccess.ReadWrite);
3129
}
3230

@@ -36,7 +34,7 @@ public void IndexTerm(DocumentId id, T term, int position)
3634
index.IndexTerm(id, term, position);
3735
}
3836

39-
public ISearchableIndex<T> Build()
37+
public ExternalIndex<T> Build()
4038
{
4139
var composer = new ExternalIndexComposer<T>(stream);
4240

@@ -132,6 +130,11 @@ public void Dispose()
132130
indexManager.Dispose();
133131
}
134132

133+
ISearchableIndex<T> IBuildableIndex<T>.Build()
134+
{
135+
return Build();
136+
}
137+
135138
private struct IndexInfo
136139
{
137140
public string Path;
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
using System.Collections.Generic;
2+
using System.IO;
3+
using System.IO.Abstractions;
4+
5+
using ProtoBuf;
6+
7+
namespace NaturalLanguageTools.Indexing
8+
{
9+
public class ExternalIndexSerializer<T>
10+
{
11+
public const string IndexFileName = "corpus.index";
12+
public const string OffsetsFileName = "corpus.offsets";
13+
14+
private readonly FileSystem fileSystem;
15+
16+
public ExternalIndexSerializer() : this(new FileSystem()) { }
17+
18+
public ExternalIndexSerializer(FileSystem fileSystem)
19+
{
20+
this.fileSystem = fileSystem;
21+
}
22+
23+
public ExternalIndex<T> Deserialize(string basePath)
24+
{
25+
var postingStream = fileSystem.File.OpenRead(Path.Combine(basePath, IndexFileName));
26+
using var offsetsStream = fileSystem.File.OpenRead(Path.Combine(basePath, OffsetsFileName));
27+
var offsets = DeserializeOffsets(offsetsStream);
28+
return new ExternalIndex<T>(offsets, postingStream);
29+
}
30+
31+
public void Serialize(string basePath, ExternalIndex<T> index)
32+
{
33+
using var offsetsStream = fileSystem.File.OpenWrite(Path.Combine(basePath, OffsetsFileName));
34+
SerializeOffsets(offsetsStream, index.Offsets);
35+
}
36+
37+
public static void SerializeOffsets(Stream stream, IDictionary<T, long> offsets)
38+
{
39+
Serializer.Serialize(stream, offsets);
40+
}
41+
42+
public static IDictionary<T, long> DeserializeOffsets(Stream stream)
43+
{
44+
return Serializer.Deserialize<Dictionary<T, long>>(stream);
45+
}
46+
}
47+
}

0 commit comments

Comments
 (0)