Skip to content

Commit 6e585f8

Browse files
authored
Support postings list compression using varint #19 (#24)
* implementation of PostingListWriter for varint postings #19 * add IPostingsListBuilder to represent PostingsList Builder functionality * fixed issue when PostingLists reused between indices * implement VarintPostingsListBuilder to support Varint listings posts * minor fix
1 parent a7a3818 commit 6e585f8

13 files changed

+247
-28
lines changed

src/InformationRetrieval.App/Program.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ static void BuildExternalIndex()
100100

101101
var reader = new CorpusZipReader<IList<char>>(wikiPath, charDataSerializer);
102102
using var buildableIndex = new BlockedExternalBuildableIndex<int>(
103-
DictonaryBasedExternalBuildableIndex<int>.GetCreateMethod(RangeThreshold),
103+
DictonaryBasedExternalBuildableIndex<int>.GetCreateMethodWithVarintPostingsLists(),
104104
externalIndexPath);
105105
var indexBuilder = new IndexBuilder<int, IEnumerable<int>>(buildableIndex);
106106
var processor = new WikitextProcessor();

src/InformationRetrieval.Test/Indexing/External/BlockedExternalBuildableIndexTest.cs

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
using System.IO.Abstractions.TestingHelpers;
1+
using System;
2+
using System.IO;
3+
using System.IO.Abstractions.TestingHelpers;
24
using System.Linq;
35
using Xunit;
46

@@ -12,12 +14,29 @@ public class BlockedExternalBuildableIndexTest
1214
private const string path = @"C:\path";
1315

1416
[Fact]
15-
public void BuildBlockedExternalIndexTest()
17+
public void BuildBlockedExternalIndexTest_SortBased()
18+
{
19+
BuildBlockedExternalIndexTest(SortBasedExternalBuildableIndex<string>.CreateMethod);
20+
}
21+
22+
[Fact]
23+
public void BuildBlockedExternalIndexTest_DictionaryBased_MixedPostingsList()
24+
{
25+
BuildBlockedExternalIndexTest(DictonaryBasedExternalBuildableIndex<string>.GetCreateMethodWithMixedPostingsLists(3));
26+
}
27+
28+
[Fact]
29+
public void BuildBlockedExternalIndexTest_DictionaryBased_VarintPostingsList()
30+
{
31+
BuildBlockedExternalIndexTest(DictonaryBasedExternalBuildableIndex<string>.GetCreateMethodWithVarintPostingsLists());
32+
}
33+
34+
private void BuildBlockedExternalIndexTest(Func<Stream, IExternalBuildableIndex<string>> createIndex)
1635
{
1736
var docs = new (DocumentId Id, string[] Text)[]
1837
{
19-
(new DocumentId(0, 1), "d e f a".Split()),
2038
(new DocumentId(0, 0), "a b c d".Split()),
39+
(new DocumentId(0, 1), "d e f a".Split()),
2140
(new DocumentId(0, 2), "e e f d".Split()),
2241

2342
(new DocumentId(1, 0), "a b".Split()),
@@ -30,9 +49,7 @@ public void BuildBlockedExternalIndexTest()
3049
var fileSystem = new MockFileSystem();
3150
fileSystem.Directory.CreateDirectory(path);
3251

33-
var buildableIndex = new BlockedExternalBuildableIndex<string>(
34-
SortBasedExternalBuildableIndex<string>.CreateMethod,
35-
path, fileSystem);
52+
var buildableIndex = new BlockedExternalBuildableIndex<string>(createIndex, path, fileSystem);
3653

3754
foreach (var doc in docs)
3855
{

src/InformationRetrieval.Test/Indexing/External/DictionaryBasedExternalBuildableIndexTest.cs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33

44
using Xunit;
55

6-
using InformationRetrieval.Indexing.External;
76
using Corpus;
7+
using InformationRetrieval.Indexing.External;
8+
using InformationRetrieval.Indexing.PostingsList;
89

910
namespace InformationRetrieval.Test.Indexing.External
1011
{
@@ -16,7 +17,8 @@ public class DictionaryBasedExternalBuildableIndexTests : IndexUnitTestsBase<Ext
1617
public void ExternalIndexBuildTest()
1718
{
1819
var stream = new MemoryStream();
19-
var buildableIndex = new DictonaryBasedExternalBuildableIndex<string>(RangeThreshold, stream);
20+
var postingsListBuilder = new MixedPostingsListBuilder<string>(RangeThreshold);
21+
var buildableIndex = new DictonaryBasedExternalBuildableIndex<string>(postingsListBuilder, stream);
2022

2123
var docs = new (DocumentId Id, string[] Text)[]
2224
{
@@ -54,7 +56,8 @@ public void ExternalIndexBuildTest()
5456
protected override ExternalIndex<string> CreateIndex(string[][] corpus)
5557
{
5658
var stream = new MemoryStream();
57-
var buildableIndex = new DictonaryBasedExternalBuildableIndex<string>(RangeThreshold, stream);
59+
var postingsListBuilder = new MixedPostingsListBuilder<string>(RangeThreshold);
60+
var buildableIndex = new DictonaryBasedExternalBuildableIndex<string>(postingsListBuilder, stream);
5861
IndexHelper.BuildIndex(buildableIndex, corpus);
5962
return buildableIndex.BuildExternalIndex();
6063
}

src/InformationRetrieval.Test/Indexing/PostingsList/MixedPostingsListBuilderTest.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ public int GetCount()
5454

5555
public void IndexTerm(DocumentId id, string term, int position)
5656
{
57-
builder.Add(id, term);
57+
builder.AddTerm(id, term);
5858
}
5959

6060
public IReadOnlyCollection<DocumentId> Search(string word)

src/InformationRetrieval.Test/Indexing/PostingsList/PostingsListTest.cs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,38 @@ public void ChainedUncompressedReadWriteTest()
141141
Assert.Equal(chain, deserialized);
142142
}
143143

144+
[Fact]
145+
public void ChainedVarintReadWriteTest()
146+
{
147+
var stream = new MemoryStream();
148+
149+
var chain = new ListChain<DocumentId>()
150+
{
151+
new VarintPostingsList()
152+
{
153+
0, 1, 2
154+
},
155+
GetDocIds(10, 11),
156+
new VarintPostingsList()
157+
{
158+
12, 13, 14, 15, 100, 111
159+
},
160+
};
161+
162+
using var writer = new PostingsListWriter(stream);
163+
writer.Write(chain);
164+
165+
stream.Seek(0, SeekOrigin.Begin);
166+
167+
using var reader = new PostingsListReader(stream, leaveOpen: false);
168+
var count = reader.ReadCount(0);
169+
var deserialized = reader.Read(0);
170+
171+
Assert.Equal(chain.Count, count);
172+
Assert.True(deserialized is VarintPostingsList);
173+
Assert.Equal(chain.ToArray(), deserialized.ToArray());
174+
}
175+
144176
public static DocumentId[] GetDocIds(params uint[] ids)
145177
{
146178
return ids.Select(id => new DocumentId(id)).ToArray();
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
using System;
2+
using System.Collections.Generic;
3+
4+
using Corpus;
5+
using InformationRetrieval.Indexing;
6+
using InformationRetrieval.Indexing.PostingsList;
7+
8+
namespace InformationRetrieval.Test.Indexing.PostingsList
9+
{
10+
class VarintPostingsListBuilderTest : IndexUnitTestsBase<VarintPostingsListBuilderTest.Index>
11+
{
12+
protected override Index CreateIndex(string[][] corpus)
13+
{
14+
var index = new Index();
15+
IndexHelper.BuildIndex(index, corpus);
16+
return index;
17+
}
18+
19+
public class Index : IBuildableIndex<string>, ISearchableIndex<string>
20+
{
21+
VarintPostingsListBuilder<string> builder = new VarintPostingsListBuilder<string>();
22+
23+
public ISearchableIndex<string> Build()
24+
{
25+
return this;
26+
}
27+
28+
public IReadOnlyCollection<DocumentId> GetAll()
29+
{
30+
return builder.Documents;
31+
}
32+
33+
public int GetCount(string word)
34+
{
35+
if (builder.VarintPostingsLists.TryGetValue(word, out var blockList))
36+
{
37+
return blockList.Count;
38+
}
39+
40+
return 0;
41+
}
42+
43+
public int GetCount()
44+
{
45+
return builder.Documents.Count;
46+
}
47+
48+
public void IndexTerm(DocumentId id, string term, int position)
49+
{
50+
builder.AddTerm(id, term);
51+
}
52+
53+
public IReadOnlyCollection<DocumentId> Search(string word)
54+
{
55+
if (builder.VarintPostingsLists.TryGetValue(word, out var ps))
56+
{
57+
return ps;
58+
}
59+
60+
return Array.Empty<DocumentId>();
61+
}
62+
}
63+
}
64+
}

src/InformationRetrieval/Indexing/External/BlockedExternalBuildableIndex.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,10 @@ public void Clear()
198198
}
199199
indices.Clear();
200200

201-
fileSystem.Directory.Delete(basePath, recursive: true);
201+
if (fileSystem.Directory.Exists(basePath))
202+
{
203+
fileSystem.Directory.Delete(basePath, recursive: true);
204+
}
202205
}
203206

204207
public void Dispose()

src/InformationRetrieval/Indexing/External/DictonaryBasedExternalBuildableIndex.cs

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,18 @@ namespace InformationRetrieval.Indexing.External
77
{
88
public class DictonaryBasedExternalBuildableIndex<T> : IExternalBuildableIndex<T> where T : notnull
99
{
10-
public static Func<Stream, IExternalBuildableIndex<T>> GetCreateMethod(int rangeThreshold)
11-
=> s => new DictonaryBasedExternalBuildableIndex<T>(rangeThreshold, s);
10+
public static Func<Stream, IExternalBuildableIndex<T>> GetCreateMethodWithMixedPostingsLists(int rangeThreshold)
11+
=> s => new DictonaryBasedExternalBuildableIndex<T>(new MixedPostingsListBuilder<T>(rangeThreshold), s);
12+
13+
public static Func<Stream, IExternalBuildableIndex<T>> GetCreateMethodWithVarintPostingsLists()
14+
=> s => new DictonaryBasedExternalBuildableIndex<T>(new VarintPostingsListBuilder<T>(), s);
1215

1316
private readonly Stream postingsStream;
14-
private readonly MixedPostingsListBuilder<T> builder;
17+
private readonly IPostingsListBuilder<T> builder;
1518

16-
public DictonaryBasedExternalBuildableIndex(int rangeThreshold, Stream postingsStream)
19+
public DictonaryBasedExternalBuildableIndex(IPostingsListBuilder<T> postingListBuilder, Stream postingsStream)
1720
{
18-
builder = new MixedPostingsListBuilder<T>(rangeThreshold);
21+
builder = postingListBuilder;
1922
this.postingsStream = postingsStream;
2023
}
2124

@@ -26,14 +29,9 @@ public ExternalIndex<T> BuildExternalIndex()
2629
{
2730
var composer = new ExternalIndexComposer<T>(postingsStream);
2831

29-
composer.AddAllDocuments(builder.AllDocuments);
30-
31-
foreach (var postings in builder.RangedPostingsLists)
32-
{
33-
composer.AddPostingsList(postings.Key, postings.Value);
34-
}
32+
composer.AddAllDocuments(builder.Documents);
3533

36-
foreach (var postings in builder.UncompressedPostingsLists)
34+
foreach (var postings in builder.PostingsLists)
3735
{
3836
composer.AddPostingsList(postings.Key, postings.Value);
3937
}
@@ -42,6 +40,6 @@ public ExternalIndex<T> BuildExternalIndex()
4240
}
4341

4442
public void IndexTerm(DocumentId id, T term, int position)
45-
=> builder.Add(id, term);
43+
=> builder.AddTerm(id, term);
4644
}
4745
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
using Corpus;
2+
using System;
3+
using System.Collections.Generic;
4+
using System.Text;
5+
6+
namespace InformationRetrieval.Indexing.PostingsList
7+
{
8+
public interface IPostingsListBuilder<T> where T : notnull
9+
{
10+
void AddTerm(DocumentId id, T term);
11+
12+
IReadOnlyCollection<DocumentId> Documents { get; }
13+
IEnumerable<KeyValuePair<T, IReadOnlyCollection<DocumentId>>> PostingsLists { get; }
14+
}
15+
}

src/InformationRetrieval/Indexing/PostingsList/MixedPostingsListBuilder.cs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ namespace InformationRetrieval.Indexing.PostingsList
1010
/// and uncompressed postings list otherwise.
1111
/// </summary>
1212
/// <typeparam name="T">Term's type</typeparam>
13-
public class MixedPostingsListBuilder<T> where T : notnull
13+
public class MixedPostingsListBuilder<T> : IPostingsListBuilder<T> where T : notnull
1414
{
1515
private readonly int rangeThreshold;
1616
public RangePostingsList AllDocuments;
@@ -25,7 +25,25 @@ public MixedPostingsListBuilder(int rangeThreshold)
2525
this.rangeThreshold = rangeThreshold;
2626
}
2727

28-
public void Add(DocumentId id, T word)
28+
public IReadOnlyCollection<DocumentId> Documents => AllDocuments;
29+
30+
public IEnumerable<KeyValuePair<T, IReadOnlyCollection<DocumentId>>> PostingsLists
31+
{
32+
get
33+
{
34+
foreach (var postings in RangedPostingsLists)
35+
{
36+
yield return new KeyValuePair<T, IReadOnlyCollection<DocumentId>>(postings.Key, postings.Value);
37+
}
38+
39+
foreach (var postings in UncompressedPostingsLists)
40+
{
41+
yield return new KeyValuePair<T, IReadOnlyCollection<DocumentId>>(postings.Key, postings.Value);
42+
}
43+
}
44+
}
45+
46+
public void AddTerm(DocumentId id, T word)
2947
{
3048
if (RangedPostingsLists.TryGetValue(word, out var blockList))
3149
{

0 commit comments

Comments
 (0)