Skip to content

Commit eb90664

Browse files
committed
implement MixedPostingsListBuilder
1 parent 0f4bd8a commit eb90664

File tree

2 files changed

+146
-0
lines changed

2 files changed

+146
-0
lines changed
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
using System;
2+
using System.Collections.Generic;
3+
4+
using Corpus;
5+
using InformationRetrieval.Indexing;
6+
using InformationRetrieval.Indexing.PostingsList;
7+
8+
namespace InformationRetrieval.Test.Indexing.PostingsList
9+
{
10+
public class MixedPostingsListBuilderTest : IndexUnitTestsBase<MixedPostingsListBuilderTest.Index>
11+
{
12+
private const int rangeThreshold = 3;
13+
14+
protected override Index CreateIndex(string[][] corpus)
15+
{
16+
var index = new Index();
17+
IndexHelper.BuildIndex(index, corpus);
18+
return index;
19+
}
20+
21+
public class Index : IBuildableIndex<string>, ISearchableIndex<string>
22+
{
23+
MixedPostingsListBuilder<string> builder = new MixedPostingsListBuilder<string>(rangeThreshold);
24+
25+
public ISearchableIndex<string> Build()
26+
{
27+
return this;
28+
}
29+
30+
public IEnumerable<DocumentId> GetAll()
31+
{
32+
return builder.AllDocuments;
33+
}
34+
35+
public int GetCount(string word)
36+
{
37+
if (builder.RangedPostingsLists.TryGetValue(word, out var blockList))
38+
{
39+
return blockList.Count;
40+
}
41+
42+
if (builder.UncompressedPostingsLists.TryGetValue(word, out var ids))
43+
{
44+
return ids.Count;
45+
}
46+
47+
return 0;
48+
}
49+
50+
public int GetCount()
51+
{
52+
return builder.AllDocuments.Count;
53+
}
54+
55+
public void IndexTerm(DocumentId id, string term, int position)
56+
{
57+
builder.Add(id, term);
58+
}
59+
60+
public IEnumerable<DocumentId> Search(string word)
61+
{
62+
if (builder.RangedPostingsLists.TryGetValue(word, out var blockList))
63+
{
64+
return blockList;
65+
}
66+
67+
if (builder.UncompressedPostingsLists.TryGetValue(word, out var ids))
68+
{
69+
return ids;
70+
}
71+
72+
return Array.Empty<DocumentId>();
73+
}
74+
}
75+
}
76+
}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
using System.Collections.Generic;
2+
3+
using Corpus;
4+
5+
namespace InformationRetrieval.Indexing.PostingsList
6+
{
7+
/// <summary>
8+
/// Builds postings lists using mixed strategy:
9+
/// ranged postings lists for frequent terms
10+
/// and uncompressed postings list otherwise.
11+
/// </summary>
12+
/// <typeparam name="T">Term's type</typeparam>
13+
public class MixedPostingsListBuilder<T>
14+
{
15+
private readonly int rangeThreshold;
16+
public RangePostingsList AllDocuments;
17+
public IDictionary<T, RangePostingsList> RangedPostingsLists;
18+
public IDictionary<T, IList<DocumentId>> UncompressedPostingsLists;
19+
20+
public MixedPostingsListBuilder(int rangeThreshold)
21+
{
22+
AllDocuments = new RangePostingsList();
23+
RangedPostingsLists = new Dictionary<T, RangePostingsList>();
24+
UncompressedPostingsLists = new Dictionary<T, IList<DocumentId>>();
25+
this.rangeThreshold = rangeThreshold;
26+
}
27+
28+
public void Add(DocumentId id, T word)
29+
{
30+
if (RangedPostingsLists.TryGetValue(word, out var blockList))
31+
{
32+
blockList.Add(id);
33+
}
34+
else
35+
{
36+
if (!UncompressedPostingsLists.TryGetValue(word, out var ids))
37+
{
38+
ids = new List<DocumentId>(1);
39+
UncompressedPostingsLists.Add(word, ids);
40+
ids.Add(id);
41+
}
42+
else if (ids[^1] != id)
43+
{
44+
ids.Add(id);
45+
}
46+
47+
int count = ids.Count;
48+
if (count >= rangeThreshold && id.Id - ids[count - rangeThreshold].Id == (rangeThreshold - 1))
49+
{
50+
MoveToIndex(word, ids);
51+
}
52+
}
53+
54+
AllDocuments.Add(id);
55+
}
56+
57+
private void MoveToIndex(T word, IList<DocumentId> ids)
58+
{
59+
UncompressedPostingsLists.Remove(word);
60+
61+
var list = new RangePostingsList();
62+
foreach (var id in ids)
63+
{
64+
list.Add(id);
65+
}
66+
67+
RangedPostingsLists.Add(word, list);
68+
}
69+
}
70+
}

0 commit comments

Comments
 (0)